## gmm sandbox

focus on making modules and importing

use kb's stuff as a base

for each log in dir:

1. load logs 
2. process logs
3. construct gmm
4. run gmm
5. plot results

look for messed up logs

consider results and think about what it means

In [1]:
import os
from pathlib import Path

# run cell once or path will break
print(os.getcwd())
os.chdir(Path(os.getcwd()).parent)
print(os.getcwd())

/home/mlr/Desktop/gmm_sandbox/notebooks
/home/mlr/Desktop/gmm_sandbox


In [5]:
from common.input import load_log
from common.model import interval, scale, pca, pca_rank, gmm
from common.plot import plot_pca_2D, plot_pca_3D, plot_pca_rank, plot_curves_prob
from common.output import combine_curves_prob, combine_pca_prob

In [6]:
# load log from data and init data dictionary
cols = ["SP", "GR", "RT90", "NPHI_COMP", "RHOB", "PE"]
data = "./logs/Lazy_D_400222042.las"
lazy = load_log(data, cols)

# grab interval of log by index
lazy = interval(lazy, top=8000, bot=9000)

# # standard scale log with defaults
lazy = scale(lazy)

# # run 3 component pca with defaults
lazy = pca(lazy)

# # run feature rank on pca
lazy = pca_rank(lazy)

# # run gmm on scaled curves
lazy = gmm(lazy, n=2)

# # merge gmm clusters to base curves 
lazy = combine_curves_prob(lazy)

# # merge gmm clusters to pca components
lazy = combine_pca_prob(lazy)

LOG LOADED
PCA COMPLETE
GMM COMPLETE
LOGS AND CLUSTERS MERGED
PCA AND CLUSTERS MERGED


In [7]:
for k,v in lazy.items():
    if not isinstance(v, str):
        print(f"{k}: {type(v)}")
    else:
        print(f"{k}: {v}")

print("-"*50)

# TODO: work into loop above
print(f"base df: {lazy['base_curves'].shape}")
print(f"soft arr: {lazy['soft_clusters'].shape}")
print(f"hard arr: {lazy['hard_clusters'].shape}")
print(f"pca arr: {lazy['pca_curves'].shape}")
print(f"pca expvar: {lazy['pca_curves'].shape}")
print(f"pca rank: {lazy['pca_rank'].shape}")
print(f"merged df: {lazy['merged_curves'].shape}")
print(f"merged pca: {lazy['merged_pca'].shape}")

las: <class 'lasio.las.LASFile'>
well_name: LAZY-D ZN 3-9
base_curves: <class 'pandas.core.frame.DataFrame'>
interval_top: <class 'int'>
interval_bot: <class 'int'>
scaled_curves: <class 'numpy.ndarray'>
pca_curves: <class 'numpy.ndarray'>
pca_expvar: <class 'numpy.ndarray'>
pca_rank: <class 'pandas.core.frame.DataFrame'>
soft_clusters: <class 'numpy.ndarray'>
hard_clusters: <class 'numpy.ndarray'>
cluster_n: <class 'int'>
merged_curves: <class 'pandas.core.frame.DataFrame'>
merged_pca: <class 'pandas.core.frame.DataFrame'>
--------------------------------------------------
base df: (2001, 6)
soft arr: (2001, 2)
hard arr: (2001,)
pca arr: (2001, 6)
pca expvar: (2001, 6)
pca rank: (6, 6)
merged df: (2001, 8)
merged pca: (2001, 8)


In [8]:
df = lazy["merged_curves"]
test = list(range(0,len(df["soft_clusters"].iloc[0])))
[str(t) for t in test]

['0', '1']

In [9]:
df["soft_clusters"].iloc[0]

[0.9999999999791986, 2.0801267322674972e-11]

In [10]:
df.head()

Unnamed: 0_level_0,SP,GR,RT90,NPHI_COMP,RHOB,PE,soft_clusters,hard_clusters
DEPT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8000.0,60.0192,109.0224,3.503,0.2423,2.5642,3.0618,"[0.9999999999791986, 2.0801267322674972e-11]",0
8000.5,59.712,110.2059,3.5513,0.2369,2.5743,3.0574,"[0.9999999999883162, 1.1683742195077445e-11]",0
8001.0,59.3496,105.2788,3.6874,0.2282,2.5809,3.2409,"[0.9999999999913798, 8.620215993188577e-12]",0
8001.5,59.0965,99.6273,3.8342,0.2292,2.5701,3.3143,"[0.9999999999772258, 2.277430636696767e-11]",0
8002.0,58.7665,101.3662,3.9282,0.2203,2.5604,3.1588,"[0.9999999999983471, 1.6527950687074862e-12]",0


In [20]:
df_pca = lazy["merged_pca"]
df_pca.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2001 entries, 0 to 2000
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PC_0          2001 non-null   float64
 1   PC_1          2001 non-null   float64
 2   PC_2          2001 non-null   float64
 3   PC_3          2001 non-null   float64
 4   PC_4          2001 non-null   float64
 5   PC_5          2001 non-null   float64
 6   hard_cluster  2001 non-null   float64
 7   dept          2001 non-null   float64
dtypes: float64(8)
memory usage: 125.2 KB


In [None]:
depth = lazy["base_curves"].index.values.reshape((-1,1))
depth

In [None]:
# plot 2D pca with clusters
plot_pca_2D(lazy, key="merged_pca")

# plot 3D pca with clusters
plot_pca_3D(lazy, key="merged_pca")

# plot pca feature rank
plot_pca_rank(lazy, key="pca_rank")

# FIXME: adjust curve names in plot
# plot base log curves and clusters in log view
plot_curves_prob(lazy)

In [None]:
import plotly
print(plotly.__version__)
import numpy as np
import plotly.graph_objects as go

def discrete_colorscale(bvals, colors):
    """
    bvals - list of values bounding intervals/ranges of interest
    colors - list of rgb or hex colorcodes for values in [bvals[k], bvals[k+1]],0<=k < len(bvals)-1
    returns the plotly  discrete colorscale
    """
    if len(bvals) != len(colors)+1:
        raise ValueError('len(boundary values) should be equal to  len(colors)+1')
    bvals = sorted(bvals)     
    nvals = [(v-bvals[0])/(bvals[-1]-bvals[0]) for v in bvals]  #normalized values
    
    dcolorscale = [] #discrete colorscale
    for k in range(len(colors)):
        dcolorscale.extend([[nvals[k], colors[k]], [nvals[k+1], colors[k]]])
    return dcolorscale 

bvals = [2, 15, 40, 65, 90]

# NOTE: still need to define the colors manually...
colors = ['#09ffff', '#19d3f3', '#e763fa' , '#ab63fa']

dcolorsc = discrete_colorscale(bvals, colors)

bvals = np.array(bvals)

tickvals = [np.mean(bvals[k:k+2]) for k in range(len(bvals)-1)] #position with respect to bvals where ticktext is displayed
ticktext = [f'<{bvals[1]}'] + [f'{bvals[k]}-{bvals[k+1]}' for k in range(1, len(bvals)-2)]+[f'>{bvals[-2]}']

In [None]:
z = np.random.randint(bvals[0],  bvals[-1]+1, size=(20, 20))

heatmap = go.Heatmap(z=z, 
                     colorscale = dcolorsc, 
                     colorbar = dict(thickness=25, 
                                     tickvals=tickvals, 
                                     ticktext=ticktext))

fig = go.Figure(data=[heatmap])
fig.update_layout(width=500, height=500)

fig.show()

In [None]:
import plotly.express as px

g10 = px.colors.qualitative.G10
print(type(g10))

unique = df["hard_clusters"].unique()

colors = [g10[u] for u in unique]
print(colors)