## gmm sandbox

focus on making modules and importing

use kb's stuff as a base

for each log in dir:

1. load logs 
2. process logs
3. construct gmm
4. run gmm
5. plot results

look for messed up logs

consider results and think about what it means

In [1]:
import os
from pathlib import Path

# run cell once or path will break
print(os.getcwd())
os.chdir(Path(os.getcwd()).parent)
print(os.getcwd())

c:\Users\mrhoa\Desktop\gmm_sandbox\notebooks
c:\Users\mrhoa\Desktop\gmm_sandbox


In [2]:
from common.input import load_log
from common.model import interval, scale, pca, pca_rank, gmm
from common.plot import plot_pca_2D, plot_pca_3D, plot_pca_rank, plot_curves_prob
from common.output import combine_curves_prob, combine_pca_prob

In [3]:
# load log from data and init data dictionary
cols = ["SP", "GR", "RT90", "NPHI_COMP", "RHOB", "PE"]
data = "./logs/Lazy_D_400222042.las"
lazy = load_log(data, cols)

# grab interval of log by index
lazy = interval(lazy, top=8000, bot=9000)

# # standard scale log with defaults
lazy = scale(lazy)

# # run 3 component pca with defaults
lazy = pca(lazy)

# # run feature rank on pca
lazy = pca_rank(lazy)

# # run gmm on scaled curves
lazy = gmm(lazy, n=4)

# # merge gmm clusters to base curves 
lazy = combine_curves_prob(lazy)

# # merge gmm clusters to pca components
lazy = combine_pca_prob(lazy)

LOG LOADED
PCA COMPLETE
GMM COMPLETE
LOGS AND CLUSTERS MERGED
PCA AND CLUSTERS MERGED


In [4]:
for k,v in lazy.items():
    if not isinstance(v, str):
        print(f"{k}: {type(v)}")
    else:
        print(f"{k}: {v}")

print("-"*50)

# TODO: work into loop above
print(f"base df: {lazy['base_curves'].shape}")
print(f"soft arr: {lazy['soft_clusters'].shape}")
print(f"hard arr: {lazy['hard_clusters'].shape}")
print(f"pca arr: {lazy['pca_curves'].shape}")
print(f"pca expvar: {lazy['pca_curves'].shape}")
print(f"pca rank: {lazy['pca_rank'].shape}")
print(f"merged df: {lazy['merged_curves'].shape}")
print(f"merged pca: {lazy['merged_pca'].shape}")

las: <class 'lasio.las.LASFile'>
well_name: LAZY-D ZN 3-9
base_curves: <class 'pandas.core.frame.DataFrame'>
interval_top: <class 'int'>
interval_bot: <class 'int'>
scaled_curves: <class 'numpy.ndarray'>
pca_curves: <class 'numpy.ndarray'>
pca_expvar: <class 'numpy.ndarray'>
pca_rank: <class 'pandas.core.frame.DataFrame'>
soft_clusters: <class 'numpy.ndarray'>
hard_clusters: <class 'numpy.ndarray'>
cluster_n: <class 'int'>
merged_curves: <class 'pandas.core.frame.DataFrame'>
merged_pca: <class 'pandas.core.frame.DataFrame'>
--------------------------------------------------
base df: (2001, 6)
soft arr: (2001, 4)
hard arr: (2001,)
pca arr: (2001, 6)
pca expvar: (2001, 6)
pca rank: (6, 6)
merged df: (2001, 8)
merged pca: (2001, 8)


In [5]:
pca_var = lazy["pca_expvar"]
print(pca_var[0:2]*100)
print(pca_var*100)

pca_var[0:2].sum()

[37.95522952 35.45408361]
[37.95522952 35.45408361 15.14142878  6.93065833  2.90519228  1.61340748]


0.7340931313391792

In [6]:
df = lazy["merged_curves"]
test = list(range(0,len(df["soft_clusters"].iloc[0])))

In [7]:
df["soft_clusters"].iloc[0]

[1.0039525895920558e-157, 3.585934934702599e-69, 1.0, 1.1861264879369834e-41]

In [11]:
df_pca = lazy["merged_pca"]
df_pca["dept"].astype(str).tolist()


['8000.0',
 '8000.5',
 '8001.0',
 '8001.5',
 '8002.0',
 '8002.5',
 '8003.0',
 '8003.5',
 '8004.0',
 '8004.5',
 '8005.0',
 '8005.5',
 '8006.0',
 '8006.5',
 '8007.0',
 '8007.5',
 '8008.0',
 '8008.5',
 '8009.0',
 '8009.5',
 '8010.0',
 '8010.5',
 '8011.0',
 '8011.5',
 '8012.0',
 '8012.5',
 '8013.0',
 '8013.5',
 '8014.0',
 '8014.5',
 '8015.0',
 '8015.5',
 '8016.0',
 '8016.5',
 '8017.0',
 '8017.5',
 '8018.0',
 '8018.5',
 '8019.0',
 '8019.5',
 '8020.0',
 '8020.5',
 '8021.0',
 '8021.5',
 '8022.0',
 '8022.5',
 '8023.0',
 '8023.5',
 '8024.0',
 '8024.5',
 '8025.0',
 '8025.5',
 '8026.0',
 '8026.5',
 '8027.0',
 '8027.5',
 '8028.0',
 '8028.5',
 '8029.0',
 '8029.5',
 '8030.0',
 '8030.5',
 '8031.0',
 '8031.5',
 '8032.0',
 '8032.5',
 '8033.0',
 '8033.5',
 '8034.0',
 '8034.5',
 '8035.0',
 '8035.5',
 '8036.0',
 '8036.5',
 '8037.0',
 '8037.5',
 '8038.0',
 '8038.5',
 '8039.0',
 '8039.5',
 '8040.0',
 '8040.5',
 '8041.0',
 '8041.5',
 '8042.0',
 '8042.5',
 '8043.0',
 '8043.5',
 '8044.0',
 '8044.5',
 '8045.0',

In [None]:
depth = lazy["base_curves"].index.values.reshape((-1,1))
depth

In [None]:
# plot 2D pca with clusters
plot_pca_2D(lazy, key="merged_pca")

# plot 3D pca with clusters
plot_pca_3D(lazy, key="merged_pca")

# plot pca feature rank
plot_pca_rank(lazy, key="pca_rank")

# FIXME: adjust curve names in plot
# plot base log curves and clusters in log view
plot_curves_prob(lazy)

In [None]:
import plotly
print(plotly.__version__)
import numpy as np
import plotly.graph_objects as go

def discrete_colorscale(bvals, colors):
    """
    bvals - list of values bounding intervals/ranges of interest
    colors - list of rgb or hex colorcodes for values in [bvals[k], bvals[k+1]],0<=k < len(bvals)-1
    returns the plotly  discrete colorscale
    """
    if len(bvals) != len(colors)+1:
        raise ValueError('len(boundary values) should be equal to  len(colors)+1')
    bvals = sorted(bvals)     
    nvals = [(v-bvals[0])/(bvals[-1]-bvals[0]) for v in bvals]  #normalized values
    
    dcolorscale = [] #discrete colorscale
    for k in range(len(colors)):
        dcolorscale.extend([[nvals[k], colors[k]], [nvals[k+1], colors[k]]])
    return dcolorscale 

bvals = [2, 15, 40, 65, 90]

# NOTE: still need to define the colors manually...
colors = ['#09ffff', '#19d3f3', '#e763fa' , '#ab63fa']

dcolorsc = discrete_colorscale(bvals, colors)

bvals = np.array(bvals)

tickvals = [np.mean(bvals[k:k+2]) for k in range(len(bvals)-1)] #position with respect to bvals where ticktext is displayed
ticktext = [f'<{bvals[1]}'] + [f'{bvals[k]}-{bvals[k+1]}' for k in range(1, len(bvals)-2)]+[f'>{bvals[-2]}']

In [None]:
z = np.random.randint(bvals[0],  bvals[-1]+1, size=(20, 20))

heatmap = go.Heatmap(z=z, 
                     colorscale = dcolorsc, 
                     colorbar = dict(thickness=25, 
                                     tickvals=tickvals, 
                                     ticktext=ticktext))

fig = go.Figure(data=[heatmap])
fig.update_layout(width=500, height=500)

fig.show()

In [None]:
import plotly.express as px

g10 = px.colors.qualitative.G10
print(type(g10))

unique = df["hard_clusters"].unique()

colors = [g10[u] for u in unique]
print(colors)