## gmm sandbox

focus on making modules and importing

use kb's stuff as a base

for each log in dir:

1. load logs 
2. process logs
3. construct gmm
4. run gmm
5. plot results

look for messed up logs

consider results and think about what it means

In [1]:
import os
from pathlib import Path

# run cell once or path will break
print(os.getcwd())
os.chdir(Path(os.getcwd()).parent)
print(os.getcwd())

/home/mlr/Desktop/gmm_sandbox/notebooks
/home/mlr/Desktop/gmm_sandbox


In [2]:
from common.load import load_log
from common.model import interval, scale, pca, pca_rank, gmm
# from common.plot import plot_pca_2D, plot_pca_3D, plot_pca_rank, plot_curves_prob
from common.output import combine_curves_prob, combine_pca_prob

In [9]:
# load log from data and init data dictionary
cols = ["SP", "GR", "RT90", "NPHI_COMP", "RHOB", "PE"]
data = "./logs/Lazy_D_400222042.las"
lazy = load_log(data, cols)

# grab interval of log by index
lazy = interval(lazy, top=8000, bot=9000)

# # standard scale log with defaults
lazy = scale(lazy)

# # run 3 component pca with defaults
lazy = pca(lazy)

# # run feature rank on pca
lazy = pca_rank(lazy)

# # run gmm on scaled curves
lazy = gmm(lazy, n=4)

# # merge gmm clusters to base curves 
lazy = combine_curves_prob(lazy)

# # merge gmm clusters to pca components
lazy = combine_pca_prob(lazy)

LOG LOADED
PCA COMPLETE
GMM COMPLETE
LOGS AND CLUSTERS MERGED
PCA AND CLUSTERS MERGED


In [10]:
for k,v in lazy.items():
    if not isinstance(v, str):
        print(f"{k}: {type(v)}")
    else:
        print(f"{k}: {v}")

print("-"*50)

# TODO: work into loop above
print(f"base df: {lazy['base_curves'].shape}")
print(f"soft arr: {lazy['soft_clusters'].shape}")
print(f"hard arr: {lazy['hard_clusters'].shape}")
print(f"pca arr: {lazy['pca_curves'].shape}")
print(f"pca expvar: {lazy['pca_curves'].shape}")
print(f"pca rank: {lazy['pca_rank'].shape}")
print(f"merged df: {lazy['merged_curves'].shape}")
print(f"merged pca: {lazy['merged_pca'].shape}")

las: <class 'lasio.las.LASFile'>
well_name: LAZY-D ZN 3-9
base_curves: <class 'pandas.core.frame.DataFrame'>
interval_top: <class 'int'>
interval_bot: <class 'int'>
scaled_curves: <class 'numpy.ndarray'>
pca_curves: <class 'numpy.ndarray'>
pca_expvar: <class 'numpy.ndarray'>
pca_rank: <class 'pandas.core.frame.DataFrame'>
soft_clusters: <class 'numpy.ndarray'>
hard_clusters: <class 'numpy.ndarray'>
cluster_n: <class 'int'>
merged_curves: <class 'pandas.core.frame.DataFrame'>
merged_pca: <class 'pandas.core.frame.DataFrame'>
--------------------------------------------------
base df: (2001, 6)
soft arr: (2001, 4)
hard arr: (2001,)
pca arr: (2001, 6)
pca expvar: (2001, 6)
pca rank: (6, 6)
merged df: (2001, 8)
merged pca: (2001, 8)


In [25]:
import lasio
import numpy as np

hard_clusters = lazy["hard_clusters"]
las = lazy["las"]
name = lazy["well_name"]
merged_df = lazy["merged_curves"]

print(las.index)
print(hard_clusters)

print(las.index.shape)
print(hard_clusters.shape)

top_idx = np.where(las.index==8000)
bot_idx = np.where(las.index==9000)

print(las.index[top_idx])
print(las.index[bot_idx])

print(merged_df.index.values[0])
print(merged_df.index.values[-1])

# need to make NaN array down to index top and from index bot to td. 
# concat or stack NaN arrays and cluster array
# test that new cluster array has the same depth dimension as las.index

# then merge try and output a modifed LAS

[2295.  2295.5 2296.  ... 9675.  9675.5 9676. ]
[0 0 0 ... 1 1 1]
(14763,)
(2001,)
[8000.]
[9000.]
8000.0
9000.0
(array([11410]),) (array([13410]),)


In [8]:
from common.load import cd

try:
    with cd(os.path.join(os.getcwd(),"logs")):
        print(os.getcwd())
        new_las = las.add_curve("HARD_CLUSTER", hard_clusters, unit="int", descr="zone clusters")
        print(las.data.shape)
        new_las.write(f"MODIFIED_{name}.las")
except Exception as e:
    print(e)

print(os.getcwd())

/home/mlr/Desktop/gmm_sandbox/logs
all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 14763 and the array at index 37 has size 14659
/home/mlr/Desktop/gmm_sandbox


In [30]:
import numpy as np
depths = np.arange(10, 50, 0.5)
synth = np.log10(depths)*5+np.random.random(len(depths))
synth[:8] = np.nan
synth.shape

(80,)

In [24]:
pca_var = lazy["pca_expvar"]*100
print(pca_var[0:2]*100)
print(pca_var*100)

rank_df = lazy["pca_rank"]
cols = rank_df.columns.values.tolist()
print(cols)

print(len(cols)==len(pca_var))

combine = [f"{col} ({var:.1f}%)" for col, var in zip(cols,pca_var)]
print(combine)

[3795.52295228 3545.40836111]
[3795.52295228 3545.40836111 1514.1428778   693.06583307  290.51922796
  161.34074779]
['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6']
True
['PC1 (38.0%)', 'PC2 (35.5%)', 'PC3 (15.1%)', 'PC4 (6.9%)', 'PC5 (2.9%)', 'PC6 (1.6%)']


In [6]:
df = lazy["merged_curves"]
test = list(range(0,len(df["soft_clusters"].iloc[0])))

In [7]:
df["soft_clusters"].iloc[0]

[1.0039525895920558e-157, 3.585934934702599e-69, 1.0, 1.1861264879369834e-41]

In [7]:
# df_pca = lazy["merged_pca"]
# df_pca["dept"].astype(str).tolist()


In [None]:
depth = lazy["base_curves"].index.values.reshape((-1,1))
depth

In [None]:
# plot 2D pca with clusters
plot_pca_2D(lazy, key="merged_pca")

# plot 3D pca with clusters
plot_pca_3D(lazy, key="merged_pca")

# plot pca feature rank
plot_pca_rank(lazy, key="pca_rank")

# FIXME: adjust curve names in plot
# plot base log curves and clusters in log view
plot_curves_prob(lazy)

In [None]:
import plotly
print(plotly.__version__)
import numpy as np
import plotly.graph_objects as go

def discrete_colorscale(bvals, colors):
    """
    bvals - list of values bounding intervals/ranges of interest
    colors - list of rgb or hex colorcodes for values in [bvals[k], bvals[k+1]],0<=k < len(bvals)-1
    returns the plotly  discrete colorscale
    """
    if len(bvals) != len(colors)+1:
        raise ValueError('len(boundary values) should be equal to  len(colors)+1')
    bvals = sorted(bvals)     
    nvals = [(v-bvals[0])/(bvals[-1]-bvals[0]) for v in bvals]  #normalized values
    
    dcolorscale = [] #discrete colorscale
    for k in range(len(colors)):
        dcolorscale.extend([[nvals[k], colors[k]], [nvals[k+1], colors[k]]])
    return dcolorscale 

bvals = [2, 15, 40, 65, 90]

# NOTE: still need to define the colors manually...
colors = ['#09ffff', '#19d3f3', '#e763fa' , '#ab63fa']

dcolorsc = discrete_colorscale(bvals, colors)

bvals = np.array(bvals)

tickvals = [np.mean(bvals[k:k+2]) for k in range(len(bvals)-1)] #position with respect to bvals where ticktext is displayed
ticktext = [f'<{bvals[1]}'] + [f'{bvals[k]}-{bvals[k+1]}' for k in range(1, len(bvals)-2)]+[f'>{bvals[-2]}']

In [None]:
z = np.random.randint(bvals[0],  bvals[-1]+1, size=(20, 20))

heatmap = go.Heatmap(z=z, 
                     colorscale = dcolorsc, 
                     colorbar = dict(thickness=25, 
                                     tickvals=tickvals, 
                                     ticktext=ticktext))

fig = go.Figure(data=[heatmap])
fig.update_layout(width=500, height=500)

fig.show()

In [None]:
import plotly.express as px

g10 = px.colors.qualitative.G10
print(type(g10))

unique = df["hard_clusters"].unique()

colors = [g10[u] for u in unique]
print(colors)