In [1]:
def make_dwt_vars(wells_df,logs,levels,wavelet='db3'):

    from scipy.interpolate import interp1d
    import pywt
    
    wave= pywt.Wavelet(wavelet)
    max_level = max(levels)
    
    grouped = wells_df.groupby(['Well Name'])
    new_df = pd.DataFrame()
    for key in grouped.groups.keys():
    
        depth = grouped.get_group(key)['Depth']
        temp_df = pd.DataFrame()
        temp_df['Depth'] = depth
        for log in logs:
      
            temp_data = grouped.get_group(key)[log]
        
            for i in range(1,max_level+1):

                cA,cD = pywt.dwt(temp_data,wave,mode='symmetric')
                new_depth = np.linspace(min(depth),max(depth),len(cD))
                f = interp1d(new_depth,cD,kind='nearest')
                
                if (i in levels):
                    temp_df[log + '_cD_step_level_' + str(i)] = f(depth)
    
        temp_df['Well Name'] = [key for _ in range(len(depth))]
        new_df = new_df.append(temp_df)
        
    new_df = new_df.sort_index()
    new_df = new_df.drop(['Well Name','Depth'],axis=1)
    return pd.concat([wells_df,new_df],axis=1)

In [2]:
def make_entropy_vars(wells_df,logs,l_foot):
    
    from skimage.filters.rank import entropy
    from skimage.morphology import rectangle
    from skimage.util import img_as_ubyte
    
    new_df = pd.DataFrame()
    grouped = wells_df.groupby(['Well Name'])
    
    for key in grouped.groups.keys():
    
        depth = grouped.get_group(key)['Depth']
        temp_df = pd.DataFrame()
        temp_df['Depth'] = depth
        
        for log in logs:
      
            temp_data = grouped.get_group(key)[log]       
            footprint = rectangle(l_foot,3)

            image = np.vstack((temp_data,temp_data,temp_data))
            image -= np.median(image) 
            image /= np.max(np.abs(image))
            image = img_as_ubyte(image)

            temp_df[log + '_entropy_foot' + str(l_foot)] = entropy(image,footprint)[0,:]
    
        temp_df['Well Name'] = [key for _ in range(len(depth))]
        new_df = new_df.append(temp_df)
    
    new_df = new_df.sort_index()
    new_df = new_df.drop(['Well Name','Depth'],axis=1)
    return pd.concat([wells_df,new_df],axis=1)

In [3]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable

from pandas import set_option
set_option("display.max_rows", 20)
pd.options.mode.chained_assignment = None

filename = 'data/facies_vectors.csv'
training_data = pd.read_csv(filename)
training_data.head(10)

Unnamed: 0,Facies,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
0,3,A1 SH,SHRIMPLIN,2793.0,77.45,0.664,9.9,11.915,4.6,1,1.0
1,3,A1 SH,SHRIMPLIN,2793.5,78.26,0.661,14.2,12.565,4.1,1,0.979
2,3,A1 SH,SHRIMPLIN,2794.0,79.05,0.658,14.8,13.05,3.6,1,0.957
3,3,A1 SH,SHRIMPLIN,2794.5,86.1,0.655,13.9,13.115,3.5,1,0.936
4,3,A1 SH,SHRIMPLIN,2795.0,74.58,0.647,13.5,13.3,3.4,1,0.915
5,3,A1 SH,SHRIMPLIN,2795.5,73.97,0.636,14.0,13.385,3.6,1,0.894
6,3,A1 SH,SHRIMPLIN,2796.0,73.72,0.63,15.6,13.93,3.7,1,0.872
7,3,A1 SH,SHRIMPLIN,2796.5,75.65,0.625,16.5,13.92,3.5,1,0.83
8,3,A1 SH,SHRIMPLIN,2797.0,73.79,0.624,16.2,13.98,3.4,1,0.809
9,3,A1 SH,SHRIMPLIN,2797.5,76.89,0.615,16.9,14.22,3.5,1,0.787


In [4]:
grouped = training_data.groupby(['Well Name'])

In [5]:
new_vars = pd.DataFrame()

for key in grouped.groups.keys():
    
    NM_M = grouped.get_group(key)['NM_M'].values
    
    temp_df = pd.DataFrame()
    temp_df['Depth'] = grouped.get_group(key)['Depth']
    temp_df['Well Name'] = [key for _ in range(len(NM_M))]
        
    dh_1 = np.zeros(len(NM_M))
    dh_2 = np.zeros(len(NM_M))
    count1 = 0
    count2 = 0
    
    for i in range(len(NM_M)):
               
        if NM_M[i] == 1:
            
            count1=0
            count2+=0.5
            dh_2[i] += count2
            
        elif NM_M[i] == 2:
            
            count2=0
            count1+=0.5
            dh_1[i] += count1
    
    temp_df['dh_1'] = dh_1
    temp_df['dh_2'] = dh_2
        
    new_vars = new_vars.append(temp_df)

In [6]:
new_vars = new_vars.sort_index()
new_vars =new_vars.drop(['Well Name','Depth'],axis=1)
training_data =  pd.concat([training_data,new_vars],axis=1)
training_data

Unnamed: 0,Facies,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,dh_1,dh_2
0,3,A1 SH,SHRIMPLIN,2793.0,77.450,0.664,9.900,11.915,4.600,1,1.000,0.0,0.5
1,3,A1 SH,SHRIMPLIN,2793.5,78.260,0.661,14.200,12.565,4.100,1,0.979,0.0,1.0
2,3,A1 SH,SHRIMPLIN,2794.0,79.050,0.658,14.800,13.050,3.600,1,0.957,0.0,1.5
3,3,A1 SH,SHRIMPLIN,2794.5,86.100,0.655,13.900,13.115,3.500,1,0.936,0.0,2.0
4,3,A1 SH,SHRIMPLIN,2795.0,74.580,0.647,13.500,13.300,3.400,1,0.915,0.0,2.5
5,3,A1 SH,SHRIMPLIN,2795.5,73.970,0.636,14.000,13.385,3.600,1,0.894,0.0,3.0
6,3,A1 SH,SHRIMPLIN,2796.0,73.720,0.630,15.600,13.930,3.700,1,0.872,0.0,3.5
7,3,A1 SH,SHRIMPLIN,2796.5,75.650,0.625,16.500,13.920,3.500,1,0.830,0.0,4.0
8,3,A1 SH,SHRIMPLIN,2797.0,73.790,0.624,16.200,13.980,3.400,1,0.809,0.0,4.5
9,3,A1 SH,SHRIMPLIN,2797.5,76.890,0.615,16.900,14.220,3.500,1,0.787,0.0,5.0


In [7]:
new_vars = pd.DataFrame()

for key in grouped.groups.keys():
    
    NM_M = grouped.get_group(key)['NM_M'].values
    
    temp_df = pd.DataFrame()
    temp_df['Depth'] = grouped.get_group(key)['Depth']
    temp_df['Well Name'] = [key for _ in range(len(NM_M))]
        
    db_1 = np.zeros(len(NM_M))
    db_2 = np.zeros(len(NM_M))
    count1 = 0
    count2 = 0
    
    for i in range(len(NM_M)-1,-1,-1):
               
        if NM_M[i] == 1:
            
            count1=0
            count2+=0.5
            db_2[i] += count2
            
        elif NM_M[i] == 2:
            
            count2=0
            count1+=0.5
            db_1[i] += count1
    
    temp_df['db_1'] = db_1
    temp_df['db_2'] = db_2
        
    new_vars = new_vars.append(temp_df)

In [8]:
new_vars = new_vars.sort_index()
new_vars = new_vars.drop(['Well Name','Depth'],axis=1)
training_data =  pd.concat([training_data,new_vars],axis=1)
training_data

Unnamed: 0,Facies,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,dh_1,dh_2,db_1,db_2
0,3,A1 SH,SHRIMPLIN,2793.0,77.450,0.664,9.900,11.915,4.600,1,1.000,0.0,0.5,0.0,21.5
1,3,A1 SH,SHRIMPLIN,2793.5,78.260,0.661,14.200,12.565,4.100,1,0.979,0.0,1.0,0.0,21.0
2,3,A1 SH,SHRIMPLIN,2794.0,79.050,0.658,14.800,13.050,3.600,1,0.957,0.0,1.5,0.0,20.5
3,3,A1 SH,SHRIMPLIN,2794.5,86.100,0.655,13.900,13.115,3.500,1,0.936,0.0,2.0,0.0,20.0
4,3,A1 SH,SHRIMPLIN,2795.0,74.580,0.647,13.500,13.300,3.400,1,0.915,0.0,2.5,0.0,19.5
5,3,A1 SH,SHRIMPLIN,2795.5,73.970,0.636,14.000,13.385,3.600,1,0.894,0.0,3.0,0.0,19.0
6,3,A1 SH,SHRIMPLIN,2796.0,73.720,0.630,15.600,13.930,3.700,1,0.872,0.0,3.5,0.0,18.5
7,3,A1 SH,SHRIMPLIN,2796.5,75.650,0.625,16.500,13.920,3.500,1,0.830,0.0,4.0,0.0,18.0
8,3,A1 SH,SHRIMPLIN,2797.0,73.790,0.624,16.200,13.980,3.400,1,0.809,0.0,4.5,0.0,17.5
9,3,A1 SH,SHRIMPLIN,2797.5,76.890,0.615,16.900,14.220,3.500,1,0.787,0.0,5.0,0.0,17.0


In [9]:
logs = ['PHIND','ILD_log10','DeltaPHI']
levels = [4]

# training_full = make_dwt_vars(training_data,['GR'],levels)
training_full = make_entropy_vars(training_data,logs,20)
training_full

Unnamed: 0,Facies,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,dh_1,dh_2,db_1,db_2,PHIND_entropy_foot20,ILD_log10_entropy_foot20,DeltaPHI_entropy_foot20
0,3,A1 SH,SHRIMPLIN,2793.0,77.450,0.664,9.900,11.915,4.600,1,1.000,0.0,0.5,0.0,21.5,1.000000,1.000000,1.000000
1,3,A1 SH,SHRIMPLIN,2793.5,78.260,0.661,14.200,12.565,4.100,1,0.979,0.0,1.0,0.0,21.0,1.584963,1.584963,1.584963
2,3,A1 SH,SHRIMPLIN,2794.0,79.050,0.658,14.800,13.050,3.600,1,0.957,0.0,1.5,0.0,20.5,1.584963,1.584963,1.584963
3,3,A1 SH,SHRIMPLIN,2794.5,86.100,0.655,13.900,13.115,3.500,1,0.936,0.0,2.0,0.0,20.0,1.584963,1.584963,1.584963
4,3,A1 SH,SHRIMPLIN,2795.0,74.580,0.647,13.500,13.300,3.400,1,0.915,0.0,2.5,0.0,19.5,1.584963,1.584963,1.584963
5,3,A1 SH,SHRIMPLIN,2795.5,73.970,0.636,14.000,13.385,3.600,1,0.894,0.0,3.0,0.0,19.0,1.584963,1.584963,1.584963
6,3,A1 SH,SHRIMPLIN,2796.0,73.720,0.630,15.600,13.930,3.700,1,0.872,0.0,3.5,0.0,18.5,0.918296,1.584963,1.584963
7,3,A1 SH,SHRIMPLIN,2796.5,75.650,0.625,16.500,13.920,3.500,1,0.830,0.0,4.0,0.0,18.0,0.000000,1.584963,1.584963
8,3,A1 SH,SHRIMPLIN,2797.0,73.790,0.624,16.200,13.980,3.400,1,0.809,0.0,4.5,0.0,17.5,0.918296,1.584963,1.584963
9,3,A1 SH,SHRIMPLIN,2797.5,76.890,0.615,16.900,14.220,3.500,1,0.787,0.0,5.0,0.0,17.0,1.584963,1.584963,1.584963


In [10]:
training_full.replace(to_replace=np.nan,value=-99999,inplace=True)

In [11]:
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
from sklearn import ensemble

In [19]:
names = list(np.unique(training_full['Well Name']))
grouped = training_full.groupby(['Well Name'])
scores = []

for name in names:

    test = grouped.get_group(name)
    X_test = test.drop(['Formation', 'Well Name', 'Depth','Facies','db_1','dh_1','DeltaPHI','ILD_log10'], axis=1).values
    y_test = test['Facies'].values

    train_names = names.copy()
    train_names.remove(name)
    train = pd.DataFrame()

    for train_name in train_names:
        train = train.append(grouped.get_group(train_name))

    X_train = train.drop(['Formation', 'Well Name', 'Depth','Facies','db_1','dh_1','DeltaPHI','ILD_log10'], axis=1).values
    y_train = train['Facies'].values

    #The prediction begins

    #We start with oversampling minority classes
    smt = SMOTE()
    X_train, y_train = smt.fit_sample(X_train,y_train)

    clf = ensemble.RandomForestClassifier(n_estimators=800,n_jobs=-1)
#     clf = ensemble.GradientBoostingClassifier(n_estimators=300)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    #Scoring
    conf_mat = metrics.confusion_matrix(y_test,y_pred)
    print(conf_mat)
    try:
        score = metrics.f1_score(y_test, y_pred,average='weighted')
    except:
        score = conf_mat[1,1]/np.sum(conf_mat)
    scores.append(score)
    print('********')
    print('Blind well is {0}, F1 score : {1:.1%}\n'.format(name,score))

print("="*30)
print('*********** RESULT ***********')
print("="*30)
print('\nAverage  F1-score is {:.1%}'.format(np.mean(scores)))

[[ 0  0  0  0  0  0  0  0  0]
 [ 4 96 17  0  0  0  0  0  0]
 [ 0 26 65  0  0  0  0  0  0]
 [ 0  0  0 30  3  2  9  0  0]
 [ 0  0  0  0 12  6  2  6  0]
 [ 0  0  0 13 10 20  7 19  0]
 [ 0  0  0  0  1  1 11  3  0]
 [ 0  0  0 11  8  7 15 55  2]
 [ 0  0  0  0  1  0  0  0  4]]
********
Blind well is ALEXANDER D, F1 score : 63.3%

[[ 0  7  1  0  0  0  0  0  0]
 [ 0 34 21  0  0  1  0  0  0]
 [ 0  7 37  0  5  0  0  2  0]
 [ 0  0  0  8  2  3  0  0  0]
 [ 0  0  1  3  3 16  0  7  0]
 [ 0  0  0 11  5 57  1 13  0]
 [ 0  0  1  8  5  5  2  9  4]
 [ 0  0  3  2  5 26  2 32  5]
 [ 0  0  0  0  0  2  0  5 43]]
********
Blind well is CHURCHMAN BIBLE, F1 score : 51.2%

[[74 61 23  0  0  0  0  0]
 [ 7 59 72  2  1  0  0  1]
 [ 0  5 36  0  0  2  0  4]
 [ 0  0  4  7  0 12  0  2]
 [ 0  4  3  0  3 16  0  2]
 [ 0  0  0  4  2 19  0  6]
 [ 0  0  0  0  0  0  0  2]
 [ 0  0  4  6  0 27  3 28]]
********
Blind well is CROSS H CATTLE, F1 score : 46.9%

[[ 5  0  4  0  0  0  0  0  0]
 [ 1 75  9  0  0  0  0  0  0]
 [ 1 35 38  

### It looks like it is better to drop information about position of Marine sediments relative to Non-marine units. However, i is better to keep info about position of Non-marine sediments realtive to marine unit. This conclusion is to verify because the average score is better but it looks as the variance from well to well is greater

Not sure if DeltaPHI is good. The average F1 is lower with it
