In [1]:
%matplotlib inline
%load_ext watermark
%watermark -v -m -p numpy,scipy,pandas,matplotlib,seaborn,sklearn,pywt 

CPython 3.5.2
IPython 5.1.0

numpy 1.11.1
scipy 0.18.0
pandas 0.18.1
matplotlib 1.5.1
seaborn 0.7.1
sklearn 0.19.dev0
pywt 0.4.0

compiler   : GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)
system     : Darwin
release    : 16.0.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit


In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
# import seaborn as sns

from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None

### Load variables

In [3]:
raw_logs = pd.read_pickle('data/training_data_raw.pkl')
raw_logs.replace(to_replace=np.nan, value=-99999.0, inplace=True)

dwt = pd.read_pickle('data/vars_from_dwt.pkl')
dwt.replace(to_replace=np.nan, value=-99999.0, inplace=True)

entropy = pd.read_pickle('data/vars_from_log_entropy.pkl')
entropy.replace(to_replace=np.nan, value=-99999.0, inplace=True)


In [4]:
training_data = pd.merge(raw_logs,dwt,on=['Depth','Well Name'])
training_data = pd.merge(training_data,entropy, on=['Depth','Well Name'])

training_data

Unnamed: 0,Facies,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,...,PE_cD_step_level_2,PE_cD_step_level_3,PE_cD_step_level_4,PE_cD_step_level_5,PE_cD_step_level_6,GR_entropy_foot5,ILD_log10_entropy_foot5,DeltaPHI_entropy_foot5,PHIND_entropy_foot5,PE_entropy_foot5
0,3,A1 SH,SHRIMPLIN,2793.0,77.450,0.664,9.900,11.915,4.600,1,...,0.109651,0.109651,0.109651,0.109651,0.109651,1.000000,1.000000,1.000000,1.000000,1.000000
1,3,A1 SH,SHRIMPLIN,2793.5,78.260,0.661,14.200,12.565,4.100,1,...,-0.228628,-0.228628,-0.228628,-0.228628,-0.228628,0.918296,1.584963,1.584963,1.584963,1.584963
2,3,A1 SH,SHRIMPLIN,2794.0,79.050,0.658,14.800,13.050,3.600,1,...,-0.228628,-0.228628,-0.228628,-0.228628,-0.228628,0.918296,1.584963,1.584963,1.584963,0.918296
3,3,A1 SH,SHRIMPLIN,2794.5,86.100,0.655,13.900,13.115,3.500,1,...,-0.037444,-0.037444,-0.037444,-0.037444,-0.037444,1.584963,1.584963,1.584963,1.584963,0.000000
4,3,A1 SH,SHRIMPLIN,2795.0,74.580,0.647,13.500,13.300,3.400,1,...,-0.037444,-0.037444,-0.037444,-0.037444,-0.037444,0.918296,1.584963,1.584963,1.584963,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4162,5,C LM,CHURCHMAN BIBLE,3120.5,46.719,0.947,1.828,7.254,3.617,2,...,-0.030984,-0.030984,-0.030984,-0.030984,-0.030984,0.000000,1.584963,0.918296,0.000000,0.000000
4163,5,C LM,CHURCHMAN BIBLE,3121.0,44.563,0.953,2.241,8.013,3.344,2,...,0.059633,0.059633,0.059633,0.059633,0.059633,0.000000,1.584963,1.584963,0.000000,0.000000
4164,5,C LM,CHURCHMAN BIBLE,3121.5,49.719,0.964,2.925,8.013,3.190,2,...,0.059633,0.059633,0.059633,0.059633,0.059633,0.000000,1.584963,1.584963,0.000000,0.000000
4165,5,C LM,CHURCHMAN BIBLE,3122.0,51.469,0.965,3.083,7.708,3.152,2,...,-0.040299,-0.040299,-0.040299,-0.040299,-0.040299,0.000000,1.584963,1.584963,0.000000,0.000000


In [5]:
# 1=sandstone  2=c_siltstone   3=f_siltstone 
# 4=marine_silt_shale 5=mudstone 6=wackestone 7=dolomite
# 8=packstone 9=bafflestone
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00',
       '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']

facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
                 'WS', 'D','PS', 'BS']
#facies_color_map is a dictionary that maps facies labels
#to their respective colors
facies_color_map = {}
for ind, label in enumerate(facies_labels):
    facies_color_map[label] = facies_colors[ind]

def label_facies(row, labels):
    return labels[ row['Facies'] -1]
    
training_data.loc[:,'FaciesLabels'] = training_data.apply(lambda row: label_facies(row, facies_labels), axis=1)
training_data.replace(to_replace=np.nan, value=-99999.0, inplace=True)
training_data.describe()

Unnamed: 0,Facies,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,GR_cD_step_level_1,...,PE_cD_step_level_2,PE_cD_step_level_3,PE_cD_step_level_4,PE_cD_step_level_5,PE_cD_step_level_6,GR_entropy_foot5,ILD_log10_entropy_foot5,DeltaPHI_entropy_foot5,PHIND_entropy_foot5,PE_entropy_foot5
count,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,...,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0,4167.0
mean,4.50252,2906.351092,65.248225,0.659125,4.412696,13.203811,-22003.113383,1.519078,0.523053,-0.198926,...,-21934.026972,-21934.026972,-21934.026972,-21934.026972,-21934.026972,0.803668,0.778121,0.869436,0.831015,0.546265
std,2.475018,133.443259,30.951925,0.252315,5.267093,7.119856,41434.94857,0.499696,0.287208,5.096861,...,41384.66058,41384.66058,41384.66058,41384.66058,41384.66058,0.72758,0.740063,0.719423,0.738078,0.684827
min,1.0,2573.5,10.149,-0.025949,-21.832,0.55,-99999.0,1.0,0.0,-96.538551,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,2820.75,44.912,0.497034,1.602,8.51225,2.4425,1.0,0.278,-1.206098,...,-0.228628,-0.228628,-0.228628,-0.228628,-0.228628,0.0,0.0,0.0,0.0,0.0
50%,4.0,2932.5,65.125,0.639,4.3,12.036,3.3,2.0,0.528,0.015539,...,-0.027043,-0.027043,-0.027043,-0.027043,-0.027043,0.918296,0.918296,0.918296,0.918296,0.0
75%,6.0,3006.5,79.535,0.821,7.5,16.057,4.0,2.0,0.769,1.140359,...,0.024787,0.024787,0.024787,0.024787,0.024787,1.584963,1.584963,1.584963,1.584963,1.584963
max,9.0,3138.0,361.15,1.8,19.312,84.4,8.094,2.0,1.0,41.034509,...,1.523253,1.523253,1.523253,1.523253,1.523253,1.584963,1.584963,1.584963,1.584963,1.584963


## Test set
Remove a single well to use as a blind test later.

In [6]:
blind = training_data[training_data['Well Name'] == 'NEWBY']
training_data = training_data[training_data['Well Name'] != 'NEWBY']
training_data['Well Name']
blind

Unnamed: 0,Facies,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,...,PE_cD_step_level_3,PE_cD_step_level_4,PE_cD_step_level_5,PE_cD_step_level_6,GR_entropy_foot5,ILD_log10_entropy_foot5,DeltaPHI_entropy_foot5,PHIND_entropy_foot5,PE_entropy_foot5,FaciesLabels
3300,3,A1 SH,NEWBY,2826.0,76.34,0.719,7.8,11.00,3.7,1,...,0.104104,0.104104,0.104104,0.104104,1.000000,1.000000,1.000000,1.000000,0.000000,FSiS
3301,3,A1 SH,NEWBY,2826.5,83.74,0.688,9.7,12.55,3.4,1,...,-0.014871,-0.014871,-0.014871,-0.014871,0.918296,1.584963,1.584963,1.584963,0.000000,FSiS
3302,3,A1 SH,NEWBY,2827.0,83.19,0.664,10.1,11.95,3.4,1,...,-0.014871,-0.014871,-0.014871,-0.014871,0.918296,1.584963,0.918296,1.584963,0.000000,FSiS
3303,3,A1 SH,NEWBY,2827.5,80.44,0.648,10.1,11.15,3.4,1,...,-0.036854,-0.036854,-0.036854,-0.036854,1.584963,0.918296,0.918296,1.584963,0.000000,FSiS
3304,3,A1 SH,NEWBY,2828.0,75.42,0.648,9.3,11.45,3.3,1,...,-0.036854,-0.036854,-0.036854,-0.036854,1.584963,0.918296,1.584963,1.584963,0.000000,FSiS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3758,6,C LM,NEWBY,3055.0,66.94,0.838,4.0,8.00,4.2,2,...,-0.114737,-0.114737,-0.114737,-0.114737,1.584963,1.584963,1.584963,0.918296,1.584963,WS
3759,6,C LM,NEWBY,3055.5,54.06,0.823,1.9,5.45,4.3,2,...,-0.114737,-0.114737,-0.114737,-0.114737,0.918296,1.584963,0.918296,0.000000,1.584963,WS
3760,6,C LM,NEWBY,3056.0,47.87,0.797,0.7,4.85,4.4,2,...,0.029744,0.029744,0.029744,0.029744,0.000000,1.584963,0.000000,0.000000,1.584963,WS
3761,6,C LM,NEWBY,3056.5,49.34,0.763,2.3,4.85,4.1,2,...,0.029744,0.029744,0.029744,0.029744,0.000000,1.584963,0.000000,0.000000,1.584963,WS


## Split into training and test set

Now we extract just the feature variables we need to perform the classification.  The predictor variables are the five wireline values and two geologic constraining variables. We also get a vector of the facies labels that correspond to each feature vector.

In [17]:
X_train = training_data.drop(['Formation', 'Well Name','Facies','FaciesLabels'], axis=1).values
y_train = training_data['Facies'].values
X_test = blind.drop(['Formation', 'Well Name','Facies','FaciesLabels'], axis=1).values
y_test = blind['Facies'].values

from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
# feature_vectors = training_data.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1)
# feature_vectors.describe()

(3704, 43)
(463, 43)
(3704,)
(463,)


## Comparing classifier

In [18]:
from sklearn import cross_validation
from sklearn import tree
from sklearn import svm
from sklearn import ensemble
from sklearn import neighbors
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing

from sklearn.metrics import confusion_matrix
from classification_utilities import display_cm, display_adj_cm
from sklearn.metrics import classification_report

def accuracy(conf):
    total_correct = 0.
    nb_classes = conf.shape[0]
    for i in np.arange(0,nb_classes):
        total_correct += conf[i][i]
    acc = total_correct/sum(sum(conf))
    return acc

adjacent_facies = np.array([[1], [0,2], [1], [4], [3,5], [4,6,7], [5,7], [5,6,8], [6,7]])

def accuracy_adjacent(conf, adjacent_facies):
    nb_classes = conf.shape[0]
    total_correct = 0.
    for i in np.arange(0,nb_classes):
        total_correct += conf[i][i]
        for j in adjacent_facies[i]:
            total_correct += conf[i][j]
    return total_correct / sum(sum(conf))

In [19]:
classifiers = [
    ensemble.RandomForestClassifier(n_estimators=1000),
    ensemble.GradientBoostingClassifier(min_samples_split=1),
    ensemble.BaggingClassifier()]

In [20]:
# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Precision", "Recall", "F1-score"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****\n')
    predicted_labels = clf.predict(X_test)
    
#     accuracy = metrics.accuracy_score(y_test, train_predictions)
#     print("Accuracy: {:.4%}".format(accuracy))
    conf = confusion_matrix(y_test, predicted_labels)

    print('Optimized facies classification accuracy = %.2f' % accuracy(conf))
    print('Optimized adjacent facies classification accuracy = %.2f\n' % accuracy_adjacent(conf, adjacent_facies))
    
    display_adj_cm(conf, facies_labels, adjacent_facies,display_metrics=True, hide_zeros=True)
    
    
    

#     precision = metrics.precision_score(y_test, train_predictions, average='micro')
#     print("Precision: {:.4%}".format(precision))
    
#     recall = metrics.recall_score(y_test, train_predictions, average='micro')
#     print("Recall: {:.4%}".format(recall))
    
#     f1 = metrics.f1_score(y_test, train_predictions, average='micro')
#     print("F1 score: {:.4%}".format(f1))
       
#     log_entry = pd.DataFrame([[name, accuracy*100, precision*100, recall*100, f1*100]], columns=log_cols)
#     log = log.append(log_entry)
    
print("="*30)

RandomForestClassifier
****Results****

Optimized facies classification accuracy = 0.47
Optimized adjacent facies classification accuracy = 0.91

     Pred    SS  CSiS  FSiS  SiSh    MS    WS     D    PS    BS Total
     True
       SS                                                           0
     CSiS          98                                              98
     FSiS                80                                        80
     SiSh                      39          17     1     1          58
       MS                 4          10           4    10          28
       WS                       2          94                      96
        D                                        16                16
       PS                                              56          56
       BS                                   4                27    31

Precision  0.00  1.00  0.95  0.95  1.00  0.82  0.76  0.84  1.00  0.92
   Recall  0.00  1.00  1.00  0.67  0.36  0.98  1.00  1.00  0.87  0.91
   