In [47]:
%matplotlib inline
%load_ext watermark
%watermark -v -m -p numpy,scipy,pandas,matplotlib,seaborn,sklearn,pywt 

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
CPython 3.5.2
IPython 5.1.0

numpy 1.11.1
scipy 0.18.0
pandas 0.18.1
matplotlib 1.5.1
seaborn 0.7.1
sklearn 0.19.dev0
pywt 0.4.0

compiler   : GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)
system     : Darwin
release    : 16.1.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit


In [108]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
# import seaborn as sns

from pandas import set_option
set_option("display.max_rows", 30)
pd.options.mode.chained_assignment = None

### Load variables

In [109]:
raw_logs = pd.read_pickle('data/training_data_raw.pkl')
raw_logs.replace(to_replace=np.nan, value=-99999.0, inplace=True)

dwt = pd.read_pickle('data/vars_from_dwt.pkl')
dwt.replace(to_replace=np.nan, value=-99999.0, inplace=True)

entropy = pd.read_pickle('data/vars_from_log_entropy.pkl')
entropy.replace(to_replace=np.nan, value=-99999.0, inplace=True)


In [110]:
training_data = pd.merge(raw_logs,dwt,on=['Depth','Well Name'])
training_data = pd.merge(training_data,entropy, on=['Depth','Well Name'])

# training_data = raw_logs
training_data.head()
# training_data.isnull().sum()

Unnamed: 0,Facies,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,...,PE_cD_step_level_2,PE_cD_step_level_3,PE_cD_step_level_4,PE_cD_step_level_5,PE_cD_step_level_6,GR_entropy_foot5,ILD_log10_entropy_foot5,DeltaPHI_entropy_foot5,PHIND_entropy_foot5,PE_entropy_foot5
0,3,A1 SH,SHRIMPLIN,2793.0,77.45,0.664,9.9,11.915,4.6,1,...,0.109651,0.109651,0.109651,0.109651,0.109651,1.0,1.0,1.0,1.0,1.0
1,3,A1 SH,SHRIMPLIN,2793.5,78.26,0.661,14.2,12.565,4.1,1,...,-0.228628,-0.228628,-0.228628,-0.228628,-0.228628,0.918296,1.584963,1.584963,1.584963,1.584963
2,3,A1 SH,SHRIMPLIN,2794.0,79.05,0.658,14.8,13.05,3.6,1,...,-0.228628,-0.228628,-0.228628,-0.228628,-0.228628,0.918296,1.584963,1.584963,1.584963,0.918296
3,3,A1 SH,SHRIMPLIN,2794.5,86.1,0.655,13.9,13.115,3.5,1,...,-0.037444,-0.037444,-0.037444,-0.037444,-0.037444,1.584963,1.584963,1.584963,1.584963,0.0
4,3,A1 SH,SHRIMPLIN,2795.0,74.58,0.647,13.5,13.3,3.4,1,...,-0.037444,-0.037444,-0.037444,-0.037444,-0.037444,0.918296,1.584963,1.584963,1.584963,0.0


In [111]:
PE_mask = training_data['PE'].notnull().values
training_data = training_data[PE_mask]

In [112]:
# 1=sandstone  2=c_siltstone   3=f_siltstone 
# 4=marine_silt_shale 5=mudstone 6=wackestone 7=dolomite
# 8=packstone 9=bafflestone
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00',
       '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']

facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
                 'WS', 'D','PS', 'BS']
#facies_color_map is a dictionary that maps facies labels
#to their respective colors
facies_color_map = {}
for ind, label in enumerate(facies_labels):
    facies_color_map[label] = facies_colors[ind]

def label_facies(row, labels):
    return labels[ row['Facies'] -1]
    
training_data.loc[:,'FaciesLabels'] = training_data.apply(lambda row: label_facies(row, facies_labels), axis=1)
training_data.replace(to_replace=np.nan, value=-99999.0, inplace=True)
training_data['FaciesLabels'].value_counts()

CSiS    946
FSiS    786
PS      692
WS      582
MS      296
SiSh    271
SS      268
BS      185
D       141
Name: FaciesLabels, dtype: int64

## Test set
Remove a single well to use as a blind test later.

In [113]:
blind = training_data[training_data['Well Name'] == 'SHANKLE']
training_data = training_data[training_data['Well Name'] != 'SHANKLE']
training_data['Well Name'].unique()


array(['SHRIMPLIN', 'ALEXANDER D', 'LUKE G U', 'KIMZEY A',
       'CROSS H CATTLE', 'NOLAN', 'Recruit F9', 'NEWBY', 'CHURCHMAN BIBLE'], dtype=object)

## Split into training and test set

Now we extract just the feature variables we need to perform the classification.  The predictor variables are the five wireline values and two geologic constraining variables. We also get a vector of the facies labels that correspond to each feature vector.

In [114]:
X_train = training_data.drop(['Formation', 'Well Name','Facies','FaciesLabels'], axis=1).values
y_train = training_data['Facies'].values
X_test = blind.drop(['Formation', 'Well Name','Facies','FaciesLabels'], axis=1).values
y_test = blind['Facies'].values

from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
# feature_vectors = training_data.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1)
# feature_vectors.describe()

(3718, 43)
(449, 43)
(3718,)
(449,)


In [115]:
# from sklearn.model_selection import cross_val_score

# scores = cross_val_score(estimator=clf,
#                          X=X_train,
#                          y=y_train,
#                          cv=10,
#                          n_jobs=1,
#                          scoring='f1_micro')
# print('CV f1 scores: %s' % scores)
# print('CV f1: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))




from sklearn.model_selection import StratifiedKFold
    
kfold = StratifiedKFold(n_splits=10,
                            random_state=1).split(X_train, y_train)

# clf = ensemble.RandomForestClassifier(n_estimators=1000)
# clf = svm.LinearSVC(class_weight='balanced', tol=1e-03) 
clf = ensemble.GradientBoostingClassifier()


scores = []
for k, (train, test) in enumerate(kfold):
    clf.fit(X_train[train], y_train[train])
    score = clf.score(X_train[test], y_train[test])
    scores.append(score)
    print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,
          np.bincount(y_train[train]), score))
    
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Fold: 1, Class dist.: [  0 161 771 602 237 249 459 111 586 166], Acc: 0.556
Fold: 2, Class dist.: [  0 161 771 602 237 249 460 111 586 166], Acc: 0.589
Fold: 3, Class dist.: [  0 161 771 602 237 249 460 111 587 166], Acc: 0.580
Fold: 4, Class dist.: [  0 161 771 602 237 249 460 111 587 166], Acc: 0.521
Fold: 5, Class dist.: [  0 161 771 602 238 249 460 112 587 166], Acc: 0.433
Fold: 6, Class dist.: [  0 161 771 602 238 249 460 112 587 167], Acc: 0.345
Fold: 7, Class dist.: [  0 161 771 602 238 249 460 112 587 167], Acc: 0.383
Fold: 8, Class dist.: [  0 161 772 602 238 250 460 112 587 167], Acc: 0.585
Fold: 9, Class dist.: [  0 161 772 602 238 250 460 112 587 167], Acc: 0.612
Fold: 10, Class dist.: [  0 162 772 603 238 250 460 112 587 167], Acc: 0.466

CV accuracy: 0.507 +/- 0.090


In [116]:
print('****Results****\n')
predicted_labels = clf.predict(X_test)
    
accuracy = metrics.accuracy_score(y_test, predicted_labels)
print("Accuracy: {:.4%}".format(accuracy))
precision = metrics.precision_score(y_test, predicted_labels, average='micro')
print("Precision: {:.4%}".format(precision))
    
recall = metrics.recall_score(y_test, predicted_labels, average='micro')
print("Recall: {:.4%}".format(recall))
    
f1 = metrics.f1_score(y_test, predicted_labels, average='micro')
print("F1 score: {:.4%}".format(f1))
    
print(metrics.classification_report(y_test, predicted_labels,target_names=facies_labels))

****Results****

Accuracy: 47.6615%
Precision: 47.6615%
Recall: 47.6615%
F1 score: 47.6615%
             precision    recall  f1-score   support

         SS       0.00      0.00      0.00        89
       CSiS       0.38      0.78      0.51        89
       FSiS       0.77      0.73      0.75       117
       SiSh       0.12      0.14      0.13         7
         MS       0.29      0.63      0.39        19
         WS       0.53      0.42      0.47        71
          D       0.50      0.06      0.11        17
         PS       0.37      0.40      0.39        40
         BS       0.00      0.00      0.00         0

avg / total       0.43      0.48      0.43       449



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [89]:
blind['Well Name'].value_counts()

SHANKLE    449
Name: Well Name, dtype: int64

## Comparing classifier

In [55]:
from sklearn import cross_validation
from sklearn import tree
from sklearn import svm
from sklearn import ensemble
from sklearn import neighbors
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing

from sklearn.metrics import confusion_matrix
from classification_utilities import display_cm, display_adj_cm
from sklearn.metrics import classification_report

def accuracy(conf):
    total_correct = 0.
    nb_classes = conf.shape[0]
    for i in np.arange(0,nb_classes):
        total_correct += conf[i][i]
    acc = total_correct/sum(sum(conf))
    return acc

adjacent_facies = np.array([[1], [0,2], [1], [4], [3,5], [4,6,7], [5,7], [5,6,8], [6,7]])

def accuracy_adjacent(conf, adjacent_facies):
    nb_classes = conf.shape[0]
    total_correct = 0.
    for i in np.arange(0,nb_classes):
        total_correct += conf[i][i]
        for j in adjacent_facies[i]:
            total_correct += conf[i][j]
    return total_correct / sum(sum(conf))

In [56]:
classifiers = [
    
    ensemble.RandomForestClassifier(n_estimators=1000),
    ensemble.GradientBoostingClassifier(min_samples_split=1),
    ensemble.BaggingClassifier()]

In [57]:
# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Precision", "Recall", "F1-score"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****\n')
    predicted_labels = clf.predict(X_test)
    
    accuracy = metrics.accuracy_score(y_test, predicted_labels)
    print("Accuracy: {:.4%}".format(accuracy))
#     conf = confusion_matrix(y_test, predicted_labels)

#     print('Optimized facies classification accuracy = %.2f' % accuracy(conf))
#     print('Optimized adjacent facies classification accuracy = %.2f\n' % accuracy_adjacent(conf, adjacent_facies))
    
#     display_adj_cm(conf, facies_labels, adjacent_facies,display_metrics=True, hide_zeros=True)
    
    
    

    precision = metrics.precision_score(y_test, predicted_labels, average='micro')
    print("Precision: {:.4%}".format(precision))
    
    recall = metrics.recall_score(y_test, predicted_labels, average='micro')
    print("Recall: {:.4%}".format(recall))
    
    f1 = metrics.f1_score(y_test, predicted_labels, average='micro')
    print("F1 score: {:.4%}".format(f1))
    
    print(metrics.classification_report(y_test, predicted_labels))
       
    log_entry = pd.DataFrame([[name, accuracy*100, precision*100, recall*100, f1*100]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

LinearSVC
****Results****

Accuracy: 48.3801%
Precision: 48.3801%
Recall: 48.3801%
F1 score: 48.3801%
             precision    recall  f1-score   support

          1       0.00      0.00      0.00         0
          2       0.64      0.66      0.65        98
          3       0.65      0.53      0.58        80
          4       0.64      0.40      0.49        58
          5       0.00      0.00      0.00        28
          6       0.52      0.30      0.38        96
          7       0.41      0.88      0.56        16
          8       0.38      0.54      0.44        56
          9       0.30      0.68      0.41        31

avg / total       0.51      0.48      0.48       463



  'recall', 'true', average, warn_for)


RandomForestClassifier
****Results****

Accuracy: 54.2117%
Precision: 54.2117%
Recall: 54.2117%
F1 score: 54.2117%
             precision    recall  f1-score   support

          1       0.00      0.00      0.00         0
          2       0.64      0.71      0.68        98
          3       0.58      0.50      0.54        80
          4       0.90      0.66      0.76        58
          5       0.14      0.07      0.10        28
          6       0.55      0.44      0.49        96
          7       0.55      0.38      0.44        16
          8       0.36      0.86      0.51        56
          9       1.00      0.16      0.28        31

avg / total       0.60      0.54      0.53       463



  'recall', 'true', average, warn_for)


GradientBoostingClassifier
****Results****

Accuracy: 51.8359%
Precision: 51.8359%
Recall: 51.8359%
F1 score: 51.8359%
             precision    recall  f1-score   support

          1       0.00      0.00      0.00         0
          2       0.68      0.72      0.70        98
          3       0.62      0.57      0.60        80
          4       0.81      0.66      0.72        58
          5       0.00      0.00      0.00        28
          6       0.49      0.38      0.42        96
          7       0.37      0.44      0.40        16
          8       0.35      0.75      0.47        56
          9       0.00      0.00      0.00        31

avg / total       0.51      0.52      0.50       463

BaggingClassifier
****Results****

Accuracy: 46.8683%
Precision: 46.8683%
Recall: 46.8683%
F1 score: 46.8683%
             precision    recall  f1-score   support

          1       0.00      0.00      0.00         0
          2       0.60      0.60      0.60        98
          3       0.59   

  'recall', 'true', average, warn_for)
