## Exploring the dataset
First, we will examine the data set we will use to train the classifier.

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable

from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None

filename = 'data/facies_vectors.csv'
training_data_raw = pd.read_csv(filename)
training_data_raw.describe()



Unnamed: 0,Facies,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
count,4149.0,4149.0,4149.0,4149.0,4149.0,4149.0,3232.0,4149.0,4149.0
mean,4.503254,2906.867438,64.933985,0.659566,4.402484,13.201066,3.725014,1.518438,0.521852
std,2.474324,133.300164,30.30253,0.252703,5.274947,7.132846,0.896152,0.49972,0.286644
min,1.0,2573.5,10.149,-0.025949,-21.832,0.55,0.2,1.0,0.0
25%,2.0,2821.5,44.73,0.498,1.6,8.5,,1.0,0.277
50%,4.0,2932.5,64.99,0.639,4.3,12.02,,2.0,0.528
75%,6.0,3007.0,79.438,0.822,7.5,16.05,,2.0,0.769
max,9.0,3138.0,361.15,1.8,19.312,84.4,8.094,2.0,1.0


## K fold cross-validation 
Adapted from [@LukasMosser code](https://gist.github.com/LukasMosser/cd645bad2bdbbb419098ac3ea363f2b3) to fit python 3.5.
Doing a cross-validation on each well in order to see how each well perform as a blind text

In [2]:
new_vars = pd.read_pickle('data/vars_from_dwt.pkl')
new_vars2 = pd.read_pickle('data/vars_from_log_entropy.pkl')
new_vars = new_vars.drop(['Well Name','Depth'],axis=1)
new_vars2 = new_vars2.drop(['Well Name','Depth'],axis=1)
training_data = pd.concat([training_data_raw,new_vars,new_vars2],axis=1)
training_data

Unnamed: 0,Facies,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,GR_cD_step_level_4,ILD_log10_entropy_foot20,DeltaPHI_entropy_foot20,PHIND_entropy_foot20
0,3,A1 SH,SHRIMPLIN,2793.0,77.450,0.664,9.900,11.915,4.600,1,1.000,0.062591,1.000000,1.000000,1.000000
1,3,A1 SH,SHRIMPLIN,2793.5,78.260,0.661,14.200,12.565,4.100,1,0.979,-1.930541,1.584963,1.584963,1.584963
2,3,A1 SH,SHRIMPLIN,2794.0,79.050,0.658,14.800,13.050,3.600,1,0.957,-1.930541,1.584963,1.584963,1.584963
3,3,A1 SH,SHRIMPLIN,2794.5,86.100,0.655,13.900,13.115,3.500,1,0.936,-5.282836,1.584963,1.584963,1.584963
4,3,A1 SH,SHRIMPLIN,2795.0,74.580,0.647,13.500,13.300,3.400,1,0.915,-5.282836,1.584963,1.584963,1.584963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4144,5,C LM,CHURCHMAN BIBLE,3120.5,46.719,0.947,1.828,7.254,3.617,2,0.685,1.743657,1.584963,0.918296,0.000000
4145,5,C LM,CHURCHMAN BIBLE,3121.0,44.563,0.953,2.241,8.013,3.344,2,0.677,-0.891801,1.584963,1.584963,0.000000
4146,5,C LM,CHURCHMAN BIBLE,3121.5,49.719,0.964,2.925,8.013,3.190,2,0.669,-0.891801,1.584963,1.584963,0.000000
4147,5,C LM,CHURCHMAN BIBLE,3122.0,51.469,0.965,3.083,7.708,3.152,2,0.661,0.956644,1.584963,1.584963,0.000000


In [3]:
# PE_mask = training_data['PE'].notnull().values
# training_data = training_data[PE_mask]
training_data.replace(to_replace=np.nan,value=-99999,inplace=True)

In [5]:
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
from sklearn import ensemble
#Create a set of unique well names

names = list(set(training_data["Well Name"]))

#Create a dicitionary of the well datasets, continued from original contest notebook 
#But perform dropping for each well individually
#Maybe not necessary.

well_datas = {}
for name in names:
    well = training_data[training_data["Well Name"]==name] 
    well_labels = well['Facies'].values.astype(np.int64)
    well = well.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1).values
    well_datas[name] = [well, well_labels]
    
    
X_data = {}
y_data = {}
for name, (data, labels) in well_datas.items():
    y_data[name] = np.array(labels, dtype=np.int64)
    X_data[name] = np.array(data, dtype=np.float32)

training_sets = []
test_sets = []

for i in range(len(names)):
    X_train = []
    y_train = []

    X_test = []
    y_test = []

    for name, data in X_data.items():
        if name is not names[i]:
            for row in data:
                X_train.append(row)
        else:
            for row in data:
                X_test.append(row)

    for name, labels in y_data.items():
        if name is not names[i]:
            for val in labels:
                y_train.append(val)
        else:
            for val in labels:
                y_test.append(val)

    X_train = np.array(X_train, dtype=np.float32)
    y_train = np.array(y_train, dtype=np.int64).reshape(len(y_train), 1)
    y_train = y_train.ravel()
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)


    X_test = np.array(X_test, dtype=np.float32)
    X_test = scaler.transform(X_test)

    y_test = np.array(y_test, dtype=np.int32)
    training_sets.append([X_train, y_train, X_test, y_test])
    
#Use as follows:
scores = []
for i, (X_train, y_train, X_test, y_test) in enumerate(training_sets):
#     clf = svm.LinearSVC(class_weight='balanced', tol=1e-03, random_state=42, C=10)
    smt = SMOTE()
    X_train, y_train = smt.fit_sample(X_train,y_train)
    clf = ensemble.RandomForestClassifier(n_estimators=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    #Scoring
    score = metrics.f1_score(y_test, y_pred, average='weighted')
    scores.append(score)
    print('********')
    print('Blind well is {0}, F1 score : {1:.4%}\n'.format(names[i],score))
#     print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    pass
print("="*30)
print('*********** RESULT ***********')
print("="*30)
print('\nAverage  F1-score is {:.4%}'.format(np.mean(scores)))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


********
Blind well is SHRIMPLIN, F1 score : 55.1869%

********
Blind well is ALEXANDER D, F1 score : 59.2707%



  'recall', 'true', average, warn_for)


********
Blind well is NOLAN, F1 score : 51.9693%

********
Blind well is SHANKLE, F1 score : 48.8206%



  'recall', 'true', average, warn_for)


********
Blind well is LUKE G U, F1 score : 64.0076%



  'precision', 'predicted', average, warn_for)


********
Blind well is KIMZEY A, F1 score : 47.7982%

********
Blind well is CROSS H CATTLE, F1 score : 40.1015%



  'recall', 'true', average, warn_for)


********
Blind well is Recruit F9, F1 score : 94.0397%



  'precision', 'predicted', average, warn_for)


********
Blind well is CHURCHMAN BIBLE, F1 score : 55.0195%

********
Blind well is NEWBY, F1 score : 47.7420%

*********** RESULT ***********

Average  F1-score is 56.3956%


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


K fold cross validation shows that the F1-score for each well is highly variable. For example, the model fit well for SHANKLE but less SHRIMPLIN. This is way, as @LukasMosser and me suggest is to use the average F1-score as a metricsto evaluate the performance of the submission.