In [1]:
%matplotlib inline
# to install watermark magic command: pip install ipyext
%load_ext watermark 
%watermark -v -p numpy,scipy,pandas,matplotlib,seaborn,sklearn

CPython 3.5.2
IPython 5.1.0

numpy 1.11.1
scipy 0.18.0
pandas 0.18.1
matplotlib 1.5.1
seaborn 0.7.1
sklearn 0.18


## Exploring the dataset
First, we will examine the data set we will use to train the classifier.

In [19]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable

from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None

filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
training_data.describe()



Unnamed: 0,Facies,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
count,4149.0,4149.0,4149.0,4149.0,4149.0,4149.0,3232.0,4149.0,4149.0
mean,4.503254,2906.867438,64.933985,0.659566,4.402484,13.201066,3.725014,1.518438,0.521852
std,2.474324,133.300164,30.30253,0.252703,5.274947,7.132846,0.896152,0.49972,0.286644
min,1.0,2573.5,10.149,-0.025949,-21.832,0.55,0.2,1.0,0.0
25%,2.0,2821.5,44.73,0.498,1.6,8.5,,1.0,0.277
50%,4.0,2932.5,64.99,0.639,4.3,12.02,,2.0,0.528
75%,6.0,3007.0,79.438,0.822,7.5,16.05,,2.0,0.769
max,9.0,3138.0,361.15,1.8,19.312,84.4,8.094,2.0,1.0


In [20]:
training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')
# training_data = training_data[training_data['Well Name'] != 'Recruit F9']
training_data['Well Name'].unique()


[SHRIMPLIN, ALEXANDER D, SHANKLE, LUKE G U, KIMZEY A, CROSS H CATTLE, NOLAN, NEWBY, CHURCHMAN BIBLE]
Categories (9, object): [SHRIMPLIN, ALEXANDER D, SHANKLE, LUKE G U, ..., CROSS H CATTLE, NOLAN, NEWBY, CHURCHMAN BIBLE]

In [21]:
# PE_mask = training_data['PE'].notnull().values
# training_data = training_data[PE_mask]
training_data.replace(to_replace=np.nan,value=-99999,inplace=True)

## K fold cross-validation 
Adapted from [@LukasMosser code](https://gist.github.com/LukasMosser/cd645bad2bdbbb419098ac3ea363f2b3) to fit python 3.5.
Doing a cross-validation on each well in order to see how each well perform as a blind text

In [28]:
training_data = pd.read_pickle('../../2016-ml-contest_liamlearn/data/training_data.pkl')
training_data.drop_duplicates(inplace=True)
# training_data = training_data[training_data['Well Name'] != 'Recruit F9']
training_data.describe()


Unnamed: 0,Facies,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,GR_cD_step_level_1,...,PHIND_cD_step_level_2,PHIND_cD_step_level_3,PE_cD_step_level_1,PE_cD_step_level_2,PE_cD_step_level_3,GR_entropy_foot10,ILD_log10_entropy_foot10,DeltaPHI_entropy_foot10,PHIND_entropy_foot10,PE_entropy_foot10
count,4151.0,4151.0,4151.0,4151.0,4151.0,4151.0,4151.0,4151.0,4151.0,4151.0,...,4151.0,4151.0,4151.0,4151.0,4151.0,4151.0,4151.0,4151.0,4151.0,4151.0
mean,4.502048,2906.778126,64.938671,0.659477,4.404699,13.203233,-22087.939466,1.518429,0.522082,-0.149963,...,-0.022784,-0.022784,-22018.571435,-22018.571435,-22018.571435,0.803226,0.779793,0.870575,0.83267,0.545273
std,2.474337,133.330124,30.29598,0.252675,5.274641,7.13181,41492.164833,0.49972,0.286767,4.924412,...,1.268329,1.268329,41441.902455,41441.902455,41441.902455,0.728947,0.740483,0.720045,0.738475,0.6857
min,1.0,2573.5,10.149,-0.025949,-21.832,0.55,-99999.0,1.0,0.0,-96.538551,...,-14.212082,-14.212082,-99999.0,-99999.0,-99999.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,2821.5,44.74,0.497034,1.6,8.5,2.423,1.0,0.277,-1.202735,...,-0.272629,-0.272629,-0.231496,-0.231496,-0.231496,0.0,0.0,0.0,0.0,0.0
50%,4.0,2932.5,65.0,0.639,4.3,12.03,3.3,2.0,0.528,0.013825,...,-0.001953,-0.001953,-0.027656,-0.027656,-0.027656,0.918296,0.918296,0.918296,0.918296,0.0
75%,6.0,3007.0,79.438,0.822,7.5,16.057,4.0,2.0,0.769,1.16237,...,0.280288,0.280288,0.025482,0.025482,0.025482,1.584963,1.584963,1.584963,1.584963,1.584963
max,9.0,3138.0,361.15,1.8,19.312,84.4,8.094,2.0,1.0,41.034509,...,22.766611,22.766611,1.523253,1.523253,1.523253,1.584963,1.584963,1.584963,1.584963,1.584963


In [30]:

from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
from sklearn import ensemble
#Create a set of unique well names

names = list(set(training_data["Well Name"]))

#Create a dicitionary of the well datasets, continued from original contest notebook 
#But perform dropping for each well individually
#Maybe not necessary.

well_datas = {}
for name in names:
    well = training_data[training_data["Well Name"]==name] 
    well_labels = well['Facies'].values.astype(np.int64)
    well = well.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1).values
    well_datas[name] = [well, well_labels]
    
    
X_data = {}
y_data = {}
for name, (data, labels) in well_datas.items():
    y_data[name] = np.array(labels, dtype=np.int64)
    X_data[name] = np.array(data, dtype=np.float32)

training_sets = []
test_sets = []

for i in range(len(names)):
    X_train = []
    y_train = []

    X_test = []
    y_test = []

    for name, data in X_data.items():
        if name is not names[i]:
            for row in data:
                X_train.append(row)
        else:
            for row in data:
                X_test.append(row)

    for name, labels in y_data.items():
        if name is not names[i]:
            for val in labels:
                y_train.append(val)
        else:
            for val in labels:
                y_test.append(val)

    X_train = np.array(X_train, dtype=np.float32)
    y_train = np.array(y_train, dtype=np.int64).reshape(len(y_train), 1)
    y_train = y_train.ravel()
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)


    X_test = np.array(X_test, dtype=np.float32)
    X_test = scaler.transform(X_test)

    y_test = np.array(y_test, dtype=np.int32)
    training_sets.append([X_train, y_train, X_test, y_test])
    
#Use as follows:
scores = []
for i, (X_train, y_train, X_test, y_test) in enumerate(training_sets):
#     clf = svm.LinearSVC(class_weight='balanced', tol=1e-03, random_state=42, C=10)
    clf = ensemble.RandomForestClassifier(n_estimators=300, class_weight='balanced')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    #Scoring
    score = metrics.f1_score(y_test, y_pred, average='weighted')
    scores.append(score)
    print('********')
    print('Blind well is {0}, F1 score : {1:.4%}\n'.format(names[i],score))
#     print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    pass
print("="*30)
print('*********** RESULT ***********')
print("="*30)
print('\nAverage  F1-score is {:.4%}'.format(np.mean(scores)))

  'recall', 'true', average, warn_for)


********
Blind well is LUKE G U, F1 score : 64.5898%



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


********
Blind well is NEWBY, F1 score : 48.1006%



  'precision', 'predicted', average, warn_for)


********
Blind well is KIMZEY A, F1 score : 48.4954%



  'recall', 'true', average, warn_for)


********
Blind well is NOLAN, F1 score : 51.4632%

********
Blind well is CROSS H CATTLE, F1 score : 33.7938%



  'precision', 'predicted', average, warn_for)


********
Blind well is CHURCHMAN BIBLE, F1 score : 40.5155%



  'precision', 'predicted', average, warn_for)


********
Blind well is ALEXANDER D, F1 score : 58.0558%



  'recall', 'true', average, warn_for)


********
Blind well is SHRIMPLIN, F1 score : 54.9580%



  'recall', 'true', average, warn_for)


********
Blind well is Recruit F9, F1 score : 60.8696%

********
Blind well is SHANKLE, F1 score : 46.8888%

*********** RESULT ***********

Average  F1-score is 50.7730%


  'recall', 'true', average, warn_for)


K fold cross validation shows that the F1-score for each well is highly variable. For example, the model fit well for SHANKLE but less SHRIMPLIN. This is way, as @LukasMosser and me suggest is to use the average F1-score as a metricsto evaluate the performance of the submission.