In [2]:
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import learning_curve
from sklearn import tree

from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

plt.style.use('seaborn')


path = "training_data.csv"

data = pd.read_csv(path)
data = data.sample(frac=1)

categorical = ["Well Name", "Formation"]
for cats in categorical:
    try:
        dummed = pd.get_dummies(data[cats], prefix = cats)
        data = data.drop(cats, axis=1)
        data = data.join(dummed)
    except KeyError:
        pass

target = 'Facies'
features = [feature for feature in list(data.head(0)) if feature != target]

X = data[features]

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)


y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)



train_sizes = list(range(295,2065,295))

def tv_scores_mean(estimator, X, y, train_sizes, scoring='accuracy', shuffle=True, cv=5):
    train_sizes, train_scores, validation_scores = learning_curve(estimator = estimator, X=X, y=y, 
                                                                  train_sizes=train_sizes, scoring=scoring, 
                                                                  shuffle=shuffle, cv=cv)
    train_scores_mean = train_scores.mean(axis = 1)
    validation_scores_mean = validation_scores.mean(axis = 1)
    return train_sizes, train_scores_mean, validation_scores_mean

def plot_learning_curve(train_sizes, train_scores_mean, validation_scores_mean):
    plt.plot(train_sizes, train_scores_mean, label = 'Training accuracy')
    plt.plot(train_sizes, validation_scores_mean, label = 'Validation accuracy')
    plt.ylabel('Accuracy', fontsize = 14)
    plt.xlabel('Training set size', fontsize = 14)
    plt.title('Learning curves', fontsize = 18, y = 1.03)
    plt.legend()
    plt.ylim(0,1)
    
def gridsearch_helper(grid, estimator, X, y):
    gridsearch = model_selection.GridSearchCV(estimator = estimator, param_grid=grid)
    gridsearch.fit(X,y)
    return (gridsearch.best_estimator_, gridsearch.best_score_, gridsearch.best_params_)

data.head()

Unnamed: 0,Facies,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,Well Name_CHURCHMAN BIBLE,...,Formation_B2 LM,Formation_B2 SH,Formation_B3 LM,Formation_B3 SH,Formation_B4 LM,Formation_B4 SH,Formation_B5 LM,Formation_B5 SH,Formation_C LM,Formation_C SH
31,2,2808.5,70.98,0.401,11.5,18.395,3.1,1,0.277,0,...,0,0,0,0,0,0,0,0,0,0
1058,2,2679.5,60.3,0.624,4.3,11.05,3.6,1,0.538,0,...,0,0,0,0,0,0,0,0,0,0
2975,8,2991.0,32.469,0.799,0.865,8.461,4.391,2,0.656,1,...,0,0,0,0,0,0,0,0,0,0
3166,3,3090.0,62.344,0.535,2.334,38.486,2.443,1,0.765,1,...,0,0,0,0,0,0,0,0,0,1
1593,1,2682.0,61.163,0.819083,6.2,7.0,3.511,1,0.537,0,...,0,1,0,0,0,0,0,0,0,0


In [3]:
tree_grid = {'max_depth' : range(1,20), 'splitter':["best","random"], 'max_features':["auto","log2",None]}
print(gridsearch_helper(tree_grid, tree.DecisionTreeClassifier(random_state=1), X_train, y_train))

NameError: name 'model_selection' is not defined

In [None]:
knn_grid = {'n_neighbors' : range(1,10), 'p':[1,2], 'weights':["uniform","distance"]}
print(gridsearch_helper(knn_grid, sklearn.neighbors.KNeighborsClassifier(), X_train, y_train))

In [None]:
rfc_grid = {'n_estimators' : range(10,30), 'max_features':["auto","log2",None], 'warm_start':[True, False]}
print(gridsearch_helper(rfc_grid, RandomForestClassifier(random_state=1), X_train, y_train))

In [None]:
from sklearn.model_selection import learning_curve
from sklearn import tree

dtc_est = tree.DecisionTreeClassifier(max_depth=10)

plot_learning_curve(*tv_scores_mean(dtc_est,X_train,y_train,train_sizes))


In [None]:
import sklearn

knn_est = sklearn.neighbors.KNeighborsClassifier(n_neighbors=1)

plot_learning_curve(*tv_scores_mean(knn_est,X_train,y_train,train_sizes))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_est = RandomForestClassifier(n_estimators = 25, warm_start = True, random_state=1)

plot_learning_curve(*tv_scores_mean(rfc_est,X_train,y_train,train_sizes))

In [None]:
from sklearn.neural_network import MLPClassifier


mlp_est = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(30,30), random_state=1)


plot_learning_curve(*tv_scores_mean(mlp_est,X_train,y_train,train_sizes))

In [None]:
from sklearn import svm

svm_est = svm.SVC(kernel='linear', C=1, gamma=1)

plot_learning_curve(*tv_scores_mean(svm_est,X_train,y_train,train_sizes))

In [None]:
from sklearn import svm, model_selection


param_grid = {'max_leaf_nodes' : range(10,500,5)}

gridsearch = model_selection.GridSearchCV(estimator = tree.DecisionTreeClassifier(), param_grid = param_grid)

gridsearch.fit(X_train,y_train)

def plot_complexity_curve(hyperparameter_values, time_complexity, train_scores_mean, validation_scores_mean):
    """
    https://matplotlib.org/examples/api/two_scales.html
    """
    fig, ax1 = plt.subplots()
    ax1.plot(hyperparameter_values, train_scores_mean, label = 'Training F1')
    ax1.plot(hyperparameter_values, validation_scores_mean, label = 'Validation F1')
    ax2 = ax1.twinx()
    ax2.plot(hyperparameter_values, time_complexity, label = 'Time Complexity')

plot_complexity_curve(param_grid['max_leaf_nodes'], gridsearch.cv_results_['mean_fit_time'], gridsearch.cv_results_['mean_train_score'], gridsearch.cv_results_['mean_test_score'])