In [1]:
%matplotlib inline
%load_ext watermark
%watermark -v -m -p numpy,scipy,matplotlib,seaborn,sklearn -g
import warnings
warnings.filterwarnings('ignore')

CPython 3.5.2
IPython 5.1.0

numpy 1.11.1
scipy 0.18.0
matplotlib 1.5.1
seaborn 0.7.1
sklearn 0.18

compiler   : GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)
system     : Darwin
release    : 16.1.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit
Git hash   : db0162ae88094a468d0c24924737999d98a07dba


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from IPython.display import Image
import seaborn as sns

pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)

## Load Data

In [None]:
filename = 'data/facies_vectors.csv'
training_data = pd.read_csv(filename)



## Pipeline and GridSearchCV

In [6]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import sklearn.pipeline
from sklearn import preprocessing
from time import time
import sklearn.grid_search
from sklearn import metrics

clf1 = KNeighborsClassifier()
clf2 = GaussianNB()
clf3 = SVC()
clf4 = DecisionTreeClassifier()
clf5 = RandomForestClassifier()
clf6 = GradientBoostingClassifier()

s_scaler = preprocessing.StandardScaler()

## KNN

In [None]:
steps = [('kNN', clf1)]

pipeline = sklearn.pipeline.Pipeline(steps)

### gridsearchCV
start = time()
parameters = dict(kNN__n_neighbors=[2,5,10,15,20],
              kNN__weights=['uniform','distance'],
              kNN__algorithm=['ball_tree', 'kd_tree', 'brute'],    
              kNN__leaf_size=[10,30,50],
              kNN__p=[1,2,5],
              kNN__metric=['cityblock', 'euclidean', 'l1', 'l2', 'manhattan'],
              kNN__n_jobs=[-1])


cv = sklearn.grid_search.GridSearchCV(pipeline, param_grid=parameters, scoring='f1',cv=10, n_jobs=-1)
cv = cv.fit(X_train, y_train)
print(cv.best_score_)
print(cv.best_params_)
print(cv.best_estimator_)
print('temps: {:.2f}'.format(time() - start))

## NB (pas de grid search)

## SVM

In [None]:
steps = [('standardscaler', s_scaler),
        ('SVC', clf3)]

pipeline = sklearn.pipeline.Pipeline(steps)

### gridsearchCV
start = time()
parameters = dict(SVC__C=[0.1, 1 , 10, 100, 1000],
              SVC__kernel=['rbf','linear'],
              SVC__gamma=[0.0001, 0.001, 0.01, 0.1, 1, 10])


cv = sklearn.grid_search.GridSearchCV(pipeline, param_grid=parameters, scoring='f1',cv=10, n_jobs=-1)
cv = cv.fit(X_train, y_train)
print(cv.best_score_)
print(cv.best_params_)
print(cv.best_estimator_)
print('temps: {:.2f}'.format(time() - start))

## DTREE

In [7]:
steps = [('decision_tree', clf4)]

pipeline = sklearn.pipeline.Pipeline(steps)

### gridsearchCV
start = time()
parameters = dict(decision_tree__criterion=['gini','entropy'],
              decision_tree__splitter=['best','random'],
              decision_tree__max_features=['sqrt',1.0],
              decision_tree__max_depth=[2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
              decision_tree__min_samples_split=[2,  6, 10, 14, 18, 22, 26, 30, 34, 38],
              decision_tree__min_samples_leaf=[1, 2, 3, 4, 5, 6, 7, 8, 9])


cv = sklearn.grid_search.GridSearchCV(pipeline, param_grid=parameters, scoring='f1',cv=10, n_jobs=-1)
cv = cv.fit(X_train, y_train)
print(cv.best_score_)
print(cv.best_params_)
print(cv.best_estimator_)
print('temps: {:.2f}'.format(time() - start))

NameError: name 'X_train' is not defined

## Random Forest

In [None]:
steps = [('random_forest', clf5)]

pipeline = sklearn.pipeline.Pipeline(steps)

### gridsearchCV
start = time()
parameters = dict(random_forest__n_estimators=[ 10, 50, 100,  500],
              random_forest__criterion=['gini','entropy'],
              random_forest__max_features=['sqrt',1.0],
              random_forest__max_depth=[5, 10, 15],
              random_forest__min_samples_split=[15, 30, 45],
              random_forest__min_samples_leaf=[1, 5, 10],
              random_forest__n_jobs=[-1])


cv = sklearn.grid_search.GridSearchCV(pipeline, param_grid=parameters, scoring='f1',cv=10, n_jobs=-1)
cv = cv.fit(X_train, y_train)
print(cv.best_score_)
print(cv.best_params_)
print(cv.best_estimator_)
print('temps: {:.2f}'.format(time() - start))

## Gradient Boosting

In [None]:
steps = [('gradient_boosting', clf6)]

pipeline = sklearn.pipeline.Pipeline(steps)

### gridsearchCV
start = time()
parameters = dict(gradient_boosting__n_estimators=[ 10, 50, 100, 200],
              gradient_boosting__learning_rate=[0.01, 0.1, 1,2],
              gradient_boosting__loss=['deviance', 'exponential'],
              gradient_boosting__max_features=['sqrt',1.0],
              gradient_boosting__max_depth=[5, 10, 15],
              gradient_boosting__min_samples_split=[15, 30, 45],
              gradient_boosting__min_samples_leaf=[1, 5, 10],
              gradient_boosting__subsample=[0.2, 0.5, 1.0])


cv = sklearn.grid_search.GridSearchCV(pipeline, param_grid=parameters, scoring='f1',cv=10, n_jobs=-1)
cv = cv.fit(X_train, y_train)
print(cv.best_score_)
print(cv.best_params_)
print(cv.best_estimator_)
print('temps: {:.2f}'.format(time() - start))

In [None]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import sklearn.pipeline
from sklearn import preprocessing
from time import time
import sklearn.grid_search
from sklearn import metrics





num_folds = 10
num_instances = len(X)
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=seed)
train = np.zeros((X_train.shape[0], X_train.shape[1]+1))
columns_name = list(training_data.ix[:, 30:].columns)

train[:,:-1] = X_train
train[:,-1] = y_train
train = pd.DataFrame(train,columns=columns_name)
#     print(train.shape)
train1 = train.loc[train['Fertility'] == 1]
n = len(train1.index)
train2 = train.loc[train['Fertility'] == 0]
train2 = train2.sample(n=n)
train = pd.concat([train1, train2])
# print(n)

X_train = train.ix[:, :-1]
# X_train = X_train.as_matrix().astype(np.float)

y_train = train['Fertility']
# prepare models
models = []

# Gridsearch optimised for f1 score
models.append(('KNN', KNeighborsClassifier(algorithm='ball_tree', leaf_size=10, metric='euclidean',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=1,weights='distance')))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma=0.1, kernel='rbf', C=0.1)))
models.append(('DTREE', DecisionTreeClassifier(criterion='entropy', max_depth=5,
            max_features='sqrt', min_samples_leaf=3,
            min_samples_split=2, splitter='best')))
models.append(('RF', RandomForestClassifier( criterion='gini',
            max_depth=10, max_features=1.0, min_samples_leaf=5,
            min_samples_split=30,
            n_estimators=50, n_jobs=-1)))
models.append(('GBM', GradientBoostingClassifier(criterion='friedman_mse',
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features='sqrt', min_samples_leaf=10,
              min_samples_split=2,
              n_estimators=200)))
# evaluate each model in turn
results = []
names = []
scoring = 'recall'
for name, model in models:
    kfold = KFold(n_splits=10,random_state=seed)

    if name == 'SVM':
        sc = preprocessing.StandardScaler()
        sc.fit(X_train)
        X_train_std = sc.transform(X_train)

#         X_test_std = sc.transform(X_test)
#     print( name, model)
        
#     cv_results = cross_val_score(estimator=RandomForestClassifier(),
#                          X=X,
#                          y=y,
#                          cv=kfold,
#                          n_jobs=1)
        cv_results = cross_val_score(model, X_train_std, y_train, cv = 20, scoring=scoring, n_jobs=-1)
    else:
        cv_results = cross_val_score(model, X_train, y_train, cv = 20, scoring=scoring, n_jobs=-1)
        
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# boxplot algorithm comparison
sns.set(style="whitegrid")
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results, showmeans=True)
ax.set_xticklabels(names)
ax.set_ylabel('recall')
# plt.savefig('fig/algorithm-comparison_f1_recall.pdf', dpi=300)
plt.show()