### Ensembles Development

- Objective: test ensembles as an alternative to stacking

In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy.io import arff
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.utils.fixes import signature
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
pd.options.display.float_format = '{:.2f}'.format

  from numpy.core.umath_tests import inner1d


In [5]:
#Loading df1 after it has been augmented in iteration 2:
df = pd.read_pickle('../data/pickle_files/df_pickle_w_all_stats')
#Excluding the duration variable as it cannot be used in our baseline
df = df.drop(columns = ['duration'])

In [6]:
df.drop(columns=['Date'], inplace=True)

In [7]:
def scale_noncat_only(df):
    X_temp_noncat = df.select_dtypes(exclude=['int8'])
    X_temp_cat = df.select_dtypes(include=['int8'])
    scaler = preprocessing.StandardScaler().fit(X_temp_noncat)
    X_transformed = scaler.transform(X_temp_noncat)
    X = pd.concat([pd.DataFrame(X_transformed, columns =X_temp_noncat.columns), X_temp_cat], axis = 1)
    X = X.drop(columns=['y'])
    y = df['y']
    #will work with numpy arrays
    y = np.array(y)
    X = np.array(X)
    
    return X, y

X, y = scale_noncat_only(df)

In [8]:
#Checking dtypes have loaded correctly (should return empty index)
df.select_dtypes(exclude = ['float64', 'int64', 'int8']).columns

Index([], dtype='object')

In [9]:
X.shape

(41188, 72)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.33, random_state = 4)

In [11]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)

In [16]:
scores = cross_val_score(clf, X_train, y_train, scoring = 'accuracy', cv=10)

In [18]:
print("CV results for %s" %(clf.__class__.__name__))
print(np.mean(scores), np.std(scores))

CV results for RandomForestClassifier
0.8979888191728167 0.0018249702348291836


In [20]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [22]:
#y true y pred
accuracy_score(y_test, clf.predict(X_test))

0.8959023026557787

In [23]:
param_grid = {'n_estimators': [10,20,30], 
              'max_depth':[5,10],
              'min_samples_split':[2,10,20], 
              'min_samples_leaf':[2,10,20]}

In [24]:
gs = GridSearchCV(clf, param_grid = param_grid, cv = 10).fit(X_train, y_train)

In [26]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [27]:
accuracy_score(y_test, gs.best_estimator_.predict(X_test))

0.8979621864194807