# Pair Problem

In [46]:
# Import Relevant Libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df  = pd.read_pickle('data/classification.pkl')

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,target
0,0.094176,1.311481,0.669518,0.46213,0.924826,0.320199,0.500238,-1.141416,0.946244,0.134583,...,-0.210488,-0.688082,-0.198602,-0.389638,0.847061,1.147521,-1.248303,0.819974,0.684105,1
1,0.039309,-0.282793,0.09144,0.348772,-1.222226,-1.270123,0.5652,0.654688,-1.243673,1.190361,...,-0.291332,0.636953,0.464488,0.815605,0.277057,0.274514,0.879307,-0.182923,0.470393,1
2,-0.566671,-0.865299,-1.869118,-0.708375,-1.363103,-1.322028,0.34696,-2.391982,0.693698,0.442282,...,0.160562,0.724428,1.359291,0.38884,-1.174396,0.28722,-0.1525,-1.97932,0.563817,0
3,0.229075,-2.189922,0.103816,-0.662714,0.34671,-0.630887,0.958386,1.194592,-0.032281,-0.756764,...,-1.299216,0.640543,0.123078,2.162666,-1.839366,2.511557,0.051661,-1.840078,0.232787,0
4,0.660343,-1.007195,1.097985,0.978973,1.39676,0.20198,-2.161304,0.168231,-0.161767,0.47994,...,-0.331865,0.330867,0.749767,-0.728429,1.780035,0.444968,0.482534,0.031063,0.324782,1


**Scale dataframe**

In [28]:
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df.iloc[:,:20]))

In [30]:
df_scaled['target'] = df['target']

In [32]:
df_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,target
0,0.023042,1.340729,0.133051,0.389986,0.952241,0.662449,0.488307,-1.070648,0.928763,0.087676,...,-0.222719,-0.721245,-0.22573,-0.694536,0.690836,1.066901,-1.271201,0.844603,0.694817,1
1,-0.031458,-0.262836,-0.313895,0.218794,-1.138267,-0.607613,0.55513,0.741511,-1.21209,1.14825,...,-0.302904,0.627403,0.442942,0.00249,0.215756,0.230091,0.808892,-0.183525,0.485669,1
2,-0.633379,-0.848736,-1.829716,-1.377693,-1.275434,-0.649065,0.330638,-2.332392,0.681875,0.396772,...,0.145304,0.716438,1.34528,-0.24432,-0.993984,0.24227,-0.199871,-2.025115,0.577098,0
3,0.157036,-2.181078,-0.304326,-1.308737,0.38935,-0.097106,0.95958,1.28624,-0.027838,-0.80772,...,-1.302567,0.631058,0.098658,0.781533,-1.548215,2.374382,-0.00027,-1.88237,0.253138,0
4,0.585416,-0.991459,0.464324,1.170515,1.411746,0.568037,-2.249484,0.250705,-0.154424,0.434601,...,-0.343106,0.315863,0.730623,-0.890469,1.46844,0.393478,0.42098,0.035845,0.343168,1


**Train Test Split**

In [36]:
X = df_scaled.iloc[:,:20]
y = df_scaled.iloc[:,20]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

**Random Forest**

In [33]:
rf = RandomForestClassifier()

In [39]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [40]:
y_pred = rf.predict(X_test)

In [42]:
accuracy_score(y_test, y_pred)

0.91666666666666663

In [45]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.94      0.90      0.92       144
          1       0.89      0.93      0.91       120

avg / total       0.92      0.92      0.92       264



**Optimize Model**

In [69]:
rf_opt = RandomForestClassifier()

In [73]:
n_trees =      [100, 200, 400]
max_features = [10, 15, 20]
max_depth =    [50, 100, 200, 400, 600, 800]

parameters = {'n_estimators': n_trees,
              'max_features': max_features,
              'max_depth': max_depth}

In [74]:
clf = GridSearchCV(rf_opt, parameters, verbose=10, cv=5)

In [75]:
clf.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] n_estimators=100, max_depth=50, max_features=10 .................
[CV]  n_estimators=100, max_depth=50, max_features=10, score=0.962963 -   0.0s
[CV] n_estimators=100, max_depth=50, max_features=10 .................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  n_estimators=100, max_depth=50, max_features=10, score=0.953704 -   0.0s
[CV] n_estimators=100, max_depth=50, max_features=10 .................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV]  n_estimators=100, max_depth=50, max_features=10, score=0.925926 -   0.0s
[CV] n_estimators=100, max_depth=50, max_features=10 .................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.9s remaining:    0.0s


[CV]  n_estimators=100, max_depth=50, max_features=10, score=0.943396 -   0.0s
[CV] n_estimators=100, max_depth=50, max_features=10 .................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.2s remaining:    0.0s


[CV]  n_estimators=100, max_depth=50, max_features=10, score=0.943396 -   0.0s
[CV] n_estimators=200, max_depth=50, max_features=10 .................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.5s remaining:    0.0s


[CV]  n_estimators=200, max_depth=50, max_features=10, score=0.981481 -   0.0s
[CV] n_estimators=200, max_depth=50, max_features=10 .................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.2s remaining:    0.0s


[CV]  n_estimators=200, max_depth=50, max_features=10, score=0.935185 -   0.0s
[CV] n_estimators=200, max_depth=50, max_features=10 .................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    2.8s remaining:    0.0s


[CV]  n_estimators=200, max_depth=50, max_features=10, score=0.925926 -   0.0s
[CV] n_estimators=200, max_depth=50, max_features=10 .................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    3.4s remaining:    0.0s


[CV]  n_estimators=200, max_depth=50, max_features=10, score=0.924528 -   0.0s
[CV] n_estimators=200, max_depth=50, max_features=10 .................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    4.1s remaining:    0.0s


[CV]  n_estimators=200, max_depth=50, max_features=10, score=0.933962 -   0.0s
[CV] n_estimators=400, max_depth=50, max_features=10 .................
[CV]  n_estimators=400, max_depth=50, max_features=10, score=0.981481 -   0.0s
[CV] n_estimators=400, max_depth=50, max_features=10 .................
[CV]  n_estimators=400, max_depth=50, max_features=10, score=0.935185 -   0.0s
[CV] n_estimators=400, max_depth=50, max_features=10 .................
[CV]  n_estimators=400, max_depth=50, max_features=10, score=0.925926 -   0.0s
[CV] n_estimators=400, max_depth=50, max_features=10 .................
[CV]  n_estimators=400, max_depth=50, max_features=10, score=0.924528 -   0.0s
[CV] n_estimators=400, max_depth=50, max_features=10 .................
[CV]  n_estimators=400, max_depth=50, max_features=10, score=0.943396 -   0.0s
[CV] n_estimators=100, max_depth=50, max_features=15 .................
[CV]  n_estimators=100, max_depth=50, max_features=15, score=0.981481 -   0.0s
[CV] n_estimators=100

[Parallel(n_jobs=1)]: Done 270 out of 270 | elapsed:  3.7min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 200, 400], 'max_depth': [50, 100, 200, 400, 600, 800], 'max_features': [10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

In [76]:
y_pred_best = clf.predict(X_test)

In [77]:
print(classification_report(y_test, y_pred_best))

             precision    recall  f1-score   support

          0       0.96      0.90      0.93       144
          1       0.89      0.96      0.92       120

avg / total       0.93      0.93      0.93       264



In [78]:
clf.best_params_

{'max_depth': 200, 'max_features': 10, 'n_estimators': 200}