# Preliminaries

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.scorer import make_scorer
from sklearn.externals import joblib
import itertools
from sklearn.model_selection import train_test_split
from datetime import datetime

In [3]:
X = np.load('/tmp/X.npy')
y = np.load('/tmp/y.npy')
X_test = np.load('/tmp/X_test.npy')

In [4]:
X.shape, y.shape

((595212, 218), (595212,))

In [5]:
X_small = np.load('/tmp/X_small.npy')
y_small = np.load('/tmp/y_small.npy')

In [6]:
X_small.shape, y_small.shape

((17856, 114), (17856,))

In [5]:
def gini(pred, y):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def normalized_gini(pred, y):
    return gini(pred, y) / gini(y, y)

#my_scorer = make_scorer(normalized_gini, greater_is_better=True)

In [6]:
porto_test_id = pd.read_csv('data/test.csv',usecols=['id'])
def make_submission(name, pred):
    s = porto_test_id['id'].to_frame()
    s['target'] = pred
    s.to_csv('Submissions/'+name+'.csv',index=False)

In [7]:
def binarize(pred,threshold):
    s= pred
    s[s<threshold] = 0
    s[s>=threshold] = 1
    s.astype(int)
    return s

# RandomForest tuning directly by ginion kaggle

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

In [13]:
rnd_clf.fit(X, y) # Data all, na=0, 218 var, dropfirst= false, time around 2m

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
joblib.dump(rnd_clf, "pkl/RandomForest/rndf_clf.pkl")

['pkl/rndf_clf.pkl']

In [18]:
rnd_clf = joblib.load("pkl/RandomForest/rndf_clf.pkl")

In [28]:
pred = cross_val_predict(rnd_clf, X, y, cv=5, method='predict_proba')

In [29]:
normalized_gini(pred[:,1],y)

0.25167130855006264

In [21]:
make_submission('df6', rnd_clf.predict_proba(X_test)[:,1]) 
# private kaggle :  0.25267, public kaggle : 0.24868

# RandomForest train test

In [13]:
X_train, X_t, y_train, y_t = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [27]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

In [30]:
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [34]:
normalized_gini(rnd_clf.predict_proba(X_t)[:,1],y_t)

0.25756406541813154

In [36]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=2,max_leaf_nodes=16, n_jobs=-1)

In [37]:
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [38]:
normalized_gini(rnd_clf.predict_proba(X_t)[:,1],y_t)

0.25426623509808233

#### RandomForestClassifier(n_estimators=500, max_depth=10,max_leaf_nodes=16, n_jobs=-1)

In [9]:
def rnd_gini(clf):
    x = datetime.now()
    clf.fit(X_train, y_train)
    y = datetime.now()
    print('Time to fit : ' + str(x-y))
    tr = normalized_gini(clf.predict_proba(X_train)[:,1],y_train)
    te = normalized_gini(clf.predict_proba(X_t)[:,1],y_t)
    print('Train-gini :', tr)
    print('Test-gini :',  te)
    print('Difference-gini :', tr-te)
    x = str(datetime.now())
    joblib.dump(rnd_clf, "pkl/RandomForest/"+ x +".pkl")
    print("Model file : pkl/RandomForest/"+ x +".pkl")

In [13]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=10,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

Train-gini : 0.261927659016
Test-gini : 0.258120885529
Difference-gini : -0.00380677348716


In [15]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=10,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

Train-gini : 0.261860336923
Test-gini : 0.258169428921
Difference-gini : 0.00369090800209


In [17]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=30,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

2018-01-07 01:18:29.885247
2018-01-07 01:20:10.292170
Train-gini : 0.260397604716
Test-gini : 0.257670990197
Difference-gini : 0.00272661451886


In [42]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=30,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

0.25663494295778089

In [43]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=30,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

0.25880297038265065

In [29]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=80,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

Time to fit : -1 day, 23:58:20.183056
Train-gini : 0.260644307062
Test-gini : 0.257805524946
Difference-gini : 0.00283878211608
Model file file pkl/RandomForest/2018-01-07 02:15:07.259701.pkl


In [26]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_features=6, max_depth=80,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

Time to fit : -1 day, 23:59:05.385189
Train-gini : 0.26279929703
Test-gini : 0.258453448312
Difference-gini : 0.0043458487178
Model file file pkl/RandomForest/2018-01-07 02:09:32.891147.pkl


In [27]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_features=6, max_depth=80,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

Time to fit : -1 day, 23:59:05.110706
Train-gini : 0.263183328992
Test-gini : 0.257546120188
Difference-gini : 0.00563720880351
Model file file pkl/RandomForest/2018-01-07 02:11:04.863216.pkl


In [28]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_features=6, max_depth=80,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

Time to fit : -1 day, 23:59:02.111062
Train-gini : 0.262521498345
Test-gini : 0.257351880532
Difference-gini : 0.00516961781321
Model file file pkl/RandomForest/2018-01-07 02:12:42.979945.pkl


In [25]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_features=12, max_depth=80,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf) # selected

Time to fit : -1 day, 23:58:32.440399
Train-gini : 0.262308628944
Test-gini : 0.258140891951
Difference-gini : 0.00416773699325
Model file file pkl/RandomForest/2018-01-07 02:08:01.219178.pkl


In [20]:
rnd_clf = joblib.load("pkl/RandomForest/f1.pkl")
sub = rnd_clf.predict_proba(X_test)[:,1]
make_submission('f1',sub)

In [23]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_features=20, max_depth=80,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

Time to fit : -1 day, 23:57:38.032723
2018-01-07 02:00:09.446123
Train-gini : 0.259242296878
Test-gini : 0.256828821816
Difference-gini : 0.00241347506119
pkl/RandomForest/2018-01-07 02:00:21.988302.pkl


In [19]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_features=50, max_depth=80,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

2018-01-07 01:41:02.813677
2018-01-07 01:46:08.818712
Train-gini : 0.251520952016
Test-gini : 0.250380593557
Difference-gini : 0.00114035845915


In [22]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_features=100, max_depth=80,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

Time to fit : -1 day, 23:50:50.684120
2018-01-07 01:57:32.427713
Train-gini : 0.24353557253
Test-gini : 0.243786109975
Difference-gini : -0.0002505374449
pkl/RandomForest/2018-01-07 01:57:47.165039.pkl


In [18]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_features=150, max_depth=80,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

2018-01-07 01:26:06.539272
2018-01-07 01:39:21.273438
Train-gini : 0.238508124726
Test-gini : 0.238430168774
Difference-gini : 7.79559513794e-05


In [30]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_features=6, max_depth=200,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf)

Time to fit : -1 day, 23:59:02.967641
Train-gini : 0.263193986076
Test-gini : 0.258099464493
Difference-gini : 0.00509452158267
Model file file pkl/RandomForest/2018-01-07 02:18:40.616762.pkl


In [21]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_features=6, max_depth=10,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf) # selected

Time to fit : -1 day, 23:59:02.277795
Train-gini : 0.262543509623
Test-gini : 0.257240735132
Difference-gini : 0.00530277449123
Model file : pkl/RandomForest/2018-01-08 00:57:42.879064.pkl


In [22]:
sub = rnd_clf.predict_proba(X_test)[:,1]
make_submission('f2',sub)

In [23]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_features=6, max_depth=8,max_leaf_nodes=16, n_jobs=-1)
rnd_gini(rnd_clf) # selected

Time to fit : -1 day, 23:58:57.724715
Train-gini : 0.260405155152
Test-gini : 0.254705985221
Difference-gini : 0.00569916993028
Model file : pkl/RandomForest/2018-01-08 00:59:26.104094.pkl


In [24]:
sub = rnd_clf.predict_proba(X_test)[:,1]
make_submission('f3',sub)