In [1]:
import pandas as pd
import numpy as np
from validation import cross_val, Preprocessing, cross_gen #custom lib
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import math
from os import cpu_count
pd.options.display.max_columns = None

In [2]:
n_jobs = max(cpu_count()-1, 1)

In [114]:
train = pd.read_csv('train_cleaned.csv', sep='\t')
test = pd.read_csv('test_cleaned.csv', sep='\t')

In [95]:
#train = pd.read_csv('train.csv', sep='\t').drop("Unnamed: 0",axis=1)
#test = pd.read_csv('test.csv', sep='\t').drop("Unnamed: 0",axis=1)

In [4]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
rfc = RandomForestClassifier(random_state=42, n_jobs=n_jobs)#, oob_score=True

In [115]:
X = train.drop(['0'], axis=1)
y = train['0']

In [7]:
cv_score = np.mean(cross_val_score(rfc, X, y, scoring='roc_auc',n_jobs=n_jobs))

In [8]:
print("Pure RF with no data preprocessing:", cv_score)

Pure RF with no data preprocessing: 0.6597779128921988


In [9]:
print("Pure RF with no data preprocessing:", cv_score)

Pure RF with no data preprocessing: 0.6597779128921988


# Hold out validation

In [116]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.12, random_state=42)

In [117]:
cv_score_val = np.mean(cross_val_score(rfc, X_train, y_train, scoring='roc_auc',n_jobs=n_jobs))

In [118]:
print("Pure RF with train_split and no data preprocessing:", cv_score_val)

Pure RF with train_split and no data preprocessing: 0.6601739769210956


In [119]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=3,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [120]:
y_holdout_pred = rfc.predict_proba(X_dev)

In [121]:
y_holdout_pred[:, 1]

array([0.6, 0. , 0.1, ..., 0.1, 0.3, 0.5])

In [122]:
roc_auc_score(y_dev,y_holdout_pred[:, 1])#0.673341515524867

0.673341515524867

# Cross-validation

In [123]:
skf = StratifiedKFold(shuffle=True, random_state=42)

In [124]:
parameters = {'n_estimators':[150],'max_features': [13,15,20], 'min_samples_leaf': [1, 7], 'max_depth': [20,25]}
rfc_tuned = RandomForestClassifier( random_state=42, n_jobs=-1)
gcv = GridSearchCV(rfc, parameters, n_jobs=n_jobs, cv=skf, verbose=1, scoring='roc_auc')
gcv.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=3)]: Done  36 out of  36 | elapsed:  2.8min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=3,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'n_estimators': [150], 'max_features': [13, 15, 20], 'min_samples_leaf': [1, 7], 'max_depth': [20, 25]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [138]:
gcv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features=13, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=7, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=3,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [139]:
gcv.best_score_

0.7211786120184293

In [140]:
best_dt = gcv.best_estimator_

In [141]:
y_pred_best = best_dt.predict_proba(X_dev)

In [142]:
roc_auc_score(y_dev,y_holdout_pred[:, 1]) #0.736330981

0.673341515524867

In [143]:
roc_auc_score(y_dev,y_pred_best[:, 1]) #0.737981617413134

0.737981617413134

In [144]:
X_test = test.drop(['0'], axis=1)
y_test = test['0']

In [146]:
prediction =best_dt.predict_proba(X_test)

In [147]:
test_index = pd.read_csv('test.csv', sep='\t')
test_index = test_index['Unnamed: 0']

In [148]:
submition = pd.DataFrame({'_ID_': test_index, '_VAL_': np.zeros_like(test_index)})
submition['_VAL_'] = prediction[:,1]
submition.to_csv('submission_rf.csv', index=False)

# LB submit 0.74140859