In [1]:
import pandas as pd
import numpy as np
from validation import cross_val, Preprocessing, cross_gen #custom lib
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
import math
from os import cpu_count
pd.options.display.max_columns = None

In [2]:
n_jobs = max(cpu_count()-1, 1)

In [3]:
train = pd.read_csv('train_cleaned.csv', sep='\t')
test = pd.read_csv('test_cleaned.csv', sep='\t')

In [4]:
#train = pd.read_csv('train.csv', sep='\t').drop("Unnamed: 0",axis=1)
#test = pd.read_csv('test.csv', sep='\t').drop("Unnamed: 0",axis=1)

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [6]:
rfc = ExtraTreesClassifier(random_state=42, n_jobs=n_jobs)#, oob_score=True

In [7]:
X = train.drop(['0'], axis=1)
y = train['0']

In [8]:
cv_score = np.mean(cross_val_score(rfc, X, y, scoring='roc_auc',n_jobs=n_jobs))

In [9]:
print("Pure EF with no data preprocessing:", cv_score)

Pure EF with no data preprocessing: 0.6778553551613279


# Hold out validation

In [10]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.12, random_state=42)

In [11]:
cv_score_val = np.mean(cross_val_score(rfc, X_train, y_train, scoring='roc_auc',n_jobs=n_jobs))

In [12]:
print("Pure EF with train_split and no data preprocessing:", cv_score_val)

Pure EF with train_split and no data preprocessing: 0.668952700843569


In [13]:
rfc.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=3,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [14]:
y_holdout_pred = rfc.predict_proba(X_dev)

In [15]:
y_holdout_pred[:, 1]

array([0.4, 0. , 0. , ..., 0. , 0.2, 0.1])

In [16]:
roc_auc_score(y_dev,y_holdout_pred[:, 1])#0.673341515524867

0.6668302053277846

# Cross-validation

In [17]:
skf = StratifiedKFold(shuffle=True, random_state=42)

In [18]:
parameters = {'max_features': [5, 6, 7], 'min_samples_leaf': [1, 3, 5], 'max_depth': [5,10,15,20]}
etc = ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1)
etcv = GridSearchCV(etc, parameters, n_jobs=-1, cv=skf, verbose=1, scoring='roc_auc')
etcv.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  3.3min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
       error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_features': [5, 6, 7], 'min_samples_leaf': [1, 3, 5], 'max_depth': [5, 10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [19]:
etcv.best_estimator_

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=20, max_features=7, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [20]:
etcv.best_score_

0.7242286626616734

In [48]:
etcv.best_score_

0.7242286626616734

In [21]:
best_dt = etcv.best_estimator_

In [22]:
y_pred_best = best_dt.predict_proba(X_dev)

In [23]:
roc_auc_score(y_dev,y_holdout_pred[:, 1]) #0.736330981

0.6668302053277846

In [24]:
roc_auc_score(y_dev,y_pred_best[:, 1]) #0.747267563447479

0.747267563447479

In [57]:
X_test = test.drop(['0'], axis=1)
y_test = test['0']

In [58]:
prediction =best_dt.predict_proba(X_test)

In [59]:
test_index = pd.read_csv('test.csv', sep='\t')
test_index = test_index['Unnamed: 0']

In [60]:
submition = pd.DataFrame({'_ID_': test_index, '_VAL_': np.zeros_like(test_index)})
submition['_VAL_'] = prediction[:,1]
submition.to_csv('submission_ef.csv', index=False)

# LB submit  lb:0.74110492
