In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("data/train.csv")

In [3]:
# a peek at data
df.head(2).T

Unnamed: 0,0,1
ID_code,train_0,train_1
target,0,0
var_0,8.9255,11.5006
var_1,-6.7863,-4.1473
var_2,11.9081,13.8588
var_3,5.093,5.389
var_4,11.4607,12.3622
var_5,-9.2834,7.0433
var_6,5.1187,5.6208
var_7,18.6266,16.5338


In [4]:
# basic stats

df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
target,200000.0,0.100490,0.300653,0.0000,0.000000,0.00000,0.000000,1.0000
var_0,200000.0,10.679914,3.040051,0.4084,8.453850,10.52475,12.758200,20.3150
var_1,200000.0,-1.627622,4.050044,-15.0434,-4.740025,-1.60805,1.358625,10.3768
var_2,200000.0,10.715192,2.640894,2.1171,8.722475,10.58000,12.516700,19.3530
var_3,200000.0,6.796529,2.043319,-0.0402,5.254075,6.82500,8.324100,13.1883
var_4,200000.0,11.078333,1.623150,5.0748,9.883175,11.10825,12.261125,16.6714
var_5,200000.0,-5.065317,7.863267,-32.5626,-11.200350,-4.83315,0.924800,17.2516
var_6,200000.0,5.408949,0.866607,2.3473,4.767700,5.38510,6.003000,8.4477
var_7,200000.0,16.545850,3.418076,5.3497,13.943800,16.45680,19.102900,27.6918
var_8,200000.0,0.284162,3.332634,-10.5055,-2.317800,0.39370,2.937900,10.1513


In [5]:
# corr analysis

df.corr().abs()['target'].sort_values(ascending=False)

target     1.000000
var_81     0.080917
var_139    0.074080
var_12     0.069489
var_6      0.066731
var_110    0.064275
var_146    0.063644
var_53     0.063399
var_26     0.062422
var_76     0.061917
var_174    0.061669
var_22     0.060558
var_21     0.058483
var_99     0.058367
var_166    0.057773
var_80     0.057609
var_190    0.055973
var_2      0.055870
var_165    0.055734
var_13     0.055156
var_148    0.055011
var_133    0.054548
var_198    0.053000
var_34     0.052692
var_0      0.052390
var_1      0.050343
var_115    0.050174
var_179    0.050002
var_109    0.049926
var_40     0.049530
             ...   
var_61     0.007407
var_182    0.007198
var_153    0.007103
var_73     0.006460
var_14     0.006332
var_60     0.006265
var_129    0.005880
var_46     0.005690
var_183    0.005467
var_160    0.005135
var_29     0.004682
var_124    0.004218
var_161    0.004168
var_39     0.004090
var_98     0.004074
var_158    0.003817
var_136    0.003554
var_96     0.003037
var_7      0.003025


Two main takeaways from the above. 1 - there is no strong univariate correlation between the variables and 2 - there is not much information provided in the data to do much of feature engineering

We shall see how Random Forest performs on this model

In [6]:
df.dtypes

ID_code     object
target       int64
var_0      float64
var_1      float64
var_2      float64
var_3      float64
var_4      float64
var_5      float64
var_6      float64
var_7      float64
var_8      float64
var_9      float64
var_10     float64
var_11     float64
var_12     float64
var_13     float64
var_14     float64
var_15     float64
var_16     float64
var_17     float64
var_18     float64
var_19     float64
var_20     float64
var_21     float64
var_22     float64
var_23     float64
var_24     float64
var_25     float64
var_26     float64
var_27     float64
            ...   
var_170    float64
var_171    float64
var_172    float64
var_173    float64
var_174    float64
var_175    float64
var_176    float64
var_177    float64
var_178    float64
var_179    float64
var_180    float64
var_181    float64
var_182    float64
var_183    float64
var_184    float64
var_185    float64
var_186    float64
var_187    float64
var_188    float64
var_189    float64
var_190    float64
var_191    f

In [7]:
# train test splits
X = df.drop(['target', 'ID_code'], axis=1).copy()
y = df.target.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# baseline Rf model

In [8]:
model = RandomForestClassifier(n_estimators=100)

In [9]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
preds = model.predict(X_test)

### Metrics: Accuracy and ROC_AUC Score

In [11]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [12]:
accuracy_score(y_test, preds)

0.90114

In [13]:
proba = model.predict_proba(X_test)

roc auc score

In [14]:
roc_auc_score(y_test, proba[:,1])

0.8158061963646371