In [2]:
%pylab
%load_ext autoreload
%autoreload 2

from datetime import datetime, timedelta

import os
import sys

import numpy as np
import pandas as pd
from scipy import sparse
import sklearn as sl

from sklearn.cross_validation import StratifiedKFold
from sklearn.svm import 
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support

from cPickle import Pickler

Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib


## Load preprocessed train data

In [3]:
if os.name == 'nt':
    TRAIN_PATH = r'D:\train.csv'
    PTRAIN_PATH = r'D:\train_preprocessed2.csv'
    TEST_PATH = r'D:\test.csv'
    GOOGNEWS_PATH = r'D:\GoogleNews-vectors-negative300.bin.gz'
    VOCAB_PATH = r'D:\big.txt'
else:
    TRAIN_PATH = r'/media/speedy/train.csv'
    PTRAIN_PATH = r'/media/speedy/train_preprocessed2.csv'
    TEST_PATH = r'/media/speedy/test.csv'
    GOOGNEWS_PATH = r'/media/speedy/GoogleNews-vectors-negative300.bin.gz'
    VOCAB_PATH = r'/media/speedy/big.txt'
df = pd.read_csv(PTRAIN_PATH, dtype=np.float32)

In [4]:
print "Calculating medians"
df_median = df.median(skipna=True)
print "Filling NAs"
df = df.fillna(value=df_median)

Calculating medians
Filling NAs


In [5]:
df = df.astype(np.float32)
df = df.drop(['VAR_0207', 'VAR_0213', 'VAR_0840'], axis=1)

ValueError: labels ['VAR_0207' 'VAR_0213' 'VAR_0840'] not contained in axis

In [27]:
df.to_csv(r'D:\train_preprocessed2.csv', index=False)

In [6]:
X = df.loc[:, df.columns != 'target']
y = df['target']

In [7]:
print 'Building folds'
skf = StratifiedKFold(y, n_folds=5)

print 'Iterating folds'
classifiers = []
y_hats = []

for train_index, test_index in skf:
    classifier = RandomForestClassifier(n_estimators=1000, n_jobs=-1, verbose=1)
    classifier.fit(X.iloc[train_index, :], y.iloc[train_index])
    y_hat = classifier.predict_proba(X.iloc[test_index, :])

    y_hats.append(y_hat)
    classifiers.append(classifier)

[Parallel(n_jobs=-1)]: Done   1 out of 1000 | elapsed:    5.5s remaining: 91.0min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 10.9min finished
[Parallel(n_jobs=8)]: Done   1 out of 359 | elapsed:    0.0s remaining:   10.4s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    4.9s finished
[Parallel(n_jobs=-1)]: Done   1 out of 1000 | elapsed:    4.9s remaining: 81.9min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 10.5min finished
[Parallel(n_jobs=8)]: Done   1 out of 308 | elapsed:    0.0s remaining:    8.9s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    5.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of 1000 | elapsed:    5.1s remaining: 85.0min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 10.4min finished
[Parallel(n_jobs=8)]: Done   1 out of  26 | elapsed:    0.0s remaining:    0.7s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    5.0s finished
[Parallel(n_jobs=-1)]: Done   1 out of 1000 | elapsed:    5.1s remaining: 84.3m

Building folds
Iterating folds


In [39]:
y_hats = np.asarray(y_hats)
aucs = np.zeros(y_hats.shape)
precisions = np.zeros(y_hats.shape)
recalls = np.zeros(y_hats.shape)
f1_scores = np.zeros(y_hats.shape)

print '+---------+---------+---------+---------+'
print '|   auc   |precision| recall  |f1_score |'
print '+---------+---------+---------+---------+'
for i, (train_index, test_index) in enumerate(skf):
    y_hat_bin = y_hats[i].T[0] >= 0.5
    auc = roc_auc_score(y.iloc[test_index], y_hat_bin)
    precision, recall, f1_score, _ = precision_recall_fscore_support(y.iloc[test_index], y_hat_bin,
                                                                     pos_label=None, average='binary')
    aucs[i] = auc
    precisions[i] = precision
    recalls[i] = recall
    f1_scores[i] = f1_score

    print '+---------+---------+---------+---------+'
    print '|{:^9.4}|{:^9.4}|{:^9.4}|{:^9.4}|'.format(auc, precision, recall, f1_score)
    print '+---------+---------+---------+---------+'

+---------+---------+---------+---------+
|   auc   |precision| recall  |f1_score |
+---------+---------+---------+---------+
+---------+---------+---------+---------+
| 0.4156  | 0.2874  | 0.2078  | 0.1129  |
+---------+---------+---------+---------+
+---------+---------+---------+---------+
| 0.4175  | 0.2888  | 0.2085  | 0.1128  |
+---------+---------+---------+---------+
+---------+---------+---------+---------+
| 0.4145  | 0.2914  | 0.2079  | 0.1143  |
+---------+---------+---------+---------+
+---------+---------+---------+---------+
| 0.4137  | 0.2876  | 0.2073  | 0.1135  |
+---------+---------+---------+---------+
+---------+---------+---------+---------+
|  0.416  |  0.282  | 0.2074  | 0.1115  |
+---------+---------+---------+---------+


In [45]:
for classifier in classifiers:
    hundred_most_important_features = np.argpartition(classifier.feature_importances_, range(50))[:50]
    print df.columns[hundred_most_important_features]

Index([u'VAR_0179_year', u'VAR_0005=s', u'VAR_0214=fsi-0005-1',
       u'VAR_0214=hre-social security number-1289',
       u'VAR_0214=hre-social security number-15335',
       u'VAR_0214=hre-social security number-1855',
       u'VAR_0214=hre-social security number-2857', u'VAR_0237=al',
       u'VAR_0274=rn', u'VAR_0283=u', u'VAR_0342=ua', u'VAR_0354=u',
       u'VAR_0214=hre-social security number-1747',
       u'VAR_0214=hre-social security number-1397',
       u'VAR_0214=hre-social security number-10143',
       u'VAR_0214=hre-social security number-18823', u'VAR_0342=fu',
       u'VAR_0214=hre-home phone-0621', u'VAR_0342=af',
       u'VAR_0214=hre-social security number-1373', u'VAR_0283=s',
       u'VAR_0214=hre-home phone-0779', u'VAR_0193', u'VAR_0191',
       u'VAR_0237=mi', u'VAR_0237=co', u'VAR_0194', u'VAR_0114', u'VAR_0192',
       u'VAR_0098', u'VAR_0138', u'VAR_0237=ct', u'VAR_0130', u'VAR_0342=bf',
       u'VAR_0106', u'VAR_0325=f', u'VAR_0107', u'VAR_0342=eu', u'VAR_0