In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import time
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

import xgboost as xgb

SEED = 9719
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

%run ../src/data/load_dataset.py
%run ../src/models/cross_validation.py
%run ../src/features/build_features.py

** Load Dataset **

In [2]:
st = time.clock()
train, test = load('../data/raw/205e1808-6-dataset/train.csv',
                   '../data/raw/205e1808-6-dataset/test.csv',
                   include_dtypes=True
                  )
et = time.clock()

print('Took: {} seconds to load dataset'.format((et - st)))

Took: 37.995780999999994 seconds to load dataset


** Prepare Features **

In [3]:
st = time.clock()
train, test = create_features(train, test)
et = time.clock()

print('Took: {} seconds to prepare features'.format((et - st)))

Took: 25.46125 seconds to prepare features


** Create feature matrix and target vector **

In [4]:
features = ['hour_of_day', 'prime_time', 'siteid_count']

In [4]:
X  = train.loc[:, features]
y  = train['click']

Xtest = test.loc[:, features]

** Cross Validation Scheme **

In [9]:
training_mask   = (train.datetime < '2017/01/14') | (train.datetime > '2017/01/16')
validation_mask = ~training_mask

X_train, X_val, y_train, y_val = create_splits(X, y, training_mask, validation_mask, None)

** Train Models **

In [12]:
st = time.clock()
rf = RandomForestClassifier(max_depth=5, random_state=SEED, n_jobs=3)
rf.fit(X_train, y_train)
et = time.clock()

print('Took: {} seconds to train model'.format((et - st)))

preds = rf.predict_proba(X_val)[:, 1]
print('ROC AUC score on validation set: {}'.format(roc_auc_score(y_val, preds)))

ROC AUC score on validation set: 0.841431183578659


** Full Training **

In [5]:
st = time.clock()
rf = RandomForestClassifier(max_depth=5, random_state=SEED, n_jobs=3)
rf.fit(train.loc[:, features], train['click'])
et = time.clock()

print('Took: {} seconds to train model'.format((et - st)))

final_preds = rf.predict_proba(test.loc[:, features])[:, 1]

Took: 138.86973 seconds to train model


In [7]:
del train
gc.collect()

73

In [12]:
sub = pd.read_csv('../data/raw/205e1808-6-dataset/sample_submission.csv')
# sub.loc[:, 'ID']    = test.ID
# sub.loc[:, 'click'] = final_preds

# sub.to_csv('../submissions/rf_siteid_count.csv', index=False)
# !zip '../submissions/rf_siteid_count.csv.zip' '../submissions/rf_siteid_count.csv'
# !rm '../submissions/rf_siteid_count.csv'

In [14]:
sub.head(6)

Unnamed: 0,ID,click
0,IDE4beP,0.5
1,IDfo26Y,0.5
2,IDYZM6I,0.5
3,ID8CVw1,0.5
4,IDPltMK,0.5
5,IDaFVqz,0.5


In [15]:
sub.loc[:, 'ID']    = test.ID
sub.loc[:, 'click'] = final_preds 

In [17]:
sub.to_csv('../submissions/rf_siteid_count.csv', index=False)