In [432]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

%run ../src/Munger.py
%run ../src/Models.py
%run ../src/utils.py

In [6]:
# load dataset
numer_ai = pd.read_csv('../data/numerai_datasets/numerai_training_data.csv')
test = pd.read_csv('../data/numerai_datasets/numerai_tournament_data.csv')

In [7]:
# split into training and validation sets as per the competition instructions
train = numer_ai[numer_ai.validation == 0]
validation = numer_ai[numer_ai.validation == 1]

In [442]:
munger = Munger(train, validation, test)

In [443]:
## remove correlated features
munger.remove_correlated_features()

In [444]:
## label encoding a categorical feature
munger.label_encoding()

In [359]:
## one hot encoding categorical feature
munger.one_hot_encoding()

In [445]:
X_train = munger.X
y_train = munger.y

X_validation = munger.X_validation
y_validation = munger.y_validation

X_test = munger.X_test

In [446]:
model = Models()
clf = model.logistic_regression_model()

In [None]:
## 5-fold cross validation
mean_score, mean_std = eval_models([clf], X_train, y_train)

In [435]:
print 'Mean AUC score %f and std %f ' %(mean_score, mean_std)

Mean AUC score 0.531229 and std 0.004915 


In [436]:
# fit a model
clf.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])

In [437]:
# calibrate predictions
sig_clf = CalibratedClassifierCV(clf, method='isotonic', cv='prefit')
sig_clf.fit(X_validation, y_validation)

CalibratedClassifierCV(base_estimator=Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))]),
            cv='prefit', method='isotonic')

In [438]:
## predictions
predsValidation = sig_clf.predict_proba(X_validation)[:, 1]

In [439]:
## check to see how this dummy classifier performs
print 'ROC AUC Score on the validation examples %f ' %(roc_auc_score(y_validation, predsValidation))

ROC AUC Score on the validation examples 0.534512 


In [440]:
# create full dataset
munger.concatenate_train_validation()

In [441]:
# full dataset
X_full = munger.X_full
y_full = munger.y_full

In [425]:
## fit on whole dataset
clf.fit(X_full, y_full)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])

In [426]:
# calibration
sig_clf.fit(X_full, y_full)

CalibratedClassifierCV(base_estimator=Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))]),
            cv='prefit', method='isotonic')

In [427]:
predictions = sig_clf.predict_proba(X_test)[:, 1]

In [430]:
# submission dataframe
submission_df = pd.read_csv('../data/numerai_datasets/numerai_example_predictions.csv')

In [431]:
prepare_submission(submission_df, predictions, 'logistic_reg_label_enc_calibrated.csv')