In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.calibration import CalibratedClassifierCV
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV

import matplotlib.pyplot as plt

from scipy.stats import randint

%matplotlib inline

%run ../src/Munger.py
%run ../src/Models.py
%run ../src/utils.py

In [2]:
# load dataset
numer_ai = pd.read_csv('../data/numerai_datasets/numerai_training_data.csv')
test = pd.read_csv('../data/numerai_datasets/numerai_tournament_data.csv')

In [3]:
# split into training and validation sets as per the competition instructions
train = numer_ai[numer_ai.validation == 0]
validation = numer_ai[numer_ai.validation == 1]

In [4]:
munger = Munger(train, validation, test)
## remove correlated features
munger.remove_correlated_features()
## label encoding a categorical feature
munger.label_encoding()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [None]:
## one hot encoding categorical feature
# munger.one_hot_encoding()

In [5]:
X_train = munger.X
y_train = munger.y

X_validation = munger.X_validation
y_validation = munger.y_validation

X_test = munger.X_test

In [6]:
model = Models()
clf = model.logistic_regression_model()
# clf = model.extreme_gbm()

In [None]:
## 5-fold cross validation
mean_score, mean_std = eval_models([clf], X_train, y_train)

In [None]:
print 'Mean AUC score %f and std %f ' %(mean_score, mean_std)

In [None]:
# fit a model
clf.fit(X_train, y_train)

In [None]:
## predictions
predsValidation = clf.predict_proba(X_validation)[:, 1]

In [None]:
## check to see how this classifier performs
print 'ROC AUC Score on the validation examples %f ' %(roc_auc_score(y_validation, predsValidation))

## Stacked blending

In [8]:
linear_model = model.logistic_regression_model()
non_linear_model = model.extreme_gbm()

predsValidation = stacked_blending([linear_model, non_linear_model], X_train, y_train, X_validation)

Creating train and test sets for blending.
0 Pipeline(steps=[('ft', FeatureTransformer()), ('select', SelectKBest(k=5, score_func=<function chi2 at 0x0000000016743588>)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=3.0, class_weight='auto', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])
Fold 0
Fold 1
Fold 2
1 Pipeline(steps=[('ft', FeatureTransformer()), ('select', SelectKBest(k=5, score_func=<function chi2 at 0x0000000016743588>)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.25...logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])
Fold 0
Fold 1
Fold 2

Blending.


In [9]:
print 'ROC AUC Score on the validation examples %f ' %(roc_auc_score(y_validation, predsValidation))

ROC AUC Score on the validation examples 0.529126 


## Learning curves

In [None]:
from sklearn.learning_curve import learning_curve

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
        scoring='roc_auc')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")

In [None]:
plot_learning_curve(clf, 'Learning Curve', X_train, y_train, cv=5)

## ROC curve

In [None]:
def make_roc(name, clf, ytest, xtest, ax=None, labe=5, proba=True, skip=0):
    initial=False
    if not ax:
        ax=plt.gca()
        initial=True
    if proba:
        fpr, tpr, thresholds=roc_curve(ytest, clf.predict_proba(xtest)[:,1])
    else:
        fpr, tpr, thresholds=roc_curve(ytest, clf.decision_function(xtest))
    roc_auc = auc(fpr, tpr)
    if skip:
        l=fpr.shape[0]
        ax.plot(fpr[0:l:skip], tpr[0:l:skip], '.-', alpha=0.3, label='ROC curve for %s (area = %0.2f)' % (name, roc_auc))
    else:
        ax.plot(fpr, tpr, '.-', alpha=0.3, label='ROC curve for %s (area = %0.2f)' % (name, roc_auc))
    label_kwargs = {}
    label_kwargs['bbox'] = dict(
        boxstyle='round,pad=0.3', alpha=0.2,
    )
    for k in xrange(0, fpr.shape[0],labe):
        #from https://gist.github.com/podshumok/c1d1c9394335d86255b8
        threshold = str(np.round(thresholds[k], 2))
        ax.annotate(threshold, (fpr[k], tpr[k]), **label_kwargs)
    if initial:
        ax.plot([0, 1], [0, 1], 'k--')
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title('ROC')
    ax.legend(loc="lower right")
    return ax

In [None]:
ax = make_roc('logistic', clf, y_validation, X_validation, labe=5)

## Train on full dataset

In [10]:
# create full dataset
munger.concatenate_train_validation()

In [11]:
# full dataset
X_full = munger.X_full
y_full = munger.y_full

In [None]:
## fit on whole dataset
clf.fit(X_full, y_full)

In [None]:
predictions = clf.predict_proba(X_test)[:, 1]

## Stacked Blending 

In [12]:
predictions = stacked_blending([linear_model, non_linear_model], X_full, y_full, X_test)

Creating train and test sets for blending.
0 Pipeline(steps=[('ft', FeatureTransformer()), ('select', SelectKBest(k=5, score_func=<function chi2 at 0x0000000016743588>)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=3.0, class_weight='auto', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])
Fold 0
Fold 1
Fold 2
1 Pipeline(steps=[('ft', FeatureTransformer()), ('select', SelectKBest(k=5, score_func=<function chi2 at 0x0000000016743588>)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.25...logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])
Fold 0
Fold 1
Fold 2

Blending.


In [13]:
predictions[:10]

array([ 0.52042906,  0.5155925 ,  0.49481546,  0.48975507,  0.50296483,
        0.52562474,  0.5157315 ,  0.47154946,  0.47487507,  0.50999284])

In [14]:
# submission dataframe
submission_df = pd.read_csv('../data/numerai_datasets/numerai_example_predictions.csv')

In [15]:
prepare_submission(submission_df, predictions, 'stacked_blending.csv')