In [1]:
# Import all required libraries
from __future__ import division # For python 2.*

import numpy as np
import matplotlib.pyplot as plt
import mltools as ml

from sklearn import *
from sklearn.metrics import roc_auc_score 

np.random.seed(0)
%matplotlib inline

In [2]:
# Data Loading
X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)

# The test data
Xte = np.genfromtxt('data/X_test.txt', delimiter=None)

Xtr, Xva, Ytr, Yva = ml.splitData(X, Y)
Xtr, Ytr = ml.shuffleData(Xtr, Ytr)

# Taking a subsample of the data so that trains faster.  You should train on whole data for homework and Kaggle.
Xt, Yt = Xtr[:4000], Ytr[:4000]

In [28]:
def predictValidation(classifier) -> (float, list):
    classifier.fit(Xt, Yt)
    YvaHat = classifier.predict_proba(Xva)[:,1]
    auc_score = roc_auc_score(Yva, YvaHat)
    return (auc_score, YvaHat)

In [29]:
def predictTest(classifier) -> list:
    classifier.fit(Xtr, Ytr)
    YteHat = classifier.predict(Xte)
    return YteHat

## Logistic Regression using Scalar Preprocessing

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

lreg = LogisticRegression(random_state=0, solver='sag', multi_class='multinomial', max_iter=1000)
print(predictValidation(lreg)[0])

0.6324625797116111




## KNN

In [6]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=20, weights="distance")
print(predictValidation(knn)[0])

0.5881487130765016


## Random Forests

In [60]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=1000, min_samples_leaf=5, n_jobs=-1, oob_score=True)
print(predictValidation(rfc)[0])

0.6694208535543542


## Test Weighing and Blending on Validation Data

In [61]:
classifiers = {"lreg": lreg, "knn": knn, "rfc": rfc}
weights = {"lreg": 6, "knn": 1, "rfc": 9}

def runValidation(weights):
    t = []
    for i in range(np.shape(Yva)[0]):
        t.append([])

    for name, classifier in classifiers.items():
        weight = weights[name]
        res = predictValidation(classifier)[1]

        for i, val in enumerate(res):
            for x in range(weight):
                t[i].append(val)

    YvaHat = []
    for vals in t:
        YvaHat.append(np.mean(vals))

    auc_score = roc_auc_score(Yva, YvaHat)
    print(weights, "->", auc_score)

runValidation(weights)



{'lreg': 6, 'knn': 1, 'rfc': 9} -> 0.6741458338314154


## Final Run on Test Data and Submission

In [9]:
import time
# a helper function to generate a file
def to_kaggle(YteHat):
    current_time_str = time.strftime("%H-%M-%S_%a_%b_%d", time.localtime())
    file_name = "results/{}.txt".format(current_time_str)
    np.savetxt(file_name,
           np.vstack( (np.arange(len(prediction)) , prediction) ).T,
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');

In [62]:
def runFinalAndSaveToKaggle():
    t = []
    for i in range(np.shape(Xte)[0]):
        t.append([])

    for name, classifier in classifiers.items():
        weight = weights[name]
        res = predictTest(classifier)

        for i, val in enumerate(res):
            for x in range(weight):
                t[i].append(val)

    YteHat = []
    for vals in t:
        YteHat.append(np.mean(vals))

    to_kaggle(YteHat)

