In [23]:
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml

X = np.genfromtxt("X_train.txt", delimiter=None)[:10000] 
Y = np.genfromtxt("Y_train.txt", delimiter=None)[:10000] 
Xtest = np.genfromtxt("X_test.txt",delimiter=None)
X,Y = ml.shuffleData(X,Y)
Xtr, Xte, Ytr, Yte = ml.splitData(X, Y, 0.75)

In [2]:
# Create a class with basic interface
class Ensemble:
    def __init__(self):
        self.learners = []
        self.phis = []
    
    def add(self, learner, phi=None):
        self.learners.append(learner)
        if phi == None:
            self.phis.append(lambda x : x)
        else:
            self.phis.append(phi)
    
    def predict(self, X):
        predictions = []
        for learner,phi in zip(self.learners, self.phis):
            predictions.append(learner.predict(phi(X))) # Predict using each learner once
        prediction = np.mean(predictions, axis=0) # Average the the predictions
        return prediction.ravel()

# Instantiate the class
ensemble = Ensemble()

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

K = [1,2,5,10,50,100,200,500,1000,1500,2000,5000,7500]
knnpredict = []
knnVMse = []
for k in K:
    knn = KNeighborsRegressor(n_neighbors = k)
    knn.fit(Xtr[:, 0:4], Ytr)
    knnpredict.append(knn.predict(Xte[:, 0:4]))
    
for p in knnpredict:
    knnVMse.append(mean_squared_error(Yte, p))
for i in range(len(knnVMse)):
    print("k =", K[i], " VMSE =", knnVMse[i])

bestK = K[knnVMse.index(min(knnVMse))]
print bestK


('k =', 1, ' VMSE =', 0.39800000000000002)
('k =', 2, ' VMSE =', 0.30209999999999998)
('k =', 5, ' VMSE =', 0.24729600000000002)
('k =', 10, ' VMSE =', 0.22753599999999996)
('k =', 50, ' VMSE =', 0.20863904000000003)
('k =', 100, ' VMSE =', 0.20903331999999999)
('k =', 200, ' VMSE =', 0.20965781000000003)
('k =', 500, ' VMSE =', 0.21018690560000006)
('k =', 1000, ' VMSE =', 0.21110304759999998)
('k =', 1500, ' VMSE =', 0.21226322186666666)
('k =', 2000, ' VMSE =', 0.21299201919999999)
('k =', 5000, ' VMSE =', 0.21961263390399999)
('k =', 7500, ' VMSE =', 0.22516288000000004)
50


In [32]:
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor

clfs = [KMeans(n_clusters=1000),
        KNeighborsRegressor(n_neighbors = 1000),
        GradientBoostingRegressor(n_estimators=1000)
       ]

X1,X2,Y1,Y2 = ml.splitData(X,Y,0.75)
temp_results_t = np.zeros((  len(Y1)        ,len(clfs) ))   #x1,x2
temp_results_v = np.zeros((  len(Y2)        ,len(clfs) ))   #y1,y2
test1        = np.zeros((  Xtest.shape[0] ,len(clfs) ))   
for i, clf in enumerate(clfs):
    clf.fit(X1,Y1)
    temp_results_v[:,i] = clf.predict(X2)
    temp_results_t[:,i] = clf.predict(X1)
    test1[:,i] = clf.predict(Xtest)

lr = LinearRegression(normalize=True)
#lr = LogisticRegression()
lr.fit(temp_results_v, Y2)

print "AUC on Train data: ",roc_auc_score(Y1, lr.predict(temp_results_t))
print "AUC on Valid data: ",roc_auc_score(Y2, lr.predict(temp_results_v))

AUC on Train data:  0.932311784512
AUC on Valid data:  0.681713159054


In [28]:
Ypred_lr = lr.predict(test1)
np.savetxt('Yhat_knn_lr.txt', np.vstack( (np.arange(len(Ypred_lr)) , Ypred_lr) ).T, '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');

In [13]:
n = Xtr.shape[1]
knnpredict = []
knnVMse = []
xs = [i for i in range(n)]
for i in range(n):
    knn = KNeighborsRegressor(n_neighbors = bestK)
    knn.fit(Xtr[:, i].reshape(Xtr.shape[0], 1), Ytr)
    p = knn.predict(Xte[:, i].reshape(Xte.shape[0], 1))
    knnpredict.append(p)
    knnVMse.append(mean_squared_error(Yte, p))
    
for i in range(n):
    print("x" + str(i), " VMSE =", knnVMse[i])
    
zipped = zip(xs, knnVMse)
featureRank = sorted(zipped, key = lambda x: x[1])
print(featureRank[0:5])

('x0', ' VMSE =', 0.21320479999999997)
('x1', ' VMSE =', 0.22272400000000003)
('x2', ' VMSE =', 0.22332879999999999)
('x3', ' VMSE =', 0.22399344000000002)
('x4', ' VMSE =', 0.22907887999999998)
('x5', ' VMSE =', 0.22382800000000003)
('x6', ' VMSE =', 0.23235296000000003)
('x7', ' VMSE =', 0.22498783999999999)
('x8', ' VMSE =', 0.22486319999999996)
('x9', ' VMSE =', 0.22528624)
('x10', ' VMSE =', 0.22863984000000001)
('x11', ' VMSE =', 0.22829807999999999)
('x12', ' VMSE =', 0.22322688000000002)
('x13', ' VMSE =', 0.22018703999999997)
[(0, 0.21320479999999997), (13, 0.22018703999999997), (1, 0.22272400000000003), (12, 0.22322688000000002), (2, 0.22332879999999999)]


In [6]:
features = []
for i in range(n):
    features.append(featureRank[i][0])

selectedf = tuple(features)

In [20]:
from sklearn.metrics import roc_auc_score

phi = lambda X: X[:, (0, 13, 1, 12, 2)]
knn = KNeighborsRegressor(n_neighbors = bestK)
knn.fit(Xtr[:, selectedf], Ytr)
p = knn.predict(Xte[:, selectedf])
ptr = knn.predict(Xtr[:, selectedf])
VMSE = mean_squared_error(Yte, p)
Tmse = mean_squared_error(Ytr, ptr)
print "AUC on Valid data: ",roc_auc_score(Yte, p)

ensemble.add(knn, phi)

 AUC on Valid data:  0.603843344248
