In [152]:
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
import seaborn as sns
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, ZeroPadding1D, Conv1D, MaxPooling1D, Flatten
from keras.optimizers import SGD, Adam, RMSprop

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import VarianceThreshold

from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.svm import SVC

from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

%matplotlib inline

## Data Import

In [2]:
X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)
# X,Y = ml.shuffleData(X,Y)

Xdf = pd.DataFrame(X)
Ydf = pd.DataFrame(Y)

In [3]:
Xdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,242.0,227.0,240.92,232.44,1195.0,253.0,0.0,1.2671,6.4128,1.9869,3.9756,2.3392,6.3537,0.0
1,249.0,230.0,242.31,233.68,1579.0,243.0,0.0,9.6831,6.0824,1.1964,3.4577,2.0416,7.6746,0.0
2,223.0,195.0,227.64,204.42,1034.0,603.0,318.0,1.5286,17.869,13.23,5.712,4.7216,6.603,0.0
3,234.0,221.0,236.27,229.73,7716.0,3907.0,0.0,0.60465,8.0497,3.4476,2.4845,1.5741,1.4205,0.0
4,234.0,233.0,245.51,234.1,545.0,21.0,0.0,6.7473,5.2649,0.86766,5.9626,4.245,3.1429,24.2


In [6]:
X_train, X_val = X[:180000,:], X[180000:,:]
Y_train, Y_val = Y[:180000], Y[180000:]
print(X_train.shape)
print(Y_train.shape)

(180000, 14)
(180000,)


### Try Reducing Dimension

In [54]:
X_kBest = SelectKBest(f_classif, k=14).fit_transform(X,Y)

In [55]:
X_kBest_train, X_kBest_val = X_kBest[:180000,:], X_kBest[180000:,:]
Y_kBest_train, Y_kBest_val = Y[:180000], Y[180000:]
X_kBestdf = pd.DataFrame(X_kBest)
X_kBestdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,242.0,227.0,240.92,232.44,1195.0,253.0,0.0,1.2671,6.4128,1.9869,3.9756,2.3392,6.3537,0.0
1,249.0,230.0,242.31,233.68,1579.0,243.0,0.0,9.6831,6.0824,1.1964,3.4577,2.0416,7.6746,0.0
2,223.0,195.0,227.64,204.42,1034.0,603.0,318.0,1.5286,17.869,13.23,5.712,4.7216,6.603,0.0
3,234.0,221.0,236.27,229.73,7716.0,3907.0,0.0,0.60465,8.0497,3.4476,2.4845,1.5741,1.4205,0.0
4,234.0,233.0,245.51,234.1,545.0,21.0,0.0,6.7473,5.2649,0.86766,5.9626,4.245,3.1429,24.2


### Try Increasing Dimension

In [56]:
X_poly = PolynomialFeatures(2).fit_transform(X_kBest)
print(X_poly.shape)

(200000, 120)


In [57]:
X_poly_train = X_poly[:180000,:]
Y_poly_train = Y[:180000]
X_poly_val = X_poly[180000:,:]
Y_poly_val = Y[180000:]

In [11]:
X_poly_3rd = PolynomialFeatures(3).fit_transform(X_kBest)
print(X_poly_3rd.shape)
X_poly_3rd_train = X_poly_3rd[:180000,:]
Y_poly_3rd_train = Y[:180000]
X_poly_3rd_val = X_poly_3rd[180000:,:]
Y_poly_3rd_val = Y[180000:]

(200000, 286)


## Keras - Neural Networks

In [114]:
nnModel = Sequential()
nnModel.add(BatchNormalization(input_shape=(14,)))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(32, activation="relu"))
nnModel.add(Dense(1, activation='sigmoid'))
adam = Adam(lr=.1)
nnModel.compile(loss="binary_crossentropy", optimizer=adam, metrics=["accuracy"])

In [115]:
nnModel.optimizer.lr = .1
nnModel.fit(X_train, Y_train, epochs=1, batch_size=16, validation_data=(X_val, Y_val))

Train on 180000 samples, validate on 20000 samples
Epoch 1/1
 38064/180000 [=====>........................] - ETA: 2:12 - loss: 5.5343 - acc: 0.6564

KeyboardInterrupt: 

### Conv Net

In [283]:
model = Sequential()
model.add(Conv1D(256, 3, padding="same", activation='relu', input_shape=(286,1)))
model.add(BatchNormalization())
model.add(Conv1D(256, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2, padding="same", strides=2))

model.add(Conv1D(256, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(256, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2, strides=2))

model.add(Conv1D(512, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(512, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2, padding="same", strides=2))

model.add(Conv1D(512, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(512, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2, padding="same", strides=2))

model.add(Conv1D(256, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(256, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2, strides=2))

model.add(Conv1D(256, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(256, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2, strides=2))

model.add(Conv1D(128, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(128, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2, padding="same", strides=2))

model.add(Flatten())
model.add(Dense(516, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [284]:
X_train_expanded = np.expand_dims(X_poly_3rd_train, axis=2)
X_val_expanded = np.expand_dims(X_poly_3rd_val, axis=2)

In [287]:
model.compile(loss="binary_crossentropy", optimizer=adam, metrics=["accuracy"])

## Gradient Boosting

In [23]:
param_grid = [
  {'n_estimators': [205, 250, 300, 350, 400]},
]
xbGrid = GridSearchCV(GradientBoostingClassifier(), param_grid)

In [24]:
xbGrid.fit(X_train, Y_train);

In [25]:
xbGrid.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=400,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [26]:
xbGrid.best_estimator_.fit(X_train, Y_train)
xbGrid.best_estimator_.score(X_val, Y_val)

0.7167

In [147]:
xb600 = GradientBoostingClassifier(n_estimators=600)
xb600.fit(X_train, Y_train)
print(xb600.score(X_val, Y_val))

0.7197


In [148]:
xb600.score(X_train, Y_train)

0.72189999999999999

In [124]:
xb = GradientBoostingClassifier(n_estimators=1200)
xb.fit(X_train, Y_train)
print(xb.score(X_val, Y_val))

0.7285


In [142]:
xb.score(X_train, Y_train)

0.73397777777777773

In [143]:
xb

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1200,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [157]:
zBoost = GradientBoostingClassifier(n_estimators=600, learning_rate=.02,max_depth=6, max_leaf_nodes =50, random_state=0).fit(X_train, Y_train)


In [168]:
zBoost.score(X_val, Y_val)

0.72860000000000003

In [164]:
a = np.array([[1,2],[3,4]])

In [165]:
a

array([[1, 2],
       [3, 4]])

In [166]:
a[0]

array([1, 2])

In [167]:
a[0,:]

array([1, 2])

## Random Forest

In [179]:
RandomForestClassifier?

In [180]:
param_grid = [
  {'max_depth': [2,10], 'max_features': [2,14]},
]
randomForestClf = RandomForestClassifier()
rfGrid = GridSearchCV(randomForestClf, param_grid)

In [182]:
rfGrid.fit(X_train, Y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_features': [2, 14], 'max_depth': [2, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [184]:
rfGrid.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=14, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [29]:
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
print(rf.score(X_val, Y_val))

0.7266


In [30]:
rf = RandomForestClassifier(n_estimators=600)
rf.fit(X_train, Y_train)
print(rf.score(X_val, Y_val))

0.735


In [118]:
rf = RandomForestClassifier(n_estimators=1200)
rf.fit(X_train, Y_train)
print(rf.score(X_val, Y_val))

0.7354


In [146]:
rf.score(X_train, Y_train)

0.94516111111111112

## Ada Boosting

In [33]:
ada = AdaBoostClassifier()
ada.fit(X_train, Y_train)
print(ada.score(X_val, Y_val))

0.7004


## Logistic Regression

In [150]:
logReg = LogisticRegression()
logReg.fit(X_train, Y_train)
logReg.score(X_val, Y_val)

0.68884999999999996

In [151]:
logReg.score(X_train, Y_train)

0.68593888888888888

## Naive Bayes

In [288]:
nb = GaussianNB()
nb.fit(X_train, Y_train)
nb.score(X_val, Y_val)

0.61285000000000001

## Combinations

In [31]:
def preds_from_ensemble(models, X):
    model_pred_probs = []
    n = len(models)
    for i, model in enumerate(models):
        proba = model.predict_proba(X)
        if proba.shape[1] > 1:
#           model_pred_probs.append(model.predict_proba(X)[:,1])
            model_pred_probs.append(proba[:,1])
        else:
            model_pred_probs.append(proba.reshape(proba.shape[0],))
        
    pred_probs = np.array(model_pred_probs).T
    pred_probs.sort()
    
    vote1 = (pred_probs >= [0.5 for k in range(n)]).sum(axis=1)
    vote_thresh = n / 2
    avgs = []
    hard_preds = []
    for i in range(len(pred_probs)):
        if vote1[i] > vote_thresh:
            avgs.append(pred_probs[i,-vote1[i]:-1].mean())
            hard_preds.append(1)
        else:
            avgs.append(pred_probs[i,0:n-vote1[i]].mean())
            hard_preds.append(0)
    return np.array(avgs), np.array(hard_preds)

In [175]:
def hard_accuracy(preds, Y):
    preds = np.array(preds)
    return (preds == Y).astype(int).sum() / float(Y.shape[0])

In [119]:
models = [rf, xb, ada]

In [120]:
soft, hard = preds_from_ensemble(models, X_val)

In [121]:
hard_accuracy(hard, Y_val)

0.72189999999999999

In [125]:
ensemble2 = [rf, xb]

In [126]:
soft2, hard2 = preds_from_ensemble(ensemble2, X_val)
hard_accuracy(hard2, Y_val)

0.73124999999999996

In [170]:
vclf = VotingClassifier(estimators=[("rf", rf), ("zBoost", zBoost)], voting="soft")

In [176]:
hard, soft = preds_from_ensemble([rf, zBoost], X_val)

In [178]:
hard_accuracy(soft, Y_val)

0.73014999999999997

## Submission

In [127]:
Xte = np.genfromtxt('data/X_test.txt', delimiter=None)

In [134]:
prob_1, _ = preds_from_ensemble(ensemble3, Xte)

KeyboardInterrupt: 

In [138]:
prob_1 = prob_1[:,1]

In [139]:
prob_1[10:15]

array([ 0.12351356,  0.05458333,  0.72395833,  0.4825    ,  0.16815212])

In [140]:
Yte = np.vstack((np.arange(prob_1.shape[0]), prob_1[:])).T
np.savetxt('Y _ submit.txt',Yte,'%d, %.2f',header='ID,Prob1',comments='',delimiter=',')

In [65]:
np.vstack?