In [179]:
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
import seaborn as sns
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, ZeroPadding1D, Conv1D, MaxPooling1D, Flatten
from keras.optimizers import SGD, Adam, RMSprop


from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import VarianceThreshold
%matplotlib inline

## Data Import

In [183]:
X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)
X,Y = ml.shuffleData(X,Y)

Xdf = pd.DataFrame(X)
Ydf = pd.DataFrame(Y)

In [184]:
Xdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,230.0,226.0,241.12,232.53,3536.0,895.0,0.0,1.804,6.8566,2.0479,3.2205,1.9657,2.3371,28.8
1,252.0,227.0,237.44,231.62,1908.0,937.0,0.0,1.204,7.024,2.021,6.4398,6.1343,2.4893,0.0
2,238.5,226.0,241.12,232.53,8632.0,2227.0,0.0,2.1958,6.8124,2.0108,2.9833,2.1844,2.7493,0.0
3,238.0,234.5,244.43,234.85,450.0,5.0,0.0,6.1049,5.2759,0.2,5.26,2.8574,20.0,18.0
4,233.0,223.5,238.97,230.81,3614.0,1334.0,0.0,0.8704,7.5201,2.8912,2.8271,2.0687,1.7119,1.6


In [185]:
X_kBest = SelectKBest(f_classif, k=10).fit_transform(X,Y)

In [187]:
X_kBestdf = pd.DataFrame(X_kBest)
X_kBestdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,230.0,226.0,241.12,232.53,895.0,6.8566,2.0479,1.9657,2.3371,28.8
1,252.0,227.0,237.44,231.62,937.0,7.024,2.021,6.1343,2.4893,0.0
2,238.5,226.0,241.12,232.53,2227.0,6.8124,2.0108,2.1844,2.7493,0.0
3,238.0,234.5,244.43,234.85,5.0,5.2759,0.2,2.8574,20.0,18.0
4,233.0,223.5,238.97,230.81,1334.0,7.5201,2.8912,2.0687,1.7119,1.6


In [150]:
X_train, X_val = X[:30000,:], X[180000:,:]
Y_train, Y_val = Y[:30000], Y[180000:]
print(X_train.shape)
print(Y_train.shape)

(30000, 14)
(30000,)


## Keras - Neural Network

In [196]:
nnModel = Sequential()
nnModel.add(BatchNormalization(input_shape=(14,)))
nnModel.add(Dense(100, activation="sigmoid"))
nnModel.add(Dense(200, activation="sigmoid"))
nnModel.add(Dense(300, activation="sigmoid"))
nnModel.add(Dense(200, activation="relu"))
nnModel.add(Dense(100, activation="relu"))
nnModel.add(Dense(1, activation='sigmoid'))
adam = RMSprop(lr=1)
nnModel.compile(loss="binary_crossentropy", optimizer=adam, metrics=["accuracy"])

In [None]:
nnModel.optimizer.lr = 0.01
nnModel.fit(X_train, Y_train, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
 3808/30000 [==>...........................] - ETA: 7s - loss: 5.5364 - acc: 0.6565

In [27]:
model = Sequential()
model.add(Conv1D(128, 3, padding="same", activation='relu', input_shape=(14,1)))
model.add(BatchNormalization())
model.add(Conv1D(128, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2, padding="same", strides=2))

model.add(Conv1D(256, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(256, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2, padding="same", strides=2))

model.add(Conv1D(256, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(256, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2, strides=2))

model.add(Conv1D(128, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(128, 3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2, padding="same", strides=2))

model.add(Flatten())
model.add(Dense(516, activation='relu'))
model.add(Dense(516, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [36]:
X_train_expanded = np.expand_dims(X_train, axis=2)
X_val_expanded = np.expand_dims(X_val, axis=2)

In [18]:
model.fit(X_train_expanded, Y_train, epochs=1, batch_size=64)

In [28]:
adam = Adam(lr=0.01)
model.compile(loss="binary_crossentropy", optimizer=adam, metrics=["accuracy"])
model.fit(X_train_expanded, Y_train, epochs=1, batch_size=32)

Epoch 1/1


<keras.callbacks.History at 0x1a223f5b50>

In [38]:
model.optimizer.lr = 0.001
model.fit(X_train_expanded, Y_train, epochs=1, validation_data=(X_val_expanded, Y_val), batch_size=16)

Train on 10000 samples, validate on 20000 samples
Epoch 1/1


<keras.callbacks.History at 0x1a23448890>

# Scikit-Learn Models

In [74]:
from sklearn.preprocessing import StandardScaler

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [272]:
print(X.shape)
lsvc = LinearSVC(C=0.5, penalty="l1", dual=False).fit(X, Y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
print(X_new.shape)

(200000, 14)
(200000, 13)


## Decision Tree

## Gradient Boosting

In [40]:
xBoostClf = Pipeline([('scaling', StandardScaler()),
  ('classification', GradientBoostingClassifier())
])


In [78]:
param_grid = [
  {'max_depth': [1, 5, 9, 13, 17, 21, 23], 'n_estimators': [3, 7, 11, 15, 19, 23]},
]
xBoostClf = GradientBoostingClassifier()
grid = GridSearchCV(xBoostClf, param_grid)

In [79]:
grid.fit(X_train, Y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 7, 11, 15, 19, 23], 'max_depth': [1, 5, 9, 13, 17, 21, 23]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [82]:
xBoostCLF = grid.best_estimator_

In [83]:
xBoostCLF.score(X_val, Y_val)

0.69084999999999996

## Random Forest

In [84]:
param_grid = [
  {'max_depth': [1, 5, 9, 13, 17, 21, 23], 'n_estimators': [3, 7, 11, 15, 19, 23]},
]
randomForestClf = GradientBoostingClassifier()
grid = GridSearchCV(randomForestClf, param_grid)

In [85]:
grid.fit(X_train, Y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 7, 11, 15, 19, 23], 'max_depth': [1, 5, 9, 13, 17, 21, 23]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [86]:
grid.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=19,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [277]:
depths = [1, 5, 9, 13, 17, 21, 23]
scores = []
for depth in depths:
    rfClf = RandomForestClassifier(n_estimators=10, max_depth=depth)
    xValScores = cross_val_score(rfClf, X_new, Y)
    scores.append(xValScores.mean())

In [278]:
print(scores)
print("best depth: %d" % depths[np.argmax(scores)])
best_depth = depths[np.argmax(scores)]

[0.6571050000328554, 0.6904250109595761, 0.7052950045602876, 0.7182350000859121, 0.7242300058112406, 0.7234000077112085]
best depth: 17


In [279]:
nEstimators = [3, 7, 11, 15, 19, 23]
n_est_scores = []
for n in nEstimators:
    rfClf = RandomForestClassifier(n_estimators=n, max_depth=best_depth)
    xValScores = cross_val_score(rfClf, X_new, Y)
    n_est_scores.append(xValScores.mean())

In [280]:
print(n_est_scores)
print("best n estimators: %d" % nEstimators[np.argmax(n_est_scores)])
best_n_est = nEstimators[np.argmax(n_est_scores)]

[0.6571050000328554, 0.6904250109595761, 0.7052950045602876, 0.7182350000859121, 0.7242300058112406, 0.7234000077112085]
best n estimators: 19


In [281]:
rfClf = RandomForestClassifier(n_estimators=best_n_est, max_depth=best_depth)
cross_val_score(rfClf, X_new, Y).mean()

In [285]:
min_samples_sp = [2, 7, 12, 17, 22, 27]
min_samples_scores = []
for m in min_samples_sp:
    rfClf = RandomForestClassifier(n_estimators=n, max_depth=best_depth, min_samples_split=m)
    xValScores = cross_val_score(rfClf, X_new, Y)
    min_samples_scores.append(xValScores.mean())

In [287]:
print(min_samples_scores)
print("best min_samples: %d" % min_samples_sp[np.argmax(min_samples_scores)])
best_min_samples = min_samples_sp[np.argmax(min_samples_scores)]

[0.7301700072365448, 0.728255001061418, 0.73007000331152, 0.7290600059614828, 0.7277900076614278, 0.7275100031363912]
best min_samples: 2


In [342]:
rlClf = Pipeline([('scaling', StandardScaler()),
  ('classification', RandomForestClassifier(n_estimators=best_n_est, max_depth=best_depth, min_samples_split=best_min_samples))
])

## Ada Boosting

In [334]:
adaClf = Pipeline([('scaling', StandardScaler()),
  ('classification', AdaBoostClassifier())
])

In [329]:
cross_val_score(clf, X, Y)

array([0.68743156, 0.69256154, 0.6897669 ])

## Combinations

In [46]:
xBoostClf = Pipeline([('scaling', StandardScaler()),
  ('classification', GradientBoostingClassifier())
])

rfClf = Pipeline([('scaling', StandardScaler()),
  ('classification', RandomForestClassifier())
])

adaClf = Pipeline([('scaling', StandardScaler()),
  ('classification', AdaBoostClassifier())
])

In [47]:
adaClf.fit(X_train, Y_train);

Pipeline(memory=None,
     steps=[('scaling', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classification', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))])

In [48]:
adaClf.score(X_val, Y_val)

0.68430000000000002

In [49]:
rfClf.fit(X_train, Y_train);

Pipeline(memory=None,
     steps=[('scaling', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classification', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [50]:
rfClf.score(X_val, Y_val)

0.67815000000000003

In [51]:
xBoostClf.fit(X_train, Y_train);

Pipeline(memory=None,
     steps=[('scaling', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classification', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impu...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))])

In [52]:
xBoostClf.score(X_val, Y_val);

0.69494999999999996

In [89]:
rf = RandomForestClassifier(n_estimators=19, max_depth=5).fit(X_train, Y_train)
ada = AdaBoostClassifier(n_estimators=19).fit(X_train, Y_train)
xb = GradientBoostingClassifier(n_estimators=19, max_depth=5).fit(X_train, Y_train)
models = [rf, ada, xb]
predictions = []
for x in X_val:
    preds = [0,0]
    for model in models:
        pred = model.predict(x.reshape(1,14)).astype(int)
        preds[pred[0]] = preds[pred[0]] + 1
    predictions.append(np.argmax(preds))

In [90]:
predictions = np.array(predictions)

In [91]:
acc = (predictions == Y_val).sum().astype(float) / Y_val.shape[0]
print(acc)

0.68855


In [69]:
predictions[10:20]

array([3, 3, 3, 3, 3, 3, 3, 3, 2, 3])