# Overview
This notebook generates the first layer of meta features.

In [8]:
import numpy as np
from sklearn.model_selection import KFold, train_test_split

In [3]:
training_data = np.loadtxt('../data/training_data.txt', skiprows=1)
X_test = np.loadtxt('../data/test_data.txt', skiprows=1)

For generating the meta features, __always__ use 5-fold CV with random seed 214. Predict each training data in each fold using other 4 folds. For test data, either use all of training data or best CV classifier, whichever makes more sense for the algorithm (are we just hyper-parameter tuning? is early stopping required?).

In [13]:
X_train = training_data[:,1:]
y_train = training_data[:,0]
X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.1, random_state=100)
kf = KFold(n_splits=5, shuffle=True, random_state=214)

# K-Nearest Neighbors

In [16]:
from sklearn.neighbors import KNeighborsClassifier
def KNN(k, distance='braycurtis'):
    """Train a KNN model with given k and distance metric, and return predictions.

    Distance can be 'manhattan', 'euclidean', or 'braycurtis'.

    Returns two numpy array of size y_train, for training predictions, and X_test, for test predictions.

    """
    train_predictions = np.empty(y_train.shape)
    test_predictions = np.empty(X_test.shape[0])

    if distance == 'manhattan':
        knn = KNeighborsClassifier(n_neighbors=k, p=2)
    elif distance == 'euclidean':
        knn = KNeighborsClassifier(n_neighbors=k, p=1)
    elif distance == 'braycurtis':
        knn = KNeighborsClassifier(n_neighbors=k, metric='braycurtis')

    for split, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
        print('KNN k=%d, split=%d' % (k, split))
        knn.fit(X_train[train_index], y_train[train_index])
        train_predictions[test_index] = knn.predict_proba(X_train[test_index])[:, 1]
    
    print('KNN k=%d, test manhattan' % (k))
    knn.fit(X_train, y_train)
    test_predictions = knn.predict_proba(X_test)[:, 1]

    return train_predictions, test_predictions

In [18]:
train_results, test_results = KNN(128)
np.savetxt('../inferences/knn_128_bc_train.txt', train_results, fmt='%.6g')
np.savetxt('../inferences/knn_128_bc_test.txt', test_results, fmt='%.6g')

KNN k=128, split=0
KNN k=128, split=1
KNN k=128, split=2
KNN k=128, split=3
KNN k=128, split=4
KNN k=128, test manhattan


# Adaboost
Parameter tuning CV results are [here](https://github.com/veniversum/cs155-projects/wiki/AdaBoost).

In [4]:
from sklearn.ensemble import AdaBoostClassifier
def AdaBoost(n_estimators=250, learning_rate=1):
    adaboost_classes_train = np.empty(y_train.shape)
    adaboost_classes_test = np.empty(X_test.shape[0])

    cursplit = 0;
    
    for train_index, test_index in kf.split(X_train, y_train):
        cursplit += 1
        print('Adaboost, split=%d' % (cursplit))
        clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        clf.fit(X_train[train_index], y_train[train_index])
        adaboost_classes_train[test_index] = clf.predict_proba(X_train[test_index])[:, 1]
    
    print('Adaboost, test')
    clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    clf.fit(X_train, y_train)
    adaboost_classes_test = clf.predict_proba(X_test)[:, 1]
    
    return adaboost_classes_train, adaboost_classes_test

In [None]:
train_results1, test_results1 = AdaBoost(n_estimators=373, learning_rate=1)

In [10]:
np.savetxt('../inferences/adaboost_n373_lr1_train.txt' ,train_results1, fmt='%.6g')
np.savetxt('../inferences/adaboost_n373_lr1_test.txt' ,test_results1, fmt='%.6g')
print('Accuracy: %f' % (1 - np.sum(np.abs((np.round(train_results1)) - y_train)) / y_train.shape[0]))

Accuracy: 0.835600


In [None]:
train_results2, test_results2 = AdaBoost(n_estimators=374, learning_rate=1)

In [9]:
np.savetxt('../inferences/adaboost_n374_lr1_train.txt' ,train_results2, fmt='%.6g')
np.savetxt('../inferences/adaboost_n374_lr1_test.txt' ,test_results2, fmt='%.6g')
print('Accuracy: %f' % (1 - np.sum(np.abs((np.round(train_results2)) - y_train)) / y_train.shape[0]))

Accuracy: 0.835600


# Random Forests
Parameter tuning CV results are [here](https://github.com/veniversum/cs155-projects/wiki/RandomForestClassifier-RandomizedSearchCV).

In [11]:
from sklearn.ensemble import RandomForestClassifier
def RandomForest(n_estimators=250, max_depth=144, min_samples_split=80):
    rf_classes_train = np.empty(y_train.shape)
    rf_classes_test = np.empty(X_test.shape[0])

    cursplit = 0;
    
    for train_index, test_index in kf.split(X_train, y_train):
        cursplit += 1
        print('RandomForest, split=%d' % (cursplit))
        clf = RandomForestClassifier(n_estimators=n_estimators, max_features='sqrt', criterion='gini', max_depth=max_depth, min_samples_split=min_samples_split)
        clf.fit(X_train[train_index], y_train[train_index])
        rf_classes_train[test_index] = clf.predict(X_train[test_index])
    
    print('RandomForest, test')
    clf = RandomForestClassifier(n_estimators=n_estimators, max_features='sqrt', criterion='gini', max_depth=max_depth, min_samples_split=min_samples_split)
    clf.fit(X_train, y_train)
    rf_classes_test = clf.predict(X_test)
    
    return rf_classes_train, rf_classes_test

In [12]:
train_results, test_results = RandomForest(n_estimators=250, max_depth=144, min_samples_split=80)
np.savetxt('../inferences/rf_md144_mss80_train.txt' ,train_results, fmt='%.6g')
np.savetxt('../inferences/rf_md144_mss80_test.txt' ,test_results, fmt='%.6g')
print('Accuracy: %f' % (1 - np.sum(np.abs((train_results) - y_train)) / y_train.shape[0]))

RandomForest, split=1
RandomForest, split=2
RandomForest, split=3
RandomForest, split=4
RandomForest, split=5
RandomForest, test
Accuracy: 0.829200


# Extra Trees

In [142]:
from sklearn.ensemble import ExtraTreesClassifier
def ExtraTrees(n_estimators=250):
    classes_train = np.empty(y_train.shape)
    classes_test = np.empty(X_test.shape[0])

    cursplit = 0;
    
    for train_index, test_index in kf.split(X_train, y_train):
        cursplit += 1
        print('ExtraTreesClassifier, split=%d' % (cursplit))
        clf = ExtraTreesClassifier(n_estimators=n_estimators, max_features='sqrt', criterion='gini', max_depth=None)
        clf.fit(X_train[train_index], y_train[train_index])
        classes_train[test_index] = clf.predict(X_train[test_index])
    
    print('ExtraTreesClassifier, test')
    clf = ExtraTreesClassifier(n_estimators=n_estimators, max_features='sqrt', criterion='gini', max_depth=None)
    clf.fit(X_train, y_train)
    classes_test = clf.predict(X_test)
    
    return classes_train, classes_test

In [144]:
train_results, test_results = ExtraTrees(200)
np.savetxt('../inferences/et_train.txt' ,train_results, fmt='%.6g')
np.savetxt('../inferences/et_test.txt' ,test_results, fmt='%.6g')
print('Accuracy: %f' % (1 - np.sum(np.abs((train_results) - y_train)) / y_train.shape[0]))

ExtraTreesClassifier, split=1
ExtraTreesClassifier, split=2
ExtraTreesClassifier, split=3
ExtraTreesClassifier, split=4
ExtraTreesClassifier, split=5
ExtraTreesClassifier, test
Accuracy: 0.841300


# XGBoost

In [None]:
!pip install -q xgboost==0.7post3
import xgboost as xgb
def XGBoost():
    xgb_classes_train = np.empty(y_train.shape)
    xgb_classes_test = np.empty(X_test.shape[0])
    param = {'booster':'gbtree', 'max_depth':18, 'eta':0.03, 'silent':1, 
         'objective':'binary:logistic', 'eval_metric':['error', 'logloss'], 
         'colsample_bytree':0.7, 'subsample':1, 'gamma':1,
         'min_child_weight':1, 'tree_method':'hist'}
    cursplit = 0;
    
    for train_index, test_index in kf.split(X_train, y_train):
        cursplit += 1
        print('XGBoost, split=%d' % (cursplit))
        dtrain = xgb.DMatrix(X_train[train_index], label=y_train[train_index])
        dtest = xgb.DMatrix(X_train[test_index], label=y_train[test_index])
        evallist = [(dtrain, 'train'), (dtest, 'eval')]
        bst = xgb.train(param, dtrain, 200000, evallist, verbose_eval=100, early_stopping_rounds=50)
        xgb_classes_train[test_index] = bst.predict(dtest)
    
    print('XGBoost, test')
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test)
    bst = xgb.train(param, dtrain, 200000, evallist, verbose_eval=100, early_stopping_rounds=50)
    xgb_classes_test = bst.predict(dtest)
    
    return xgb_classes_train, xgb_classes_test, bst

# Logistic Regression

In [129]:
from sklearn.linear_model import LogisticRegression
def LogReg(C=1, max_iter=100):
    lr_classes_train = np.empty(y_train.shape)
    lr_classes_test = np.empty(X_test.shape[0])

    cursplit = 0;
    
    for train_index, test_index in kf.split(X_train, y_train):
        cursplit += 1
        print('LogisticRegression, split=%d' % (cursplit))
        clf = LogisticRegression(C=C, max_iter=max_iter)
        clf.fit(np.log(1 + X_train[train_index]), y_train[train_index])
        lr_classes_train[test_index] = clf.predict(np.log(1 + X_train[test_index]))
    
    print('LogisticRegression, test')
    clf = LogisticRegression(C=C, max_iter=max_iter)
    clf.fit(np.log(1 + X_train), y_train)
    lr_classes_test = clf.predict(np.log(1 + X_test))
    
    return lr_classes_train, lr_classes_test

In [130]:
train_results, test_results = LogReg(1, max_iter=500)
np.savetxt('../inferences/lr_train.txt' ,train_results, fmt='%.6g')
np.savetxt('../inferences/lr_test.txt' ,test_results, fmt='%.6g')
print('Accuracy: %f' % (1 - np.sum(np.abs((train_results) - y_train)) / y_train.shape[0]))

LogisticRegression, split=1
LogisticRegression, split=2
LogisticRegression, split=3
LogisticRegression, split=4
LogisticRegression, split=5
LogisticRegression, test
Accuracy: 0.845900


In [100]:
train_results

array([ 1.,  0.,  1., ...,  1.,  1.,  0.])

# Neural Nets
## <font color='red'>*RUN THIS ON COLAB w/ GPU!*</font>

In [None]:
def embeddings_model:
    ''' Projects BOW features into word vectors. 
        Run on Colab. Refer to Colab notebook.
        nn_train/test
    '''
    vec = K.variable(feature_vecs)
    vec = keras.layers.Input(batch_shape=(None, 1000, 300), tensor=vec)
    inp2 = keras.layers.Input(shape=(1000, ))
    inp2_ = keras.layers.Reshape((1000,1))(inp2)
    v = keras.layers.multiply([vec, inp2_])
    v = keras.layers.Reshape((1000,300,1))(v)
    v = Dropout(0.3)(v)

    x = Conv2D (16,(1000, 1))(v)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)

    x = Flatten()(x)
    x = Dense(128)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)

    x = Dense(2, activation='softmax')(x)

    model = Model(inputs=[vec,inp2], outputs=x)
    model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
    return model

def build_model3():
    '''nn2_train/test'''
    model = Sequential()
    model.add(Dropout(0.2, input_shape=(1000,)))
    model.add(Dense(128, kernel_regularizer=regularizers.l2(0.001)))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
from keras.callbacks import EarlyStopping
from sklearn.ensemble import BaggingClassifier
cb = EarlyStopping(patience=7)
model = BaggingClassifier(base_estimator=KerasClassifier(build_fn=build_model3), n_estimators=10)
def NN():
    classes_train = np.empty(y_train.shape)
    classes_test = np.empty(X_test.shape[0])
    cursplit = 0;
    
    for train_index, test_index in kf.split(X_train, y_train):
        cursplit += 1
        print('NN, split=%d' % (cursplit))
#         model = KerasClassifier(build_fn=build_model3)
#         model = BaggingClassifier(base_estimator=KerasClassifier(build_fn=build_model3, epochs=20, batch_size=32, validation_split=0.25), n_estimators=10)
        model = BaggingClassifier(base_estimator=KerasClassifier(build_fn=build_model3, epochs=20, 
                                                                 batch_size=32, verbose=0), n_estimators=100)
        model.fit(X_train[train_index], y_train[train_index])
        classes_train[test_index] = model.predict(X_train[test_index])
        print('Eval accuracy: %f' % (1 - np.sum(np.abs(np.rint(classes_train[test_index]) - y_train[test_index])) / y_train.shape[0]))
    
    print('NN, test')
#     model = KerasClassifier(build_fn=build_model3)
    model = BaggingClassifier(base_estimator=KerasClassifier(build_fn=build_model3, epochs=20, batch_size=32), n_estimators=100)
    model.fit(X_train, y_train)
    classes_test = model.predict(X_test)
    
    return classes_train, classes_test

In [None]:
train_results, test_results = NN()
np.savetxt('nn_train.txt' ,train_results, fmt='%.6g')
np.savetxt('nn_test.txt' ,test_results, fmt='%.6g')
print('Accuracy: %f' % (1 - np.sum(np.abs(np.rint(train_results) - y_train)) / y_train.shape[0]))

## <font color='red'>Cells below this are for archival purposes.</font>

In [118]:
# Calculate TF-IDF weighted inputs from training data
max_term_freqs = np.maximum(np.max(X_train, axis=1), 1)
term_freq = X_train / max_term_freqs[:,np.newaxis]
inverse_doc_freq = np.log((X_train.shape[0] + X_test.shape[0]) / (np.count_nonzero(X_train, axis=0) + np.count_nonzero(X_test, axis=0)))
X_train_tfidf = term_freq * inverse_doc_freq[np.newaxis,:]


max_term_freqs_test = np.maximum(np.max(X_test, axis=1), 1)
term_freq_test = X_test / max_term_freqs_test[:,np.newaxis]
X_test_tfidf = term_freq_test * inverse_doc_freq[np.newaxis,:]

X_mean = np.concatenate([X_train_tfidf,X_test_tfidf]).mean(axis=0)
X_std = np.concatenate([X_train_tfidf,X_test_tfidf]).std(axis=0)
X_train_tfidf_normed = (X_train_tfidf - X_mean) / X_std
X_test_tfidf_normed = (X_test_tfidf - X_mean) / X_std