# Overview
This notebook generates the first layer of meta features.

In [1]:
import numpy as np
from sklearn.model_selection import KFold

In [2]:
training_data = np.loadtxt('../data/training_data.txt', skiprows=1)
X_test = np.loadtxt('../data/test_data.txt', skiprows=1)

In [45]:
X_train = training_data[:,1:]
y_train = training_data[:,0]
kf = KFold(n_splits=5, shuffle=True, random_state=214)

# K-Nearest Neighbors

In [91]:
from sklearn.neighbors import KNeighborsClassifier
def KNN(k):
    manhattan_classes_train = np.empty(y_train.shape)
    euclidean_classes_train = np.empty(y_train.shape)
    braycurtis_classes_train = np.empty(y_train.shape)
    manhattan_classes_test = np.empty(X_test.shape[0])
    euclidean_classes_test = np.empty(X_test.shape[0])
    braycurtis_classes_test = np.empty(X_test.shape[0])
    
    cursplit = 0;
    
    for train_index, test_index in kf.split(X_train, y_train):
        cursplit += 1
        print('KNN k=%d, split=%d manhattan' % (k, cursplit))
        knn = KNeighborsClassifier(n_neighbors=k, p=2)
        knn.fit(X_train[train_index], y_train[train_index])
        manhattan_classes[test_index] = knn.predict_proba(X_train[test_index])[:, 1]

        print('KNN k=%d, split=%d euclidean' % (k, cursplit))
        knn = KNeighborsClassifier(n_neighbors=k, p=1)
        knn.fit(X_train[train_index], y_train[train_index])
        euclidean_classes[test_index] = knn.predict_proba(X_train[test_index])[:, 1]

        print('KNN k=%d, split=%d braycurtis' % (k, cursplit))
        knn = KNeighborsClassifier(n_neighbors=k, metric='braycurtis')
        knn.fit(X_train[train_index], y_train[train_index])
        braycurtis_classes[test_index] = knn.predict_proba(X_train[test_index])[:, 1]
    
    print('KNN k=%d, test manhattan' % (k))
    knn = KNeighborsClassifier(n_neighbors=k, p=2)
    knn.fit(X_train, y_train)
    manhattan_classes_test = knn.predict_proba(X_test)[:, 1]
    
    print('KNN k=%d, test euclidean' % (k))
    knn = KNeighborsClassifier(n_neighbors=k, p=1)
    knn.fit(X_train, y_train)
    euclidean_classes_test = knn.predict_proba(X_test)[:, 1]
    
    print('KNN k=%d, test braycurtis' % (k))
    knn = KNeighborsClassifier(n_neighbors=k, metric='braycurtis')
    knn.fit(X_train, y_train)
    braycurtis_classes_test = knn.predict_proba(X_test)[:, 1]

    return [manhattan_classes, euclidean_classes, braycurtis_classes], [manhattan_classes_test, euclidean_classes_test, braycurtis_classes_test]

In [None]:
for power in range(2, 11):
    k = 2 ** power
    print('Generating meta features for KNN k=%d' % k)
    train_results, test_results = KNN(k)
    train_results = np.stack(train_results, axis=-1)
    test_results = np.stack(test_results, axis=-1)
    np.savetxt('../inferences/knn_%d_train.txt' % k ,train_results, fmt='%.6g')
    np.savetxt('../inferences/knn_%d_test.txt' % k ,test_results, fmt='%.6g')

Generating meta features for KNN k=4
KNN k=4, split=1 manhattan
KNN k=4, split=1 euclidean
KNN k=4, split=1 braycurtis
KNN k=4, split=2 manhattan


# Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
def AdaBoost(n_estimators=250, learning_rate=1):
    adaboost_classes_train = np.empty(y_train.shape)
    adaboost_classes_test = np.empty(X_test.shape[0])

    cursplit = 0;
    
    for train_index, test_index in kf.split(X_train, y_train):
        cursplit += 1
        print('Adaboost, split=%d' % (cursplit))
        clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        clf.fit(X_train[train_index], y_train[train_index])
        adaboost_classes_train[test_index] = clf.predict_proba(X_train[test_index])[:, 1]
    
    print('Adaboost, test')
    clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    clf.fit(X_train, y_train)
    adaboost_classes_test = clf.predict_proba(X_test)[:, 1]
    
    return adaboost_classes_train, adaboost_classes_test

## <font color='red'>Do not run cells below this</font>

In [86]:
# Calculate TF-IDF weighted inputs from training data
max_term_freqs = np.maximum(np.max(X_train, axis=1), 1)
term_freq = X_train / max_term_freqs[:,np.newaxis]
inverse_doc_freq = np.log((X_train.shape[0] + X_test.shape[0]) / (np.count_nonzero(X_train, axis=0) + np.count_nonzero(X_test, axis=0)))
X_train_tfidf = term_freq * inverse_doc_freq[np.newaxis,:]


max_term_freqs_test = np.maximum(np.max(X_test, axis=1), 1)
term_freq_test = X_test / max_term_freqs_test[:,np.newaxis]
X_test_tfidf = term_freq_test * inverse_doc_freq[np.newaxis,:]

X_mean = np.concatenate([X_train_tfidf,X_test_tfidf]).mean(axis=0)
X_std = np.concatenate([X_train_tfidf,X_test_tfidf]).std(axis=0)
X_train_tfidf_normed = (X_train_tfidf - X_mean) / X_std
X_test_tfidf_normed = (X_test_tfidf - X_mean) / X_std