# Overview
This notebook generates the first layer of meta features.

In [1]:
import numpy as np
from sklearn.model_selection import KFold

In [2]:
training_data = np.loadtxt('../data/training_data.txt', skiprows=1)
X_test = np.loadtxt('../data/test_data.txt', skiprows=1)

In [45]:
X_train = training_data[:,1:]
y_train = training_data[:,0]
kf = KFold(n_splits=5, shuffle=True, random_state=214)

In [None]:
for power in range(1, 11):
    k = 2 ** power
    print('Generating meta features for KNN k=%d' % k)
    train_results, test_results = KNN(k)
    np.savetxt('../inferences/knn_%d_train.txt' % k ,train_results, fmt='%.6g')
    np.savetxt('../inferences/knn_%d_train.txt' % k ,test_results, fmt='%.6g')

Generating meta features for KNN k=2
KNN k=2, split=1 manhattan


In [88]:
manhattan_classes = np.ones(y_train.shape)
euclidean_classes = np.ones(y_train.shape)
braycurtis_classes = np.ones(y_train.shape)
tosave = np.stack((manhattan_classes, euclidean_classes, braycurtis_classes), axis=-1)
k=2
np.savetxt('../inferences/knn_%d_train.txt' % k,tosave, fmt='%.6g')


In [62]:
np.empty(X_test.shape[0]).shape

(10000,)

In [91]:
from sklearn.neighbors import KNeighborsClassifier
def KNN(k):
    manhattan_classes_train = np.empty(y_train.shape)
    euclidean_classes_train = np.empty(y_train.shape)
    braycurtis_classes_train = np.empty(y_train.shape)
    manhattan_classes_test = np.empty(X_test.shape[0])
    euclidean_classes_test = np.empty(X_test.shape[0])
    braycurtis_classes_test = np.empty(X_test.shape[0])
    
    cursplit = 0;
    
    for train_index, test_index in kf.split(X_train, y_train):
        cursplit += 1
        print('KNN k=%d, split=%d manhattan' % (k, cursplit))
        knn = KNeighborsClassifier(n_neighbors=k, p=2)
        knn.fit(X_train[train_index], y_train[train_index])
        manhattan_classes[test_index] = knn.predict_proba(X_train[test_index])[:, 1]

        print('KNN k=%d, split=%d euclidean' % (k, cursplit))
        knn = KNeighborsClassifier(n_neighbors=k, p=1)
        knn.fit(X_train[train_index], y_train[train_index])
        euclidean_classes[test_index] = knn.predict_proba(X_train[test_index])[:, 1]

        print('KNN k=%d, split=%d braycurtis' % (k, cursplit))
        knn = KNeighborsClassifier(n_neighbors=k, metric='braycurtis')
        knn.fit(X_train[train_index], y_train[train_index])
        braycurtis_classes[test_index] = knn.predict_proba(X_train[test_index])[:, 1]
    
    print('KNN k=%d, test manhattan' % (k))
    knn = KNeighborsClassifier(n_neighbors=k, p=2)
    knn.fit(X_train, y_train)
    manhattan_classes_test = knn.predict_proba(X_test)[:, 1]
    
    print('KNN k=%d, test euclidean' % (k))
    knn = KNeighborsClassifier(n_neighbors=k, p=1)
    knn.fit(X_train, y_train)
    euclidean_classes_test = knn.predict_proba(X_test)[:, 1]
    
    print('KNN k=%d, test braycurtis' % (k))
    knn = KNeighborsClassifier(n_neighbors=k, metric='braycurtis')
    knn.fit(X_train, y_train)
    braycurtis_classes_test = knn.predict_proba(X_test)[:, 1]

    return [manhattan_classes, euclidean_classes, braycurtis_classes], [manhattan_classes_test, euclidean_classes_test, braycurtis_classes_test]

In [68]:
a = knn.predict_proba(X_train[:10])
b = knn.predict_proba(X_train[:10])
c = knn.predict_proba(X_train[:10])

In [86]:
# Calculate TF-IDF weighted inputs from training data
max_term_freqs = np.maximum(np.max(X_train, axis=1), 1)
term_freq = X_train / max_term_freqs[:,np.newaxis]
inverse_doc_freq = np.log((X_train.shape[0] + X_test.shape[0]) / (np.count_nonzero(X_train, axis=0) + np.count_nonzero(X_test, axis=0)))
X_train_tfidf = term_freq * inverse_doc_freq[np.newaxis,:]


max_term_freqs_test = np.maximum(np.max(X_test, axis=1), 1)
term_freq_test = X_test / max_term_freqs_test[:,np.newaxis]
X_test_tfidf = term_freq_test * inverse_doc_freq[np.newaxis,:]

X_mean = np.concatenate([X_train_tfidf,X_test_tfidf]).mean(axis=0)
X_std = np.concatenate([X_train_tfidf,X_test_tfidf]).std(axis=0)
X_train_tfidf_normed = (X_train_tfidf - X_mean) / X_std
X_test_tfidf_normed = (X_test_tfidf - X_mean) / X_std

# File format conversion for LibFFM

In [92]:
with open('../data/libffm/train_ffm.txt', 'w') as f:
    for i in range(20000):
        f.write(('1' if y_train[i] == 1 else '0'))
        for j in range(1000):
            val = X_train[i][j]
            if val != 0:
                f.write(' 0:' + str(j) + ':' + str(int(val)))
        f.write('\n')
with open('../data/libffm/train_noval_ffm.txt', 'w') as f:
    for i in range(20000):
        if i % 10 == 0: continue
        f.write(('1' if y_train[i] == 1 else '0'))
        for j in range(1000):
            val = X_train[i][j]
            if val != 0:
                f.write(' 0:' + str(j) + ':' + str(int(val)))
        f.write('\n')

with open('../data/libffm/val_ffm.txt', 'w') as f:
    for i in range(20000):
        if i % 10 != 0: continue
        f.write(('1' if y_train[i] == 1 else '0'))
        for j in range(1000):
            val = X_train[i][j]
            if val != 0:
                f.write(' 0:' + str(j) + ':' + str(int(val)))
        f.write('\n')
with open('../data/libffm/test_ffm.txt', 'w') as f:
    for i in range(10000):
        f.write(('0'))
        for j in range(1000):
            val = X_test[i][j]
            if val != 0:
                f.write(' 0:' + str(j) + ':' + str(int(val)))
        f.write('\n')

# File format conversion for Vowpal Wabbit

In [78]:
with open('../data/vowpal-wabbit/train_vw.txt', 'w') as f:
    for i in range(int(X_test.shape[0])):
        f.write(('1' if y_train[i] == 1 else '-1') + ' |')
        for j in range(1000):
            val = X_train[i][j]
            if val != 0:
                f.write(' ' + str(j) + ':' + str(int(val)))
        f.write('\n')

In [79]:
with open('../data/vowpal-wabbit/test_vw.txt', 'w') as f:
    for i in range(int(X_test.shape[0])):
        f.write('|')
        for j in range(1000):
            val = X_test[i][j]
            if val != 0:
                f.write(' ' + str(j) + ':' + str(int(val)))
        f.write('\n')