# Overview
This notebook converts the training & test file formats into ones usable by other tools.

In [3]:
import numpy as np
training_data = np.loadtxt('../data/training_data.txt', skiprows=1)
X_test = np.loadtxt('../data/test_data.txt', skiprows=1)

In [5]:
import keras
X_train = training_data[:,1:]
y_train = training_data[:,0]
y_train_cat = keras.utils.to_categorical(y_train)

Using TensorFlow backend.


In [86]:
# Calculate TF-IDF weighted inputs from training data
max_term_freqs = np.maximum(np.max(X_train, axis=1), 1)
term_freq = X_train / max_term_freqs[:,np.newaxis]
inverse_doc_freq = np.log((X_train.shape[0] + X_test.shape[0]) / (np.count_nonzero(X_train, axis=0) + np.count_nonzero(X_test, axis=0)))
X_train_tfidf = term_freq * inverse_doc_freq[np.newaxis,:]


max_term_freqs_test = np.maximum(np.max(X_test, axis=1), 1)
term_freq_test = X_test / max_term_freqs_test[:,np.newaxis]
X_test_tfidf = term_freq_test * inverse_doc_freq[np.newaxis,:]

X_mean = np.concatenate([X_train_tfidf,X_test_tfidf]).mean(axis=0)
X_std = np.concatenate([X_train_tfidf,X_test_tfidf]).std(axis=0)
X_train_tfidf_normed = (X_train_tfidf - X_mean) / X_std
X_test_tfidf_normed = (X_test_tfidf - X_mean) / X_std

# File format conversion for LibFFM

In [92]:
with open('../data/libffm/train_ffm.txt', 'w') as f:
    for i in range(20000):
        f.write(('1' if y_train[i] == 1 else '0'))
        for j in range(1000):
            val = X_train[i][j]
            if val != 0:
                f.write(' 0:' + str(j) + ':' + str(int(val)))
        f.write('\n')
with open('../data/libffm/train_noval_ffm.txt', 'w') as f:
    for i in range(20000):
        if i % 10 == 0: continue
        f.write(('1' if y_train[i] == 1 else '0'))
        for j in range(1000):
            val = X_train[i][j]
            if val != 0:
                f.write(' 0:' + str(j) + ':' + str(int(val)))
        f.write('\n')

with open('../data/libffm/val_ffm.txt', 'w') as f:
    for i in range(20000):
        if i % 10 != 0: continue
        f.write(('1' if y_train[i] == 1 else '0'))
        for j in range(1000):
            val = X_train[i][j]
            if val != 0:
                f.write(' 0:' + str(j) + ':' + str(int(val)))
        f.write('\n')
with open('../data/libffm/test_ffm.txt', 'w') as f:
    for i in range(10000):
        f.write(('0'))
        for j in range(1000):
            val = X_test[i][j]
            if val != 0:
                f.write(' 0:' + str(j) + ':' + str(int(val)))
        f.write('\n')

# File format conversion for Vowpal Wabbit

In [6]:
with open('../data/vowpal-wabbit/train_vw.txt', 'w') as f:
    for i in range(int(X_train.shape[0])):
        f.write(('1' if y_train[i] == 1 else '-1') + ' |')
        for j in range(1000):
            val = X_train[i][j]
            if val != 0:
                f.write(' ' + str(j) + ':' + str(int(val)))
        f.write('\n')

In [79]:
with open('../data/vowpal-wabbit/test_vw.txt', 'w') as f:
    for i in range(int(X_test.shape[0])):
        f.write('|')
        for j in range(1000):
            val = X_test[i][j]
            if val != 0:
                f.write(' ' + str(j) + ':' + str(int(val)))
        f.write('\n')