In [1]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

import math

# SciKit-Learn
import sklearn as skl
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier


# Plotting library
from matplotlib import pyplot
import matplotlib as mpl

# Optimization module in scipy
from scipy import optimize


## Load the Data

In [2]:
X_train = np.zeros((7868, 76))
Y_train = np.zeros((7868, 147))
X_test = np.zeros((2705, 76))
predictions = np.zeros((2705, 147))

# Load X_train
dir_string = '../data/train_feature_files'
training_files = os.listdir(dir_string)

for file in training_files:
    file_path = dir_string + '/' + file
    song_id_str = file_path.split('/')[-1].split('.')[0]
    a = np.load(file_path)
    a_summary = a['summary']
    X_train[int(song_id_str)] = a_summary
    
# Load Y_train
with open('../data/cal10k_train_data.csv', 'r') as y_train_file:
    for line in y_train_file:
        if 'id' not in line:
            features_w_id = line.strip().split(',')
            features_no_id = features_w_id[1:]
            for idx in range(len(features_no_id)):
                features_no_id[idx] = int(features_no_id[idx])
            Y_train[int(features_w_id[0])] = np.array(features_no_id)
            
# Load X_test
dir_string = '../data/test_feature_files/'
testing_files = os.listdir(dir_string)

for file in testing_files:
    file_path = dir_string + file
    song_id_str = file_path.split('/')[-1].split('.')[0]
    data = np.load(file_path)
    data_summary = data['summary']
    X_test[int(song_id_str)] = data_summary

## Get rid of NaN

In [3]:
where_are_NaNs = np.isnan(X_train)
X_train[where_are_NaNs] = 0
where_are_NaNs = np.isnan(X_test)
X_test[where_are_NaNs] = 0
print(X_train.shape)
print(X_test.shape)

(7868, 76)
(2705, 76)


## Scale & PCA

In [4]:
# Scale the Data for Train and Test set
X_train_scaled = preprocessing.scale(X_train)
X_test_scaled = preprocessing.scale(X_test)

# Perform Dimensionality reduction because fuckkkkkkkkk some of these dumbass features
pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(X_train_scaled)
X_train_approx = pca.transform(X_train_scaled)
print(X_train_approx.shape)
X_test_approx = pca.transform(X_test_scaled)
print(X_test_approx.shape)

(7868, 62)
(2705, 62)


## Train Multi-layer Perceptron (Neural Net)

In [5]:
# neural_net = MLPClassifier(hidden_layer_sizes=(120,120), activation='logistic', solver='lbfgs') got 89%
# neural_net = MLPClassifier(hidden_layer_sizes=(186,186), activation='logistic', solver='lbfgs') got 89% but better
neural_net = MLPClassifier(hidden_layer_sizes=(248,248), activation='logistic', solver='lbfgs')
neural_net.fit(X_train_approx, Y_train)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(248, 248), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

## Get Predictions

In [6]:
predictions = neural_net.predict_proba(X_test_approx)
print(predictions)

[[  1.55056157e-03   1.43595276e-03   1.43480820e-02 ...,   8.49919857e-03
    2.24365360e-03   1.25879112e-03]
 [  2.01260786e-05   1.28974080e-06   5.16671778e-04 ...,   8.34217966e-05
    9.38354658e-06   4.44585488e-04]
 [  4.44369815e-03   1.34166275e-04   3.21877230e-02 ...,   1.34324671e-03
    5.39572521e-04   1.29281649e-04]
 ..., 
 [  1.60593287e-04   2.29503744e-02   2.21575840e-03 ...,   1.98121577e-03
    1.85166674e-02   7.33457220e-05]
 [  2.23194396e-05   8.50176356e-03   1.76553361e-01 ...,   2.05473810e-03
    7.67774875e-05   1.42679646e-08]
 [  8.74717745e-05   5.24558130e-05   2.44343306e-06 ...,   2.82291782e-04
    3.97409839e-04   6.16579284e-03]]


## Write to File

In [7]:
file = open('../data/cal10k_test_random_submission.csv', 'r')
id_line = None
for line in file:
    if 'id' in line:
        id_line = line
ids = []
for i in range(2705):
    ids.append(int(i))
predictions = np.insert(predictions, 0, ids, axis=1)
string_predictions = []
for song_idx in range(len(predictions)):
    song = predictions[song_idx]
    song_string = []
    for genre_idx in range(len(song)):
        if genre_idx != 0:
            song_string.append(str(predictions[song_idx, genre_idx]))
        else: 
            song_string.append(str(predictions[song_idx, genre_idx]))
    string_predictions.append(song_string)
with open('../data/will_pca_help_MLP_probably_not.csv', 'w') as out_file:
    out_file.write(id_line)
    for song_array in string_predictions:
        song_array[0] = str(int(float(song_array[0])))
        out_file.write(",".join(song_array) + os.linesep)