## Imports

In [1]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

# SciKit-Learn
from sklearn.linear_model import LogisticRegression

# Plotting library
from matplotlib import pyplot
import matplotlib as mpl

# Optimization module in scipy
from scipy import optimize

#Skynet
import sklearn as skl
from sklearn.neural_network import MLPClassifier

from sklearn import preprocessing

import math

## Load Data

In [3]:
X_train = np.zeros((7868, 76))
Y_train = np.zeros((7868, 147))
X_test = np.zeros((2705, 76))
predictions = np.zeros((2705, 147))

# Load X_train
dir_string = '../data/train_feature_files'
training_files = os.listdir(dir_string)

for file in training_files:
    file_path = dir_string + '/' + file
    song_id_str = file_path.split('/')[-1].split('.')[0]
    a = np.load(file_path)
    a_summary = a['summary']
    X_train[int(song_id_str)] = a_summary
    
# Load Y_train
with open('../data/cal10k_train_data.csv', 'r') as y_train_file:
    for line in y_train_file:
        if 'id' not in line:
            features_w_id = line.strip().split(',')
            features_no_id = features_w_id[1:]
            for idx in range(len(features_no_id)):
                features_no_id[idx] = int(features_no_id[idx])
            Y_train[int(features_w_id[0])] = np.array(features_no_id)
            
# Load X_test
dir_string = '../data/test_feature_files/'
testing_files = os.listdir(dir_string)

for file in testing_files:
    file_path = dir_string + file
    song_id_str = file_path.split('/')[-1].split('.')[0]
    data = np.load(file_path)
    data_summary = data['summary']
    X_test[int(song_id_str)] = data_summary

## Clean the baby

In [4]:
where_are_NaNs = np.isnan(X_train)
X_train[where_are_NaNs] = 0
where_are_NaNs = np.isnan(X_test)
X_test[where_are_NaNs] = 0
print(X_train.shape)
print(X_test.shape)

(7868, 76)
(2705, 76)


## Process the baby

In [5]:
scaled_X = preprocessing.scale(X_train)
scaled_test = preprocessing.scale(X_test)
print(scaled_X)

[[  1.04953433e+00  -1.82717419e+00  -3.14378223e-01 ...,  -7.78459164e-01
   -6.15547973e-01   3.63916714e-02]
 [ -5.89573467e-01  -5.43698480e-01   1.26742646e+00 ...,   3.02933909e+00
    6.36986019e+00  -3.53863479e-02]
 [ -8.71579002e-01  -1.45694081e+00   8.06538601e-01 ...,  -1.39196830e+00
    7.20935543e-02   7.04298404e-01]
 ..., 
 [ -6.80810552e-01   1.28278619e+00  -6.48520049e-01 ...,   1.12104502e+00
    4.24172839e-03  -6.16879755e-01]
 [ -8.78688385e-01   4.17609241e-01  -1.75760504e-01 ...,   6.52454338e-01
   -1.75135222e-01  -5.08273063e-01]
 [ -9.54521806e-01   4.17609241e-01   4.14536114e-01 ...,   1.37187461e-01
   -1.69513068e-01  -5.52251472e-01]]


## Fit the baby

In [7]:
genre_model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(27, 2), random_state=1)
genre_model.fit(scaled_X, Y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(27, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

## Teach the robot how to love

In [8]:
predictions = genre_model.predict_proba(scaled_test)
        
print(predictions)

[[ 0.0056245   0.01083739  0.05960931 ...,  0.00736552  0.01057328
   0.00252812]
 [ 0.00207128  0.00068827  0.00083464 ...,  0.0018892   0.00353662
   0.0218879 ]
 [ 0.0061763   0.0028434   0.00632428 ...,  0.00521109  0.00703503
   0.022561  ]
 ..., 
 [ 0.01432918  0.00942668  0.03530193 ...,  0.01175108  0.01236127
   0.01981373]
 [ 0.00900983  0.02149535  0.14886359 ...,  0.01166951  0.01457515
   0.0022697 ]
 [ 0.00383073  0.00119922  0.0017344  ...,  0.00311178  0.00481954
   0.03177182]]


In [9]:
print(np.sum(predictions))

6228.28682447


## Show the world

In [10]:
file = open('../data/cal10k_test_random_submission.csv', 'r')
id_line = None
for line in file:
    if 'id' in line:
        id_line = line
ids = []
for i in range(2705):
    ids.append(int(i))
predictions = np.insert(predictions, 0, ids, axis=1)
string_predictions = []
for song_idx in range(len(predictions)):
    song = predictions[song_idx]
    song_string = []
    for genre_idx in range(len(song)):
        if genre_idx != 0:
            song_string.append(str(predictions[song_idx, genre_idx]))
        else: 
            song_string.append(str(predictions[song_idx, genre_idx]))
    string_predictions.append(song_string)
with open('../data/skynet_will_rise_with_multilabel_shit.csv', 'w') as out_file:
    out_file.write(id_line)
    for song_array in string_predictions:
        song_array[0] = str(int(float(song_array[0])))
        out_file.write(",".join(song_array) + os.linesep)

## More complex

In [11]:
X_cv_train = X_train[0:-2000]
Y_cv_train = Y_train[0:-2000]
X_cv_test = X_train[-2000:]
Y_cv_test = Y_train[-2000:]
cv_predictions = np.zeros(Y_cv_test.shape)

scaled_cv_X = preprocessing.scale(X_cv_train)
scaled_cv_test = preprocessing.scale(X_cv_test)

genre_model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(27, 2), random_state=1)
genre_model.fit(scaled_cv_X, Y_cv_train)

cv_predictions = genre_model.predict_proba(scaled_cv_test)  # Get probability that it is class=1
        
print(cv_predictions)

[[  4.70533621e-06   3.38135998e-06   5.50615417e-09 ...,   2.50640947e-05
    3.57141381e-04   5.40106541e-03]
 [  7.61445256e-07   6.56751151e-07   2.79616909e-10 ...,   6.12246417e-06
    1.56471704e-04   2.42759467e-03]
 [  8.84339963e-05   5.21006421e-05   8.45600768e-07 ...,   2.48155461e-04
    1.36202354e-03   1.58435508e-02]
 ..., 
 [  9.26271265e-04   5.80707498e-04   8.19741488e-05 ...,   1.63794763e-03
    4.06466372e-03   2.32730994e-02]
 [  2.84602872e-04   1.60201919e-04   6.82471179e-06 ...,   6.23439608e-04
    2.32873588e-03   2.25657291e-02]
 [  1.27349445e-03   7.95651274e-04   1.48029487e-04 ...,   2.10918239e-03
    4.70722008e-03   2.51398493e-02]]


In [12]:
print(cv_predictions.shape)
print(skl.metrics.roc_auc_score(Y_cv_test, cv_predictions))

(2000, 147)
0.781023172152
