In [46]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

import math

# SciKit-Learn
import sklearn as skl
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.decomposition import PCA


# Plotting library
from matplotlib import pyplot
import matplotlib as mpl

# Optimization module in scipy
from scipy import optimize

## Load the Data

In [47]:
X_train = np.zeros((7868, 76))
Y_train = np.zeros((7868, 147))
X_test = np.zeros((2705, 76))
predictions = np.zeros((2705, 147))

# Load X_train
dir_string = '../data/train_feature_files'
training_files = os.listdir(dir_string)

for file in training_files:
    file_path = dir_string + '/' + file
    song_id_str = file_path.split('/')[-1].split('.')[0]
    a = np.load(file_path)
    a_summary = a['summary']
    X_train[int(song_id_str)] = a_summary
    
# Load Y_train
with open('../data/cal10k_train_data.csv', 'r') as y_train_file:
    for line in y_train_file:
        if 'id' not in line:
            features_w_id = line.strip().split(',')
            features_no_id = features_w_id[1:]
            for idx in range(len(features_no_id)):
                features_no_id[idx] = int(features_no_id[idx])
            Y_train[int(features_w_id[0])] = np.array(features_no_id)
            
# Load X_test
dir_string = '../data/test_feature_files/'
testing_files = os.listdir(dir_string)

for file in testing_files:
    file_path = dir_string + file
    song_id_str = file_path.split('/')[-1].split('.')[0]
    data = np.load(file_path)
    data_summary = data['summary']
    X_test[int(song_id_str)] = data_summary

## Clean the Data

In [48]:
where_are_NaNs = np.isnan(X_train)
X_train[where_are_NaNs] = 0
where_are_NaNs = np.isnan(X_test)
X_test[where_are_NaNs] = 0

## Scale the Data

In [49]:
# scaled_X_train = preprocessing.scale(X_train)
# scaled_X_test = preprocessing.scale(X_test)

## Dimensionality Reduction (by Hand)

In [50]:
feature_indices = list()
feature_indices.append(1)
for i in range(3,15):
    feature_indices.append(i)
for i in range(27,40):
    feature_indices.append(i)
feature_indices.append(53)
for i in range(56,63):
    feature_indices.append(i)
feature_indices.append(70)
feature_indices.append(73)
X_train_tempo_chroma_mfcc = X_train[:, feature_indices]
X_test_tempo_chroma_mfcc = X_test[:, feature_indices]

## Train that Shit

In [51]:
genre_models = list()
for i in range(147):  # For each genre
    genre_model = LogisticRegression(solver='liblinear')
    genre_model.fit(X_train_tempo_chroma_mfcc, Y_train[:,i])
    genre_models.append(genre_model)
    
print(len(genre_models))

147


## Predict Some Things

In [52]:
for i in range(147):
    genre_prediction_prob = genre_models[i].predict_proba(X_test_tempo_chroma_mfcc)
    predictions[:, i] = genre_prediction_prob[:,1]  # Get probability that it is class=1
        
print(predictions)

[[  7.44797618e-03   4.50197360e-03   4.26336144e-02 ...,   2.03733491e-02
    5.63474908e-03   1.20326136e-03]
 [  1.87900928e-03   1.43071509e-05   1.07504938e-02 ...,   6.78784935e-04
    2.49255810e-04   1.03580289e-02]
 [  2.53776819e-03   4.98069435e-04   3.81272026e-02 ...,   2.85375914e-03
    2.49741263e-03   4.08414996e-03]
 ..., 
 [  5.86940679e-03   9.76081769e-03   2.38020817e-02 ...,   6.96657491e-03
    2.45944980e-03   3.72161250e-02]
 [  4.07595004e-02   2.61308736e-02   2.18998357e-02 ...,   2.21267362e-02
    4.28870188e-03   1.15018536e-03]
 [  5.81545199e-03   5.15934896e-04   3.83228284e-02 ...,   3.07788717e-03
    2.15716867e-03   5.75399530e-02]]


## Write to a File with a Great Name

In [53]:
print(predictions.shape)
file = open('../data/cal10k_test_random_submission.csv', 'r')
id_line = None
for line in file:
    if 'id' in line:
        id_line = line
ids = []
for i in range(2705):
    ids.append(int(i))
predictions = np.insert(predictions, 0, ids, axis=1)
string_predictions = []
for song_idx in range(len(predictions)):
    song = predictions[song_idx]
    song_string = []
    for genre_idx in range(len(song)):
        if genre_idx != 0:
            song_string.append(str(predictions[song_idx, genre_idx]))
        else: 
            song_string.append(str(predictions[song_idx, genre_idx]))
    string_predictions.append(song_string)
with open('../data/fuck_some_of_those_columns_tempo_chroma_mfcc_are_cool_tho.csv', 'w') as out_file:
    out_file.write(id_line)
    for song_array in string_predictions:
        song_array[0] = str(int(float(song_array[0])))
        out_file.write(",".join(song_array) + os.linesep)

(2705, 147)


## Evaluating this shit

In [54]:
X_cv_train = X_train_tempo_chroma_mfcc[:-1500]
Y_cv_train = Y_train[:-1500]
X_cv_test = X_train_tempo_chroma_mfcc[-1500:]
Y_cv_test = Y_train[-1500:]
cv_predictions = np.zeros(Y_cv_test.shape)

genre_models = list()
for i in range(147):  # For each genre
    genre_model = LogisticRegression(solver='liblinear')
    genre_model.fit(X_cv_train, Y_cv_train[:,i])
    genre_models.append(genre_model)


for i in range(147):
    genre_prediction_prob = genre_models[i].predict_proba(X_cv_test)
    cv_predictions[:, i] = genre_prediction_prob[:,1]  # Get probability that it is class=1
        
print(cv_predictions)

[[ 0.00131303  0.00038539  0.01094618 ...,  0.00113878  0.00152891
   0.00449887]
 [ 0.00099141  0.0016229   0.04806449 ...,  0.0009814   0.00200187
   0.05501445]
 [ 0.00236326  0.00483812  0.03486454 ...,  0.00609434  0.00733406
   0.00367785]
 ..., 
 [ 0.00177691  0.0003273   0.03175369 ...,  0.00128652  0.00465786
   0.0425281 ]
 [ 0.00073046  0.00037186  0.01049603 ...,  0.00235111  0.00165572
   0.12425217]
 [ 0.0021762   0.00106096  0.03207204 ...,  0.00131827  0.00697672
   0.00972884]]


In [55]:
print(skl.metrics.roc_auc_score(Y_cv_test, cv_predictions))

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.