In [39]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

import math

# SciKit-Learn
import sklearn as skl
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA


# Plotting library
from matplotlib import pyplot
import matplotlib as mpl

# Optimization module in scipy
from scipy import optimize


## Load the Data

In [40]:
X_train = np.zeros((7868, 76))
Y_train = np.zeros((7868, 147))
X_test = np.zeros((2705, 76))
predictions = np.zeros((2705, 147))

# Load X_train
dir_string = '../data/train_feature_files'
training_files = os.listdir(dir_string)

for file in training_files:
    file_path = dir_string + '/' + file
    song_id_str = file_path.split('/')[-1].split('.')[0]
    a = np.load(file_path)
    a_summary = a['summary']
    X_train[int(song_id_str)] = a_summary
    
# Load Y_train
with open('../data/cal10k_train_data.csv', 'r') as y_train_file:
    for line in y_train_file:
        if 'id' not in line:
            features_w_id = line.strip().split(',')
            features_no_id = features_w_id[1:]
            for idx in range(len(features_no_id)):
                features_no_id[idx] = int(features_no_id[idx])
            Y_train[int(features_w_id[0])] = np.array(features_no_id)
            
# Load X_test
dir_string = '../data/test_feature_files/'
testing_files = os.listdir(dir_string)

for file in testing_files:
    file_path = dir_string + file
    song_id_str = file_path.split('/')[-1].split('.')[0]
    data = np.load(file_path)
    data_summary = data['summary']
    X_test[int(song_id_str)] = data_summary

## Remove Column 2 because fuck that column

In [41]:
where_are_NaNs = np.isnan(X_train)
X_train[where_are_NaNs] = 0
where_are_NaNs = np.isnan(X_test)
X_test[where_are_NaNs] = 0
print(X_train.shape)
print(X_test.shape)

(7868, 76)
(2705, 76)


## Do PCA (w/ K keeping 99% variance)

In [42]:
# Scale the Data for Train and Test set
X_train_scaled = preprocessing.scale(X_train)
X_test_scaled = preprocessing.scale(X_test)

# Perform Dimensionality reduction because fuckkkkkkkkk some of these dumbass features
pca = PCA(n_components=0.99, svd_solver='full')
pca.fit(X_train_scaled)
X_train_approx = pca.transform(X_train_scaled)
print(X_train_approx.shape)
X_test_approx = pca.transform(X_test_scaled)
print(X_test_approx.shape)

(7868, 62)
(2705, 62)


## Train 147 Logits for Each Genre

In [43]:
genre_models = list()
for i in range(147):  # For each genre
    genre_model = LogisticRegression(solver='liblinear')
    genre_model.fit(X_train_approx, Y_train[:,i])
    genre_models.append(genre_model)
    
print(len(genre_models))


147


## Predict on Testing Data for all 147 models

In [44]:
for i in range(147):
    genre_prediction_prob = genre_models[i].predict_proba(X_test_approx)
    predictions[:, i] = genre_prediction_prob[:,1]  # Get probability that it is class=1
        
print(predictions)


[[  2.16169318e-03   5.47220032e-04   1.46417003e-01 ...,   8.52911028e-03
    2.70904474e-03   1.16139427e-03]
 [  1.98945135e-03   1.17831297e-04   2.76674609e-03 ...,   2.45018212e-04
    7.30310565e-04   2.04680447e-02]
 [  5.38803836e-02   2.01536354e-03   4.15018060e-02 ...,   4.28598186e-03
    8.86172419e-03   3.80692105e-03]
 ..., 
 [  1.95205435e-03   2.14800919e-02   6.76201201e-03 ...,   4.69014523e-03
    4.17695736e-03   6.06008084e-03]
 [  6.01795203e-04   2.38817520e-02   1.51914567e-02 ...,   1.24282922e-02
    5.35448848e-04   2.07086901e-04]
 [  6.25550439e-03   1.28913519e-03   7.39441591e-03 ...,   1.39868166e-03
    1.39552589e-03   7.73781788e-02]]


## Write to a CSV for Submission

In [45]:
file = open('../data/cal10k_test_random_submission.csv', 'r')
id_line = None
for line in file:
    if 'id' in line:
        id_line = line
ids = []
for i in range(2705):
    ids.append(int(i))
predictions = np.insert(predictions, 0, ids, axis=1)
string_predictions = []
for song_idx in range(len(predictions)):
    song = predictions[song_idx]
    song_string = []
    for genre_idx in range(len(song)):
        if genre_idx != 0:
            song_string.append(str(predictions[song_idx, genre_idx]))
        else: 
            song_string.append(str(predictions[song_idx, genre_idx]))
    string_predictions.append(song_string)
with open('../data/fuck_this_project_sometimes_man.csv', 'w') as out_file:
    out_file.write(id_line)
    for song_array in string_predictions:
        song_array[0] = str(int(float(song_array[0])))
        out_file.write(",".join(song_array) + os.linesep)

## Evaluating this shit

In [28]:
half_X_train_len = math.floor(len(X_train)/2)
half_Y_train_len = math.floor(len(Y_train)/2)
X_cv_train = X_train[0:half_X_train_len]
Y_cv_train = Y_train[0:half_Y_train_len]
X_cv_test = X_train[half_X_train_len:]
Y_cv_test = Y_train[half_Y_train_len:]
cv_predictions = np.zeros(Y_cv_test.shape)

genre_models = list()
for i in range(147):  # For each genre
    genre_model = LogisticRegression(solver='liblinear')
    genre_model.fit(X_cv_train, Y_cv_train[:,i])
    genre_models.append(genre_model)


for i in range(147):
    genre_prediction_prob = genre_models[i].predict_proba(X_cv_test)
    cv_predictions[:, i] = genre_prediction_prob[:,1]  # Get probability that it is class=1
        
print(cv_predictions)

[[  1.90413028e-03   6.35661788e-06   3.51711131e-02 ...,   2.01218400e-03
    9.37349362e-06   3.63976344e-02]
 [  4.28641394e-04   2.88837216e-05   2.77470180e-02 ...,   1.13898490e-03
    1.06090562e-03   4.42270162e-04]
 [  9.99164611e-04   4.72838828e-06   1.50059174e-02 ...,   4.03653496e-03
    2.85722605e-04   2.91386925e-03]
 ..., 
 [  4.34333501e-04   6.37673799e-05   1.74543085e-02 ...,   1.19922678e-05
    3.07606572e-03   1.64472038e-02]
 [  1.90203732e-04   9.45692211e-05   1.16950095e-02 ...,   5.62015572e-04
    6.98205696e-04   1.40019376e-01]
 [  2.58452470e-03   3.17083508e-04   2.43683935e-02 ...,   4.19116767e-04
    5.10110380e-03   4.38808959e-02]]


In [32]:
print(cv_predictions.shape)
print(skl.metrics.roc_auc_score(Y_cv_test, cv_predictions))

(3934, 147)
0.851911722097
