In [8]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

# SciKit-Learn
from sklearn.linear_model import LogisticRegression

# Plotting library
from matplotlib import pyplot
import matplotlib as mpl

# Optimization module in scipy
from scipy import optimize



## Load the Data

In [9]:
X_train = np.zeros((7868, 76))
Y_train = np.zeros((7868, 147))
X_test = np.zeros((2705, 76))
predictions = np.zeros((2705, 147))

# Load X_train
dir_string = '../data/train_feature_files'
training_files = os.listdir(dir_string)

for file in training_files:
    file_path = dir_string + '/' + file
    song_id_str = file_path.split('/')[-1].split('.')[0]
    a = np.load(file_path)
    a_summary = a['summary']
    X_train[int(song_id_str)] = a_summary
    
# Load Y_train
with open('../data/cal10k_train_data.csv', 'r') as y_train_file:
    for line in y_train_file:
        if 'id' not in line:
            features_w_id = line.strip().split(',')
            features_no_id = features_w_id[1:]
            for idx in range(len(features_no_id)):
                features_no_id[idx] = int(features_no_id[idx])
            Y_train[int(features_w_id[0])] = np.array(features_no_id)
            
# Load X_test
dir_string = '../data/test_feature_files/'
testing_files = os.listdir(dir_string)

for file in testing_files:
    file_path = dir_string + file
    song_id_str = file_path.split('/')[-1].split('.')[0]
    data = np.load(file_path)
    data_summary = data['summary']
    X_test[int(song_id_str)] = data_summary

## Remove Column 2 because fuck that column

In [10]:
X_train = np.delete(X_train, 2, axis=1)
X_test = np.delete(X_test, 2, axis=1)
print(X_train.shape)
print(X_test.shape)

(7868, 75)
(2705, 75)


## Train 147 Logits for Each Genre

In [11]:
genre_models = list()
for i in range(147):  # For each genre
    genre_model = LogisticRegression(solver='liblinear')
    genre_model.fit(X_train, Y_train[:,i])
    genre_models.append(genre_model)
    
print(len(genre_models))


147


## Predict on Testing Data for all 147 models

In [12]:
for i in range(147):
    genre_prediction = genre_models[i].predict(X_test)
    genre_prediction_prob = genre_models[i].predict_proba(X_test)
    predictions[:, i] = genre_prediction_prob[:,1]  # Get probability that it is class=1
        
print(predictions)


[[  1.24762070e-03   3.47815595e-04   6.55657611e-02 ...,   1.20681305e-02
    2.16764134e-03   7.96490741e-04]
 [  1.63571679e-03   8.30661210e-06   5.76851095e-03 ...,   1.50735649e-04
    2.26812571e-04   8.49083043e-03]
 [  1.91250889e-02   6.58768310e-04   6.19809850e-02 ...,   6.28433550e-03
    5.16220674e-03   3.91267166e-03]
 ..., 
 [  1.17833693e-03   2.66751034e-02   7.32566917e-03 ...,   9.94670735e-03
    3.98396667e-03   8.75296968e-03]
 [  2.34765203e-04   3.52172337e-02   1.03609192e-02 ...,   7.83574398e-03
    3.32068591e-04   2.52458384e-05]
 [  2.37209948e-03   2.74298036e-04   1.87085039e-02 ...,   2.23590329e-03
    9.92621333e-04   7.69675357e-02]]


## Write to a CSV for Submission

In [14]:
file = open('../data/cal10k_test_random_submission.csv', 'r')
id_line = None
for line in file:
    if 'id' in line:
        id_line = line
ids = []
for i in range(2705):
    ids.append(int(i))
# predictions = np.insert(predictions, 0, ids, axis=1)
string_predictions = []
for song_idx in range(len(predictions)):
    song = predictions[song_idx]
    song_string = []
    for genre_idx in range(len(song)):
        if genre_idx != 0:
            song_string.append(str(predictions[song_idx, genre_idx]))
        else: 
            song_string.append(str(predictions[song_idx, genre_idx]))
    string_predictions.append(song_string)
with open('../data/first_attempt_d_is_a_dumb_bird.csv', 'w') as out_file:
    out_file.write(id_line)
    for song_array in string_predictions:
        song_array[0] = str(int(float(song_array[0])))
        out_file.write(",".join(song_array) + os.linesep)