In [189]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
import math

def cost_estimate(genres, y_test):
    m = y_test.shape[0]
    cost = sum((genres!=y_test))/m
    return cost

def take_m_rows_every_n(data, m, n):
    # returns 2 1d or 2d matrices 
    # first one contains the first m out of every n datapoints
    # second one contains the remainingi n-m out of every n datapoints
    # bound to be a better way to do this... some kind of pandas mask?
    size = data.shape[0]
    train_size = math.ceil(size * m / n)
    test_size = math.floor(size * (n-m) / n)
    
    if len(data.shape) > 1:
        data_first = np.empty((train_size, data.shape[1])) # if not exact integer will make a mess
        data_second = np.empty((test_size, data.shape[1]))
    else:
        data_first = np.empty((train_size)) # if not exact integer will make a mess
        data_second = np.empty((test_size))
    i=0
    j=0
    for idx, row in enumerate(data):
        if idx%both < take:
            data_first[i] = row
            i+=1
        else:
            data_second[j] = row
            j+=1
    return data_first, data_second


if __name__ == "__main__":
    
    df = pd.read_csv (r'features_30_sec.csv')
    # print (df)

    x = df.to_numpy()[:,2:-1]
    # print(X)
    print(f"x shape:{x.shape}")

    y_as_word = df.to_numpy()[:,-1].T
    # print(Y)
    print(f"y shape:{y_as_word.shape}")

    classes_dict = {
        "blues" : 0,
        "classical" : 1,
        "country" : 2,
        "disco" : 3,
        "hiphop" : 4,
        "jazz" : 5,
        "metal" : 6,
        "pop" : 7,
        "reggae" : 8,
        "rock" : 9,
    }

    y = np.array([classes_dict[genre] for genre in y_as_word])
    # print(y)

    x_train, x_test = take_m_rows_every_n(x, 90, 100)
    y_train, y_test = take_m_rows_every_n(y, 90, 100)
                                   
    print(f"x train shape:{x_train.shape}")
    print(f"y train shape:{y_train.shape}")   
    print(f"x test shape:{x_test.shape}")
    print(f"y test shape:{y_test.shape}") 
    

x shape:(1000, 57)
y shape:(1000,)
x train shape:(900, 57)
y train shape:(900,)
x test shape:(100, 57)
y test shape:(100,)


In [190]:
    # linear kernel, from what I read appears to be somewhat optimised
    
    clf = make_pipeline(StandardScaler(),
                        LinearSVC(random_state=0, tol=1e-5, dual=False))
    clf.fit(x_train, y_train)
    
    results = clf.decision_function(x_test)
    genres = results.argmax(axis=1)
    m = y_test.shape[0]
    print(genres)    
    print(cost_estimate(genres, y_test))

[0 2 0 0 0 0 2 2 6 5 1 1 1 1 1 1 1 1 1 1 2 2 6 2 2 9 9 2 2 0 3 3 9 3 3 6 6
 6 4 4 4 6 4 8 4 6 8 4 4 4 5 5 5 7 5 5 5 5 5 5 5 9 2 6 6 4 4 6 6 4 7 7 7 7
 7 7 7 7 3 7 7 8 8 2 2 7 8 8 9 6 5 6 6 9 6 2 9 6 2 6]
0.41


In [191]:
    # gaussian kernel
    
    clf = make_pipeline(StandardScaler(),OneVsRestClassifier(SVC(kernel='rbf')).fit(X, y))
    clf.fit(x_train, y_train)
    
    results = clf.decision_function(x_test)
    
    genres = results.argmax(axis=1)
    m = y_test.shape[0]
    print(genres)    
    print(cost_estimate(genres, y_test))
    # better but still pretty bad..

[0 2 0 1 0 0 0 2 6 5 1 1 1 1 1 1 1 1 1 5 2 2 6 2 2 2 2 2 2 0 3 3 9 3 3 3 4
 3 3 3 4 4 4 6 8 6 2 4 4 4 5 5 5 5 5 5 5 5 5 5 9 0 0 6 6 6 6 6 6 6 7 7 7 7
 7 7 7 7 7 7 8 8 8 8 2 2 8 5 9 6 5 6 6 9 6 2 9 6 2 9]
0.29


(900, 57)
(100, 57)
