In [15]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
import math
import librosa
import os


# good starting point but will use svc score instead
def accuracy_estimate(genres, y_test):
    # quick estimate
    # lower is better
    m = y_test.shape[0]
    cost = sum((genres!=y_test))/m
    return cost

# does the trick but is easier to do in pandas
# no longer using this function
def take_m_rows_every_n(data, m, n):
    # returns 2 1d or 2d matrices 
    # first one contains the first m out of every n datapoints (first 90 out of each 100)
    # second one contains the remainingi n-m out of every n datapoints (last 10 out of each 100)
    # bound to be a better way to do this... some kind of pandas mask?
    size = data.shape[0]
    train_size = math.ceil(size * m / n)   # if not exact division train on 1 more; test on 1 less
    test_size = math.floor(size * (n-m) / n)
    
    if len(data.shape) > 1:
        data_first = np.empty((train_size, data.shape[1])) 
        data_second = np.empty((test_size, data.shape[1]))
    else:
        data_first = np.empty((train_size))
        data_second = np.empty((test_size))
    i=0
    j=0
    for idx, row in enumerate(data):
        if idx%n < m:
            data_first[i] = row
            i+=1
        else:
            data_second[j] = row
            j+=1
    return data_first, data_second


# extract features from csv; code genres onto numbers
def read_csv_extract_features(name):
    df = pd.read_csv (name)
    
#     x = df.to_numpy()[:,2:-1] # all rows; don't care about the filename, length, label
#     y_as_word = df.to_numpy()[:,-1].T  # label only

    
    classes_dict = {
        "blues" : 0,
        "classical" : 1,
        "country" : 2,
        "disco" : 3,
        "hiphop" : 4,
        "jazz" : 5,
        "metal" : 6,
        "pop" : 7,
        "reggae" : 8,
        "rock" : 9,
    }

#   y = np.array([classes_dict[genre] for genre in y_as_word])

#   x_train, x_test = take_m_rows_every_n(x, 90, 100)
#   y_train, y_test = take_m_rows_every_n(y, 90, 100)
    
    x_train = df[df.index%100 <=90].to_numpy()[:,2:-1]
    x_test = df[df.index%100 >90].to_numpy()[:,2:-1]
    
    y_tr = df[df.index%100 <=90].to_numpy()[:,-1].T
    y_tst = df[df.index%100 >90].to_numpy()[:,-1].T
    
    y_train = np.array([classes_dict[genre] for genre in y_tr])
    y_test = np.array([classes_dict[genre] for genre in y_tst])
    
    return x_train, y_train, x_test, y_test


if __name__ == "__main__":
    # I skip jazz 0054 to have the same datapoints on both 20 mel features and using the original 30 sec features

    x_train, y_train, x_test, y_test = read_csv_extract_features(r'features_30_sec.csv')

In [16]:
    # linear kernel, from what I read appears to be somewhat optimised
    
    clf = make_pipeline(StandardScaler(),
                        LinearSVC(random_state=0, tol=1e-5, dual=False, C=0.01))
    clf.fit(x_train, y_train)

    score = clf.score(x_train, y_train)
    print("accuracy score linear kernel; original features; higher is better")
    print(f"training score is {score}")    
    score = clf.score(x_test, y_test)
    print(f"test score is {score}")    

accuracy score linear kernel; original features; higher is better
training score is 0.7604395604395604
test score is 0.6333333333333333


In [17]:
    # gaussian, ovr by default
    # gamma C optimisation seems to be beyond the scope;
    # essentialy useless without covariance samples
    clf = make_pipeline(StandardScaler(),
                        SVC(kernel="rbf", C=1, gamma=1/30))
    clf.fit(x_train, y_train)
    
    print("accuracy score gaussian kernel; original features; higher is better")    
    score = clf.score(x_train, y_train)
    print(f"training score is {score}")    
    score = clf.score(x_test, y_test)
    print(f"test score is {score}")   
    # overfitting because of my gamma choice; better than the linear kernel tho

accuracy score gaussian kernel; original features; higher is better
training score is 0.9516483516483516
test score is 0.7777777777777778


In [20]:
    # librosa install is broken on 3.10 atm :/
    # extract features from songs into csv
    rootdir = 'Data\\genres_original\\'
    csv_file = "20melfeatures.csv"
    j = 0
    lst = []
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            # librosa fails at jazz0054
            try:
                y, sr = librosa.load(subdir+"\\" +file)
            except:
                print(f"failed extraction on {file}")
                continue
            feat = np.mean(librosa.feature.mfcc(y=y, sr=sr), axis=1)
            # filename; length of file, features,   label
            row = [file, y.size] + feat.tolist() + [file.split(".")[0]] # make the same format as previous csv file
            lst.append(row)

    df = pd.DataFrame(lst)
    df.to_csv(csv_file, index = False)

In [21]:
    x_train_mel, y_train_mel, x_test_mel, y_test_mel = read_csv_extract_features(csv_file)

In [22]:
    clf = make_pipeline(StandardScaler(),
                        SVC(kernel="rbf", C=1))
    clf.fit(x_train_mel, y_train_mel)
    
    print("accuracy score gaussian kernel; 20 mel features; higher is better")    
    score = clf.score(x_train_mel, y_train_mel)
    print(f"training score is {score}")    
    score = clf.score(x_test_mel, y_test_mel)
    print(f"test score is {score}")  
    # not particularly good by themselves; 
    # makes sense because these features are a subset of the original features in the 30 sec CSV file

accuracy score gaussian kernel; 20 mel features; higher is better
training score is 0.7703296703296704
test score is 0.5280898876404494
