In [12]:

from typing import Tuple, Union
import sys
sys.path.append('../')
from ds4400_final_project.dataset.constants import DATASET_FOLDER
from pathlib import Path
from typing import Tuple, Dict
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
import numpy as np

number = Union[float, int]

In [13]:
def load_data_from_file(csv_filename: str) -> Tuple[np.ndarray, np.ndarray, Dict[int, str], Dict[str, int]]:
	""" Load the CSV file from the dataset folder. """
	file = str(Path(DATASET_FOLDER) / csv_filename)
	features_list = np.genfromtxt(file, dtype=None, encoding=None, delimiter=",", skip_header=1, usecols=range(2, 60))
	features = np.array([list(x) for x in features_list])

	# Create a mapping between a numeric value and genre
	index_genre_map = {i: genre for i, genre in enumerate(np.unique(features[:,-1]))}
	genre_index_map = {value: key for key, value in index_genre_map.items()}

	# split the inputs and their labels
	X = features[:,:57]
	y = np.array([genre_index_map[genre] for genre in features[:,-1]])

	# normalize the data
	X = normalize(X, axis=0)

	return X, y, index_genre_map, genre_index_map

In [14]:
def train_evaluate_classifier(
	csv_filename: str, 
	kernel: str,
	test_size: float = 0.33, 
	random_state: int = 42,
) -> Tuple[float, float, Dict[str, int]]:

	# import the data from the seconds features CSV
	X, y, index_genre_map, genre_index_map = load_data_from_file(csv_filename)

	# split all the data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

	# create and train the classifier
	classifier = svm.SVC(kernel=kernel)
	classifier.fit(X_train, y_train)
	
	# evaluate the model on training data
	train_pred = classifier.predict(X_train)
	train_failed = np.where(np.not_equal(y_train.ravel(), train_pred))
	y_train_failed = y_train[train_failed]

	# compute the accuracy / error on the training data
	train_accuracy = classifier.score(X_train, y_train)

	# evaluate the model on testing data
	test_pred = classifier.predict(X_test)
	test_failed = np.where(np.not_equal(y_test.ravel(), test_pred))
	y_test_failed = y_test[test_failed]

	# compute the accuracy / error on the testing data
	test_accuracy = classifier.score(X_test, y_test)

	failed_count: Dict[str, int] = {genre: 0 for genre in genre_index_map.keys()}

	for y_failed in [*y_train_failed, *y_test_failed]:
		genre = index_genre_map[y_failed]
		failed_count[genre] += 1

	return train_accuracy, test_accuracy, failed_count

In [21]:
def print_results(
	title:str, 
	train_accuracy: float, 
	test_accuracy: float, 
	failed_count: Dict[str, int]
) -> None:
	total_incorrect = sum(failed_count.values())

	print("=="*26)
	print(title)
	print("=="*26)
	print(f"Train accuracy: {round(train_accuracy * 100, 3)}")
	print(f"Train error:    {round((1-train_accuracy) * 100, 3)}")
	print(f"Test accuracy:  {round(test_accuracy * 100, 3)}")
	print(f"Test error:     {round((1-test_accuracy) * 100, 3)}")
	print()
	print("Genre".ljust(15), "| # of Incorrect | % of All Incorrect")
	print("--"*26)

	failed = sorted([(genre, count) for genre, count in failed_count.items()], key=lambda t: t[1], reverse=True)
	for i, (genre, failed_count) in enumerate(failed):
		print(f"[{i+1:02}]", genre.upper().ljust(10), "|", str(failed_count).ljust(14), f"| {round((failed_count / total_incorrect) * 100, 1)}%")
	print("\n")

In [17]:
kernels = {"linear", "poly", "rbf", "sigmoid"}

results_3s: Dict[str, Tuple] = {}
results_30s: Dict[str, Tuple] = {}

for kernel in kernels:
	# import the data from the 3 seconds features CSV
	train_accuracy_3s, test_accuracy_3s, failed_count_3s = train_evaluate_classifier("features_3_sec.csv", kernel=kernel)
	results_3s[kernel] = (train_accuracy_3s, test_accuracy_3s, failed_count_3s)
	
	# import the data from the 30 seconds features CSV
	train_accuracy_30s, test_accuracy_30s, failed_count_30s = train_evaluate_classifier("features_30_sec.csv", kernel=kernel)
	results_30s[kernel] = (train_accuracy_30s, test_accuracy_30s, failed_count_30s)

In [22]:
for kernel, (train_accuracy_3s, test_accuracy_3s, failed_count_3s) in results_3s.items():
	print_results(f"3 Second Clips ({kernel})", train_accuracy_3s, test_accuracy_3s, failed_count_3s)


for kernel, (train_accuracy_30s, test_accuracy_30s, failed_count_30s) in results_30s.items():
	print_results(f"30 Second Clips ({kernel})", train_accuracy_30s, test_accuracy_30s, failed_count_30s)

3 Second Clips (sigmoid)
Train accuracy: 41.147
Train error:    58.853
Test accuracy:  43.1
Test error:     56.9

Genre           | # of Incorrect | % of All Incorrect
----------------------------------------------------
[01] ROCK       | 886            | 15.2%
[02] DISCO      | 799            | 13.7%
[03] COUNTRY    | 683            | 11.7%
[04] BLUES      | 650            | 11.2%
[05] REGGAE     | 637            | 11.0%
[06] HIPHOP     | 629            | 10.8%
[07] JAZZ       | 603            | 10.4%
[08] POP        | 456            | 7.8%
[09] METAL      | 237            | 4.1%
[10] CLASSICAL  | 235            | 4.0%


3 Second Clips (linear)
Train accuracy: 27.387
Train error:    72.613
Test accuracy:  26.297
Test error:     73.703

Genre           | # of Incorrect | % of All Incorrect
----------------------------------------------------
[01] BLUES      | 1000           | 13.7%
[02] ROCK       | 998            | 13.7%
[03] CLASSICAL  | 997            | 13.7%
[04] HIPHOP     | 996  