In [34]:
from typing import Dict, List, Optional, Union, Tuple, Callable
import sys
sys.path.append('../')
from ds4400_final_project.dataset.constants import DATASET_FOLDER
from pathlib import Path
from typing import Tuple, Dict
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
import numpy as np
from collections import defaultdict

number = Union[float, int]

In [27]:
def load_data_from_file(csv_filename: str) -> Tuple[np.ndarray, np.ndarray, Dict[int, str], Dict[str, int]]:
	""" Load the CSV file from the dataset folder. """
	file = str(Path(DATASET_FOLDER) / csv_filename)
	features_list = np.genfromtxt(file, dtype=None, encoding=None, delimiter=",", skip_header=1, usecols=range(2, 60))
	features = np.array([list(x) for x in features_list])

	# Create a mapping between a numeric value and genre
	index_genre_map = {i: genre for i, genre in enumerate(np.unique(features[:,-1]))}
	genre_index_map = {value: key for key, value in index_genre_map.items()}

	# split the inputs and their labels
	X = features[:,:57]
	y = np.array([genre_index_map[genre] for genre in features[:,-1]])

	# normalize the data
	X = normalize(X, axis=0)

	return X, y, index_genre_map, genre_index_map

In [42]:
def train_evaluate_classifier(csv_filename: str, test_size: float = 0.33, random_state: int = 42) -> Tuple[float, float, Dict[str, int]]:

	# import the data from the seconds features CSV
	X, y, index_genre_map, genre_index_map = load_data_from_file(csv_filename)

	# split all the data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

	# create and train the classifier
	classifier = svm.SVC()
	classifier.fit(X_train, y_train)
	
	# evaluate the model on training data
	train_pred = classifier.predict(X_train)
	train_failed = np.where(np.not_equal(y_train.ravel(), train_pred))
	X_train_failed = X_train[train_failed]
	y_train_failed = y_train[train_failed]

	# compute the accuracy / error on the training data
	train_accuracy = classifier.score(X_train, y_train)

	# evaluate the model on testing data
	test_pred = classifier.predict(X_test)
	test_failed = np.where(np.not_equal(y_test.ravel(), test_pred))
	X_test_failed = X_test[test_failed]
	y_test_failed = y_test[test_failed]

	# compute the accuracy / error on the testing data
	test_accuracy = classifier.score(X_test, y_test)

	failed_count: Dict[str, int] = {genre: 0 for genre in genre_index_map.keys()}

	for y_failed in [*y_train_failed, *y_test_failed]:
		genre = index_genre_map[y_failed]
		failed_count[genre] += 1

	return train_accuracy, test_accuracy, failed_count

In [74]:
def print_results(title:str, train_accuracy: float, test_accuracy: float, failed_count: Dict[str, int]) -> None:
	total_incorrect = sum(failed_count.values())

	print("=="*26)
	print(title)
	print("=="*26)
	print(f"Train accuracy: {round(train_accuracy * 100, 3)}")
	print(f"Train error:    {round((1-train_accuracy) * 100, 3)}")
	print(f"Test accuracy:  {round(test_accuracy * 100, 3)}")
	print(f"Test error:     {round((1-test_accuracy) * 100, 3)}")
	print()
	print("Genre".ljust(15), "| # of Incorrect | % of All Incorrect")
	print("--"*26)

	failed = sorted([(genre, count) for genre, count in failed_count.items()], key=lambda t: t[1], reverse=True)
	for i, (genre, failed_count) in enumerate(failed):
		print(f"[{i+1:02}]", genre.upper().ljust(10), "|", str(failed_count).ljust(14), f"| {round((failed_count / total_incorrect) * 100, 3)}%")
	print()

In [66]:
# import the data from the 3 seconds features CSV
train_accuracy_3s, test_accuracy_3s, failed_count_3s = train_evaluate_classifier("features_3_sec.csv")
train_accuracy_30s, test_accuracy_30s, failed_count_30s = train_evaluate_classifier("features_30_sec.csv")

In [75]:
print_results("3 Second Clips", train_accuracy_3s, test_accuracy_3s, failed_count_3s)
print_results("30 Second Clips", train_accuracy_30s, test_accuracy_30s, failed_count_30s)

3 Second Clips
Train accuracy: 86.747
Train error:    13.253
Test accuracy:  82.681
Test error:     17.319

Genre           | # of Incorrect | % of All Incorrect
----------------------------------------------------
[01] ROCK       | 272            | 18.656%
[02] HIPHOP     | 206            | 14.129%
[03] DISCO      | 196            | 13.443%
[04] COUNTRY    | 165            | 11.317%
[05] REGGAE     | 157            | 10.768%
[06] BLUES      | 148            | 10.151%
[07] POP        | 99             | 6.79%
[08] JAZZ       | 94             | 6.447%
[09] METAL      | 94             | 6.447%
[10] CLASSICAL  | 27             | 1.852%

30 Second Clips
Train accuracy: 82.687
Train error:    17.313
Test accuracy:  62.727
Test error:     37.273

Genre           | # of Incorrect | % of All Incorrect
----------------------------------------------------
[01] ROCK       | 51             | 21.339%
[02] BLUES      | 46             | 19.247%
[03] HIPHOP     | 30             | 12.552%
[04] DISCO    