In [1]:
from src.data_representations.tf_idf import TfIdf
from src.data_representations.vector import Vector
from src.data_representations.structure import Structure
from src.classifiers.knn import Knn
from src.preprocessing.preprocessing import Preprocessor
from src.evaluation.evaluation import Evaluator
from src.data_representations.bow import BOW
import gc

In [None]:
def report(evaluator):
    """Runs evaluation and print out accuracy and micro precision, recall and f1
    """
    print("Accuracy:\n", evaluator.accuracy())
    print("Micro Precision:\n", evaluator.micro_precision())
    print("Micro Recall:\n", evaluator.micro_recall())
    print("Micro F-Score:\n", evaluator.micro_fscore())

def run_prediction(classifier, test_examples, test_labels, measure, alpha, beta):
    """Runs prediction and evaluation
    """
    predictions = classifier.predict(test_examples, k=4, measure=measure, alpha=alpha, beta=beta)
    evaluator = Evaluator(test_labels, predictions)
    report(evaluator)

# tf-idf and random choice experiments

This notebook contains experiments with a random choice baseline as well as tf-idf as representation for the lyrics in different configurations of different training and test sizes as well as distance metrics and structural features.

For the random baseline, a random class is picked for each of the examples in the test set.
Since it's random, we're taking the average of **10 tests**.

The first part provides a tutorial on how to run the experiments and specify core hyperparameters and settings.

The second part showcases experiments with their results.

---
# Tutorial

To run the experiments, either add your datasets to the data folder or change these variables to the paths to your datasets.

In [None]:
filepath_train = "./data/songs_train.txt"
filepath_test = "./data/songs_test.txt"

To run experiments with custom settings, change `read_limit` in the respective `Preprocessor`s to your training and test sizes

In [None]:
# Read dataset
dataset_train = Preprocessor(filepath=filepath_train, read_limit=10000)
dataset_test = Preprocessor(filepath=filepath_test, read_limit=100)

Create numerical representations of labels for mapping

In [None]:
artists = list(set(dataset_train.artists) | set(dataset_test.artists))
label_to_num = {artist:i for i, artist in enumerate(set(dataset_train.artists) | set(dataset_test.artists))}
num_to_label = {value:key for key, value in label_to_num.items()}

Specify the number of processes for multiprocessing

In [None]:
number_processes = 8

Compute the tf-idf representations

In [None]:
tf_idf = TfIdf()
train = tf_idf.fit_transform(dataset_train.tokenized)
test = tf_idf.transform(dataset_test.tokenized)

Create training and testing examples and labels

In [None]:
training_examples = [Vector([ex]) for ex in train]
training_labels = [label_to_num[label] for label in dataset_train.artists]
test_examples = [Vector([ex]) for ex in test]
test_labels = [label_to_num[label] for label in dataset_test.artists]

Initialize the classifier

In [None]:
classifier = Knn(training_examples, training_labels, number_processes)

Choose the distance metric by changing the `measure` argument in `classifier.predict` from the following options

- Cosine similarity: `"cosine"`

- Euclidean distance: `"euclidean"`

In [None]:
predictions = classifier.predict(test_examples, k=4, measure="euclidean")

Initialize evaluation with the following

In [None]:
evaluator = Evaluator(test_labels, predictions)

and evaluate with accuracy, micro and macro $F_1$, precision and recall as follows 

In [None]:
evaluator.accuracy()
evaluator.micro_precision()
evaluator.micro_recall()
evaluator.micro_f1()
evaluator.macro_precision()
evaluator.macro_recall()
evaluator.macro_f1()

To run a full experiment and evaluation with your preferred settings, run

In [None]:
run_prediction(classifier, test_examples, test_labels, "cosine")

---
# Experiments

All experiments in 1) and 2) with kNN are run with $k=4$.
Both 1) and 2) follow this structure:
- Random choice baseline

- tf-idf

    - with euclidean distance

    - with cosine distance

- tf-idf + structural features

    - with euclidean distance

    - with cosine distance

## 1) 10k training/100 test sizes

In [18]:
# Read dataset
dataset_train = Preprocessor(filepath=filepath_train, read_limit=10000)
dataset_test = Preprocessor(filepath=filepath_test, read_limit=100)

# Create numerical representations of labels for mapping
artists = list(set(dataset_train.artists) | set(dataset_test.artists))
label_to_num = {artist:i for i, artist in enumerate(set(dataset_train.artists) | set(dataset_test.artists))}
num_to_label = {value:key for key, value in label_to_num.items()}

# how many process are gonna be run
number_processes = 8

## Random choice

In [19]:
import random

test_labels = [label_to_num[label] for label in dataset_test.artists]

accuracy = []
micro_precision = []
micro_recall = []
micro_fscore = []

num_experiments = 10

for i in range(num_experiments):
    random_labels = random.choices(range(len(artists)), k=len(test_labels))

    evaluator = Evaluator(test_labels, random_labels)
    accuracy.append(evaluator.accuracy())
    micro_precision.append(evaluator.micro_precision())
    micro_recall.append(evaluator.micro_recall())
    micro_fscore.append(evaluator.micro_fscore())

print("Accuracy:\n", sum(accuracy) / num_experiments)
print("Micro Precision:\n", sum(micro_precision) / num_experiments)
print("Micro Recall:\n", sum(micro_recall) / num_experiments)
print("Micro F-Score:\n", sum(micro_fscore) / num_experiments)

Accuracy:
 0.004
Micro Precision:
 0.02760854341736695
Micro Recall:
 0.004
Micro F-Score:
 0.006973639891062759


## TF-idf

In [23]:
tf_idf = TfIdf()
train = tf_idf.fit_transform(dataset_train.tokenized)
test = tf_idf.transform(dataset_test.tokenized)

# Initiate Knn classifier
training_examples = [Vector([ex]) for ex in train]
training_labels = [label_to_num[label] for label in dataset_train.artists]

classifier = Knn(training_examples, training_labels, number_processes)

test_examples = [Vector([ex]) for ex in test]
test_labels = [label_to_num[label] for label in dataset_test.artists]
del tf_idf, train, test, training_examples, training_labels


### Prediction with cosine and euclidean distance, evaluation

#### Cosine similarity

In [24]:
run_prediction(classifier, test_examples, test_labels, "cosine")

Accuracy:
 0.07
Micro Precision:
 0.23333333333333334
Micro Recall:
 0.07
Micro F-Score:
 0.1076923076923077


#### Euclidean distance

In [25]:
run_prediction(classifier, test_examples, test_labels, "euclidean")

Accuracy:
 0.01
Micro Precision:
 0.09090909090909091
Micro Recall:
 0.01
Micro F-Score:
 0.018018018018018018


Print what artist was chosen and what is the real one:

In [26]:
#for pred, lab in zip(predictions, test_labels):
#    print(num_to_label[pred], " - ", num_to_label[lab])

## TF-idf + structural features

In [31]:
tf_idf = TfIdf()
train = tf_idf.fit_transform(dataset_train.tokenized)
test = tf_idf.transform(dataset_test.tokenized)

train_struc =  Structure(dataset_train.tokenized)
test_struc = Structure(dataset_test.tokenized)

# Initiate Knn classifier
training_examples = [Vector([ex, [n], [d]]) for ex, n, d in zip(train, train_struc.number_lines, train_struc.doc_length)]
training_labels = [label_to_num[label] for label in dataset_train.artists]

classifier = Knn(training_examples, training_labels, number_processes)

test_examples = [Vector([ex, [n], [d]]) for ex, n, d in zip(test, test_struc.number_lines, test_struc.doc_length)]
test_labels = [label_to_num[label] for label in dataset_test.artists]
del tf_idf, train, test, train_struc, test_struc 


### Prediction with cosine and euclidean distance, evaluation

#### Cosine similarity

In [32]:
run_prediction(classifier, test_examples, test_labels, "cosine")

Accuracy:
 0.02
Micro Precision:
 0.08333333333333333
Micro Recall:
 0.02
Micro F-Score:
 0.03225806451612903


#### Euclidean distance

In [33]:
run_prediction(classifier, test_examples, test_labels, "euclidean")

Accuracy:
 0.01
Micro Precision:
 0.05555555555555555
Micro Recall:
 0.01
Micro F-Score:
 0.016949152542372885


-----
## 2) 20k training/100 test sizes 

In [3]:
filepath_train = "./data/songs_train.txt"
dataset_train = Preprocessor(filepath=filepath_train, read_limit=20000)
filepath_test = "./data/songs_test.txt"
dataset_test = Preprocessor(filepath=filepath_test, read_limit=100)

# Create numerical representations of labels for mapping
artists = list(set(dataset_train.artists) | set(dataset_test.artists))
label_to_num = {artist:i for i, artist in enumerate(artists)}
num_to_label = {value:key for key, value in label_to_num.items()}

# how many process are gonna be run
number_processes = 8

### Random choice

In [4]:
import random

test_labels = [label_to_num[label] for label in dataset_test.artists]

accuracy = []
micro_precision = []
micro_recall = []
micro_fscore = []

num_experiments = 10

for i in range(num_experiments):
    random_labels = random.choices(range(len(artists)), k=len(test_labels))

    evaluator = Evaluator(test_labels, random_labels)
    accuracy.append(evaluator.accuracy())
    micro_precision.append(evaluator.micro_precision())
    micro_recall.append(evaluator.micro_recall())
    micro_fscore.append(evaluator.micro_fscore())

print("Accuracy:\n", sum(accuracy) / num_experiments)
print("Micro Precision:\n", sum(micro_precision) / num_experiments)
print("Micro Recall:\n", sum(micro_recall) / num_experiments)
print("Micro F-Score:\n", sum(micro_fscore) / num_experiments)

Accuracy:
 0.001
Micro Precision:
 0.006666666666666666
Micro Recall:
 0.001
Micro F-Score:
 0.0017391304347826088


### TF-IDF

In [16]:
tf_idf = TfIdf()
train = tf_idf.fit_transform(dataset_train.tokenized)
test = tf_idf.transform(dataset_test.tokenized)
print('done')

# Initiate Knn classifier
training_examples = [Vector([ex]) for ex in train]
training_labels = [label_to_num[label] for label in dataset_train.artists]

classifier = Knn(training_examples, training_labels, number_processes)

test_examples = [Vector([ex]) for ex in test]
test_labels = [label_to_num[label] for label in dataset_test.artists]
del tf_idf, train, test, training_examples, training_labels
gc.collect()

done


12

#### Prediction with cosine and euclidean distance, evaluation

##### Cosine similarity

In [17]:
run_prediction(classifier, test_examples, test_labels, "cosine")

Accuracy:
 0.08
Micro Precision:
 0.2857142857142857
Micro Recall:
 0.08
Micro F-Score:
 0.125


##### Euclidean distance

In [19]:
run_prediction(classifier, test_examples, test_labels, "euclidean")

Accuracy:
 0.04
Micro Precision:
 0.3333333333333333
Micro Recall:
 0.04
Micro F-Score:
 0.07142857142857142


### TF-idf + structural features

In [4]:
tf_idf = TfIdf()
train = tf_idf.fit_transform(dataset_train.tokenized)
test = tf_idf.transform(dataset_test.tokenized)

train_struc =  Structure(dataset_train.tokenized)
test_struc = Structure(dataset_test.tokenized)

# Initiate Knn classifier
training_examples = [Vector([ex, [n], [d]]) for ex, n, d in zip(train, train_struc.number_lines, train_struc.doc_length)]
training_labels = [label_to_num[label] for label in dataset_train.artists]

classifier = Knn(training_examples, training_labels, number_processes)

test_examples = [Vector([ex, [n], [d]]) for ex, n, d in zip(test, test_struc.number_lines, test_struc.doc_length)]
test_labels = [label_to_num[label] for label in dataset_test.artists]
del tf_idf, train, test, train_struc, test_struc 


#### Prediction with cosine and euclidean distance, evaluation

##### Cosine similarity

In [22]:
run_prediction(classifier, test_examples, test_labels, "cosine")

Accuracy:
 0.01
Micro Precision:
 0.05555555555555555
Micro Recall:
 0.01
Micro F-Score:
 0.016949152542372885


##### Euclidean distance

In [5]:
run_prediction(classifier, test_examples, test_labels, "euclidean")

Accuracy:
 0.01
Micro Precision:
 0.07142857142857142
Micro Recall:
 0.01
Micro F-Score:
 0.01754385964912281
