In [4]:
from src.classifiers.knn import Knn
from src.evaluation.evaluation import Evaluator
from src.preprocessing.preprocessing import Preprocessor
from src.data_representations.bow import BOW

=== NOTE: This notebook is deprecated ===

# Experiments

## Set BOW approach

### Loading data and initializing KNN

In [2]:
# Read dataset
filepath_train = "./data/songs_train.txt"
dataset_train = Preprocessor(filepath=filepath_train, read_limit=10000)
filepath_test = "./data/songs_test.txt"
dataset_test = Preprocessor(filepath=filepath_test, read_limit=100)
# Create numerical representations of labels for mapping
label_to_num = {artist:i for i, artist in enumerate(set(dataset_train.artists) | set(dataset_test.artists))}
num_to_label = {value:key for key, value in label_to_num.items()}
# Initiate Knn classifier
training_examples = [BOW(tok) for tok in dataset_train.tokenized]
training_labels = [label_to_num[label] for label in dataset_train.artists]
classifier = Knn(training_examples, training_labels)

### Using Jaccard

In [3]:
# Test predictions
test_examples = dataset_test.BOW()
test_labels = [label_to_num[label] for label in dataset_test.artists]
predictions = classifier.predict(test_examples, k=4, measure="jaccard")
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.07
Micro Precision:
 0.3181818181818182
Micro Recall:
 0.07
Micro F-Score:
 0.11475409836065574


### Using Sørensen-Dice

In [4]:
predictions = classifier.predict(test_examples, k=4, measure="dsc")
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.07
Micro Precision:
 0.3181818181818182
Micro Recall:
 0.07
Micro F-Score:
 0.11475409836065574


### Using Overlap index

In [5]:
predictions = classifier.predict(test_examples, k=4, measure="overlap")
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.01
Micro Precision:
 0.047619047619047616
Micro Recall:
 0.01
Micro F-Score:
 0.01652892561983471


### Using different Tversky settings

#### $\alpha=0.1$, $\beta=0.9$

In [6]:
predictions = classifier.predict(test_examples, k=4, measure="tversky", alpha=0.1, beta=0.9)
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.03
Micro Precision:
 0.15789473684210525
Micro Recall:
 0.03
Micro F-Score:
 0.050420168067226885


#### $\alpha=0.2$, $\beta=0.8$

In [7]:
predictions = classifier.predict(test_examples, k=4, measure="tversky", alpha=0.2, beta=0.8)
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.05
Micro Precision:
 0.3125
Micro Recall:
 0.05
Micro F-Score:
 0.08620689655172414


#### $\alpha=0.3$, $\beta=0.7$

In [8]:
predictions = classifier.predict(test_examples, k=4, measure="tversky", alpha=0.3, beta=0.7)
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.05
Micro Precision:
 0.3125
Micro Recall:
 0.05
Micro F-Score:
 0.08620689655172414


#### $\alpha=\frac{1}{3}$, $\beta=\frac{2}{3}$

In [9]:
predictions = classifier.predict(test_examples, k=4, measure="tversky", alpha=1/3, beta=2/3)
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.05
Micro Precision:
 0.2777777777777778
Micro Recall:
 0.05
Micro F-Score:
 0.08474576271186442


#### $\alpha=0.4$, $\beta=0.6$

In [10]:
predictions = classifier.predict(test_examples, k=4, measure="tversky", alpha=0.4, beta=0.6)
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.05
Micro Precision:
 0.22727272727272727
Micro Recall:
 0.05
Micro F-Score:
 0.08196721311475409


#### $\alpha=0.6$, $\beta=0.4$

In [11]:
predictions = classifier.predict(test_examples, k=4, measure="tversky", alpha=0.6, beta=0.4)
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.05
Micro Precision:
 0.20833333333333334
Micro Recall:
 0.05
Micro F-Score:
 0.08064516129032258


#### $\alpha=\frac{2}{3}$, $\beta=\frac{1}{3}$

In [12]:
predictions = classifier.predict(test_examples, k=4, measure="tversky", alpha=2/3, beta=1/3)
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.05
Micro Precision:
 0.20833333333333334
Micro Recall:
 0.05
Micro F-Score:
 0.08064516129032258


#### $\alpha=0.7$, $\beta=0.3$

In [13]:
predictions = classifier.predict(test_examples, k=4, measure="tversky", alpha=0.7, beta=0.3)
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.06
Micro Precision:
 0.2222222222222222
Micro Recall:
 0.06
Micro F-Score:
 0.09448818897637795


#### $\alpha=0.8$, $\beta=0.2$

In [14]:
predictions = classifier.predict(test_examples, k=4, measure="tversky", alpha=0.8, beta=0.2)
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.06
Micro Precision:
 0.18181818181818182
Micro Recall:
 0.06
Micro F-Score:
 0.09022556390977443


#### $\alpha=0.1$, $\beta=0.9$

In [15]:
predictions = classifier.predict(test_examples, k=4, measure="tversky", alpha=0.9, beta=0.1)
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.01
Micro Precision:
 0.03125
Micro Recall:
 0.01
Micro F-Score:
 0.015151515151515152


#### $\alpha=1$, $\beta=0$

In [17]:
predictions = classifier.predict(test_examples, k=4, measure="tversky", alpha=1, beta=0)
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.01
Micro Precision:
 0.03225806451612903
Micro Recall:
 0.01
Micro F-Score:
 0.015267175572519083


#### $\alpha=0$, $\beta=1$

In [16]:
predictions = classifier.predict(test_examples, k=4, measure="tversky", alpha=0, beta=1)
# Run evaluation of algorithms performance
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.02
Micro Precision:
 0.11764705882352941
Micro Recall:
 0.02
Micro F-Score:
 0.03418803418803419


### Running fairly good performing distance measures on full train and test

#### Sørensen-Dice

In [4]:
# Read dataset
filepath_train = "./data/songs_train.txt"
dataset_train = Preprocessor(filepath=filepath_train, read_limit=46120)
filepath_test = "./data/songs_test.txt"
dataset_test = Preprocessor(filepath=filepath_test, read_limit=5765)
# Create numerical representations of labels for mapping
label_to_num = {artist:i for i, artist in enumerate(set(dataset_train.artists) | set(dataset_test.artists))}
num_to_label = {value:key for key, value in label_to_num.items()}
# Initiate Knn classifier
training_examples = [BOW(tok) for tok in dataset_train.tokenized]
training_labels = [label_to_num[label] for label in dataset_train.artists]
classifier = Knn(training_examples, training_labels)
# Run evaluation of algorithms performance
test_examples = [BOW(tok) for tok in dataset_test.tokenized]
test_labels = [label_to_num[label] for label in dataset_test.artists]
predictions = classifier.predict(test_examples, k=4, measure="dsc")
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.05013009540329575
Micro Precision:
 0.05029585798816568
Micro Recall:
 0.05013009540329575
Micro F-Score:
 0.05021283989227695


### Jaccard

In [5]:
# Read dataset
filepath_train = "./data/songs_train.txt"
dataset_train = Preprocessor(filepath=filepath_train, read_limit=46120)
filepath_test = "./data/songs_test.txt"
dataset_test = Preprocessor(filepath=filepath_test, read_limit=5765)
# Create numerical representations of labels for mapping
label_to_num = {artist:i for i, artist in enumerate(set(dataset_train.artists) | set(dataset_test.artists))}
num_to_label = {value:key for key, value in label_to_num.items()}
# Initiate Knn classifier
training_examples = [BOW(tok) for tok in dataset_train.tokenized]
training_labels = [label_to_num[label] for label in dataset_train.artists]
classifier = Knn(training_examples, training_labels)
# Run evaluation of algorithms performance
test_examples = [BOW(tok) for tok in dataset_test.tokenized]
test_labels = [label_to_num[label] for label in dataset_test.artists]

In [2]:
predictions = classifier.predict(test_examples, k=4, measure="jaccard")
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.05013009540329575
Micro Precision:
 0.05029585798816568
Micro Recall:
 0.05013009540329575
Micro F-Score:
 0.05021283989227695


In [6]:
predictions = classifier.predict(test_examples, k=3, measure="jaccard")
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.049262792714657416
Micro Precision:
 0.04941708717591787
Micro Recall:
 0.049262792714657416
Micro F-Score:
 0.04933981931897151


In [7]:
predictions = classifier.predict(test_examples, k=2, measure="jaccard")
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.04700780572419774
Micro Precision:
 0.04715503741082304
Micro Recall:
 0.04700780572419774
Micro F-Score:
 0.0470813064628214


In [8]:
predictions = classifier.predict(test_examples, k=1, measure="jaccard")
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.04700780572419774
Micro Precision:
 0.04715503741082304
Micro Recall:
 0.04700780572419774
Micro F-Score:
 0.0470813064628214


In [9]:
predictions = classifier.predict(test_examples, k=5, measure="jaccard")
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.05151777970511708
Micro Precision:
 0.051688130873651233
Micro Recall:
 0.05151777970511708
Micro F-Score:
 0.051602814698983576


## TF-IDF

In [None]:
from src.data_representations.tf_idf import TfIdf
from src.data_representations.vector import Vector
from src.classifiers.knn import Knn
from src.preprocessing.preprocessing import Preprocessor
from src.evaluation.evaluation import Evaluator

In [2]:
# Read dataset
filepath_train = "./data/songs_train.txt"
dataset_train = Preprocessor(filepath=filepath_train, read_limit=1000)
tf_idf = TfIdf()
train = tf_idf.fit_transform(dataset_train.tokenized)
filepath_test = "./data/songs_test.txt"
dataset_test = Preprocessor(filepath=filepath_test, read_limit=100)
test = tf_idf.transform(dataset_test.tokenized)
# Create numerical representations of labels for mapping
label_to_num = {artist:i for i, artist in enumerate(set(dataset_train.artists) | set(dataset_test.artists))}
num_to_label = {value:key for key, value in label_to_num.items()}
# Initiate Knn classifier
training_examples = [Vector([ex]) for ex in train]
training_labels = [label_to_num[label] for label in dataset_train.artists]
classifier = Knn(training_examples, training_labels)

In [3]:
test_examples = [Vector([ex]) for ex in test]
test_labels = [label_to_num[label] for label in dataset_test.artists]
predictions = classifier.predict(test_examples, k=4, measure="euclidean")
print(predictions)

[20, 316, 297, 20, 50, 11, 201, 20, 20, 20, 213, 138, 201, 264, 201, 342, 394, 316, 350, 20, 252, 136, 201, 20, 213, 243, 20, 20, 136, 20, 316, 408, 336, 20, 201, 20, 213, 20, 213, 20, 20, 264, 20, 213, 20, 20, 108, 20, 20, 20, 264, 20, 201, 306, 20, 160, 316, 20, 215, 333, 20, 20, 20, 20, 108, 135, 264, 333, 20, 304, 136, 20, 316, 342, 297, 213, 20, 63, 297, 20, 342, 20, 174, 215, 342, 201, 174, 213, 20, 213, 20, 342, 394, 201, 20, 213, 264, 213, 342, 215]


In [4]:
for pred, lab in zip(predictions, test_labels):
    print(num_to_label[pred], " - ", num_to_label[lab])

Kyla  -  Virgin Steele
Lil Wayne  -  Demi Lovato
Counting Crows  -  Indigo Girls
Kyla  -  Phil Collins
Lionel Richie  -  Extreme
The Temptations  -  Vince Gill
Bosson  -  Dusty Springfield
Kyla  -  Nat King Cole
Kyla  -  Arrogant Worms
Kyla  -  Kirsty Maccoll
LL Cool J  -  Chuck Berry
Avril Lavigne  -  Avril Lavigne
Bosson  -  Van Halen
Backstreet Boys  -  Phil Collins
Bosson  -  Rod Stewart
U2  -  Mark Ronson
Tim McGraw  -  Michael Bolton
Lil Wayne  -  Gipsy Kings
The Beatles  -  Linda Ronstadt
Kyla  -  Rihanna
Stone Temple Pilots  -  Queen
Outkast  -  Kanye West
Bosson  -  ABBA
Kyla  -  Religious Music
LL Cool J  -  Queen Latifah
Ice Cube  -  Snoop Dogg
Kyla  -  Kid Rock
Kyla  -  Michael Bolton
Outkast  -  Lil Wayne
Kyla  -  Alison Krauss
Lil Wayne  -  Michael Bolton
Bob Dylan  -  Veruca Salt
Everlast  -  Bryan White
Kyla  -  Cher
Bosson  -  George Jones
Kyla  -  Lauryn Hill
LL Cool J  -  Green Day
Kyla  -  Quietdrive
LL Cool J  -  Kid Rock
Kyla  -  John Prine
Kyla  -  Kinks
Backstre

In [5]:
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.01
Micro Precision:
 0.1111111111111111
Micro Recall:
 0.01
Micro F-Score:
 0.01834862385321101


In [6]:
test_examples = [Vector([ex]) for ex in test]
test_labels = [label_to_num[label] for label in dataset_test.artists]
predictions = classifier.predict(test_examples, k=4, measure="cosine")
print(predictions)

[330, 169, 297, 398, 157, 11, 428, 173, 118, 425, 440, 162, 92, 264, 289, 241, 352, 213, 350, 52, 252, 197, 197, 403, 213, 303, 414, 292, 303, 402, 65, 175, 164, 301, 28, 400, 175, 240, 86, 361, 6, 301, 106, 109, 45, 220, 108, 326, 169, 20, 196, 128, 329, 306, 81, 8, 169, 399, 84, 333, 224, 265, 99, 214, 197, 405, 264, 333, 196, 304, 359, 421, 385, 207, 312, 174, 148, 63, 65, 403, 250, 24, 178, 357, 301, 328, 174, 257, 9, 104, 26, 93, 8, 270, 343, 354, 264, 337, 23, 250]


In [7]:
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Accuracy:
 0.02
Micro Precision:
 0.125
Micro Recall:
 0.02
Micro F-Score:
 0.03448275862068966


### Train and test size as with BOW

In [6]:
# Read dataset
filepath_train = "./data/songs_train.txt"
dataset_train = Preprocessor(filepath=filepath_train, read_limit=10000)
tf_idf = TfIdf()
train = tf_idf.fit_transform(dataset_train.tokenized)
filepath_test = "./data/songs_test.txt"
dataset_test = Preprocessor(filepath=filepath_test, read_limit=100)
test = tf_idf.transform(dataset_test.tokenized)
# Create numerical representations of labels for mapping
label_to_num = {artist:i for i, artist in enumerate(set(dataset_train.artists) | set(dataset_test.artists))}
num_to_label = {value:key for key, value in label_to_num.items()}
# Initiate Knn classifier
training_examples = [Vector([ex]) for ex in train]
training_labels = [label_to_num[label] for label in dataset_train.artists]
classifier = Knn(training_examples, training_labels)

In [7]:
test_examples = [Vector([ex]) for ex in test]
test_labels = [label_to_num[label] for label in dataset_test.artists]
predictions = classifier.predict(test_examples, k=4, measure="cosine")

In [9]:
evaluator = Evaluator(test_labels, predictions)
print("Accuracy:\n", evaluator.accuracy())

Accuracy:
 0.07


In [10]:
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Micro Precision:
 0.23333333333333334
Micro Recall:
 0.07
Micro F-Score:
 0.1076923076923077


In [8]:
predictions2 = classifier.predict(test_examples, k=4, measure="euclidean")

In [12]:
evaluator = Evaluator(test_labels, predictions2)
print("Accuracy:\n", evaluator.accuracy())

Accuracy:
 0.01


In [13]:
print("Micro Precision:\n", evaluator.micro_precision())
print("Micro Recall:\n", evaluator.micro_recall())
print("Micro F-Score:\n", evaluator.micro_fscore())

Micro Precision:
 0.09090909090909091
Micro Recall:
 0.01
Micro F-Score:
 0.018018018018018018


### Full train and test

In [None]:
# Read dataset
filepath_train = "./data/songs_train.txt"
dataset_train = Preprocessor(filepath=filepath_train, read_limit=46120)
tf_idf = TfIdf()
train = tf_idf.fit_transform(dataset_train.tokenized)
filepath_test = "./data/songs_test.txt"
dataset_test = Preprocessor(filepath=filepath_test, read_limit=5765)
test = tf_idf.transform(dataset_test.tokenized)
# Create numerical representations of labels for mapping
label_to_num = {artist:i for i, artist in enumerate(set(dataset_train.artists) | set(dataset_test.artists))}
num_to_label = {value:key for key, value in label_to_num.items()}
# Initiate Knn classifier
training_examples = [Vector([ex]) for ex in train]
training_labels = [label_to_num[label] for label in dataset_train.artists]
classifier = Knn(training_examples, training_labels)