In [None]:
# Imports
import pandas
import numpy as np

In [None]:
# Helper for getting shapes of arrays
def get_shape(array):
    shape = array.shape
    rows, columns = shape
    print(f"Number of rows: {rows}, Number of columns: {columns}")
    
#For the random projection matrix R we use construction method introduced by Achlioptas
def generate_random_matrix(m, n):
    rij = np.random.choice([-1, 0, 1], size=(m, n), p=[1/6, 2/3, 1/6])
    return np.sqrt(3) * rij



In [None]:
# Getting the data if not already loaded
df_tracks = pandas.read_csv('tracks.csv',index_col=0, header=[0, 1])
df_tracks = df_tracks[df_tracks['set']['subset'] == 'medium']
df_feautures = pandas.read_csv('features.csv',index_col=0, header=[0, 1, 2])


In [None]:
# Building train, test, validation sets
# filter data only for these genres
# Hip-Hop, Pop, Folk, Rock, Experimental,
# International, Electronic, and Instrumental.
# tut nix eigentlich
df_tracks = df_tracks[df_tracks['track']['genre_top'].isin(['Hip-Hop', 'Pop', 'Folk', 'Rock', 'Experimental', 'International', 'Electronic', 'Instrumental'])]


# split track dataframe into training and test data based on column with index 30
df_tracks_train = df_tracks[df_tracks.iloc[:, 30] == 'training']
df_tracks_test = df_tracks[df_tracks.iloc[:, 30] == 'test']
df_tracks_validation = df_tracks[df_tracks.iloc[:, 30] == 'validation']

#filter features data accordingly to the track data
df_features_train = df_feautures[df_feautures.index.isin(df_tracks_train.index)]
df_features_test = df_feautures[df_feautures.index.isin(df_tracks_test.index)]
df_features_validation = df_feautures[df_feautures.index.isin(df_tracks_validation.index)]

# get X
X_train = df_features_train.values
X_test = df_features_test.values
X_validation = df_features_validation.values

# get y
y_train = df_tracks_train['track']['genre_top']
y_test = df_tracks_test['track']['genre_top']
y_validation = df_tracks_validation['track']['genre_top']

In [None]:
# evaluate shape
get_shape(X_train[0:1])

In [None]:
# for first test we want to have 32 bits represantations
R = generate_random_matrix(32, 518)

In [None]:
# dot product (this shows us, on which side of the hyperplane it is)
X_train_zero_dot = np.dot(X_train, R.T)

In [None]:
# now we basically say everything greateer 0 is true and therefore a one
# so we get a represantation in binary how the track data behaves with respect to R
X_train_zero_dot = X_train_zero_dot > 0
X_train_zero_dot = X_train_zero_dot.astype(int)
get_shape(X_train_zero_dot)
len(X_train_zero_dot)


In [None]:
# from the youtube video

buckets = {}

for i in range(len(X_train_zero_dot)):
    hash_str = ''.join(X_train_zero_dot[i].astype(str))
    if hash_str not in buckets.keys():
        buckets[hash_str] = []
    buckets[hash_str].append(i)

# print each bucket into own line for better readability
for key, value in buckets.items():
    print(key, value)

In [None]:
# testing if the results are kind of valid 

# print the corresponding y labels for each entry

# testing if the results are kind of valid 

# count the genre for each key
genre_counts = {}
for key, value in buckets.items():
    genre_counts[key] = {}
    for i in range(len(value)):
        genre = y_train.iloc[value[i]]
        if genre not in genre_counts[key]:
            genre_counts[key][genre] = 1
        else:
            genre_counts[key][genre] += 1

# sort each genre count by value
for key, value in genre_counts.items():
    genre_counts[key] = {k: v for k, v in sorted(value.items(), key=lambda item: item[1], reverse=True)}

# print the genre counts for each key
for key, value in genre_counts.items():
    print(f"Key: {key}")
    for genre, count in value.items():
        print(f"Genre: {genre}, Count: {count}")

# count in how many buckets a genre has a 90% majority
majority_percentage = 0.75
genre_majority = {}
for key, value in genre_counts.items():
    for genre, count in value.items():
        if count/len(buckets[key]) > majority_percentage:
            if genre not in genre_majority:
                genre_majority[genre] = 1
            else:
                genre_majority[genre] += 1

# print the total count of majority genres and the total amount of buckets
accuracy = sum(genre_majority.values())/len(buckets)
print(f"Accuracy: {accuracy}")






In [None]:
# assign the majority genre to each bucket
bucket_genres = {}
for key, value in genre_counts.items():
    bucket_genres[key] = list(value.keys())[0]

# use test set to evaluate the accuracy of the majority genre
X_test_zero_dot = np.dot(X_test, R.T)
X_test_zero_dot = X_test_zero_dot > 0
X_test_zero_dot = X_test_zero_dot.astype(int)


# get the majority genre for each test set entry based on the bucket_genres
majority_genres = []
for i in range(len(X_test_zero_dot)):
# find the matching bucket
    bucket_genre = bucket_genres.get(''.join(X_test_zero_dot[i].astype(str)))
    majority_genres.append(bucket_genre)


# compare the majority genre with the actual genre
correct = 0
for i in range(len(majority_genres)):
    if majority_genres[i] == y_test.iloc[i]:
        correct += 1

accuracy = correct/len(majority_genres)
print(f"Accuracy Test set: {accuracy}")

In [None]:
# use combined test and validation set to evaluate the accuracy of the majority genre
X_validation_zero_dot = np.dot(X_validation, R.T)
X_validation_zero_dot = X_validation_zero_dot > 0
X_validation_zero_dot = X_validation_zero_dot.astype(int)

# get the majority genre for each test set entry based on the bucket_genres
majority_genres = []
for i in range(len(X_validation_zero_dot)):
    # find the matching bucket
    bucket_genre = bucket_genres.get(''.join(X_validation_zero_dot[i].astype(str)))
    majority_genres.append(bucket_genre)

for i in range(len(X_test_zero_dot)):
# find the matching bucket
    bucket_genre = bucket_genres.get(''.join(X_test_zero_dot[i].astype(str)))
    majority_genres.append(bucket_genre)

# compare the majority genre with the actual genre
correct = 0
for i in range(len(X_validation_zero_dot)):
    if majority_genres[i] == y_validation.iloc[i]:
        correct += 1

for i in range(len(X_test_zero_dot)):
    if majority_genres[i] == y_test.iloc[i]:
        correct += 1

accuracy = correct/len(majority_genres)
print(f"Accuracy Validation+Test set: {accuracy}")