In [1]:
# Imports
import pandas
import numpy as np

In [2]:
# Helper for getting shapes of arrays
def get_shape(array):
    shape = array.shape
    rows, columns = shape
    print(f"Number of rows: {rows}, Number of columns: {columns}")
    
#For the random projection matrix R we use construction method introduced by Achlioptas
def generate_random_matrix(m, n):
    rij = np.random.choice([-1, 0, 1], size=(m, n), p=[1/6, 2/3, 1/6])
    return np.sqrt(3) * rij



In [3]:
# Getting the data
df_tracks = pandas.read_csv('tracks.csv',index_col=0, header=[0, 1])
df_tracks = df_tracks[df_tracks['set']['subset'] == 'medium']
df_feautures = pandas.read_csv('features.csv',index_col=0, header=[0, 1, 2])


In [4]:
# Building train, test, validation sets
# filter data only for these genres
# Hip-Hop, Pop, Folk, Rock, Experimental,
# International, Electronic, and Instrumental.
# tut nix eigentlich
df_tracks = df_tracks[df_tracks['track']['genre_top'].isin(['Hip-Hop', 'Pop', 'Folk', 'Rock', 'Experimental', 'International', 'Electronic', 'Instrumental'])]


# split track dataframe into training and test data based on column with index 30
df_tracks_train = df_tracks[df_tracks.iloc[:, 30] == 'training']
df_tracks_test = df_tracks[df_tracks.iloc[:, 30] == 'test']
df_tracks_validation = df_tracks[df_tracks.iloc[:, 30] == 'validation']

#filter features data accordingly to the track data
df_features_train = df_feautures[df_feautures.index.isin(df_tracks_train.index)]
df_features_test = df_feautures[df_feautures.index.isin(df_tracks_test.index)]
df_features_validation = df_feautures[df_feautures.index.isin(df_tracks_validation.index)]

# get X
X_train = df_features_train.values
X_test = df_features_test.values
X_validation = df_features_validation.values

# get y
y_train = df_tracks_train['track']['genre_top']
y_test = df_tracks_test['track']['genre_top']
y_validation = df_tracks_validation['track']['genre_top']

In [5]:
# evaluate shape
get_shape(X_train[0:1])

Number of rows: 1, Number of columns: 518


[[ 1.88896334e+00  7.60539293e-01  3.45296562e-01 ...  0.00000000e+00
   1.71672380e+00  6.93301633e-02]
 [ 9.18444753e-01  6.74147248e-01  5.77818275e-01 ...  0.00000000e+00
   1.80610597e+00  5.46228550e-02]
 [ 9.15000617e-01 -6.43476248e-01 -4.60507214e-01 ...  3.41796875e-03
   8.05020452e-01  1.69045236e-02]
 ...
 [ 6.44815028e-01 -8.79404128e-01 -1.14923191e+00 ...  4.88281250e-03
   3.17907929e+00  2.20229235e-02]
 [-4.46937442e-01  1.23500383e+00 -2.50854611e-01 ...  1.95312500e-03
   1.99608481e+00  4.30976301e-02]
 [-1.21936493e-01 -3.48523021e-01 -5.55810153e-01 ...  6.34765625e-03
   1.98907959e+00  1.92883536e-02]]


In [14]:
# for first test we want to have 32 bits represantations
R = generate_random_matrix(16, 518)

In [15]:
# dot product (this shows us, on which side of the hyperplane it is)
X_train_zero_dot = np.dot(X_train, R.T)

In [16]:
# now we basically say everything greateer 0 is true and therefore a one
# so we get a represantation in binary how the track data behaves with respect to R
X_train_zero_dot = X_train_zero_dot > 0
X_train_zero_dot = X_train_zero_dot.astype(int)
get_shape(X_train_zero_dot)
len(X_train_zero_dot)


Number of rows: 11912, Number of columns: 16


11912

In [17]:
# from the youtube video

buckets = {}

for i in range(len(X_train_zero_dot)):
    hash_str = ''.join(X_train_zero_dot[i].astype(str))
    if hash_str not in buckets.keys():
        buckets[hash_str] = []
    buckets[hash_str].append(i)

# print each bucket into owwn line for better readability
for key, value in buckets.items():
    print(key, value)

0111001010010101 [0, 16, 17, 26, 31, 32, 41, 42, 47, 48, 49, 50, 52, 55, 56, 62, 63, 64, 65, 72, 76, 79, 80, 86, 88, 111, 114, 115, 119, 122, 125, 126, 127, 140, 141, 143, 152, 153, 155, 156, 157, 175, 178, 179, 180, 181, 203, 224, 225, 227, 228, 229, 230, 231, 232, 235, 242, 260, 261, 262, 264, 277, 306, 335, 336, 339, 342, 347, 348, 373, 377, 380, 381, 384, 387, 413, 414, 415, 416, 417, 424, 426, 427, 441, 444, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 464, 471, 479, 486, 489, 490, 491, 492, 493, 494, 496, 497, 498, 500, 505, 506, 513, 514, 520, 522, 531, 532, 534, 564, 582, 583, 591, 592, 594, 596, 598, 603, 616, 630, 638, 644, 650, 651, 652, 657, 658, 665, 666, 667, 669, 670, 672, 673, 674, 676, 677, 679, 681, 683, 684, 685, 686, 687, 689, 690, 692, 693, 695, 698, 699, 700, 701, 702, 708, 723, 724, 725, 730, 736, 740, 748, 749, 750, 757, 759, 762, 763, 764, 767, 774, 775, 784, 787, 789, 791, 798, 802, 807, 812, 827, 841, 844, 845, 846, 847, 849, 851, 852, 853, 854, 856