In [6]:
# Imports
import pandas
import numpy as np

In [7]:
# Helper for getting shapes of arrays
def get_shape(array):
    shape = array.shape
    rows, columns = shape
    print(f"Number of rows: {rows}, Number of columns: {columns}")
    
#For the random projection matrix R we use construction method introduced by Achlioptas
def generate_random_matrix(m, n):
    rij = np.random.choice([-1, 0, 1], size=(m, n), p=[1/6, 2/3, 1/6])
    return np.sqrt(3) * rij



In [11]:
# Getting the data
df_tracks = pandas.read_csv('tracks.csv',index_col=0, header=[0, 1])
df_tracks = df_tracks[df_tracks['set']['subset'] == 'medium']
df_feautures = pandas.read_csv('features.csv',index_col=0, header=[0, 1, 2])


In [14]:
# Building train, test, validation sets
# filter data only for these genres
# Hip-Hop, Pop, Folk, Rock, Experimental,
# International, Electronic, and Instrumental.
# tut nix eigentlich
df_tracks = df_tracks[df_tracks['track']['genre_top'].isin(['Hip-Hop', 'Pop', 'Folk', 'Rock', 'Experimental', 'International', 'Electronic', 'Instrumental'])]


# split track dataframe into training and test data based on column with index 30
df_tracks_train = df_tracks[df_tracks.iloc[:, 30] == 'training']
df_tracks_test = df_tracks[df_tracks.iloc[:, 30] == 'test']
df_tracks_validation = df_tracks[df_tracks.iloc[:, 30] == 'validation']

#filter features data accordingly to the track data
df_features_train = df_feautures[df_feautures.index.isin(df_tracks_train.index)]
df_features_test = df_feautures[df_feautures.index.isin(df_tracks_test.index)]
df_features_validation = df_feautures[df_feautures.index.isin(df_tracks_validation.index)]

# get X
X_train = df_features_train.values
X_test = df_features_test.values
X_validation = df_features_validation.values

# get y
y_train = df_tracks_train['track']['genre_top']
y_test = df_tracks_test['track']['genre_top']
y_validation = df_tracks_validation['track']['genre_top']

In [20]:
# evaluate shape
get_shape(X_train[0:1])

Number of rows: 1, Number of columns: 518


In [37]:
# for first test we want to have 32 bits represantations
R = generate_random_matrix(8, 518)

In [38]:
# dot product (this shows us, on which side of the hyperplane it is)
X_train_zero_dot = np.dot(X_train, R.T)

In [39]:
# now we basically say everything greateer 0 is true and therefore a one
# so we get a represantation in binary how the track data behaves with respect to R
X_train_zero_dot = X_train_zero_dot > 0
X_train_zero_dot = X_train_zero_dot.astype(int)
get_shape(X_train_zero_dot)
len(X_train_zero_dot)

Number of rows: 11912, Number of columns: 8


11912

In [40]:
# from the youtube video

buckets = {}

for i in range(len(X_train_zero_dot)):
    hash_str = ''.join(X_train_zero_dot[i].astype(str))
    if hash_str not in buckets.keys():
        buckets[hash_str] = []
    buckets[hash_str].append(i)

print(buckets)

{'10101010': [0, 72, 563, 741, 757, 1380, 1539, 1704, 2693, 2990, 3029, 3467, 3474, 3890, 4094, 4395, 4605, 4726, 4985, 5430, 6500, 6517, 6583, 6896, 7155, 7251, 7428, 7651, 7898, 8005, 8456, 8740, 8818, 8820, 8871, 9026, 9259, 9345, 9737, 9926, 10063, 10461, 10634, 10791, 10917, 11011, 11174, 11314, 11805], '01001010': [1, 373, 960, 2083, 2471, 2567, 2748, 2749, 2926, 2929, 3928, 3954, 4056, 4375, 4615, 4880, 4966, 5040, 5194, 5223, 5377, 5428, 5553, 5653, 5663, 5669, 5709, 5773, 6042, 6105, 6210, 6385, 6410, 6716, 7196, 7225, 7395, 7574, 8087, 8212, 8472, 8698, 9242, 9416, 10443, 10601, 10612, 10618, 10713, 10793, 10899, 10981, 11041, 11495, 11906], '00100111': [2, 199, 217, 223, 226, 335, 336, 342, 382, 396, 931, 997, 1220, 1240, 1246, 1263, 1283, 1284, 1285, 1385, 1473, 1687, 1696, 1781, 1896, 2040, 2061, 2212, 2566, 2641, 2742, 2772, 2839, 3415, 3534, 3713, 3871, 3973, 4076, 4138, 4259, 4542, 4776, 4938, 5182, 5381, 5388, 5530, 5870, 6194, 6361, 6426, 6625, 6816, 7482, 7512, 7515,

In [16]:
# transform y to numerical values
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
y_train = labelencoder.fit_transform(y_train)
y_test = labelencoder.fit_transform(y_test)
y_validation = labelencoder.fit_transform(y_validation)

# transform x to numerical values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X_validation = scaler.fit_transform(X_validation)

print(X_train[0])

# create min hashing model without using any libaries

# create a hash function
def hash_function(x, a, b, c, prime):
    return (a * x + b) % prime % c

# create a function to create a signature matrix

def create_signature_matrix(X, num_hashes):
    # create random matrix with the same size as the input matrix
    random_matrix = generate_random_matrix(X.shape[1], num_hashes)

    # create a prime number
    prime = 7841	
    # create a list to store the signature matrix
    signature_matrix = []
    # create a list to store the hash functions
    hash_functions = []
    # create n hash functions
    for i in range(num_hashes):
        a = np.random.randint(1, prime)
        b = np.random.randint(0, prime)
        hash_functions.append((a, b))
    



# test signature matrix function
signature_matrix = create_signature_matrix(X_train, 4)
print(signature_matrix.shape)

[ 1.01726571e+00  4.60213336e-01  8.22902609e-03  6.37051737e-01
  7.83834132e-01 -6.58614344e-02  7.70422879e-01  4.52161936e-01
 -1.91689964e-02  2.03529780e-01  3.76915193e-01  4.88544648e-01
  7.21500111e-01 -1.12798948e-02 -2.88324314e-01 -3.25229944e-02
 -1.76476360e+00 -1.45023112e+00 -4.90161134e-01  6.60385045e-01
 -8.94255097e-01 -8.31766982e-01  2.32852592e-01 -5.11575080e-01
 -3.40987673e-01 -3.40889123e-01 -3.54730334e-01 -5.92176784e-01
 -5.31633875e-01 -9.88442946e-02  1.78399177e+00  2.22896787e+00
  8.48482748e-01 -6.05811008e-02  4.02673071e-01 -1.13203646e-01
 -2.29801595e-01 -2.42670463e-01 -3.93619009e-01 -6.23484328e-01
 -2.56403210e-01  2.33538736e-01  1.92960962e+00  2.26937665e+00
  8.13383356e-01  2.30100398e-02  3.56231544e-01  2.02758098e-02
 -3.06810386e-01 -3.16487139e-01 -2.98052486e-01 -3.40767945e-01
 -3.24611990e-01 -3.27754151e-01 -3.07835094e-01 -2.95492998e-01
 -3.13390864e-01 -2.91468029e-01 -3.04757240e-01 -2.75378478e-01
  6.61933981e-01  5.78589

AttributeError: 'NoneType' object has no attribute 'shape'