# Programming Assignment 1
## Genre Classification using Locality Sensitive Hashing (LSH)


In [4]:
#Imports

import pandas as pd
import numpy as np

### Data Loading and Preprocessing 

In [10]:
# Load data
df_tracks = pd.read_csv('tracks.csv', index_col=0, header=[0, 1])
df_tracks = df_tracks[df_tracks['set']['subset'] == 'medium']
df_features = pd.read_csv('features.csv', index_col=0, header=[0, 1, 2])

# Filter by genres
df_tracks = df_tracks[df_tracks['track']['genre_top'].isin(['Hip-Hop', 'Pop', 'Folk', 'Rock', 'Experimental', 'International', 'Electronic', 'Instrumental'])]

# Split df_tracks into training, testing, and validation sets
df_tracks_train = df_tracks[df_tracks.iloc[:, 30] == 'training']
df_tracks_test = df_tracks[df_tracks.iloc[:, 30] == 'test']
df_tracks_validation = df_tracks[df_tracks.iloc[:, 30] == 'validation']

# Match features with tracks for training, testing, and validation
df_features_train = df_features[df_features.index.isin(df_tracks_train.index)]
df_features_test = df_features[df_features.index.isin(df_tracks_test.index)]
df_features_validation = df_features[df_features.index.isin(df_tracks_validation.index)]

# Extract feature values
X_train = df_features_train.values
X_test = df_features_test.values
X_validation = df_features_validation.values

# Extract genre labels
y_train = df_tracks_train['track']['genre_top']
y_test = df_tracks_test['track']['genre_top']
y_validation = df_tracks_validation['track']['genre_top']

### Random Projection Matrix

In [5]:
# r_i = rowsize, r_j) = columsize
def generate_random_matrix(r_i, r_j):
    rij = np.random.choice([-1, 0, 1], size=(r_i, r_j), p=[1/6, 2/3, 1/6])
    return np.sqrt(3) * rij

### Hashtable generator function

We use the transpose of the Random Projection Matrix to reduce the dimensionality  and determine the orientation of each track's data relative to the hyperplanes by using the dot Product of the feature matrix and the transposed Random Projection Matrix. 
Then we use the binary representations of the orientations as a bucket and put in the tracks accordingly. 
$ \begin{cases} 
0 & \text{ if } x < 0 \\
1 & \text{ else}
\end{cases}
$ 
We can do this because of $\mathbf{a} \cdot \mathbf{b} = \|\mathbf{a}\| \|\mathbf{b}\| \cos(\theta)$ positive means on one side and negative on the other.
This whole process represents one hashtable.

In [16]:
"""
The binary representations are of length l.
And the number of hashtables we creat is equal to n.
"""
def hashtable_generator(X, l=8, n=10):
    hash_tables = []  
    for _ in range(n):  
        buckets = {}
        random_matrix = generate_random_matrix(l, X.shape[1]) 
        X_dot = np.dot(X, random_matrix.T)
        X_dot = X_dot > 0
        X_dot = X_dot.astype(int)

        for i in range(len(X_dot)):
            hash_str = ''.join(X_dot[i].astype(str))
            if hash_str not in buckets:
                buckets[hash_str] = []
            buckets[hash_str].append(i)
        
        hash_tables.append(buckets)
    
    return hash_tables

[{'01110001': [0, 1, 2, 3, 4]},
 {'01011111': [0], '01011101': [1, 3, 4], '01010111': [2]},
 {'00111111': [0, 2, 3, 4], '00111110': [1]},
 {'00000111': [0], '00000101': [1, 4], '00010110': [2], '00010100': [3]},
 {'11110110': [0, 1, 2, 3, 4]},
 {'01011010': [0, 1, 3, 4], '01010010': [2]},
 {'11011110': [0, 1, 2, 3, 4]},
 {'01110000': [0, 1, 2, 3, 4]},
 {'01111111': [0, 1, 2, 3], '11111111': [4]},
 {'11000101': [0, 1], '11000001': [2, 3, 4]}]

In [ ]:
    for key, value in self.buckets.items():
        self.genre_counts[key] = {}
        for i in range(len(value)):
            genre = self.y_train.iloc[value[i]]
            if genre not in self.genre_counts[key]:
                self.genre_counts[key][genre] = 1
            else:
                self.genre_counts[key][genre] += 1

    for key, value in self.genre_counts.items():
        self.genre_counts[key] = {k: v for k, v in sorted(value.items(), key=lambda item: item[1], reverse=True)}

    for key, value in self.genre_counts.items():
        self.bucket_genres[key] = list(value.keys())[0]