# Programming Assignment 1
## Genre Classification using Locality Sensitive Hashing (LSH)


In [24]:
#Imports

import pandas as pd
import numpy as np

### Data Loading and Preprocessing 

In [25]:
"""
TODO: Do the indcies of X_train get mixed up? I think this
"""
# Load data
df_tracks = pd.read_csv('tracks.csv', index_col=0, header=[0, 1])
df_tracks = df_tracks[df_tracks['set']['subset'] == 'medium']
df_features = pd.read_csv('features.csv', index_col=0, header=[0, 1, 2])

# Filter by genres
df_tracks = df_tracks[df_tracks['track']['genre_top'].isin(['Hip-Hop', 'Pop', 'Folk', 'Rock', 'Experimental', 'International', 'Electronic', 'Instrumental'])]

# Split df_tracks into training, testing, and validation sets
df_tracks_train = df_tracks[df_tracks.iloc[:, 30] == 'training']
df_tracks_test = df_tracks[df_tracks.iloc[:, 30] == 'test']
df_tracks_validation = df_tracks[df_tracks.iloc[:, 30] == 'validation']

# Match features with tracks for training, testing, and validation
df_features_train = df_features[df_features.index.isin(df_tracks_train.index)]
df_features_test = df_features[df_features.index.isin(df_tracks_test.index)]
df_features_validation = df_features[df_features.index.isin(df_tracks_validation.index)]

# Extract feature values
X_train = df_features_train.values
X_test = df_features_test.values
X_validation = df_features_validation.values

# Extract genre labels
y_train = df_tracks_train['track']['genre_top']
y_test = df_tracks_test['track']['genre_top']
y_validation = df_tracks_validation['track']['genre_top']

In [26]:
print(y_train)
print(X_train)

track_id
3              Hip-Hop
134            Hip-Hop
136               Rock
139               Folk
198               Folk
              ...     
155297    Instrumental
155298            Folk
155306            Folk
155307    Experimental
155314            Rock
Name: genre_top, Length: 11912, dtype: object
[[ 1.88896334e+00  7.60539293e-01  3.45296562e-01 ...  0.00000000e+00
   1.71672380e+00  6.93301633e-02]
 [ 9.18444753e-01  6.74147248e-01  5.77818275e-01 ...  0.00000000e+00
   1.80610597e+00  5.46228550e-02]
 [ 9.15000617e-01 -6.43476248e-01 -4.60507214e-01 ...  3.41796875e-03
   8.05020452e-01  1.69045236e-02]
 ...
 [ 6.44815028e-01 -8.79404128e-01 -1.14923191e+00 ...  4.88281250e-03
   3.17907929e+00  2.20229235e-02]
 [-4.46937442e-01  1.23500383e+00 -2.50854611e-01 ...  1.95312500e-03
   1.99608481e+00  4.30976301e-02]
 [-1.21936493e-01 -3.48523021e-01 -5.55810153e-01 ...  6.34765625e-03
   1.98907959e+00  1.92883536e-02]]


### Random Projection Matrix

In [34]:
# r_i = rowsize, r_j) = columsize
def generate_random_matrix(r_i, r_j):
    rij = np.random.choice([-1, 0, 1], size=(r_i, r_j), p=[1/6, 2/3, 1/6])
    return np.sqrt(3) * rij

### Hashtable generator function

We use the transpose of the Random Projection Matrix to reduce the dimensionality  and determine the orientation of each track's data relative to the hyperplanes by using the dot Product of the feature matrix and the transposed Random Projection Matrix. 
Then we use the binary representations of the orientations as a bucket and put in the tracks accordingly. 
$ \begin{cases} 
0 & \text{ if } x < 0 \\
1 & \text{ else}
\end{cases}
$ 
We can do this because of $\mathbf{a} \cdot \mathbf{b} = \|\mathbf{a}\| \|\mathbf{b}\| \cos(\theta)$ positive means on one side and negative on the other.
This whole process represents one hashtable.

In [28]:
"""
The binary representations are of length l.
And the number of hashtables we creat is equal to n.
"""
def hashtable_generator(X, l=32, n=20):
    hash_tables_and_matrices = []  
    for _ in range(n):
        buckets = {}
        random_matrix = generate_random_matrix(l, X.shape[1])
        X_dot = np.dot(X, random_matrix.T)
        X_dot = X_dot > 0
        X_dot = X_dot.astype(int)

        for i in range(len(X_dot)):
            hash_str = ''.join(X_dot[i].astype(str))
            if hash_str not in buckets:
                buckets[hash_str] = []
            buckets[hash_str].append(i)
        
        hash_tables_and_matrices.append((buckets, random_matrix))
    
    return hash_tables_and_matrices


### Similar Songs Finder
In this step we use the computed hash_tables and the according matrices to find all similar songs of the input song. 
> A music track is defined as similar if it is in the same bucket as $t_i$ in one of the $n$ hash tables.


In [29]:
def find_similar_songs(song_input, hash_tables_and_matrices):
    similar_songs_indices = set()

    for buckets, random_matrix in hash_tables_and_matrices:
        song_projected = np.dot(song_input, random_matrix.T) > 0
        song_hash = ''.join(song_projected.astype(int).astype(str))

        if song_hash in buckets:
            similar_songs_indices.update(buckets[song_hash])

    return list(similar_songs_indices)

same but for multiple times

In [30]:
def find_matching_song_multiple( times, song_input):
    found_categories = []
    for _ in range(times):
        print("doing it times ", _)
        local_categories = find_similar_songs(song_input,hashtable_generator(X_train))
        if len(local_categories) == 0:
            continue
        genres = []
        for element in local_categories:
            genres.append(y_train.iloc[element[0]])
        found_categories.append(max(set(genres), key=genres.count))
        #TODO check if reset necessary
        #self.reset()
    return max(set(found_categories), key=found_categories.count)

### Distance Computation of Similar Songs
This function computes the distance of all similar Songs to the input Song.

In [31]:
def compute_distances(X, song_input, similar_songs, metric="euclid", cut=10):
    filtered_songs = []
    if metric == "euclid":
        for element in similar_songs:
            distance = np.linalg.norm(X[element] - song_input)
            filtered_songs.append((element, distance))
    elif metric == "cosine":
        for element in similar_songs:
            # cosine similarity
            dot_product = np.dot(X[element], song_input)
            norm_song = np.linalg.norm(X[element])
            norm_input = np.linalg.norm(song_input)
            similarity = dot_product / (norm_song * norm_input)
            
            # From similarity to distance (cosine distance)
            distance = 1 - similarity
            filtered_songs.append((element, distance))
    else:
        raise ValueError("Invalid metric specified. Use 'euclid' or 'cosine'.")
    
    sorted_songs = sorted(filtered_songs, key=lambda x: x[1])
    if cut is not None:
        sorted_songs = sorted_songs[:cut]
    
    return [index for index, _ in sorted_songs]

### Getting the Genre by Majority vote

In [32]:
def determine_genre_by_majority_vote(song_indices, Y):
    # Filter song_indices to ensure they are within the range of df_genres' index
    valid_indices = [i for i in song_indices if i in Y.index]
    
    # Extract the genres for the given (valid) indices
    genres = Y.loc[valid_indices].values
    
    # Count the occurrence of each genre
    genre_counts = {}
    for genre in genres:
        genre_counts[genre] = genre_counts.get(genre, 0) + 1
    
    # Determine the genre with the majority vote
    majority_genre = max(genre_counts, key=genre_counts.get)
    
    return majority_genre


In [33]:
determine_genre_by_majority_vote(matching_s, y_train)

NameError: name 'matching_s' is not defined

In [42]:
def test_and_validation_accuracy_with_find_matching_songs_multiple_optimized(times):
    correct = 0
    two_d_array = [[0 for _ in range(times)] for _ in range(len(X_test))]
    hash_tables_and_matrices = [hashtable_generator(X_train) for _ in range(times)]
    
    for _ in range(times):
        print("iteration", _)
        for i in range(len(X_test)):
            two_d_array[i][_] = determine_genre_by_majority_vote(X_test[i], hash_tables_and_matrices[_])
        for i in range(len(X_validation)):
            two_d_array[i][_] = determine_genre_by_majority_vote(X_validation[i], hash_tables_and_matrices[_])
    
    for i in range(len(X_test)):
        print("iteration part2")
        genres = [two_d_array[i][_] for _ in range(times)]
        if max(set(genres), key=genres.count) == y_test.iloc[i]:
            correct += 1
    for i in range(len(X_validation)):
        genres = [two_d_array[i][_] for _ in range(times)]
        if max(set(genres), key=genres.count) == y_validation.iloc[i]:
            correct += 1
    
    accuracy = correct / (len(X_test)+len(X_validation))
    #print(f"Accuracy Test set advanced: {accuracy}")
    return accuracy




In [43]:
def test_accuracy_with_find_matching_songs_multiple_optimized(times):
    correct = 0
    two_d_array = [[0 for _ in range(times)] for _ in range(len(X_test))]
    hash_tables_and_matrices = [hashtable_generator(X_train) for _ in range(times)]
    
    for _ in range(times):
        print("iteration", _)
        for i in range(len(X_test)):
            two_d_array[i][_] = determine_genre_by_majority_vote(X_test[i], hash_tables_and_matrices[_])
    
    for i in range(len(X_test)):
        print("iteration part2")
        genres = [two_d_array[i][_] for _ in range(times)]
        if max(set(genres), key=genres.count) == y_test.iloc[i]:
            correct += 1
    
    accuracy = correct / len(X_test)
    #print(f"Accuracy Test set advanced: {accuracy}")
    return accuracy

test_accuracy_with_find_matching_songs_multiple_optimized(2)


iteration 0


TypeError: argument of type 'builtin_function_or_method' is not iterable