<a href="https://colab.research.google.com/github/lovrodukic/music-recommendation/blob/main/notebooks/recommender_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

In [None]:
!pip install numpy pandas faiss-cpu requests sentence_transformers

In [None]:
import pandas as pd

df = pd.read_csv('music.csv')
df = df.drop(columns=['link'])
df

In [None]:
def create_textual_representation(row):
    textual_representation = f"""Artist: {row['artist']},
Song: {row['song']},
Text: {row['text']}"""

    return textual_representation

df['textual_representation'] = df.apply(create_textual_representation, axis=1)
print(df['textual_representation'].values[0])

In [None]:
import faiss
import requests
import numpy as np
from sentence_transformers import SentenceTransformer

def create_database():
    dim = 384  # Adjust this based on the embedding model
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    # Initialize FAISS index
    index = faiss.IndexFlatL2(dim)
    X = np.zeros((len(df['textual_representation']), dim), dtype='float32')

    for i, representation in enumerate(df['textual_representation']):
        if i % 100 == 0:
            print(f"Processed {i} instances")

        # Generate embedding locally
        embedding = model.encode(representation)
        X[i] = np.array(embedding)

    index.add(X)
    return index

faiss_index = create_database()
faiss.write_index(faiss_index, 'index')

In [None]:
index = faiss.read_index('index')

In [None]:
df[df.song.str.contains('Bohemian')]

In [None]:
def find_song(favorite_song, index, df, n_recommendations=5):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


    favorite_song_text = favorite_song['textual_representation']
    favorite_song_embedding = np.array(
        model.encode(favorite_song_text)
    ).reshape(1, -1)

    D, I = index.search(favorite_song_embedding, n_recommendations + 1)
    indices = I.flatten()
    distances = D.flatten()

    favorite_song_index = favorite_song.name

    filtered_results = [(dist, idx) for dist, idx in zip(distances, indices) if idx != favorite_song_index]
    filtered_results = filtered_results[:n_recommendations]
    final_distances, final_indices = zip(*filtered_results)

    return np.array(final_distances), np.array(final_indices)

favorite_song = df.iloc[49284]

_, I = find_song(favorite_song, index, df, n_recommendations=5)

top_indices = I.flatten()
top_recommendations = df.iloc[top_indices]

# Print recommendations
print(f"Top recommendations based on '{favorite_song.song}' by {favorite_song.artist}")
for i, (_, row) in enumerate(top_recommendations.iterrows(), start=1):
    print(f"{i}. {row['artist']} - {row['song']}")