<a href="https://colab.research.google.com/github/lovrodukic/music-recommendation/blob/main/notebooks/recommender_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

In [None]:
!pip install numpy pandas faiss-cpu requests sentence_transformers

In [21]:
import pandas as pd

df = pd.read_csv('music.csv')
df = df.drop(columns=['link'])
# df['artist'] = df['artist'].str.lower().str.strip()
# df['song'] = df['song'].str.lower().str.strip()

# features_df = pd.read_csv('music_features.csv')
# features_df['artist_name'] = features_df['artist_name'].str.lower().str.strip()
# features_df['track_name'] = features_df['track_name'].str.lower().str.strip()

# merged_df = pd.merge(
#     df,  # Base dataset
#     features_df,  # Features dataset
#     left_on=['artist', 'song'],  # Columns from df
#     right_on=['artist_name', 'track_name'],  # Columns from features_df
#     how='inner'  # Keep only rows where there is a match
# )

# # Select and combine relevant columns
# final_df = merged_df[[
#     'artist', 'song', 'lyrics', 'genre',  # Keep these from the merged dataset
#     *[col for col in features_df.columns if col not in ['artist_name', 'track_name', 'lyrics', 'genre']]
# ]]

df

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...
...,...,...,...
57645,Ziggy Marley,Good Old Days,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,northern star \r\nam i frightened \r\nwhere ...


In [29]:
def create_textual_representation(row):
    textual_representation = f"""Artist: {row['artist']},
Song: {row['song']},
Text: {row['text']}"""

    return textual_representation

df['textual_representation'] = df.apply(create_textual_representation, axis=1)
print(df['textual_representation'].values[0])

Artist: ABBA,
Song: Ahe's My Kind Of Girl,
Text: Look at her face, it's a wonderful face  
And it means something special to me  
Look at the way that she smiles when she sees me  
How lucky can one fellow be?  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?  
  
And when we go for a walk in the park  
And she holds me and squeezes my hand  
We'll go on walking for hours and talking  
About all the things that we plan  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?




In [None]:
import faiss
import requests
import numpy as np
from sentence_transformers import SentenceTransformer

def create_database():
    dim = 384  # Adjust this based on the embedding model
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    # Initialize FAISS index
    index = faiss.IndexFlatL2(dim)
    X = np.zeros((len(df['textual_representation']), dim), dtype='float32')

    for i, representation in enumerate(df['textual_representation']):
        if i % 100 == 0:
            print(f"Processed {i} instances")

        # Generate embedding locally
        embedding = model.encode(representation)
        X[i] = np.array(embedding)

    index.add(X)
    return index

faiss_index = create_database()
faiss.write_index(faiss_index, 'index')

In [42]:
index = faiss.read_index('index')

In [44]:
df[df.song.str.contains('Bohemian')]

Unnamed: 0,artist,song,text,textual_representation
49284,Queen,Bohemian Rhapsody,Is this the real life? \r\nIs this just fanta...,"Artist: Queen,\nSong: Bohemian Rhapsody,\nText..."
51897,Soundtracks,Bohemian Like You - The Dandy Warhols,"You've got a great car \r\nYeah, what's wrong...","Artist: Soundtracks,\nSong: Bohemian Like You ..."
55888,Weird Al Yankovic,Bohemian Polka,Is this the real life \r\nIs this just fantas...,"Artist: Weird Al Yankovic,\nSong: Bohemian Pol..."


In [78]:
def find_song(favorite_song, index, df, n_recommendations=5):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


    favorite_song_text = favorite_song['textual_representation']
    favorite_song_embedding = np.array(
        model.encode(favorite_song_text)
    ).reshape(1, -1)

    D, I = index.search(favorite_song_embedding, n_recommendations + 1)
    indices = I.flatten()
    distances = D.flatten()

    favorite_song_index = favorite_song.name

    filtered_results = [(dist, idx) for dist, idx in zip(distances, indices) if idx != favorite_song_index]
    filtered_results = filtered_results[:n_recommendations]
    final_distances, final_indices = zip(*filtered_results)

    return np.array(final_distances), np.array(final_indices)

favorite_song = df.iloc[49284]

_, I = find_song(favorite_song, index, df, n_recommendations=5)

top_indices = I.flatten()
top_recommendations = df.iloc[top_indices]

# Print recommendations
print(f"Top recommendations based on '{favorite_song.song}' by {favorite_song.artist}")
for i, (_, row) in enumerate(top_recommendations.iterrows(), start=1):
    print(f"{i}. {row['artist']} - {row['song']}")

Top recommendations based on 'Bohemian Rhapsody' by Queen
1. Weird Al Yankovic - Bohemian Polka
2. Queen - Mother Love
3. Roy Orbison - Mama
4. Ozzy Osbourne - Mama, I'm Coming Home
5. Genesis - Mama
