In [None]:
import os
import numpy as np
import pandas as pd
import polars as pl
from sklearn.model_selection import train_test_split
import logging
from colorama import Fore, Style, init
import warnings
import argparse

# filter warnings
warnings.filterwarnings('ignore')

# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO, format=f'{Fore.GREEN}%(asctime)s - %(levelname)s - %(message)s{Style.RESET_ALL}')

# Initialize colorama
init(autoreset=True)

global_path = '../data'
triplet_path = f"{global_path}/train_triplets.txt"
unique_tracks_path = f"{global_path}/p02_unique_tracks.csv"
genre_path = f"{global_path}/p02_msd_tagtraum_cd2.cls"

In [None]:
def load_data(triplet_path, unique_tracks_path):
    logging.info('Loading data...')


    triplet_columns = ['user_id', 'song_id', 'play_count']
    track_columns = ['track_id', 'song_id', 'artist', 'title']

    triplet_df = pl.read_csv(triplet_path, separator='\t', new_columns=triplet_columns, use_pyarrow=True)
    unique_tracks_df = pl.read_csv(unique_tracks_path, new_columns=track_columns, use_pyarrow=True)

    logging.info('Data loaded successfully.')

    logging.info('Merging songs...')

    triplet_df = triplet_df.filter(pl.col('play_count') > 1)
    songs = pd.merge(triplet_df.to_pandas(), unique_tracks_df.to_pandas(), on='song_id', how='left')
    songs['song'] = songs['title']+' - ' + songs['artist']
    songs = songs[['user_id', 'song_id', 'track_id', 'song', 'play_count']]

    songs['user_idx'] = pd.factorize(songs['user_id'])[0]
    songs['song_idx'] = pd.factorize(songs['song_id'])[0]

    logging.info('Songs merged successfully.')

    del triplet_df, unique_tracks_df

    # save the data
    # songs.to_csv('data/songs.csv', index=False)

    return songs

In [None]:
songs = load_data(triplet_path, unique_tracks_path)
X = songs[['user_idx', 'song_idx', 'song', 'play_count']]
X

[32m2024-05-29 13:32:54,990 - INFO - Loading data...[0m
[32m2024-05-29 13:32:58,590 - INFO - Data loaded successfully.[0m
[32m2024-05-29 13:32:58,598 - INFO - Merging songs...[0m
[32m2024-05-29 13:33:16,090 - INFO - Songs merged successfully.[0m


Unnamed: 0,user_idx,song_idx,song,play_count
0,0,0,Entre Dos Aguas - Paco De Lucia,2
1,0,1,12 segundos de oscuridad - Jorge Drexler,2
2,0,2,Apuesta Por El Rock 'N' Roll - Héroes del Sile...,5
3,0,3,I'll Be Missing You (Featuring Faith Evans & 1...,5
4,0,4,I?'m A Steady Rollin? Man - Robert Johnson,5
...,...,...,...,...
20151715,981153,1475,Représente - Alliance Ethnik,4
20151716,981153,2492,Addams Groove - MC Hammer,6
20151717,981153,2257,Go To Sleep - Eminem / DMX / Obie Trice,2
20151718,981153,55499,We're Back - Eminem / Obie Trice / Stat Quo / ...,2


In [None]:
unique_songs_df = X[['song_idx', 'song']].drop_duplicates(subset='song_idx')
unique_songs_df

Unnamed: 0,song_idx,song
0,0,Entre Dos Aguas - Paco De Lucia
1,1,12 segundos de oscuridad - Jorge Drexler
2,2,Apuesta Por El Rock 'N' Roll - Héroes del Sile...
3,3,I'll Be Missing You (Featuring Faith Evans & 1...
4,4,I?'m A Steady Rollin? Man - Robert Johnson
...,...,...
20149147,333702,Whew - Simon & Garfunkel
20149475,333703,Then There's Me (LP Version) - Tim Rushlow
20149778,333704,Just For Me - Brio From Rio
20150189,333705,Rowena - Loudon Wainwright III


In [None]:
# save two csv files, ratings.csv and songs.csv
unique_songs_df.to_csv(f"{global_path}/songs.csv", index=False)
songs[['user_idx', 'song_idx', 'play_count']].to_csv(f"{global_path}/ratings.csv", index=False)

In [None]:
num_ratings = X.groupby('song_idx')['play_count'].count()
mean_rating = X.groupby('song_idx')['play_count'].mean()
sum_ratings = X.groupby('song_idx')['play_count'].sum()

In [None]:
unique_songs_df['num_ratings'] = unique_songs_df['song_idx'].map(num_ratings)
unique_songs_df['mean_rating'] = unique_songs_df['song_idx'].map(mean_rating)

In [None]:
damping_factor = 10
global_mean_rating = X['play_count'].mean()

global_mean_rating, damping_factor

(5.588893900868015, 10)

In [None]:
damped_numerator = sum_ratings + damping_factor * global_mean_rating
damped_denominator = num_ratings + damping_factor
damped_mean_rating = damped_numerator / damped_denominator
unique_songs_df['damped_mean_rating'] = unique_songs_df['song_idx'].map(damped_mean_rating)

In [None]:
unique_songs_df.sort_values(by='num_ratings', ascending=False).head(10)

Unnamed: 0,song_idx,song,num_ratings,mean_rating,damped_mean_rating
79,76,Sehr kosmisch - Harmonia,65688,5.795153,5.795121
72,69,Undo - Björk,57660,10.673309,10.672427
74,71,You're The One - Dwight Yoakam,55035,12.681385,12.680096
73,70,Dog Days Are Over (Radio Edit) - Florence + Th...,52773,6.042143,6.042057
90,87,Use Somebody - Kings Of Leon,50044,4.742507,4.742676
87,84,Revelry - Kings Of Leon,48290,10.261483,10.260515
84,81,Secrets - OneRepublic,44874,5.775349,5.775307
77,74,Horn Concerto No. 4 in E flat K495: II. Romanc...,40470,8.916803,8.91598
101,97,Yellow - Coldplay,36272,4.905823,4.906011
55,55,Somebody To Love - Justin Bieber,35492,5.170855,5.170973


In [None]:
unique_songs_df.sort_values(by='mean_rating', ascending=False).head(10)

Unnamed: 0,song_idx,song,num_ratings,mean_rating,damped_mean_rating
11508469,301284,Without Bill the Jedi Changed - Rolfe Kent,2,340.0,61.324078
17403049,325634,Thy Mercy - Caedmon's Call,1,339.0,35.898994
8291836,279839,T.K.O - James Taylor Quartet,1,300.0,32.35354
6852866,266800,Who Thought The Railroad Wouldn't Last - Jim L...,3,263.666667,65.145303
5499507,251661,Any Place I Hang My Hat Is Home - Audra McDonald,2,240.0,44.657412
1674231,170169,Craft (Dimensional Release) - Michael Stearns,1,233.0,26.262631
573632,107910,Je te laisse un sifflet - Garou,1,228.0,25.808085
19467714,331818,Get Up! - Helix,1,224.0,25.444449
7344834,271741,The Buzzard - Rye Coalition,1,191.0,22.444449
11158,8198,Words - Jack the Ripper,11,183.454545,98.756616


In [None]:
unique_songs_df.sort_values(by='damped_mean_rating', ascending=False).head(10)

Unnamed: 0,song_idx,song,num_ratings,mean_rating,damped_mean_rating
11158,8198,Words - Jack the Ripper,11,183.454545,98.756616
6852866,266800,Who Thought The Railroad Wouldn't Last - Jim L...,3,263.666667,65.145303
66143,31926,My Prayer - Ray_ Goodman & Brown,6,159.333333,63.243059
11508469,301284,Without Bill the Jedi Changed - Rolfe Kent,2,340.0,61.324078
764195,123346,Eyen [Chosen by fans on Warp20.net] - Plaid,8,129.625,60.716052
975102,136819,Silhouettes - The Ronettes,27,70.481481,52.942944
176841,58615,Lied To - Against All Authority,6,131.0,52.618059
275429,75384,Looking For - Ann Lee,16,78.375,50.380344
218395,66279,Thais II - This Mortal Coil,18,72.277778,48.460319
2537962,197573,Hurry Xmas - L'Arc~en~Ciel,11,80.363636,44.756616


## Using LLMs to recommend songs

## Load the data & data preparation

In [None]:
def load_data(triplet_path, unique_tracks_path, genre_path):
    logging.info('Loading data...')
    triplet_columns = ['user_id', 'song_id', 'play_count']
    track_columns = ['track_id', 'song_id', 'artist', 'title']
    genre_column_names = ['track_id', 'majority_genre', 'minority_genre']

    triplet_df = pl.read_csv(triplet_path, separator='\t', new_columns=triplet_columns, use_pyarrow=True)
    unique_tracks_df = pl.read_csv(unique_tracks_path, new_columns=track_columns, use_pyarrow=True)
    genre_df = pl.from_pandas(pd.read_csv(genre_path, sep='\t', comment='#', names=genre_column_names))

    logging.info('Data loaded successfully.')
    return triplet_df, unique_tracks_df, genre_df.drop(columns=['minority_genre'])

In [None]:
triplet_df, unique_tracks_df, genre_df = load_data(triplet_path, unique_tracks_path, genre_path)
# songs = pd.read_csv(f"{global_path}/songs.csv")

In [None]:
triplet_df

In [None]:
song_play_counts = triplet_df.group_by('song_id').agg(pl.sum('play_count').alias('play_count'))
song_play_counts

In [None]:
unique_tracks_df

In [None]:
genre_df

In [None]:
o = song_play_counts.join(unique_tracks_df, on='song_id')
o

In [None]:
o = o.join(genre_df, on='track_id')
o

In [None]:

# Assuming triplet_df, unique_tracks_df, and genre_df are already defined Polars DataFrames
# Merge triplet_df with unique_tracks_df
songs_df = triplet_df.join(
    unique_tracks_df,
    on="song_id",
    how="left"
)
songs_df

In [None]:
# Merge songs_df with genre_df
full_songs_df = songs_df.join(
    genre_df,
    on="track_id",
    how="left"
)
full_songs_df

In [None]:
full_songs_df.write_csv(f"{global_path}/full_songs.csv")

In [None]:
def read_lyrics_file(file_path):
    lyrics_dataset = []
    with open(file_path, 'r', encoding='utf-8') as file:

        for line in file:
            if line.startswith("#") or not line.strip():
                continue
            elif line.startswith('%'):
                    word_list = line[1:].strip().split(',')
            else:
                parts = line.split(",")
                track_id = parts[0]
                word_counts = parts[2:]

                lyrics = []
                for wc in word_counts:
                    idx, count = map(int, wc.split(":"))
                    lyrics.extend([word_list[idx - 1]])  # Word index is 1-based

                lyrics_text = ' '.join(lyrics)
                lyrics_dataset.append((track_id, lyrics_text))

    return pl.DataFrame(lyrics_dataset, schema=['track_id', 'lyrics'])

# Usage example:
file_path = '../data/mxm_dataset_train.txt'
lyrics_df = read_lyrics_file(file_path)

In [None]:
lyrics_df

In [None]:
full_songs_df = pl.read_csv(f"{global_path}/full_songs.csv")

In [None]:
full_songs_df = full_songs_df.join(
    lyrics_df,
    on="track_id",
    how="left"
)

full_songs_df

In [None]:
o = o.join(lyrics_df, on='track_id')

In [None]:
o.write_csv(f"{global_path}/llm_RecSys_dataset.csv")

In [None]:
o = pd.read_csv(f"{global_path}/llm_RecSys_dataset.csv")
o

In [None]:
#Combine title, synopsis, and Genre
# o = o.to_pandas()
o['combined_info'] = o.apply(lambda row: f"Song ID: {row['song_id']}\n Artist : {row['artist']}\n Title : {row['title']}\n Lyrics: {row['lyrics']}.\n Genres: {row['majority_genre']}", axis=1)
print(o['combined_info'][0])

In [None]:
o[['combined_info']].to_csv(f"{global_path}/llm_RecSys_dataset_updated.csv", index=False)

In [None]:
pd.read_csv('llm_RecSys_dataset_updated.csv')

## Build the LLM recommender.

In [None]:
import os
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import CSVLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import OpenAI
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

In [None]:
api_key = os.environ.get('OPENAI_API_KEY')

In [None]:
loader = CSVLoader(file_path=f"{global_path}/llm_RecSys_dataset_updated.csv")
data = loader.load()

#data transformers
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [None]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

#Vector DB
docsearch = Chroma.from_documents(texts, embeddings)

In [None]:
query = "I'm looking for a song similar by rapper like Eminem, 50 Cent and Snopp Dog. What could you suggest to me?"
docs = docsearch.similarity_search(query, k=1)

In [None]:
llm = ChatOpenAI(model="gpt-4o", openai_api_key=api_key)
qa = RetrievalQA.from_chain_type(llm,
                                 chain_type="stuff", 
                                 retriever=docsearch.as_retriever(), 
                                 return_source_documents=True)

In [None]:
result = qa.invoke({"query": query})
result['result']

In [None]:
print(result['result'])

In [None]:
from langchain.prompts import PromptTemplate

template_prefix = """You are a music recommender system that helps users find songs that match their preferences.
Use the following pieces of context to answer the question at the end.
For each question, suggest three songs, with a short description of the song's genre, mood, and the reason why the user might like it.
For each question, take into account the context and the personal information provided by the user.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}"""

user_info = """This is what we know about the user, and you can use this information to better tune your research:
Age: {age}
Gender: {gender}"""

template_suffix= """Question: {question}
Your response:"""

user_info = user_info.format(age = 18, gender = 'female')

COMBINED_PROMPT = template_prefix +'\n'+ user_info +'\n'+ template_suffix
print(COMBINED_PROMPT)

In [None]:
PROMPT = PromptTemplate(
    template=COMBINED_PROMPT, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)
query = "I'm looking for a song similar by rapper like Eminem, 50 Cent. What could you suggest to me?"
result = qa({'query':query})
print(result['result'])

In [None]:
result.get('source_documents')

In [None]:
from langchain.prompts import PromptTemplate

template = """You are a music recommender system that helps users find songs that match their preferences.
Use the following pieces of context to answer the question at the end.
For each question, suggest three songs, with a short description of the song's genre, mood, and the reason why the user might like it.
For each question, take into account the context and the personal information provided by the user.
If you don't know the answer, just say that you don't know, don't try to make up an answer.


{context}

Question: {question}
Your response:"""


PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}

llm=ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0, openai_api_key=api_key) 

qa = RetrievalQA.from_chain_type(llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

query = "I'm looking for a song similar to pink floyd style. What could you suggest to me?"
result = qa.invoke({'query':query})
print(result['result'])

In [None]:
from langchain.prompts import PromptTemplate

template_prefix = """You are a music recommender system that helps users find songs that match their preferences.
Use the following pieces of context to answer the question at the end.
For each question, suggest three songs, with a short description of the song's genre, mood, and the reason why the user might like it.
For each question, take into account the context and the personal information provided by the user.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}"""

user_info = """This is what we know about the user, and you can use this information to better tune your research:
Age: {age}
Gender: {gender}"""

template_suffix= """Question: {question}
Your response:"""

user_info = user_info.format(age = 18, gender = 'female')

COMBINED_PROMPT = template_prefix +'\n'+ user_info +'\n'+ template_suffix
print(COMBINED_PROMPT)

In [None]:
PROMPT = PromptTemplate(template=COMBINED_PROMPT, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs
)
PROMPT

In [None]:
query = "I'm looking for rap songs, artists like eminem and 50cent. What could you suggest to me?"
result = qa.invoke({'query':query})
print(result['result'])
print(result['source_documents'])