In [8]:
import os
import numpy as np
import pandas as pd
import polars as pl
from sklearn.model_selection import train_test_split
import logging
from colorama import Fore, Style, init
import warnings
import argparse

# filter warnings
warnings.filterwarnings('ignore')

# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO, format=f'{Fore.GREEN}%(asctime)s - %(levelname)s - %(message)s{Style.RESET_ALL}')

# Initialize colorama
init(autoreset=True)

global_path = '../data'
triplet_path = f"{global_path}/train_triplets.txt"
unique_tracks_path = f"{global_path}/p02_unique_tracks.txt"
genre_path = f"{global_path}/p02_msd_tagtraum_cd2.cls"

In [11]:
def load_data(triplet_path, unique_tracks_path):
    logging.info('Loading data...')


    triplet_columns = ['user_id', 'song_id', 'play_count']
    track_columns = ['track_id', 'song_id', 'artist', 'title']

    triplet_df = pl.read_csv(triplet_path, separator='\t', new_columns=triplet_columns, use_pyarrow=True)
    unique_tracks_df = pl.read_csv(unique_tracks_path, new_columns=track_columns, use_pyarrow=True)

    logging.info('Data loaded successfully.')

    logging.info('Merging songs...')

    triplet_df = triplet_df.filter(pl.col('play_count') > 1)
    songs = pd.merge(triplet_df.to_pandas(), unique_tracks_df.to_pandas(), on='song_id', how='left')
    songs['song'] = songs['title']+' - ' + songs['artist']
    songs = songs[['user_id', 'song_id', 'track_id', 'song', 'play_count']]

    songs['user_idx'] = pd.factorize(songs['user_id'])[0]
    songs['song_idx'] = pd.factorize(songs['song_id'])[0]

    logging.info('Songs merged successfully.')

    del triplet_df, unique_tracks_df

    # save the data
    # songs.to_csv('data/songs.csv', index=False)

    return songs

In [None]:
songs = load_data(triplet_path, unique_tracks_path)
X = songs[['user_idx', 'song_idx', 'song', 'play_count']]
X

[32m2024-05-29 13:32:54,990 - INFO - Loading data...[0m
[32m2024-05-29 13:32:58,590 - INFO - Data loaded successfully.[0m
[32m2024-05-29 13:32:58,598 - INFO - Merging songs...[0m
[32m2024-05-29 13:33:16,090 - INFO - Songs merged successfully.[0m


Unnamed: 0,user_idx,song_idx,song,play_count
0,0,0,Entre Dos Aguas - Paco De Lucia,2
1,0,1,12 segundos de oscuridad - Jorge Drexler,2
2,0,2,Apuesta Por El Rock 'N' Roll - Héroes del Sile...,5
3,0,3,I'll Be Missing You (Featuring Faith Evans & 1...,5
4,0,4,I?'m A Steady Rollin? Man - Robert Johnson,5
...,...,...,...,...
20151715,981153,1475,Représente - Alliance Ethnik,4
20151716,981153,2492,Addams Groove - MC Hammer,6
20151717,981153,2257,Go To Sleep - Eminem / DMX / Obie Trice,2
20151718,981153,55499,We're Back - Eminem / Obie Trice / Stat Quo / ...,2


In [None]:
unique_songs_df = X[['song_idx', 'song']].drop_duplicates(subset='song_idx')
unique_songs_df

Unnamed: 0,song_idx,song
0,0,Entre Dos Aguas - Paco De Lucia
1,1,12 segundos de oscuridad - Jorge Drexler
2,2,Apuesta Por El Rock 'N' Roll - Héroes del Sile...
3,3,I'll Be Missing You (Featuring Faith Evans & 1...
4,4,I?'m A Steady Rollin? Man - Robert Johnson
...,...,...
20149147,333702,Whew - Simon & Garfunkel
20149475,333703,Then There's Me (LP Version) - Tim Rushlow
20149778,333704,Just For Me - Brio From Rio
20150189,333705,Rowena - Loudon Wainwright III


In [None]:
# save two csv files, ratings.csv and songs.csv
unique_songs_df.to_csv(f"{global_path}/songs.csv", index=False)
songs[['user_idx', 'song_idx', 'play_count']].to_csv(f"{global_path}/ratings.csv", index=False)

In [None]:
num_ratings = X.groupby('song_idx')['play_count'].count()
mean_rating = X.groupby('song_idx')['play_count'].mean()
sum_ratings = X.groupby('song_idx')['play_count'].sum()

In [None]:
unique_songs_df['num_ratings'] = unique_songs_df['song_idx'].map(num_ratings)
unique_songs_df['mean_rating'] = unique_songs_df['song_idx'].map(mean_rating)

In [None]:
damping_factor = 10
global_mean_rating = X['play_count'].mean()

global_mean_rating, damping_factor

(5.588893900868015, 10)

In [None]:
damped_numerator = sum_ratings + damping_factor * global_mean_rating
damped_denominator = num_ratings + damping_factor
damped_mean_rating = damped_numerator / damped_denominator
unique_songs_df['damped_mean_rating'] = unique_songs_df['song_idx'].map(damped_mean_rating)

In [None]:
unique_songs_df.sort_values(by='num_ratings', ascending=False).head(10)

Unnamed: 0,song_idx,song,num_ratings,mean_rating,damped_mean_rating
79,76,Sehr kosmisch - Harmonia,65688,5.795153,5.795121
72,69,Undo - Björk,57660,10.673309,10.672427
74,71,You're The One - Dwight Yoakam,55035,12.681385,12.680096
73,70,Dog Days Are Over (Radio Edit) - Florence + Th...,52773,6.042143,6.042057
90,87,Use Somebody - Kings Of Leon,50044,4.742507,4.742676
87,84,Revelry - Kings Of Leon,48290,10.261483,10.260515
84,81,Secrets - OneRepublic,44874,5.775349,5.775307
77,74,Horn Concerto No. 4 in E flat K495: II. Romanc...,40470,8.916803,8.91598
101,97,Yellow - Coldplay,36272,4.905823,4.906011
55,55,Somebody To Love - Justin Bieber,35492,5.170855,5.170973


In [None]:
unique_songs_df.sort_values(by='mean_rating', ascending=False).head(10)

Unnamed: 0,song_idx,song,num_ratings,mean_rating,damped_mean_rating
11508469,301284,Without Bill the Jedi Changed - Rolfe Kent,2,340.0,61.324078
17403049,325634,Thy Mercy - Caedmon's Call,1,339.0,35.898994
8291836,279839,T.K.O - James Taylor Quartet,1,300.0,32.35354
6852866,266800,Who Thought The Railroad Wouldn't Last - Jim L...,3,263.666667,65.145303
5499507,251661,Any Place I Hang My Hat Is Home - Audra McDonald,2,240.0,44.657412
1674231,170169,Craft (Dimensional Release) - Michael Stearns,1,233.0,26.262631
573632,107910,Je te laisse un sifflet - Garou,1,228.0,25.808085
19467714,331818,Get Up! - Helix,1,224.0,25.444449
7344834,271741,The Buzzard - Rye Coalition,1,191.0,22.444449
11158,8198,Words - Jack the Ripper,11,183.454545,98.756616


In [None]:
unique_songs_df.sort_values(by='damped_mean_rating', ascending=False).head(10)

Unnamed: 0,song_idx,song,num_ratings,mean_rating,damped_mean_rating
11158,8198,Words - Jack the Ripper,11,183.454545,98.756616
6852866,266800,Who Thought The Railroad Wouldn't Last - Jim L...,3,263.666667,65.145303
66143,31926,My Prayer - Ray_ Goodman & Brown,6,159.333333,63.243059
11508469,301284,Without Bill the Jedi Changed - Rolfe Kent,2,340.0,61.324078
764195,123346,Eyen [Chosen by fans on Warp20.net] - Plaid,8,129.625,60.716052
975102,136819,Silhouettes - The Ronettes,27,70.481481,52.942944
176841,58615,Lied To - Against All Authority,6,131.0,52.618059
275429,75384,Looking For - Ann Lee,16,78.375,50.380344
218395,66279,Thais II - This Mortal Coil,18,72.277778,48.460319
2537962,197573,Hurry Xmas - L'Arc~en~Ciel,11,80.363636,44.756616


## Using LLMs to recommend songs

## Load the data & data preparation

In [16]:
def load_data(triplet_path, unique_tracks_path, genre_path):
    logging.info('Loading data...')
    triplet_columns = ['user_id', 'song_id', 'play_count']
    track_columns = ['track_id', 'song_id', 'artist', 'title']
    genre_column_names = ['track_id', 'majority_genre', 'minority_genre']

    triplet_df = pl.read_csv(triplet_path, separator='\t', new_columns=triplet_columns, use_pyarrow=True)
    unique_tracks_df = pl.from_pandas(pd.read_csv(unique_tracks_path, names=track_columns, sep="<SEP>", engine='python'))
    genre_df = pl.from_pandas(pd.read_csv(genre_path, sep='\t', comment='#', names=genre_column_names))

    logging.info('Data loaded successfully.')
    return triplet_df, unique_tracks_df, genre_df.drop(columns=['minority_genre'])

In [17]:
triplet_df, unique_tracks_df, genre_df = load_data(triplet_path, unique_tracks_path, genre_path)
# songs = pd.read_csv(f"{global_path}/songs.csv")

[32m2024-05-29 16:34:04,316 - INFO - Loading data...[0m
[32m2024-05-29 16:34:13,884 - INFO - Data loaded successfully.[0m


In [18]:
triplet_df

user_id,song_id,play_count
str,str,i64
"""b80344d063b5cc…","""SOAPDEY12A81C2…",1
"""b80344d063b5cc…","""SOBBMDR12A8C13…",2
"""b80344d063b5cc…","""SOBFNSP12AF72A…",1
"""b80344d063b5cc…","""SOBFOVM12A58A7…",1
"""b80344d063b5cc…","""SOBNZDC12A6D4F…",1
…,…,…
"""b7815dbb206eb2…","""SOUHHHH12AF729…",2
"""b7815dbb206eb2…","""SOUJVIT12A8C14…",1
"""b7815dbb206eb2…","""SOUSMXX12AB018…",1
"""b7815dbb206eb2…","""SOWYSKH12AF72A…",3


In [19]:
song_play_counts = triplet_df.group_by('song_id').agg(pl.sum('play_count').alias('play_count'))
song_play_counts

song_id,play_count
str,i64
"""SOCIOHC12A8C13…",3
"""SOPWNOZ12AB018…",292
"""SOKGCNP12A8151…",259
"""SOMKOJV12A6310…",533
"""SODDNJZ12A6701…",6
…,…
"""SOHEIRO12A58A7…",100
"""SORBPPK12AC468…",5
"""SOUMMRN12AB018…",254
"""SOUATQL12AB018…",7


In [20]:
unique_tracks_df

track_id,song_id,artist,title
str,str,str,str
"""TRMMMYQ128F932…","""SOQMMHC12AB018…","""Faster Pussy c…","""Silent Night"""
"""TRMMMKD128F425…","""SOVFVAK12A8C13…","""Karkkiautomaat…","""Tanssi vaan"""
"""TRMMMRX128F931…","""SOGTUKN12AB017…","""Hudson Mohawke…","""No One Could E…"
"""TRMMMCH128F425…","""SOBNYVR12A8C13…","""Yerba Brava""","""Si Vos Querés"""
"""TRMMMWA128F426…","""SOHSBXH12A8C13…","""Der Mystic""","""Tangle Of Aspe…"
…,…,…,…
"""TRYYYUS12903CD…","""SOTXAME12AB018…","""Kiko Navarro""","""O Samba Da Vid…"
"""TRYYYJO128F426…","""SOXQYIQ12A8C13…","""Kuldeep Manak""","""Jago Chhadeo"""
"""TRYYYMG128F426…","""SOHODZI12A8C13…","""Gabriel Le Mar…","""Novemba"""
"""TRYYYDJ128F931…","""SOLXGOR12A81C2…","""Elude""","""Faraday"""


In [21]:
genre_df

track_id,majority_genre
str,str
"""TRAAAAK128F931…","""Rock"""
"""TRAAAAW128F429…","""Rap"""
"""TRAAABD128F429…","""Rock"""
"""TRAAADJ128F428…","""Rock"""
"""TRAAADZ128F934…","""Latin"""
…,…
"""TRZZZRJ128F428…","""Rock"""
"""TRZZZUK128F92E…","""Folk"""
"""TRZZZYV128F92E…","""New Age"""
"""TRZZZZD128F423…","""Rock"""


In [22]:
o = song_play_counts.join(unique_tracks_df, on='song_id')
o

song_id,play_count,track_id,artist,title
str,i64,str,str,str
"""SOQMMHC12AB018…",8,"""TRMMMYQ128F932…","""Faster Pussy c…","""Silent Night"""
"""SOBNYVR12A8C13…",3,"""TRMMMCH128F425…","""Yerba Brava""","""Si Vos Querés"""
"""SOYGNWH12AB018…",15,"""TRMMMNS128F935…","""3 Gars Su'l So…","""L'antarctique"""
"""SOGPCJI12A8C13…",12,"""TRMMMXI128F428…","""Waldemar Basto…","""N Gana"""
"""SOSDCFG12AB018…",2,"""TRMMMKI128F931…","""Lena Philipsso…","""006"""
…,…,…,…,…
"""SOIDAMG12AB018…",28,"""TRYYYHG128F934…","""Jazz Addixx""","""Chill"""
"""SOFAXZU12AB018…",45,"""TRYYYYS12903D0…","""Taylor Hawkins…","""Way Down"""
"""SOBODSE12A8C13…",67,"""TRYYYZM128F428…","""SKYCLAD""","""Inequality Str…"
"""SOWCNSN12AB018…",3,"""TRYYYON128F932…","""Loose Shus""","""Taurus (Keenho…"


In [23]:
o = o.join(genre_df, on='track_id')
o

song_id,play_count,track_id,artist,title,majority_genre
str,i64,str,str,str,str
"""SOGPCJI12A8C13…",12,"""TRMMMXI128F428…","""Waldemar Basto…","""N Gana""","""World"""
"""SOSDCFG12AB018…",2,"""TRMMMKI128F931…","""Lena Philipsso…","""006""","""Pop"""
"""SOKOVRQ12A8C14…",86,"""TRMMMQY128F92F…","""Dying Fetus""","""Ethos of Coerc…","""Rock"""
"""SOIMMJJ12AF72A…",332,"""TRMMMTK128F424…","""Emery""","""Rock-N-Rule""","""Rock"""
"""SOGNNYL12A6D4F…",269,"""TRMMMQN128F423…","""Prince & The R…","""Raspberry Bere…","""RnB"""
…,…,…,…,…,…
"""SOYKHSL12AF72A…",1657,"""TRYYYWX128F92E…","""The Game""","""Wouldn't Get F…","""Rap"""
"""SOIDAMG12AB018…",28,"""TRYYYHG128F934…","""Jazz Addixx""","""Chill""","""Rap"""
"""SOFAXZU12AB018…",45,"""TRYYYYS12903D0…","""Taylor Hawkins…","""Way Down""","""Rock"""
"""SOBODSE12A8C13…",67,"""TRYYYZM128F428…","""SKYCLAD""","""Inequality Str…","""Rock"""


In [24]:

# Assuming triplet_df, unique_tracks_df, and genre_df are already defined Polars DataFrames
# Merge triplet_df with unique_tracks_df
songs_df = triplet_df.join(
    unique_tracks_df,
    on="song_id",
    how="left"
)
songs_df

user_id,song_id,play_count,track_id,artist,title
str,str,i64,str,str,str
"""b80344d063b5cc…","""SOAPDEY12A81C2…",1,"""TRIRLYL128F425…","""Billy Preston""","""Nothing from N…"
"""b80344d063b5cc…","""SOBBMDR12A8C13…",2,"""TRMHBXZ128F423…","""Paco De Lucia""","""Entre Dos Agua…"
"""b80344d063b5cc…","""SOBFNSP12AF72A…",1,"""TRYQMNI128F147…","""Josh Rouse""","""Under Cold Blu…"
"""b80344d063b5cc…","""SOBFOVM12A58A7…",1,"""TRAHZNE128F934…","""The Dead 60s""","""Riot Radio (So…"
"""b80344d063b5cc…","""SOBNZDC12A6D4F…",1,"""TRJPXGD128F92F…","""Amset""","""Sin límites (I…"
…,…,…,…,…,…
"""b7815dbb206eb2…","""SOUHHHH12AF729…",2,"""TRKUAEO128F933…","""Eminem / Obie …","""We're Back"""
"""b7815dbb206eb2…","""SOUJVIT12A8C14…",1,"""TRRNFHH128F92D…","""Rise Against""","""Savior"""
"""b7815dbb206eb2…","""SOUSMXX12AB018…",1,"""TRSLDDC12903CC…","""Usher featurin…","""OMG"""
"""b7815dbb206eb2…","""SOWYSKH12AF72A…",3,"""TRNJQAM128F145…","""matchbox twent…","""Downfall (Albu…"


In [25]:
# Merge songs_df with genre_df
full_songs_df = songs_df.join(
    genre_df,
    on="track_id",
    how="left"
)
full_songs_df

user_id,song_id,play_count,track_id,artist,title,majority_genre
str,str,i64,str,str,str,str
"""b80344d063b5cc…","""SOAPDEY12A81C2…",1,"""TRIRLYL128F425…","""Billy Preston""","""Nothing from N…","""RnB"""
"""b80344d063b5cc…","""SOBBMDR12A8C13…",2,"""TRMHBXZ128F423…","""Paco De Lucia""","""Entre Dos Agua…","""Electronic"""
"""b80344d063b5cc…","""SOBFNSP12AF72A…",1,"""TRYQMNI128F147…","""Josh Rouse""","""Under Cold Blu…","""Rock"""
"""b80344d063b5cc…","""SOBFOVM12A58A7…",1,"""TRAHZNE128F934…","""The Dead 60s""","""Riot Radio (So…",
"""b80344d063b5cc…","""SOBNZDC12A6D4F…",1,"""TRJPXGD128F92F…","""Amset""","""Sin límites (I…",
…,…,…,…,…,…,…
"""b7815dbb206eb2…","""SOUHHHH12AF729…",2,"""TRKUAEO128F933…","""Eminem / Obie …","""We're Back""",
"""b7815dbb206eb2…","""SOUJVIT12A8C14…",1,"""TRRNFHH128F92D…","""Rise Against""","""Savior""","""Rock"""
"""b7815dbb206eb2…","""SOUSMXX12AB018…",1,"""TRSLDDC12903CC…","""Usher featurin…","""OMG""",
"""b7815dbb206eb2…","""SOWYSKH12AF72A…",3,"""TRNJQAM128F145…","""matchbox twent…","""Downfall (Albu…","""Rock"""


In [26]:
full_songs_df.write_csv(f"{global_path}/full_songs.csv")

In [27]:
def read_lyrics_file(file_path):
    lyrics_dataset = []
    with open(file_path, 'r', encoding='utf-8') as file:

        for line in file:
            if line.startswith("#") or not line.strip():
                continue
            elif line.startswith('%'):
                    word_list = line[1:].strip().split(',')
            else:
                parts = line.split(",")
                track_id = parts[0]
                word_counts = parts[2:]

                lyrics = []
                for wc in word_counts:
                    idx, count = map(int, wc.split(":"))
                    lyrics.extend([word_list[idx - 1]])  # Word index is 1-based

                lyrics_text = ' '.join(lyrics)
                lyrics_dataset.append((track_id, lyrics_text))

    return pl.DataFrame(lyrics_dataset, schema=['track_id', 'lyrics'])

# Usage example:
file_path = '../data/mxm_dataset_train.txt'
lyrics_df = read_lyrics_file(file_path)

In [28]:
lyrics_df

track_id,lyrics
str,str
"""TRAAAAV128F421…","""i the you to a…"
"""TRAAABD128F429…","""i you to and a…"
"""TRAAAED128E078…","""i the you to a…"
"""TRAAAEF128F427…","""i the you to a…"
"""TRAAAEW128F429…","""i to and a me …"
…,…
"""TRZZZWS128F429…","""a no que de y …"
"""TRZZZXA128F428…","""i the you to a…"
"""TRZZZXV128F428…","""i the you to a…"
"""TRZZZYV128F92E…","""i the you and …"


In [29]:
full_songs_df = pl.read_csv(f"{global_path}/full_songs.csv")

In [30]:
full_songs_df = full_songs_df.join(
    lyrics_df,
    on="track_id",
    how="left"
)

full_songs_df

user_id,song_id,play_count,track_id,artist,title,majority_genre,lyrics
str,str,i64,str,str,str,str,str
"""b80344d063b5cc…","""SOAPDEY12A81C2…",1,"""TRIRLYL128F425…","""Billy Preston""","""Nothing from N…","""RnB""",
"""b80344d063b5cc…","""SOBBMDR12A8C13…",2,"""TRMHBXZ128F423…","""Paco De Lucia""","""Entre Dos Agua…","""Electronic""",
"""b80344d063b5cc…","""SOBFNSP12AF72A…",1,"""TRYQMNI128F147…","""Josh Rouse""","""Under Cold Blu…","""Rock""","""the you to and…"
"""b80344d063b5cc…","""SOBFOVM12A58A7…",1,"""TRAHZNE128F934…","""The Dead 60s""","""Riot Radio (So…",,
"""b80344d063b5cc…","""SOBNZDC12A6D4F…",1,"""TRJPXGD128F92F…","""Amset""","""Sin límites (I…",,
…,…,…,…,…,…,…,…
"""b7815dbb206eb2…","""SOUHHHH12AF729…",2,"""TRKUAEO128F933…","""Eminem / Obie …","""We're Back""",,
"""b7815dbb206eb2…","""SOUJVIT12A8C14…",1,"""TRRNFHH128F92D…","""Rise Against""","""Savior""","""Rock""","""i the you to a…"
"""b7815dbb206eb2…","""SOUSMXX12AB018…",1,"""TRSLDDC12903CC…","""Usher featurin…","""OMG""",,
"""b7815dbb206eb2…","""SOWYSKH12AF72A…",3,"""TRNJQAM128F145…","""matchbox twent…","""Downfall (Albu…","""Rock""",


In [31]:
o = o.join(lyrics_df, on='track_id')

In [32]:
o.write_csv(f"{global_path}/llm_RecSys_dataset.csv")

In [33]:
o = pd.read_csv(f"{global_path}/llm_RecSys_dataset.csv")
o

Unnamed: 0,song_id,play_count,track_id,artist,title,majority_genre,lyrics
0,SOCIWDW12A8C13D406,72,TRAAABD128F429CF47,The Box Tops,Soul Deep,Rock,i you to and a me it not in my is your that do...
1,SOXZYWX12A6310ED0C,315,TRAAAED128E0783FAB,Jamie Cullum,It's About Time,Jazz,i the you to and a me it not in my is of that ...
2,SOFSOCN12A8C143F5D,70,TRAAAFD128F92F423A,Gob,Face the Ashes,Punk,i the to and a me it not in my is of do on are...
3,SOAPERH12A58A787DC,114,TRAAAHZ128E0799171,Snoop Dogg,The One And Only (Edited),Rap,i the you to and a me it not in my is of your ...
4,SOSRVUJ12AB018731E,410,TRAABIG128F9356C56,Poe,Walk the Walk,Rock,i the you to and a it not in my is of your tha...
...,...,...,...,...,...,...,...
76010,SOYKMGN12A8C1396F6,32,TRZZZCL128F428BB80,I Am Ghost,The Ship of Pills and Needed Things,Rock,i the you to and a it in my is of your on are ...
76011,SOCSNCA12AB0181E38,281,TRZZZHL128F9329CFB,Ayreon,Day five: Voices,Rock,i the you to and a me it not in is of your tha...
76012,SOULKJA12A8C140620,1212,TRZZZRJ128F42819AF,Belle & Sebastian,Lord Anthony,Rock,the you to and a it not in is of your do on ar...
76013,SOSULQJ12A8C144B79,20,TRZZZUK128F92E3C60,Judee Sill,Lady-O,Folk,i the you to and a it not in my is of your tha...


In [34]:
#Combine title, synopsis, and Genre
# o = o.to_pandas()
o['combined_info'] = o.apply(lambda row: f"Song ID: {row['song_id']}\n Artist : {row['artist']}\n Title : {row['title']}\n Lyrics: {row['lyrics']}.\n Genres: {row['majority_genre']}", axis=1)
print(o['combined_info'][0])

Song ID: SOCIWDW12A8C13D406
 Artist : The Box Tops
 Title : Soul Deep
 Lyrics: i you to and a me it not in my is your that do are for no have so know but what when time can there la get got never feel want would day away yeah heart could need our look where thing think live onli tri keep dream wanna find hear believ everyth someth insid chang soul new start pleas true while deep beat after fade wast trust alreadi style asleep wors goal.
 Genres: Rock


In [35]:
o[['combined_info']].to_csv(f"{global_path}/llm_RecSys_dataset_updated.csv", index=False)

In [37]:
pd.read_csv(f"{global_path}/llm_RecSys_dataset_updated.csv")

Unnamed: 0,combined_info
0,Song ID: SOCIWDW12A8C13D406\n Artist : The Box...
1,Song ID: SOXZYWX12A6310ED0C\n Artist : Jamie C...
2,Song ID: SOFSOCN12A8C143F5D\n Artist : Gob\n T...
3,Song ID: SOAPERH12A58A787DC\n Artist : Snoop D...
4,Song ID: SOSRVUJ12AB018731E\n Artist : Poe\n T...
...,...
76010,Song ID: SOYKMGN12A8C1396F6\n Artist : I Am Gh...
76011,Song ID: SOCSNCA12AB0181E38\n Artist : Ayreon\...
76012,Song ID: SOULKJA12A8C140620\n Artist : Belle &...
76013,Song ID: SOSULQJ12A8C144B79\n Artist : Judee S...


## Build the LLM recommender.

In [1]:
import os
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import CSVLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import OpenAI
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

In [6]:
api_key = os.getenv('OPENAI_API_KEY')

In [9]:
loader = CSVLoader(file_path=f"{global_path}/llm_RecSys_dataset_updated.csv")
data = loader.load()

#data transformers
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [10]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

#Vector DB
docsearch = Chroma.from_documents(texts, embeddings)

[32m2024-05-29 16:38:04,149 - INFO - Use pytorch device_name: mps[0m
[32m2024-05-29 16:38:04,149 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2[0m
[32m2024-05-29 16:38:08,566 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.[0m


In [11]:
query = "I'm looking for a song similar by rapper like Eminem, 50 Cent and Snopp Dog. What could you suggest to me?"
docs = docsearch.similarity_search(query, k=1)

In [12]:
llm = ChatOpenAI(model="gpt-4o", openai_api_key=api_key)
qa = RetrievalQA.from_chain_type(llm,
                                 chain_type="stuff", 
                                 retriever=docsearch.as_retriever(), 
                                 return_source_documents=True)

In [13]:
result = qa.invoke({"query": query})
result['result']

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[32m2024-05-29 16:43:41,830 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m


'The songs listed in the context provided are from rock genres and not from the rap or hip-hop genres. However, if you\'re looking for songs by rappers similar to Eminem, 50 Cent, and Snoop Dogg, you might enjoy tracks by artists like Dr. Dre, Jay-Z, Nas, or The Game. Here are a few suggestions:\n\n1. Dr. Dre - "Still D.R.E." (featuring Snoop Dogg)\n2. 50 Cent - "In Da Club"\n3. Eminem - "Lose Yourself"\n4. Snoop Dogg - "Gin and Juice"\n5. Jay-Z - "99 Problems"\n6. Nas - "N.Y. State of Mind"\n7. The Game - "Hate It or Love It" (featuring 50 Cent)\n\nThese tracks are iconic in the rap genre and share a similar style to the artists you mentioned.'

In [14]:
print(result['result'])

The songs listed in the context provided are from rock genres and not from the rap or hip-hop genres. However, if you're looking for songs by rappers similar to Eminem, 50 Cent, and Snoop Dogg, you might enjoy tracks by artists like Dr. Dre, Jay-Z, Nas, or The Game. Here are a few suggestions:

1. Dr. Dre - "Still D.R.E." (featuring Snoop Dogg)
2. 50 Cent - "In Da Club"
3. Eminem - "Lose Yourself"
4. Snoop Dogg - "Gin and Juice"
5. Jay-Z - "99 Problems"
6. Nas - "N.Y. State of Mind"
7. The Game - "Hate It or Love It" (featuring 50 Cent)

These tracks are iconic in the rap genre and share a similar style to the artists you mentioned.


In [22]:
from langchain.prompts import PromptTemplate

template_prefix = """You are a music recommender system that helps users find songs that match their preferences.
Use the following pieces of context to answer the question at the end.
For each question, suggest three songs, with a short description of the song's genre, mood, and the reason why the user might like it.
For each question, take into account the context and the personal information provided by the user.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}"""

user_info = """This is what we know about the user, and you can use this information to better tune your research:
Age: {age}
Gender: {gender}"""

template_suffix= """Question: {question}
Your response:"""

user_info = user_info.format(age = 18, gender = 'female')

COMBINED_PROMPT = template_prefix +'\n'+ user_info +'\n'+ template_suffix
print(COMBINED_PROMPT)

You are a music recommender system that helps users find songs that match their preferences.
Use the following pieces of context to answer the question at the end.
For each question, suggest three songs, with a short description of the song's genre, mood, and the reason why the user might like it.
For each question, take into account the context and the personal information provided by the user.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}
This is what we know about the user, and you can use this information to better tune your research:
Age: 18
Gender: female
Question: {question}
Your response:


In [23]:
PROMPT = PromptTemplate(
    template=COMBINED_PROMPT, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)
query = "I'm looking for a song similar by rapper like Eminem, 50 Cent. What could you suggest to me?"
result = qa({'query':query})
print(result['result'])

[32m2024-05-29 16:44:44,598 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m


Based on your preference for rappers like Eminem and 50 Cent, I recommend the following songs:

1. Song: "Lose Yourself" by Eminem
   Genre: Rap
   Mood: Energetic, motivational
   Reason: This song by Eminem is a classic rap anthem with powerful lyrics and a captivating beat, similar to the style of Eminem that you enjoy.

2. Song: "In Da Club" by 50 Cent
   Genre: Rap
   Mood: Upbeat, party vibe
   Reason: "In Da Club" is a popular hit by 50 Cent known for its catchy chorus and infectious energy, perfect for fans of 50 Cent's music.

3. Song: "Stan" by Eminem (feat. Dido)
   Genre: Rap
   Mood: Intense, storytelling
   Reason: "Stan" is a gripping narrative-driven song by Eminem that showcases his storytelling abilities and emotional depth, making it a great choice for fans of Eminem's music.


In [24]:
result.get('source_documents')

[Document(page_content='combined_info: Song ID: SOTLYUI12AF72A15AC\n Artist : Eminem\n Title : Em Calls Paul (skit)\n Lyrics: i the you to and a me it not in is your that do we am will all no be have love so know this but with what when like time oh they get if got he feel want make way take ca at back by how look thing die tell who man or well wo wanna about call alway were end yo these new him stop show should realli hope watch fuck rememb doe black bad enough becaus touch hate till em beat matter white shit hit lot gun sorri anyway mirror somethin messag thrill idea excus jean plastic billi click michael goddamn paul aye fucker video jackson.\n Genres: Rap', metadata={'row': 12053, 'source': '../data/llm_RecSys_dataset_updated.csv'}),
 Document(page_content='combined_info: Song ID: SOVJWSS12A6701FBC3\n Artist : Eminem\n Title : Public Service Announcement\n Lyrics: i the you to and a me it not in my is your on have know this with what just like can go up they if as yeah by tell thin

In [25]:
from langchain.prompts import PromptTemplate

template = """You are a music recommender system that helps users find songs that match their preferences.
Use the following pieces of context to answer the question at the end.
For each question, suggest three songs, with a short description of the song's genre, mood, and the reason why the user might like it.
For each question, take into account the context and the personal information provided by the user.
If you don't know the answer, just say that you don't know, don't try to make up an answer.


{context}

Question: {question}
Your response:"""


PROMPT = PromptTemplate(
    template=template, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}

llm=ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0, openai_api_key=api_key) 

qa = RetrievalQA.from_chain_type(llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

query = "I'm looking for a song similar to pink floyd style. What could you suggest to me?"
result = qa.invoke({'query':query})
print(result['result'])

[32m2024-05-29 16:44:48,167 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m


   Genre: Rock
   Mood: Dark, intense
   Reason: This song has a deep, introspective feel similar to Pink Floyd's style, with haunting lyrics and a brooding atmosphere.

   Genre: Metal
   Mood: Epic, atmospheric
   Reason: This song combines intricate instrumentals with thought-provoking lyrics, reminiscent of Pink Floyd's progressive rock sound.

3. Song: Stork & Owl by TV On The Radio
   Genre: Rock
   Mood: Mysterious, introspective
   Reason: This song has a unique blend of experimental rock elements and introspective lyrics, capturing a similar vibe to Pink Floyd's music.


In [26]:
from langchain.prompts import PromptTemplate

template_prefix = """You are a music recommender system that helps users find songs that match their preferences.
Use the following pieces of context to answer the question at the end.
For each question, suggest three songs, with a short description of the song's genre, mood, and the reason why the user might like it.
For each question, take into account the context and the personal information provided by the user.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}"""

user_info = """This is what we know about the user, and you can use this information to better tune your research:
Age: {age}
Gender: {gender}"""

template_suffix= """Question: {question}
Your response:"""

user_info = user_info.format(age = 18, gender = 'female')

COMBINED_PROMPT = template_prefix +'\n'+ user_info +'\n'+ template_suffix
print(COMBINED_PROMPT)

You are a music recommender system that helps users find songs that match their preferences.
Use the following pieces of context to answer the question at the end.
For each question, suggest three songs, with a short description of the song's genre, mood, and the reason why the user might like it.
For each question, take into account the context and the personal information provided by the user.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}
This is what we know about the user, and you can use this information to better tune your research:
Age: 18
Gender: female
Question: {question}
Your response:


In [27]:
PROMPT = PromptTemplate(template=COMBINED_PROMPT, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs
)
PROMPT

PromptTemplate(input_variables=['context', 'question'], template="You are a music recommender system that helps users find songs that match their preferences.\nUse the following pieces of context to answer the question at the end.\nFor each question, suggest three songs, with a short description of the song's genre, mood, and the reason why the user might like it.\nFor each question, take into account the context and the personal information provided by the user.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\nThis is what we know about the user, and you can use this information to better tune your research:\nAge: 18\nGender: female\nQuestion: {question}\nYour response:")

In [28]:
query = "I'm looking for rap songs, artists like eminem and 50cent. What could you suggest to me?"
result = qa.invoke({'query':query})
print(result['result'])
print(result['source_documents'])

[32m2024-05-29 16:44:52,413 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m


Based on your preference for rap artists like Eminem and 50 Cent, I recommend the following songs for you:

1. Song: "P.I.M.P." by 50 Cent
   Genre: Rap
   Mood: Energetic, confident
   Reason: This song by 50 Cent has a catchy beat and bold lyrics, similar to the style of Eminem and 50 Cent. It's a classic rap track that you might enjoy.

2. Song: "What Up Gangsta" by 50 Cent
   Genre: Rap
   Mood: Aggressive, intense
   Reason: "What Up Gangsta" by 50 Cent is a gritty and hard-hitting rap song that showcases his raw talent and powerful delivery, similar to the style of Eminem and 50 Cent.

3. Song: "Radio Freq" by Dead Prez
   Genre: Rap
   Mood: Conscious, thought-provoking
   Reason: Dead Prez's "Radio Freq" is a socially conscious rap track with insightful lyrics and a strong message, similar to the storytelling style often found in Eminem and 50 Cent's music.
[Document(page_content='combined_info: Song ID: SOXZTLV12A6701FFDD\n Artist : 50 Cent\n Title : P.I.M.P.\n Lyrics: i the y