In [13]:
import pandas as pd
# load difficulty data
df = pd.read_csv('../data/billboard_lyrics_1960-2024_difficulty.csv')
df

Unnamed: 0,year,rank,title,artist,url,lyrics,lyrics_ori,lyrics_tokens,lyrics_cleaned,readability,lexical_density,sentence_complexity,lexical_diversity,word_difficulty,lyrics_difficulty,lyrics_difficulty_class
0,1960,1,Theme from A Summer Place,Percy Faith,https://genius.com/Percy-faith-theme-from-a-su...,theres a summer place\nwhere it may rain or st...,There's a summer place\nWhere it may rain or s...,"['there', 'summer', 'place', 'may', 'rain', 's...",there summer place may rain storm yet im safe ...,-0.060007,-2.143354,-0.058676,0.152713,0.268202,-0.848522,easy
1,1960,2,He'll Have to Go,Jim Reeves,https://genius.com/Jim-reeves-hell-have-to-go-...,\nput your sweet lips a little closer to the p...,[Verse 1]\nPut your sweet lips a little closer...,"['put', 'sweet', 'lip', 'little', 'closer', 'p...",put sweet lip little closer phone let pretend ...,-0.035813,-0.575882,-0.034457,-0.323078,-0.960524,-0.902244,easy
2,1960,3,Cathy's Clown,The Everly Brothers,https://genius.com/The-everly-brothers-cathys-...,\ndont want your love anymore\ndont want your ...,[Chorus]\nDon't want your love anymore\nDon't ...,"['dont', 'want', 'love', 'anymore', 'dont', 'w...",dont want love anymore dont want kiss thats su...,-0.039823,1.082909,-0.037989,-1.223520,0.462769,0.134404,medium
3,1960,4,Running Bear,Johnny Preston,https://genius.com/Johnny-preston-running-bear...,\nvocalizations\n\non the bank of the river\ns...,[Intro]\n*vocalizations*\n[Verse 1]\nOn the ba...,"['vocalization', 'bank', 'river', 'stood', 'ru...",vocalization bank river stood running bear you...,-0.048492,0.679859,-0.050098,-0.361580,0.861752,0.538662,hard
4,1960,5,Teen Angel,Mark Dinning,https://genius.com/Mark-dinning-teen-angel-lyrics,\nteen angel\nteen angel\nteen angel ooh\n\nth...,"[Intro]\nTeen Angel\nTeen Angel\nTeen Angel, o...","['teen', 'angel', 'teen', 'angel', 'teen', 'an...",teen angel teen angel teen angel ooh fateful n...,-0.057031,-0.343701,-0.057162,-0.063980,1.204597,0.352218,hard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5918,2024,96,Bulletproof,Nate Smith,https://genius.com/Nate-smith-bulletproof-lyrics,\nusually these ol neon lights\nare pretty goo...,"[Verse 1]\nUsually, these ol' neon lights\nAre...","['usually', 'ol', 'neon', 'light', 'pretty', '...",usually ol neon light pretty good bad goodbye ...,-0.044869,0.289946,-0.045052,1.116625,-0.973568,0.184768,medium
5919,2024,97,Fe!n,Travis Scott featuring Playboi Carti,https://genius.com/Thehills79-2024-thehills79-...,january 3 2024 wednesday\nrockstar\n by stray ...,"January 3, 2024 Wednesday\nROCK-STAR\n by Stra...","['january', '3', '2024', 'wednesday', 'rocksta...",january 3 2024 wednesday rockstar stray kid ja...,0.300321,0.997593,0.295527,0.480326,-0.203566,0.749614,hard
5920,2024,98,The Painter,Cody Johnson,https://genius.com/Cody-johnson-the-painter-ly...,\nshe talks about the future like shes flippin...,[Verse 1]\nShe talks about the future like she...,"['talk', 'future', 'like', 'shes', 'flippin', ...",talk future like shes flippin magazine find be...,-0.007607,-0.129075,-0.009228,-0.174202,0.895179,0.278149,medium
5921,2024,99,Down Bad,Taylor Swift,https://genius.com/Taylor-swift-down-bad-lyrics,\ndid you really beam me up\nin a cloud of spa...,[Verse 1]\nDid you really beam me up\nIn a clo...,"['really', 'beam', 'cloud', 'sparkling', 'dust...",really beam cloud sparkling dust experiment te...,-0.048492,-0.541447,-0.047071,0.762688,2.465411,1.255880,hard


# Semantic Similarity

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['lyrics_cleaned'])

In [15]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(sentences=df['lyrics_tokens'], vector_size=100, window=5, min_count=1, workers=4)

df['w2v_vector'] = df['lyrics_tokens'].apply(lambda x: sum([w2v_model.wv[i] for i in x if i in w2v_model.wv])/len(x))

In [16]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

df['bert_vector'] = df['lyrics_cleaned'].apply(lambda x: model.encode(x))

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

similarity_matrix = cosine_similarity(tfidf_matrix)

def recommend(song_title, df, similarity_matrix, top_n=5):
    song_idx = df[df['title']==song_title].index[0]
    similar_songs = np.argsort(similarity_matrix[song_idx])[::-1][1:top_n+1]
    return df.iloc[similar_songs][['title', 'artist', 'lyrics_difficulty_class']]

In [23]:
print(recommend('Shape of You', df, similarity_matrix))

                               title                    artist  \
1018        Somebody's Been Sleeping  100 Proof (Aged in Soul)   
289                I Will Follow Him        Little Peggy March   
5629  Montero (Call Me by Your Name)                 Lil Nas X   
5874            What Was I Made For?             Billie Eilish   
609                   The Pied Piper       Crispian St. Peters   

     lyrics_difficulty_class  
1018                  medium  
289                     easy  
5629                    hard  
5874                  medium  
609                     easy  


In [24]:
# save vector and similarity matrix
df.to_csv('../data/billboard_lyrics_1960-2024_vector.csv', index=False)
similarity_df = pd.DataFrame(similarity_matrix, columns=df['title'], index=df['title'])
similarity_df.to_csv('../data/billboard_lyrics_1960-2024_similarity.csv')

In [30]:
def recommend_by_difficulty(user_level, df, top_n=5):
    return df[df['lyrics_difficulty_class']==user_level].sample(top_n)[['title', 'artist', 'lyrics_difficulty_class', 'lyrics']]

print(recommend_by_difficulty('easy', df))

                        title       artist lyrics_difficulty_class  \
3945                    Jaded    Aerosmith                    easy   
1812                 Sad Eyes  Robert John                    easy   
2839            Armageddon It  Def Leppard                    easy   
342   "Painted, Tainted Rose"   Al Martino                    easy   
554   Reach Out I'll Be There    Four Tops                    easy   

                                                 lyrics  
3945  \nhey jjjaded\nyou got your mamas style\nbut y...  
1812  \nlooks like its over\nyou knew i couldnt stay...  
2839  \nyou better come inside when youre ready to\n...  
342   \nshe was a wild and lovely rose\noh how i lov...  
554   1 \nrespect\n  \naretha franklin\n 1967\n2 \nf...  


In [28]:
def hybrid_recommendation(song_title, user_level, df, similarity_matrix, top_n=5):
    song_idx = df[df['title']==song_title].index[0]
    similar_songs = np.argsort(similarity_matrix[song_idx])[::-1][1:]
    if user_level == 'easy':
        similar_songs = df[df['lyrics_difficulty_class']=='easy'].index
    elif user_level == 'medium':
        similar_songs = df[df['lyrics_difficulty_class']=='medium'].index
    elif user_level == 'hard':
        similar_songs = df[df['lyrics_difficulty_class']=='hard'].index
    final_songs = [i for i in similar_songs if i in similar_songs][:top_n]
    return df.iloc[final_songs][['title', 'artist', 'lyrics_difficulty_class', 'lyrics_ori']]

In [29]:
print(hybrid_recommendation('Shape of You', 'medium', df, similarity_matrix))

                         title               artist lyrics_difficulty_class  \
2                Cathy's Clown  The Everly Brothers                  medium   
7                    Handy Man          Jimmy Jones                  medium   
8                 Stuck on You        Elvis Presley                  medium   
11                 Greenfields    The Brothers Four                  medium   
23  Save the Last Dance for Me         The Drifters                  medium   

                                           lyrics_ori  
2   [Chorus]\nDon't want your love anymore\nDon't ...  
7   Comma, comma, comma, comma, come, come, comma\...  
8   [Verse 1]\nYou can shake an apple off an apple...  
11  Once there were greenfields kissed by the sun\...  
23  [Verse 1]\nYou can dance\nEvery dance with the...  
