In [83]:
import pandas as pd
import numpy as np

First, we will build a recommender based on the following song features:
acousticness	danceability	energy	instrumentalness	liveness	loudness	speechiness	tempo	valence	popularity

In [84]:
df = pd.read_csv("taylor_swift_spotify.csv",index_col=0)
df.head()

Unnamed: 0,name,album,release_date,track_number,id,uri,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,duration_ms
0,Welcome To New York (Taylor's Version),1989 (Taylor's Version) [Deluxe],2023-10-27,1,4WUepByoeqcedHoYhSNHRt,spotify:track:4WUepByoeqcedHoYhSNHRt,0.00942,0.757,0.61,3.7e-05,0.367,-4.84,0.0327,116.998,0.685,70,212600
1,Blank Space (Taylor's Version),1989 (Taylor's Version) [Deluxe],2023-10-27,2,0108kcWLnn2HlH2kedi1gn,spotify:track:0108kcWLnn2HlH2kedi1gn,0.0885,0.733,0.733,0.0,0.168,-5.376,0.067,96.057,0.701,71,231833
2,Style (Taylor's Version),1989 (Taylor's Version) [Deluxe],2023-10-27,3,3Vpk1hfMAQme8VJ0SNRSkd,spotify:track:3Vpk1hfMAQme8VJ0SNRSkd,0.000421,0.511,0.822,0.0197,0.0899,-4.785,0.0397,94.868,0.305,72,231000
3,Out Of The Woods (Taylor's Version),1989 (Taylor's Version) [Deluxe],2023-10-27,4,1OcSfkeCg9hRC2sFKB4IMJ,spotify:track:1OcSfkeCg9hRC2sFKB4IMJ,0.000537,0.545,0.885,5.6e-05,0.385,-5.968,0.0447,92.021,0.206,71,235800
4,All You Had To Do Was Stay (Taylor's Version),1989 (Taylor's Version) [Deluxe],2023-10-27,5,2k0ZEeAqzvYMcx9Qt5aClQ,spotify:track:2k0ZEeAqzvYMcx9Qt5aClQ,0.000656,0.588,0.721,0.0,0.131,-5.579,0.0317,96.997,0.52,70,193289


In [85]:
len(df)

530

In [86]:
def preprocess_data(data):
    # normalize numerical features
    data_numeric = data.select_dtypes(include=[np.number])
    data_normalized = (data_numeric - data_numeric.min()) / (data_numeric.max() - data_numeric.min())
    data[data_normalized.columns] = data_normalized
    return data

# calculate cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity

In [87]:
def recommender1(user_input, data, top_k=5):
    similarities = {}
    for index, row in data.iterrows():
        song_features = row.drop(['name', 'album', 'release_date', 'id', 'uri', 'duration_ms'])
        similarity = cosine_similarity(user_input, song_features)
        similarities[row['name']] = similarity
    # sort and give similar top k
    top_recommendations = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return top_recommendations

In [88]:
# prompt user for a Taylor Swift song
user_song = input("Enter a Taylor Swift song: ")
print("Input:",user_song)

if user_song in df['name'].values:
    # extract features of the user's input song
    user_input = df[df['name'] == user_song].drop(['name', 'album', 'release_date', 'id', 'uri', 'duration_ms'], axis=1).iloc[0]

    # recommend top k songs
    recommendations = recommender1(user_input, df)
    print("\nTop 5 recommended songs:")
    for song, _ in recommendations:
        print(song)
    

    max_song_length = max(len(song) for song, _ in recommendations) + 2
    max_album_length = max(len(df[df['name'] == song]['album'].iloc[0]) for song, _ in recommendations) + 2
    max_similarity_length = max(len(str(similarity)) for _, similarity in recommendations) + 2

    # print song names, album names, and similarity scores
    print()
    print("{:<{song_length}} {:<{album_length}} {:<{similarity_length}}".format('Song', 'Album', 'Similarity Score', song_length=max_song_length, album_length=max_album_length, similarity_length=max_similarity_length))
    for song, similarity in recommendations:
        album_name = df[df['name'] == song]['album'].iloc[0]
        print("{:<{song_length}} {:<{album_length}} {:<{similarity_length}.2f}".format(song, album_name, similarity, song_length=max_song_length, album_length=max_album_length, similarity_length=max_similarity_length))
else:
    print("Song not found in the dataset.")

Input: 
Song not found in the dataset.


Recommender 2

Next, we will build a recommender based on song lyrics

In [115]:
df2 = pd.read_csv("ts_discography_released.csv")
df2.head()

Unnamed: 0,album_title,album_url,category,album_track_number,song_title,song_url,song_artists,song_release_date,song_page_views,song_lyrics,song_writers,song_producers,song_tags
0,Taylor Swift,https://genius.com/albums/Taylor-Swift/Taylor-...,Taylor Swift,1,Tim McGraw,https://genius.com/Taylor-swift-tim-mcgraw-lyrics,['Taylor Swift'],2006-06-19,241400,He said the way my blue eyes shined Put those ...,"['Liz Rose', 'Taylor Swift']",['Nathan Chapman'],"['Country', 'English', 'USA', 'Country Rock', ..."
1,Taylor Swift,https://genius.com/albums/Taylor-Swift/Taylor-...,Taylor Swift,2,Picture to Burn,https://genius.com/Taylor-swift-picture-to-bur...,['Taylor Swift'],2006-10-24,258600,"State the obvious, I didn't get my perfect fan...","['Liz Rose', 'Taylor Swift']",['Nathan Chapman'],"['Pop', 'Rock', 'Country', 'English', 'USA', '..."
2,Taylor Swift,https://genius.com/albums/Taylor-Swift/Taylor-...,Taylor Swift,3,Teardrops On My Guitar,https://genius.com/Taylor-swift-teardrops-on-m...,['Taylor Swift'],2006-10-24,227300,Drew looks at me I fake a smile so he won't se...,"['Liz Rose', 'Taylor Swift']",['Nathan Chapman'],"['Country', 'English', 'USA', 'Adult Contempor..."
3,Taylor Swift,https://genius.com/albums/Taylor-Swift/Taylor-...,Taylor Swift,4,A Place In This World,https://genius.com/Taylor-swift-a-place-in-thi...,['Taylor Swift'],2006-10-24,75800,"I don't know what I want, so don't ask me 'Cau...","['Angelo Petraglia', 'Robert Ellis Orrall', 'T...",['Nathan Chapman'],"['Country', 'Pop', 'English', 'USA', 'Teen Pop..."
4,Taylor Swift,https://genius.com/albums/Taylor-Swift/Taylor-...,Taylor Swift,5,Cold as You,https://genius.com/Taylor-swift-cold-as-you-ly...,['Taylor Swift'],2006-10-24,127000,You have a way of coming easily to me And when...,"['Liz Rose', 'Taylor Swift']",['Nathan Chapman'],"['Country', 'Pop', 'English', 'USA', 'Ballad',..."


In [116]:
# add column if taylor's version exists for that song
import re
df2['song_title_lower'] = df2['song_title'].str.lower().str.strip()
def clean_song_title(title):
    return re.sub(r'\(ft\. [^\)]+\)', '', title)
def clean_song_title2(title):
    return re.sub(r'(?<=\(taylor\'s version\)) .*', '', title)
df2['song_title_lower'] = df2['song_title_lower'].apply(clean_song_title)
df2['song_title_lower'] = df2['song_title_lower'].apply(clean_song_title2)
def remove_trailing_spaces(title):
    return re.sub(r'\.\.\.\s+', '...', title)
df2['song_title_lower'] = df2['song_title_lower'].apply(remove_trailing_spaces)
df2['song_title_lower'] = df2['song_title_lower'].str.lower().str.strip()
df2['taylors_version_exists'] = False
for index, row in df2.iterrows():
    if row['song_title_lower'] + " (taylor's version)" in df2['song_title_lower'].values:
        df2.at[index, 'taylors_version_exists'] = True

In [118]:
import re
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    text = ' '.join(tokens)
    return text

df2['processed_lyrics'] = df2['song_lyrics'].apply(preprocess_text)

[nltk_data] Downloading package wordnet to /Users/megha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [122]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df2['processed_lyrics'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def recommender2(title, top_k=5):
    filtered_indices = df2.index[df2['song_title'] == title]
    if len(filtered_indices) == 0:
        print("Error: Song not found")
        return []
    idx = filtered_indices[0]
    sim_scores = cosine_sim[idx]
    top_k_indices = sim_scores.argsort()[::-1][1:]  # exclude input song
    recommendations = []
    for i in top_k_indices:
        if len(recommendations) == top_k:  # stop at k recommendations
            break
        if df2.iloc[i]['taylors_version_exists']:  # skip if taylor's version exists
            continue
        song_title = df2.iloc[i]['song_title']
        album = df2.iloc[i]['album_title']
        similarity_score = sim_scores[i]
        recommendations.append((song_title, album, similarity_score))
    return recommendations


input_song_title = "cardigan"
recommended_titles = recommender2(input_song_title)
print("Recommended Song Titles based on \"", input_song_title, "\": ", sep="")
for title, album, similarity in recommended_titles:
    print("Song:", title)
    print("Album:", album)
    print("Similarity Score:", similarity)
    print()

Recommended Song Titles based on "cardigan": 
Song: If This Was a Movie (Taylor's Version)
Album: The More Fearless (Taylor's Version) Chapter
Similarity Score: 0.23731181144076954

Song: Only The Young
Album: nan
Similarity Score: 0.21049275854280006

Song: The Moment I Knew (Taylor's Version)
Album: Red (Taylor's Version)
Similarity Score: 0.19048198264818975

Song: hoax
Album: folklore (deluxe version)
Similarity Score: 0.13972464371792961

Song: Mary's Song (Oh My My My)
Album: Taylor Swift
Similarity Score: 0.12791086736695823



In [124]:
input_song_title = input("Enter a Taylor Swift song: ")
recommended_titles = recommender2(input_song_title)
print("Recommended Song Titles based on \"", input_song_title, "\": ", sep="")
print()
for title, album, similarity in recommended_titles:
    print(title)

max_song_length = max(len(song) for song, _, _ in recommended_titles) + 2
max_album_length = max(len(str(album)) for _, album, _ in recommended_titles) + 2
max_similarity_length = max(len(str(similarity)) for _, _, similarity in recommended_titles) + 2

# print song names, album names, and similarity scores
print()
print("{:<{song_length}} {:<{album_length}} {:<{similarity_length}}".format('Song', 'Album', 'Similarity Score', song_length=max_song_length, album_length=max_album_length, similarity_length=max_similarity_length))
for title, album, similarity in recommended_titles:
    print("{:<{song_length}} {:<{album_length}} {:<{similarity_length}.2f}".format(title, album, similarity, song_length=max_song_length, album_length=max_album_length, similarity_length=max_similarity_length))

Recommended Song Titles based on "cardigan": 

If This Was a Movie (Taylor's Version)
Only The Young
The Moment I Knew (Taylor's Version)
hoax
Mary's Song (Oh My My My)

Song                                     Album                                          Similarity Score     
If This Was a Movie (Taylor's Version)   The More Fearless (Taylor's Version) Chapter   0.24                 
Only The Young                           nan                                            0.21                 
The Moment I Knew (Taylor's Version)     Red (Taylor's Version)                         0.19                 
hoax                                     folklore (deluxe version)                      0.14                 
Mary's Song (Oh My My My)                Taylor Swift                                   0.13                 


Recommender 3

Our third recommender will be based on the following features: song tags (micro-genres, etc.), release date, writers, producers