In [None]:
import pandas as pd 
import os
desktop_path = os.path.expanduser("~/Desktop")

playlist_file = os.path.join(desktop_path, "spotify_dataset.csv")
song_attr_file = os.path.join(desktop_path, "Spotify_Song_Attributes.csv")

playlist_df = pd.read_csv(playlist_file, on_bad_lines='skip')
song_attr_df = pd.read_csv(song_attr_file)

playlist_df.describe()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
count,12891680,12858108,12891592,12890434
unique,15918,289820,2032043,157504
top,4398de6902abde3351347b048fcdc287,Daft Punk,Intro,Starred
freq,295275,36086,6676,1337085


In [None]:
#fixing column names (removing spaces, quotes, uppercases)
playlist_df.columns = (
    playlist_df.columns
    .str.replace('"', '') 
    .str.strip()         
    .str.lower()   
)
print(playlist_df.columns.tolist()) 


['user_id', 'artistname', 'trackname', 'playlistname']


In [None]:
#cleaning the dataset (removing nulls and duplicates)
new_playlist_df = playlist_df.dropna(subset=['artistname', 'trackname']).copy()

new_playlist_df['artistname'] = new_playlist_df['artistname'].str.strip()
new_playlist_df['trackname'] = new_playlist_df['trackname'].str.strip()

new_playlist_df = new_playlist_df.drop_duplicates(subset=['artistname', 'trackname'])
new_playlist_df = new_playlist_df.reset_index(drop=True)

print(new_playlist_df.describe())

                                 user_id       artistname trackname  \
count                            2790680          2790680   2790680   
unique                             14062           289778   2005143   
top     4398de6902abde3351347b048fcdc287  Various Artists     Intro   
freq                              138074             6780      1589   

       playlistname  
count       2789536  
unique        92871  
top         Starred  
freq         194353  


In [None]:
new_playlist_df.shape

(2790680, 4)

In [None]:
desktop_path = os.path.expanduser("~/Desktop")

filename = "cleaned_spotify_playlists.csv"

new_playlist_df.to_csv(os.path.join(desktop_path, filename), index=False)

In [None]:
#cleaning dataset 2
song_attr_df.dropna(subset=['artistName', 'genre', 'msPlayed', 'danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','type','id','uri','track_href','analysis_url','duration_ms','time_signature'], inplace=True)  # Drop rows with missing critical values

song_attr_df.drop_duplicates(inplace=True)

for col in song_attr_df.select_dtypes(include=['object']).columns:
    song_attr_df[col] = song_attr_df[col].astype(str).str.lower().str.strip()

song_attr_df.columns = [col.strip().lower().replace(" ", "_") for col in song_attr_df.columns]
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numeric_cols = song_attr_df.select_dtypes(include=['number']).columns
song_attr_df[numeric_cols] = scaler.fit_transform(song_attr_df[numeric_cols])
song_attr_df = song_attr_df.apply(lambda x: x.str.encode('utf-8').str.decode('utf-8') if x.dtype == "object" else x)

print(song_attr_df)

song_attr_df.to_csv("cleaned_dataset.csv", index=False)

print("Dataset cleaned and saved as 'cleaned_dataset.csv'")

print(song_attr_df.head())

                                              trackname  \
1     "in the hall of the mountain king" from peer g...   
2                                    #brooklynbloodpop!   
3                                                   $10   
4                            (i just) died in your arms   
5                                         (l)only child   
...                                                 ...   
5034                                            younger   
5035                                 younger with time.   
5037                your love is my drug (8 bit slowed)   
5038                                         your power   
5039                            your voice / bethel, ny   

                     artistname  msplayed              genre  danceability  \
1     london symphony orchestra  0.011405  british orchestra      0.486680   
2                          syko  0.000919         glitchcore      0.707992   
3                  good morning  0.000158   experimental 

In [None]:
def merge_datasets(new_playlist_df, song_attr_df):
    """
    Clean and merge the playlist and attributes datasets on track name and artist name.
    Returns the merged DataFrame.
    """

    # Merge on track name + artist name
    merged_df = pd.merge(
        new_playlist_df,
        song_attr_df,
        on=['track_name'],
        how='inner'  # or 'left' depending on what you want to keep
    )

    return merged_df
merge_datasets(new_playlist_df, song_attr_df)

Unnamed: 0,user_id,artist_name_x,track_name,playlistname,artist_name_y,msplayed,genre,danceability,energy,key,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,944c80d26922ae634d6ce445b1fdff7f,Passenger,27,HATK,loving caliber,0.018050,scandipop,0.627049,0.361672,0.000000,...,0.091684,0.465517,0.592910,audio_features,3r7w1kne3d3ke1jpnyifwm,spotify:track:3r7w1kne3d3ke1jpnyifwm,https://api.spotify.com/v1/tracks/3r7w1kne3d3k...,https://api.spotify.com/v1/audio-analysis/3r7w...,0.109588,0.8
1,944c80d26922ae634d6ce445b1fdff7f,Lily Allen,22,Starred,taylor swift,0.003757,pop,0.657787,0.695366,0.636364,...,0.053668,0.651116,0.440245,audio_features,3yii7uwglf6k5zw3xad3mp,spotify:track:3yii7uwglf6k5zw3xad3mp,https://api.spotify.com/v1/tracks/3yii7uwglf6k...,https://api.spotify.com/v1/audio-analysis/3yii...,0.120256,0.8
2,c50566d83fba17b20697039d5824db78,Hurula,22,Everything at once,taylor swift,0.003757,pop,0.657787,0.695366,0.636364,...,0.053668,0.651116,0.440245,audio_features,3yii7uwglf6k5zw3xad3mp,spotify:track:3yii7uwglf6k5zw3xad3mp,https://api.spotify.com/v1/tracks/3yii7uwglf6k...,https://api.spotify.com/v1/audio-analysis/3yii...,0.120256,0.8
3,c50566d83fba17b20697039d5824db78,Molotov Jive,29,Everything at once,chris james,0.000873,singer-songwriter pop,0.796107,0.597162,0.363636,...,0.087424,0.912779,0.566707,audio_features,0ahovvrk5frfzvf8fp7pzp,spotify:track:0ahovvrk5frfzvf8fp7pzp,https://api.spotify.com/v1/tracks/0ahovvrk5frf...,https://api.spotify.com/v1/audio-analysis/0aho...,0.069810,0.8
4,c50566d83fba17b20697039d5824db78,Molotov Jive,29,Everything at once,run river north,0.000229,stomp and holler,0.449795,0.903800,0.636364,...,0.238633,0.278905,0.664592,audio_features,0y30gl6j3y3xdxk3scoctp,spotify:track:0y30gl6j3y3xdxk3scoctp,https://api.spotify.com/v1/tracks/0y30gl6j3y3x...,https://api.spotify.com/v1/audio-analysis/0y30...,0.107280,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
920,edea424cdf88985d97bd74c9bc3674fd,Alison Crockett,alive,Astralizzare,audrey mika,0.002483,modern indie pop,0.605533,0.334616,0.818182,...,0.077415,0.297160,0.366001,audio_features,39lmkwlmeetbr80uy7hivu,spotify:track:39lmkwlmeetbr80uy7hivu,https://api.spotify.com/v1/tracks/39lmkwlmeetb...,https://api.spotify.com/v1/audio-analysis/39lm...,0.101893,1.0
921,edea424cdf88985d97bd74c9bc3674fd,Alison Crockett,alive,Astralizzare,empire of the sun,0.000571,australian dance,0.536885,0.885762,0.454545,...,0.048877,0.400609,0.507858,audio_features,5wbnkcespwsvwhuzmljs3s,spotify:track:5wbnkcespwsvwhuzmljs3s,https://api.spotify.com/v1/tracks/5wbnkcespwsv...,https://api.spotify.com/v1/audio-analysis/5wbn...,0.105903,0.8
922,edea424cdf88985d97bd74c9bc3674fd,Alison Crockett,alive,Astralizzare,khalid,0.001120,pop,0.395492,0.405764,1.000000,...,0.155574,0.207911,0.332842,audio_features,12vqhhz4wvvcnedsivjleq,spotify:track:12vqhhz4wvvcnedsivjleq,https://api.spotify.com/v1/tracks/12vqhhz4wvvc...,https://api.spotify.com/v1/audio-analysis/12vq...,0.091110,0.8
923,edea424cdf88985d97bd74c9bc3674fd,Alison Crockett,alive,Astralizzare,willis,0.002938,alabama indie,0.630123,0.372695,0.545455,...,0.169418,0.487830,0.372445,audio_features,7jzhp7ixsvgtm6zdwugwip,spotify:track:7jzhp7ixsvgtm6zdwugwip,https://api.spotify.com/v1/tracks/7jzhp7ixsvgt...,https://api.spotify.com/v1/audio-analysis/7jzh...,0.078985,0.8


In [None]:

from gensim.models import Word2Vec

# Create unique song identifier
df['song_id'] = df['track_name'].str.lower() + ' - ' + df['artist_name'].str.lower()

# Group songs by playlist
playlists = df.groupby('playlistname')['song_id'].apply(list).tolist()

# Train Word2Vec model
model = Word2Vec(
    sentences=playlists,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,
    workers=4,
    epochs=20
)

# Save model
model.save("song_word2vec.model")

# Example: Find similar songs
song_query = 'blinding lights - the weeknd'
if song_query in model.wv:
    similar_songs = model.wv.most_similar(song_query, topn=10)
    for song, score in similar_songs:
        print(f"{song} (score: {score:.3f})")
else:
    print("Song not in vocabulary.")


In [None]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Example user input
user_preferences = {
    'genre': 'pop',
    'mood': 'happy',  # high valence
    'energy': 0.7,
    'danceability': 0.6,
    'favorite_song': 'blinding lights'  # example seed
}

# Map mood to valence threshold
mood_valence_map = {
    'happy': 0.6,
    'sad': 0.3,
    'energetic': 0.7,
    'calm': 0.4
}
valence_threshold = mood_valence_map.get(user_preferences['mood'], 0.5)

# Make a full song key (assuming trackname is unique enough)
merged_df['song_key'] = merged_df['track_name'].str.lower().str.strip()

# Check if the favorite song is in the model
seed_song = user_preferences['favorite_song'].lower().strip()
if seed_song not in w2v_model.wv:
    print(f"'{seed_song}' not in vocabulary. Please choose another song.")
else:
    # Get most similar songs using Word2Vec
    similar_songs = w2v_model.wv.most_similar(seed_song, topn=500)
    similar_song_names = [song for song, sim in similar_songs]

    # Filter merged_df to contain only songs in similar list
    filtered_df = merged_df[merged_df['song_key'].isin(similar_song_names)].copy()

    # Apply additional metadata filters
    filtered_df = filtered_df[
        (filtered_df['genre'].str.lower() == user_preferences['genre'].lower()) &
        (filtered_df['valence'] >= valence_threshold) &
        (filtered_df['energy'] >= user_preferences['energy']) &
        (filtered_df['danceability'] >= user_preferences['danceability'])
    ]

    # Rank songs by Word2Vec similarity score
    similarity_scores = []
    for song in filtered_df['song_key']:
        sim_score = w2v_model.wv.similarity(seed_song, song)
        similarity_scores.append(sim_score)

    filtered_df['similarity'] = similarity_scores

    # Sort by similarity
    ranked_df = filtered_df.sort_values(by='similarity', ascending=False)

    # Show top 10 recommendations
    recommendations = ranked_df[['track_name', 'artist_name', 'genre', 'valence', 'similarity']].head(10)
    print("Top Recommendations:")
    print(recommendations)
