In [11]:
import numpy as np
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import psycopg2

In [8]:
df1 = pd.read_csv("/Users/monishakrothapalli/Documents/GitHub/playlistGen/spotify_data.csv")

In [9]:
df1.columns

Index(['index', 'artist_name', 'track_name', 'track_id', 'popularity', 'year',
       'genre', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature'],
      dtype='object')

In [4]:
df1.info()
df1.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159764 entries, 0 to 1159763
Data columns (total 20 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Unnamed: 0        1159764 non-null  int64  
 1   artist_name       1159749 non-null  object 
 2   track_name        1159763 non-null  object 
 3   track_id          1159764 non-null  object 
 4   popularity        1159764 non-null  int64  
 5   year              1159764 non-null  int64  
 6   genre             1159764 non-null  object 
 7   danceability      1159764 non-null  float64
 8   energy            1159764 non-null  float64
 9   key               1159764 non-null  int64  
 10  loudness          1159764 non-null  float64
 11  mode              1159764 non-null  int64  
 12  speechiness       1159764 non-null  float64
 13  acousticness      1159764 non-null  float64
 14  instrumentalness  1159764 non-null  float64
 15  liveness          1159764 non-null  float64
 16  

Unnamed: 0           0
artist_name         15
track_name           1
track_id             0
popularity           0
year                 0
genre                0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
duration_ms          0
time_signature       0
dtype: int64

In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
def section_data(genre):
    sampled = df1[df1['genre'] == genre]
    features = ['valence', 'energy', 'danceability', 'loudness']
    df2 = sampled[features]
    scaler = StandardScaler()
    df2  = scaler.fit_transform(df2)
    sim_matrix = cosine_similarity(df2)
    return sim_matrix, sampled 

In [27]:
def get_indices(genre):
    _, sampled = section_data(genre)
    indices = {song: i for i, song in enumerate(sampled['track_name'])}
    return indices; 

In [28]:
def get_genre(song, artist):
    genre = df1.loc[(df1['track_name'] == song) & (df1['artist_name'] == artist), 'genre'].values[0]
    return genre 

In [29]:
def check_in_database(song, artist):
    return not df1[(df1['track_name'] == song) & (df1['artist_name'] == artist)].empty

In [30]:
def get_recs(song, artist, length, new_song_features=None):
    global df1
    if new_song_features:
        if (not(check_in_database(song, artist))):
            new_song_data = {
                'track_name': song,
                'artist_name': artist,
                'genre': new_song_features.pop('genre')
            }
            new_song_data.update(new_song_features)
            new_song_df = pd.DataFrame([new_song_data])
            df1 = pd.concat([df1, new_song_df], ignore_index=True)
    
    genre = get_genre(song, artist)

    cosine_sim, sampled = section_data(genre)
    indices = get_indices(genre)

    if song not in indices:
        raise ValueError("Song '{song}' not found.")
    
    index = indices[song]

    sim_scores = list(enumerate(cosine_sim[index]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[0:length]

    song_index = [i[0] for i in sim_scores]

    return sampled['artist_name'].iloc[song_index],sampled['track_name'].iloc[song_index] 

In [31]:
artists, tracks = get_recs('Kill Bill', 'SZA', 20)
print(tracks)
print(artists)

569184                                        Kill Bill
462138                                Gimme What I Want
1101682                                     Complicated
1007772                           Si No Te Hubieras Ido
406534                                    Peer Pressure
93593                                         Right Now
40634      Someday (feat. Max Schneider) - Film Version
913993                                    Dance With Me
1007761                                   Cold Shoulder
197553                                          Perfect
293755                                  Can't Be Erased
1101644                                       Ay Haiti!
515742                                           OHMAMI
960323            En el Amor No Se Manda (with Yuridia)
93648                                        Make a Mil
40469                       Dirty Diana - 2012 Remaster
146830                               Stockholm Syndrome
40577                              Blue Jeans - 

In [34]:
new_song_features = {
        'valence': 0.75, 
        'energy': 0.80, 
        'danceability': 0.60, 
        'loudness': -5.0, 
        'speechiness': 0.05, 
        'acousticness': 0.15, 
        'instrumentalness': 0.0,
        'genre': 'pop'
    }
print("\nRecommendations for a manually entered song:")
tracks = get_recs('Unknown Song', 'Unknown Artist', 20, new_song_features)
for song in tracks:
    print(song)


Recommendations for a manually entered song:
1159764                Unknown Artist
825387                   Diego Torres
914022                   Toño Rosario
913911                            RBD
515711                     Conan Gray
825278               Bowling For Soup
40612                     Austin Moon
40480                   One Direction
782764                  Michael Bublé
1147622           Marco Antonio Solís
696832                    Luis Miguel
960231                       Maroon 5
197664                          BØRNS
825292                           Lamp
1054858               Marianas Trench
696662                      blink-182
914080                   Sister Hazel
93697      Michael Franti & Spearhead
40499                 Matchbox Twenty
782761                           Maná
Name: artist_name, dtype: object
1159764                                         Unknown Song
825387         Sueños (with Julieta Venegas) - MTV Unplugged
914022                                 