In [4]:
import pandas as pd 
import requests
import json 
from sklearn.preprocessing import OneHotEncoder, normalize
from fuzzywuzzy import process
# Replace this with your API's deployed URL
api_url = "https://our-service-442302-c3.wl.r.appspot.com/get_data/spotify_songs"


In [5]:
try:
    response = requests.get(api_url)
    response.raise_for_status()  # Raise an error for bad status codes (4xx or 5xx)
except requests.exceptions.RequestException as e:
    print(f"Error fetching data: {e}")


In [6]:
if response.status_code == 200:
    # Convert the JSON data to a DataFrame
    data = response.json()  # Parse JSON from API response
    df = pd.DataFrame(data)

    # Save the data locally as a CSV for future use
    df.to_csv("spotify_songs_local.csv", index=False)
    print("Data saved locally as 'spotify_songs_local.csv'")

    # Preview the first few rows
    print(df.head())
else:
    print(f"Failed to fetch data. Status Code: {response.status_code}")


Data saved locally as 'spotify_songs_local.csv'
   ACOUSTICNESS  DANCEABILITY  DURATION_MS  ENERGY  INSTRUMENTALNESS  KEY  \
0        0.1020         0.748       194754   0.916          0.000000    6   
1        0.0724         0.726       162600   0.815          0.004210   11   
2        0.0794         0.675       176616   0.931          0.000023    1   
3        0.0287         0.718       169093   0.930          0.000009    7   
4        0.0803         0.650       189052   0.833          0.000000    1   

   LIVENESS  LOUDNESS  MODE PLAYLIST_GENRE  ... SPEECHINESS    TEMPO  \
0    0.0653    -2.634     1            pop  ...      0.0583  122.036   
1    0.3570    -4.969     1            pop  ...      0.0373   99.972   
2    0.1100    -3.432     0            pop  ...      0.0742  124.008   
3    0.2040    -3.778     1            pop  ...      0.1020  121.956   
4    0.0833    -4.672     1            pop  ...      0.0359  123.976   

           TRACK_ALBUM_ID                               

In [7]:
# Check if it runs locally
df = pd.read_csv("spotify_songs_local.csv")
print(df.head())



   ACOUSTICNESS  DANCEABILITY  DURATION_MS  ENERGY  INSTRUMENTALNESS  KEY  \
0        0.1020         0.748       194754   0.916          0.000000    6   
1        0.0724         0.726       162600   0.815          0.004210   11   
2        0.0794         0.675       176616   0.931          0.000023    1   
3        0.0287         0.718       169093   0.930          0.000009    7   
4        0.0803         0.650       189052   0.833          0.000000    1   

   LIVENESS  LOUDNESS  MODE PLAYLIST_GENRE  ... SPEECHINESS    TEMPO  \
0    0.0653    -2.634     1            pop  ...      0.0583  122.036   
1    0.3570    -4.969     1            pop  ...      0.0373   99.972   
2    0.1100    -3.432     0            pop  ...      0.0742  124.008   
3    0.2040    -3.778     1            pop  ...      0.1020  121.956   
4    0.0833    -4.672     1            pop  ...      0.0359  123.976   

           TRACK_ALBUM_ID                                   TRACK_ALBUM_NAME  \
0  2oCs0DGTsRO98Gh5ZSl2C

In [8]:
df.shape

(32833, 23)

In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Drop duplicates and handle missing values
df = df.drop_duplicates()
df = df.dropna()

# Normalize numerical features (e.g., tempo, loudness, popularity)
scaler = MinMaxScaler()
df[['TEMPO', 'LOUDNESS', 'TRACK_POPULARITY', 'LIVENESS', 'DANCEABILITY', 'ENERGY', 'INSTRUMENTALNESS']] = scaler.fit_transform(
    df[['TEMPO', 'LOUDNESS', 'TRACK_POPULARITY', 'LIVENESS', 'DANCEABILITY', 'ENERGY', 'INSTRUMENTALNESS']]
)

# One-hot encode the categorical feature (PLAYLIST_GENRE)
encoder = OneHotEncoder()
genre_encoded = encoder.fit_transform(df[['PLAYLIST_GENRE']]).toarray()
genre_encoded_df = pd.DataFrame(genre_encoded, columns=encoder.get_feature_names_out(['PLAYLIST_GENRE']))

# Concatenate the one-hot encoded genres back to the dataframe
df = pd.concat([df.reset_index(drop=True), genre_encoded_df], axis=1)

# Select features for recommendation
features = ['TEMPO', 'LOUDNESS', 'LIVENESS', 'TRACK_POPULARITY', 'DANCEABILITY', 'ENERGY', 'INSTRUMENTALNESS'] + list(genre_encoded_df.columns)

# Compute feature vectors
feature_vectors = df[features].values

# Calculate similarity matrix
similarity_matrix = cosine_similarity(feature_vectors)

# Function to get recommendations
def get_recommendations(song_name, n=10):
    # Find the index of the song
    try:
        idx = df[df['TRACK_NAME'] == song_name].index[0]
    except IndexError:
        return f"Song '{song_name}' not found in the dataset."

    # Get similarity scores
    similarity_scores = list(enumerate(similarity_matrix[idx]))

    # Sort by similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get top n similar songs, ensuring they are unique and not the same as the input song
    recommended_songs = []
    for i in similarity_scores:
        if df.iloc[i[0]]['TRACK_NAME'] != song_name and df.iloc[i[0]]['TRACK_NAME'] not in recommended_songs:
            recommended_songs.append(df.iloc[i[0]]['TRACK_NAME'])
        if len(recommended_songs) >= n:
            break

    return recommended_songs

# Example
song = "Shape of You"  # Replace with a song in your dataset
print("Recommended songs:", get_recommendations(song))


Recommended songs: ["I Don't Care (with Justin Bieber)", 'SOS (feat. Aloe Blacc)', 'Attention', 'Ayy Macarena', 'ZEZE', 'Morado', 'You Need To Calm Down', 'Blanco', 'Loco Contigo (with J. Balvin & Ozuna feat. Nicky Jam, Natti Natasha, Darell & Sech) - REMIX', 'Look Back at It']
