In [None]:
import requests
import pandas as pd
import glob
import numpy as np
from dotenv import load_dotenv
import os

In [None]:
csv_files = glob.glob("data/*.csv")
csv_files

In [None]:
dataframes = [pd.read_csv(file).iloc[:,1:] for file in csv_files] #dont get the Unnamed: 0 column
dataframes

In [None]:
combined_df = pd.concat(dataframes,ignore_index=True) # combine all seasons into 1 single dataframe for every tv series
combined_df.head()

In [None]:
combined_df.shape

In [None]:
len(combined_df['track_id'].unique()) # there may be some dupliceate songs

In [None]:
duplicated_songs = combined_df[combined_df['track_id'].duplicated()] # get the duplicated songs
duplicated_songs

In [None]:
# loop through each duplicated song
for _,row in duplicated_songs.iterrows():
    track_id = row.track_id
    
    duplicate = combined_df[combined_df.track_id == track_id]
    tv_serieses = duplicate['series_name'].unique()
    
    # if a song appears more than 1 time in the SAME TV SERIES than keep one of them
    if len(tv_serieses) == 1:
        remove_indexes = duplicate.index[1:]
        combined_df.drop(remove_indexes, inplace=True)
    # if a song appears MORE THAN 1 TV SERIES remove the song
    else: 
        remove_indexes = duplicate.index
        combined_df.drop(remove_indexes, inplace=True)
        

In [None]:
combined_df.shape[0],len(combined_df['track_id'].unique()) # now all the songs are unique

In [None]:
combined_df.groupby("series_name",as_index=False)['series_name'].value_counts() # there may be needed some balancing in the model training

In [None]:
# add audio features as nan values

# feature explanations => https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features

combined_df['danceability'] = np.nan # how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. 
combined_df['energy'] = np.nan  # represents a perceptual measure of intensity and activity
combined_df['key'] = np.nan # The key the track is in. Integers map to pitches using standard Pitch Class notation. E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on. If no key was detected, the value is -1.
combined_df['loudness'] = np.nan # the overall loudness of a track in decibels (dB)
combined_df['mode'] = np.nan # Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0.
combined_df['speechiness'] = np.nan # Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value.
combined_df['acousticness'] = np.nan # whether the track is acoustic
combined_df['instrumentalness'] = np.nan # predicts whether a track contains no vocals
combined_df['liveness'] = np.nan # higher liveness values represent an increased probability that the track was performed live.
combined_df['valence'] = np.nan # A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track.
combined_df['tempo'] = np.nan # The overall estimated tempo of a track in beats per minute (BPM)

combined_df.head()

<h3> Spotify API Requests </h3>

In [None]:
load_dotenv() # load the environment variables
ACCESS_TOKEN = os.environ["ACCESS_TOKEN"] # get ACCESS_TOKEN for spotify api from .env file

In [None]:
end_point = "https://api.spotify.com/v1/audio-features/" # audio features end point

headers = {"Authorization" : f"Bearer  {ACCESS_TOKEN}"} # request headers

In [None]:
for idx,row in combined_df.iterrows():
    track_id = row.track_id
    
    request_url = end_point + track_id # request url
    response = requests.get(request_url, headers=headers) # make a get request
    
    if response.status_code == 200:
        data = response.json()
        
        combined_df.at[idx,'danceability'] = data['danceability']
        combined_df.at[idx,'energy'] = data['energy']
        combined_df.at[idx,'key'] = data['key']
        combined_df.at[idx,'loudness'] = data['loudness']
        combined_df.at[idx,'mode'] = data['mode']
        combined_df.at[idx,'speechiness'] = data['speechiness']
        combined_df.at[idx,'acousticness'] = data['acousticness']
        combined_df.at[idx,'instrumentalness'] = data['instrumentalness']
        combined_df.at[idx,'liveness'] = data['liveness']
        combined_df.at[idx,'valence'] = data['valence']
        combined_df.at[idx,'tempo'] = data['tempo']
        
        print(track_id,": audio features fetched successfully.")
    elif response.status_code == 429:
        print('API is overwhelmed!')
    else:
        print("Something went wrong!")

In [None]:
combined_df.head()

In [None]:
combined_df.to_csv("data/final_data.csv",index=False) # save into csv file