# Analysis of Musical and Lyrical Trends at Intelligent Interactive Systems (MIIS)

#### Oktay Ozan GÃ¼ner   -  ID : OZAN_ID
#### Juan Miguel Alfonso Habana   -  ID : MIGUEL_ID

# Introduction

Since 1990, the way people interact with music has evolved significantly. This period marks a transition from the tangible, physical media of CDs and vinyl to the intangible, yet infinitely accessible world of digital music. 
* How digitalization and streaming have influenced listeners? 
* How the listening habits have changed over time?

We'll examine how our music listening habits have been affected from duration of the songs to the way sentiment of the lyrics.


In [1]:
# import packages

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from tqdm.notebook import tqdm
import time
import os
from dotenv import load_dotenv
import fastparquet as fp

load_dotenv()

False

## Extract Track Information

In [5]:
# Load the billboard list of songs

df = pd.read_csv('../Billboard_Lists_1960-01-01_2024-02-23.csv')

# Modify the date format to a time data type
df['Week'] = pd.to_datetime(df['Week'])
filtered_df = df[df['Week'].dt.year >= 1990]

In [6]:
# generate a list of queries using the artist name and song
query_list = filtered_df.groupby(['Artist_Name', 'Song']).size().reset_index(name='count').apply(lambda x: x['Artist_Name'] + "|" + x['Song'], axis = 1).tolist()

In [7]:
# function to iteratively extract the track information from the query list
def extract_track_information(query_list:list, client_id:str, client_secret:str, file_name:str, batch_size:int=500):
  
  sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret), requests_timeout=10, retries=10)

  output = []
  initial_iter = True

  for index, query in enumerate(tqdm(query_list)):
      results = sp.search(q=query, limit = 20)

      track = {}
      track['query'] = query

      if len(results['tracks']['items']) > 0:
        result = results['tracks']['items'][0]
        artists = [r['name'].lower() for r in result['artists']]
        artist_ids = [r['id'] for r in result['artists']]

        # basic track features
        track['id'] = result['uri'].split(':')[2]
        track['album_release_date'] = result['album']['release_date']
        track['artist'] = artists
        track['artist_ids'] = artist_ids
        track['title'] = result['name']
        track['popularity'] = result['popularity']
      
      else:
        track['id'] = None
        track['album_release_date'] = None
        track['artist'] = None
        track['artist_ids'] = None
        track['title'] = None
        track['popularity'] = None
      
      output.append(track)

      if index % batch_size == 0:
        i = 0 if index <= batch_size else index/batch_size -1
        df = pd.DataFrame(output[int(i)*batch_size:])
        if initial_iter:
            fp.write(f'./data/{file_name}', df, file_scheme='simple', write_index=False, times='int96', compression='SNAPPY', has_nulls=True)
            initial_iter = False
        else:
            fp.write(f'./data/{file_name}', df, file_scheme='simple', write_index=False, times='int96', compression='SNAPPY', has_nulls=True, append=True)

  return pd.DataFrame(output)



In [None]:
client_id = os.getenv("SPOTIFY_CLIENT_ID")
client_secret = os.getenv("SPOTIFY_CLIENT_SECRET")     

song_info_df = extract_track_information(query_list, client_id, client_secret, 'track_info_full_p3.parquet')

In [None]:
# Merge the resulting track files into one dataframe
# Had to run the extraction script multiple times due to request timeouts and rate limits
track_info = pd.read_parquet('./data/track_info.parquet')
track_info2 = pd.read_parquet('./data/track_info_p2.parquet')
track_info3 = pd.read_parquet('./data/track_info_p3.parquet')

track_df = pd.concat([track_info, track_info2]).reset_index()
track_df = pd.concat([track_df, track_info3]).reset_index()

In [None]:
# save data to CSV
track_df.to_csv('track_info.csv')

In [None]:
# Convert all the track IDs into a list
track_ids = track_df['id'].to_list()

In [None]:
def extract_audio_features(track_ids:list, client_id:str, client_secret:str, file_name:str):
    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret), requests_timeout=10, retries=10)

    initial_iter = True
    batch_ids = [track_ids[i:i+100] for i in range(0, len(track_ids), 100)]
    for batch_id in tqdm(batch_ids):
        audio_features = sp.audio_features(batch_id)
        af_df = pd.DataFrame(audio_features)

        key_conversion = { -1:None, 0:'C', 1:'C#', 2:'D', 3:'D#', 4:'E', 5:'F', 6:'F#', 7:'G', 8:'G#', 9:'A', 10:'A#', 11:'B' }
        mode_conversion = { 0: 'minor', 1: 'major' }

        af_df['key'] = af_df['key'].apply(lambda x: key_conversion[x])
        af_df['mode']  = af_df['mode'].apply(lambda x: mode_conversion[x])
        
        if initial_iter:
            fp.write(f'./data/{file_name}', af_df, file_scheme='simple', write_index=False, times='int96', compression='SNAPPY', has_nulls=True)
            initial_iter = False
        else:
            fp.write(f'./data/{file_name}', af_df, file_scheme='simple', write_index=False, times='int96', compression='SNAPPY', has_nulls=True, append=True)



In [None]:
audio_features_df = extract_audio_features(track_ids, client_id, client_secret, 'audio_features.parquet')