<div style = 'background-color:#1DB954;'>
    <br>
    <h3 style="color:black; margin-left:20px; margin-top:5px"> CLASSICAL MUSIC RECOMMENDATIONS </h3>
    <p style="color:black; margin-left:20px; margin-top:5px"> Data Collection </p>
    <br>
</div>

In [9]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path
import os 
import time 
import requests
import sys 
import importlib

import librosa
import spotipy 
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth 
from spotipy.oauth2 import SpotifyClientCredentials 

In [13]:
# Adding the scripts folder into the path so that we can import modules
sys.path.append('../scripts')

# Importing functions from GSA script
import GSA

# Importing credentials from spotifyConstrants script 
import spotifyConstants


In [15]:
# Establishing Spotify connection 

# Setting up authorization manager (link to my Spotify Developer account) 
auth = SpotifyClientCredentials(client_id=spotifyConstants.myClientID, 
                                client_secret=spotifyConstants.myClientSecret)  

sp = spotipy.Spotify(auth_manager=auth)

# Authenticate to access Spotify API through Spotipy
GSA.authenticate()

In [None]:
# Building initial dataset 

# Importing composers dataset 
composers = pd.read_excel('Composers.xlsx')

# Compiling information from all Playlists 

# Creating empty dataframe 
data = pd.DataFrame(columns=['Composer', 'playlistID', 'TrackName', 'TrackID', 'SampleURL', 'ReleaseYear',
                             'Genres', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
                             'instrumentalness', 'liveness', 'valence', 'tempo', 'key', 'mode', 'duration_ms',
                             'Popularity'])

for i in range(0, len(composers)):

    # Track progress 
    print(i)

    # Get composer 
    composer = composers.iloc[i, 0]

    # Get playlist URI 
    uri = composers.iloc[i, 2]

    # Obtain plylist information 
    playlist_info = GSA.getInformation(uri)

    # Turn into dataframe 
    playlist_info_df = pd.read_pickle(playlist_info)

    # Combine with composer name 
    composer_df = pd.DataFrame({'Composer': [composer] * len(playlist_info_df)})
    playlist_info_df = pd.concat([composer_df, playlist_info_df], axis=1)

    # Adding to overall dataframe 
    data = pd.concat([data, playlist_info_df]).reset_index(drop=True)


In [None]:
# Import dataframe 

len(data[data.SampleURL.isnull()])
# There's 188 missing samples so those rows will be removed  

data.dropna(subset=["SampleURL"], inplace=True)
data = data.reset_index(drop=True)

In [None]:
# Obtaining mp3 samples (~30 minutes)

toDownload = data[['SampleURL', 'TrackName', 'TrackID', 'playlistID']].values.tolist()

# Create an array to keep track of which were successfully downloaded
downloaded = []

# Now download preview MP3s, in a loop

counter = 0 


for track in toDownload:
    
    success = GSA.downloadTracks(track)
    downloaded.append(success)
    
    #to keep track of progress
    counter += 1 
    if counter % 10 == 0:
        print(counter)

In [None]:
# Trimming wav files 

from scipy.io import wavfile

file_names = os.listdir('wav')

# the timestamp to split at (in seconds)
timestamp_start = 5
timestamp_end = 15

for filename in file_names:
    
    if filename == '.DS_Store':
        continue 

    # read the file and get the sample rate and data (create an empty wav folder)
    rate, data = wavfile.read('wav/' + filename) 

    # get the frame to split at
    split_start = rate * timestamp_start
    split_end = rate * timestamp_end

    # split
    sample = data[split_start:split_end-1]  # split

    # save the result
    wavfile.write('wav_red/'+filename, rate, sample)

In [None]:
# Extracting audio features 

# Dataframe column names 
colnames = ['filename', 'chroma_stft', 'spectral_centroid', 'spectral_bandwidth', 'rolloff', 'zero_crossing_rate']
for i in range(1, 21):
    colnames.append('mfcc'+str(i))

# Creating empty dataframe 
audio_features = pd.DataFrame(columns=colnames) 

# Getting filenames in the wav folder 
file_names = os.listdir('wav_red')

counter = 0 

# Looping through filenames
for filename in file_names:
    
    if filename == '.DS_Store':
        continue 
    
    # to keep track of progress
    counter += 1 
    if counter % 10 == 0:
        print(counter)
        
    y, sr = librosa.load('wav_red/' + filename)
    
    # Obtaining audio features 
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    values = [filename, np.mean(chroma_stft), np.mean(spec_cent), np.mean(spec_bw), np.mean(rolloff), np.mean(zcr)]   
    for e in mfcc:
        values.append(np.mean(e))

    
    # Adding row to dataframe 
    df_length = len(audio_features)
    audio_features.loc[len(audio_features)] = values

In [None]:
# Combining with data df

# audio features dataframe back up 
audio_features_bu = audio_features.copy()

# Extracting TrackIDs from filenames 
audio_features['TrackID'] = audio_features.filename.apply(lambda st: st[st.find("ID_")+3:st.find(".wav")])

# Merging 
data = pd.merge(data, audio_features, 'left', on='TrackID')

In [None]:
# Finalizing Dataframe 

data = pd.merge(data, composers[['Last Name', 'Born']], 'left', left_on='Composer', right_on='Last Name')

# Removing irrelevant columns 
data = data.drop(columns=['playlistID', 'SampleURL', 'ReleaseYear', \
                          'speechiness', 'liveness', 'Genres', 'filename', 'Last Name'])

# Removing Albinoni because only 4 pieces came through 
data = data[data['Composer']!='Albinoni']

In [None]:
# Creating some additional features 

data['Symphony'] = data["TrackName"].map(lambda x: 1 if "Symphony" in x else 0)
data['Concerto'] = data["TrackName"].map(lambda x: 1 if "Concerto" in x else 0)
data['Quartet'] = data["TrackName"].map(lambda x: 1 if "Quartet" in x else 0)
data['Trio'] = data["TrackName"].map(lambda x: 1 if "Trio" in x else 0)
data['Sonata'] = data["TrackName"].map(lambda x: 1 if "Sonata" in x else 0)

# Saving as an excel file 
data.to_excel("Data_Final.xlsx",  index=False)