# Preprocessing

In [3]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import yaml
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

## Read data

In [4]:
def read_data():
     global df_artists  
     df_artists = pd.read_csv('data/spotify_artists.csv', sep=',') 
     global df_tracks
     df_tracks = pd.read_csv('data/spotify_tracks.csv', sep=',')

read_data()
     

## Merging all dataframes

In [5]:
def transform_data():
    df_tracks['artists_id'] = df_tracks['artists_id'].str.strip("[]").str.replace("'","")
    df_tracks.rename(columns={'id':'tracks_id'}, inplace=True)
    df_tracks.rename(columns={'name':'tracks_name'}, inplace=True)
    #df_artists['genres'] = df_artists['genres'].apply(lambda x: x if x=="['']" else ["Unknown"])
    #df_artists.rename(columns={'id':'artists_id'}, inplace=True)
    df_artists['id'] = df_artists['id'].str.strip("[]").str.replace("'","")
    df_artists.rename(columns={'name':'artists_name'}, inplace=True)
    #df_artists['genres'] = df_artists['genres'].str.strip("[]").str.replace("'","")
    # Merge both dataframes
    global df_artists_tracks
    df_artists_tracks = pd.merge(df_tracks, df_artists, left_on='artists_id', right_on='id', how='inner')

transform_data()


In [6]:
pd.set_option('display.max_columns', None)
df_artists_tracks.head(5)

Unnamed: 0,Unnamed: 0_x,acousticness,album_id,analysis_url,artists_id,available_markets,country,danceability,disc_number,duration_ms,energy,href,tracks_id,instrumentalness,key,liveness,loudness,lyrics,mode,tracks_name,playlist,popularity,preview_url,speechiness,tempo,time_signature,track_href,track_name_prev_x,track_number,uri,valence,type_x,Unnamed: 0_y,artist_popularity,followers,genres,id,artists_name,track_id,track_name_prev_y,type_y
0,0,0.294,0D3QufeCudpQANOR7luqdr,https://api.spotify.com/v1/audio-analysis/5qlj...,3mxJuHRn2ZWD5OofvJtDZY,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",BE,0.698,1.0,235584.0,0.606,https://api.spotify.com/v1/tracks/5qljLQuKnNJf...,5qljLQuKnNJf4F4vfxQB0V,3e-06,10.0,0.151,-7.447,\r\n\r\nPerhaps I am bound to be restless\r\nA...,0.0,Blood,Hipsteribrunssi,28.0,https://p.scdn.co/mp3-preview/1b05a902da3a251d...,0.0262,115.018,4.0,https://api.spotify.com/v1/tracks/5qljLQuKnNJf...,track_14,1.0,spotify:track:5qljLQuKnNJf4F4vfxQB0V,0.622,track,45499,28,425,['finnish indie'],3mxJuHRn2ZWD5OofvJtDZY,Jesse Markin,3THTkAwJOsmxgizvYkBNRI,track_19,artist
1,84077,0.166,3wIjGVauUxR4c3NvnQZ0Jo,https://api.spotify.com/v1/audio-analysis/3THT...,3mxJuHRn2ZWD5OofvJtDZY,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",AR,0.543,1.0,233998.0,0.612,https://api.spotify.com/v1/tracks/3THTkAwJOsmx...,3THTkAwJOsmxgizvYkBNRI,0.0,9.0,0.129,-7.685,\r\n\r\nMuch better in my day\r\nMuch better i...,0.0,Jericho,Sideways 2019,37.0,https://p.scdn.co/mp3-preview/8af517b8202114d6...,0.0487,139.876,3.0,https://api.spotify.com/v1/tracks/3THTkAwJOsmx...,track_18,1.0,spotify:track:3THTkAwJOsmxgizvYkBNRI,0.463,track,45499,28,425,['finnish indie'],3mxJuHRn2ZWD5OofvJtDZY,Jesse Markin,3THTkAwJOsmxgizvYkBNRI,track_19,artist
2,1,0.863,1bcqsH5UyTBzmh9YizdsBE,https://api.spotify.com/v1/audio-analysis/3VAX...,4xWMewm6CYMstu0sPgd9jJ,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",BE,0.719,1.0,656960.0,0.308,https://api.spotify.com/v1/tracks/3VAX2MJdmdqA...,3VAX2MJdmdqARLSU5hPMpm,0.0,6.0,0.253,-10.34,\r\nYour Gods and my Gods-do you or I know whi...,1.0,The Ugly Duckling,Animal Stories,31.0,https://p.scdn.co/mp3-preview/d8140736a6131cb5...,0.922,115.075,3.0,https://api.spotify.com/v1/tracks/3VAX2MJdmdqA...,track_3,3.0,spotify:track:3VAX2MJdmdqARLSU5hPMpm,0.589,track,8219,36,2965,[],4xWMewm6CYMstu0sPgd9jJ,Favorite Kids Stories,3VAX2MJdmdqARLSU5hPMpm,track_5,artist
3,6,0.824,51g5viCaYjOW5XO4qX1RCD,https://api.spotify.com/v1/audio-analysis/1WJz...,4xWMewm6CYMstu0sPgd9jJ,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",BE,0.688,1.0,29240.0,0.304,https://api.spotify.com/v1/tracks/1WJzRtI1ABzV...,1WJzRtI1ABzV3TPIeJZVvi,0.0,10.0,0.142,-9.96,\r\nYour Gods and my Gods-do you or I know whi...,1.0,Three Blind Mice,Animal Stories,0.0,https://p.scdn.co/mp3-preview/54031f6d3ab4784a...,0.531,77.056,3.0,https://api.spotify.com/v1/tracks/1WJzRtI1ABzV...,track_9,2.0,spotify:track:1WJzRtI1ABzV3TPIeJZVvi,0.414,track,8219,36,2965,[],4xWMewm6CYMstu0sPgd9jJ,Favorite Kids Stories,3VAX2MJdmdqARLSU5hPMpm,track_5,artist
4,41,0.85,7EtGJVYZbb2Krgwizn2flB,https://api.spotify.com/v1/audio-analysis/65Mo...,4xWMewm6CYMstu0sPgd9jJ,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH...",BE,0.632,1.0,497493.0,0.372,https://api.spotify.com/v1/tracks/65MopnhsXOgY...,65MopnhsXOgYw4RuiA2pGU,0.0,7.0,0.11,-8.76,\r\nYour Gods and my Gods-do you or I know whi...,1.0,The Three Billy Goats Gruff,Animal Stories,30.0,https://p.scdn.co/mp3-preview/32c98e55e89da792...,0.935,84.486,4.0,https://api.spotify.com/v1/tracks/65MopnhsXOgY...,track_35,8.0,spotify:track:65MopnhsXOgYw4RuiA2pGU,0.611,track,8219,36,2965,[],4xWMewm6CYMstu0sPgd9jJ,Favorite Kids Stories,3VAX2MJdmdqARLSU5hPMpm,track_5,artist


## Selecting Columns

In [7]:
# acousticness, danceability, duration_ms, energy, instrumentalness, key, liveness, loudness, mode, popularity, speechiness, tempo, time_signature, valence, 
# album_id, artists_id, country, tracks_id, tracks_name (lyrics), 	uri, genre
# artist_popularity, followers 

#select all these attributes and drop the others

def drop_columns():
    global df_artists_tracks
    df_artists_tracks = df_artists_tracks[['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 
                        'loudness', 'mode', 'popularity', 'speechiness', 'tempo', 'time_signature', 'valence',
                        'artist_popularity', 'followers', 'genres', 'tracks_id', 'tracks_name', 'artists_id', 'artists_name', 'album_id', 'uri', 'preview_url', 'country', 'lyrics']]

drop_columns()

In [8]:
pd.set_option('display.max_columns', None)
df_artists_tracks.head(5)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,time_signature,valence,artist_popularity,followers,genres,tracks_id,tracks_name,artists_id,artists_name,album_id,uri,preview_url,country,lyrics
0,0.294,0.698,235584.0,0.606,3e-06,10.0,0.151,-7.447,0.0,28.0,0.0262,115.018,4.0,0.622,28,425,['finnish indie'],5qljLQuKnNJf4F4vfxQB0V,Blood,3mxJuHRn2ZWD5OofvJtDZY,Jesse Markin,0D3QufeCudpQANOR7luqdr,spotify:track:5qljLQuKnNJf4F4vfxQB0V,https://p.scdn.co/mp3-preview/1b05a902da3a251d...,BE,\r\n\r\nPerhaps I am bound to be restless\r\nA...
1,0.166,0.543,233998.0,0.612,0.0,9.0,0.129,-7.685,0.0,37.0,0.0487,139.876,3.0,0.463,28,425,['finnish indie'],3THTkAwJOsmxgizvYkBNRI,Jericho,3mxJuHRn2ZWD5OofvJtDZY,Jesse Markin,3wIjGVauUxR4c3NvnQZ0Jo,spotify:track:3THTkAwJOsmxgizvYkBNRI,https://p.scdn.co/mp3-preview/8af517b8202114d6...,AR,\r\n\r\nMuch better in my day\r\nMuch better i...
2,0.863,0.719,656960.0,0.308,0.0,6.0,0.253,-10.34,1.0,31.0,0.922,115.075,3.0,0.589,36,2965,[],3VAX2MJdmdqARLSU5hPMpm,The Ugly Duckling,4xWMewm6CYMstu0sPgd9jJ,Favorite Kids Stories,1bcqsH5UyTBzmh9YizdsBE,spotify:track:3VAX2MJdmdqARLSU5hPMpm,https://p.scdn.co/mp3-preview/d8140736a6131cb5...,BE,\r\nYour Gods and my Gods-do you or I know whi...
3,0.824,0.688,29240.0,0.304,0.0,10.0,0.142,-9.96,1.0,0.0,0.531,77.056,3.0,0.414,36,2965,[],1WJzRtI1ABzV3TPIeJZVvi,Three Blind Mice,4xWMewm6CYMstu0sPgd9jJ,Favorite Kids Stories,51g5viCaYjOW5XO4qX1RCD,spotify:track:1WJzRtI1ABzV3TPIeJZVvi,https://p.scdn.co/mp3-preview/54031f6d3ab4784a...,BE,\r\nYour Gods and my Gods-do you or I know whi...
4,0.85,0.632,497493.0,0.372,0.0,7.0,0.11,-8.76,1.0,30.0,0.935,84.486,4.0,0.611,36,2965,[],65MopnhsXOgYw4RuiA2pGU,The Three Billy Goats Gruff,4xWMewm6CYMstu0sPgd9jJ,Favorite Kids Stories,7EtGJVYZbb2Krgwizn2flB,spotify:track:65MopnhsXOgYw4RuiA2pGU,https://p.scdn.co/mp3-preview/32c98e55e89da792...,BE,\r\nYour Gods and my Gods-do you or I know whi...


## Data Preprocessing

In [9]:
# No empty values
df_artists_tracks.isna().sum()

acousticness         0
danceability         0
duration_ms          0
energy               0
instrumentalness     0
key                  0
liveness             0
loudness             0
mode                 0
popularity           0
speechiness          0
tempo                0
time_signature       0
valence              0
artist_popularity    0
followers            0
genres               0
tracks_id            0
tracks_name          0
artists_id           0
artists_name         0
album_id             0
uri                  0
preview_url          0
country              0
lyrics               0
dtype: int64

### Processing Genre

In [11]:
df_artists_tracks['genres'].unique()

array(["['finnish indie']", '[]', "['british comedy', 'comedy']", ...,
       "['electronic', 'nu skool breaks', 'progressive house', 'progressive trance', 'progressive trance house', 'trance', 'uplifting trance']",
       "['icelandic hip hop']",
       "['australian alternative rock', 'australian indie', 'australian pop', 'australian rock', 'perth indie']"],
      dtype=object)

In [10]:
df_artists_tracks['genres'].unique()

# count the number of genres
df_artists_tracks['genres'].nunique() 

10728

In [20]:
df_artists_tracks['genres']

0                                        ['finnish indie']
1                                        ['finnish indie']
2                                                       []
3                                                       []
4                                                       []
                               ...                        
77840                                     ['chilean rock']
77841                                 ['scandinavian r&b']
77842                                ['icelandic hip hop']
77843    ['australian alternative rock', 'australian in...
77844    ['australian alternative rock', 'australian in...
Name: genres, Length: 77845, dtype: object

In [12]:
print(df_artists_tracks[df_artists_tracks['genres'] == '[]']['genres'].count())
df_artists_tracks[df_artists_tracks['genres'] != '[]']['genres'].count()

15142


62703

In [13]:
genre_counts = df_artists_tracks['genres'].apply(eval).explode().value_counts()
print(genre_counts)


pop                       3553
dance pop                 2587
rock                      2572
folk-pop                  2149
latin                     2116
                          ... 
house argentino              1
unblack metal                1
musica folk asturiana        1
early modern classical       1
icelandic hip hop            1
Name: genres, Length: 2260, dtype: int64


In [25]:
genre_counts[genre_counts > 500]
#genre_counts[:10]

pop                 3553
dance pop           2587
rock                2572
folk-pop            2149
latin               2116
                    ... 
israeli pop          508
indie cafe pop       504
kleine hoerspiel     503
spanish pop          501
anthem worship       501
Name: genres, Length: 98, dtype: int64