<a href="https://colab.research.google.com/github/pjmaguire/find_me_music/blob/main/Find_Me_Music.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries & Functions

In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import json
from ast import literal_eval

In [23]:
def data_report(data, figures=True):
  """
  Reports summary statistics and figures for a data frame.

  Inputs: data - a dataframe of uniform or mixed variables
  Outputs: None
  """
  #Libraries
  from pandas.api.types import is_string_dtype
  from pandas.api.types import is_numeric_dtype

  for col in data.columns:
    print("{}\n".format(col))
    #print(type(lc_data[col]))

    #Numerical Values
    if is_numeric_dtype(data[col]):
      print("Type:\tNumeric")
      clean = data[col].dropna()
      if len(clean) != 0:
        print("Range:\t{} - {}".format(min(clean), max(clean)))
        
        if figures:
          data[col].plot(kind='hist', title=col.capitalize())
          plt.show() 

    #String Values
    elif is_string_dtype(data[col]):
      print("Type:\tString")
      print("Values:") 
      print(data[col].unique())

      #Checks For URL Data
      #Note: Assumes that any URL means that the variable should not have a figure generated
      if not (any(data[col].str.contains("http"))):
        if figures:
          data[col].value_counts().plot(kind='barh', title=col.capitalize())
          plt.show()

    #Unknown
    else:
      print("Type:\tUnknown")


    print("NAs:\t{}".format(data[col].isna().sum()))

    print("\n_____________________\n")

# Data Loading

In [24]:
#Loads In Artist And Track Info
local = False
git = True

if local:
  artists = pd.read_csv("/Users/pmaguire/Dropbox/Stanford/Courses/CS68/Data/Spotify_Data/artists.csv")
  tracks = pd.read_csv("/Users/pmaguire/Dropbox/Stanford/Courses/CS68/Data/Spotify_Data/tracks.csv")
elif git:
  artists = pd.read_csv("https://media.githubusercontent.com/media/pjmaguire/find_me_music/main/Data/artists.csv")
  tracks = pd.read_csv("https://media.githubusercontent.com/media/pjmaguire/find_me_music/main/Data/tracks.csv")

#Loads In Related Artist Data
#with open("/Users/pmaguire/Dropbox/Stanford/Courses/CS68/Data/Spotify_Data/dict_artists.json") as file:
#  data = json.load(file)
#  related_artists = pd.DataFrame.from_dict(data, orient='index').T.set_index('index')   

# Variable Summary

**Artists**

In [12]:
artists.head(25)

Unnamed: 0,id,followers,genres,name,popularity
0,0DheY5irMjBUeLybbCUEZ2,0.0,[],Armid & Amir Zare Pashai feat. Sara Rouzbehani,0
1,0DlhY15l3wsrnlfGio2bjU,5.0,[],ปูนา ภาวิณี,0
2,0DmRESX2JknGPQyO15yxg7,0.0,[],Sadaa,0
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,[],Tra'gruda,0
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,[],Ioannis Panoutsopoulos,0
5,0DotfDlYMGqkbzfBhcA5r6,7.0,[],Astral Affect,0
6,0DqP3bOCiC48L8SM9gK4W8,1.0,[],Yung Seed,0
7,0Drs3maQb99iRglyTuxizI,0.0,[],Wi'Ma,0
8,0DsPeAi1gxPPnYjgpiEGSR,0.0,[],lentboy,0
9,0DtvnTxgZ9K5YaPS5jdlQW,20.0,[],addworks,0


In [13]:
data_report(artists, figures=False)

id

Type:	String
Values:
['0DheY5irMjBUeLybbCUEZ2' '0DlhY15l3wsrnlfGio2bjU'
 '0DmRESX2JknGPQyO15yxg7' ... '2vnT9YhKIvjVo9LnVjWmr2'
 '3ID0E5XCvnJIYZEq043ZoB' '5m0Y4WSYyai2BU752lCASy']
NAs:	0

_____________________

followers

Type:	Numeric
Range:	0.0 - 78900234.0
NAs:	13

_____________________

genres

Type:	String
Values:
['[]' "['carnaval cadiz']" "['classical harp', 'harp']" ...
 "['alternative dance', 'dance-punk', 'electrofox', 'new rave', 'swedish electropop']"
 "['emoviolence', 'metallic hardcore']"
 "['australian post-hardcore', 'metallic hardcore']"]
NAs:	0

_____________________

name

Type:	String
Values:
['Armid & Amir Zare Pashai feat. Sara Rouzbehani' 'ปูนา ภาวิณี' 'Sadaa'
 ... 'Jesse Giddings' 'The Boy Band Project' '10 Reasons']
NAs:	0

_____________________

popularity

Type:	Numeric
Range:	0 - 100
NAs:	0

_____________________



**Tracks**

In [14]:
tracks.head(25)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4
5,0BRXJHRNGQ3W4v9frnSfhu,Ave Maria,0,178933,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.227,0.261,5,-12.343,1,0.0382,0.994,0.247,0.0977,0.0539,118.891,4
6,0Dd9ImXtAtGwsmsAD69KZT,La Butte Rouge,0,134467,0,['Francis Marty'],['2nuMRGzeJ5jJEKlfS7rZ0W'],1922,0.51,0.355,4,-12.833,1,0.124,0.965,0.0,0.155,0.727,85.754,5
7,0IA0Hju8CAgYfV1hwhidBH,La Java,0,161427,0,['Mistinguett'],['4AxgXfD7ISvJSTObqm4aIE'],1922,0.563,0.184,4,-13.757,1,0.0512,0.993,1.6e-05,0.325,0.654,133.088,3
8,0IgI1UCz84pYeVetnl1lGP,Old Fashioned Girl,0,310073,0,['Greg Fieler'],['5nWlsH5RDgFuRAiDeOFVmf'],1922,0.488,0.475,0,-16.222,0,0.0399,0.62,0.00645,0.107,0.544,139.952,4
9,0JV4iqw2lSKJaHBQZ0e5zK,Martín Fierro - Remasterizado,0,181173,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-29,0.548,0.0391,6,-23.228,1,0.153,0.996,0.933,0.148,0.612,75.595,3


In [None]:
data_report(tracks, figures=False)

id

Type:	String
Values:
['35iwgR4jXetI318WEWsa1Q' '021ht4sdgPcrDgSk7JTbKY'
 '07A5yehtSnoedViJAZkNnc' ... '27Y1N4Q4U3EfDU5Ubw8ws2'
 '45XJsGpFTyzbzeWK8VzR8S' '5Ocn6dZ3BJFPWh4ylwFXtn']
NAs:	0

_____________________

name

Type:	String
Values:
['Carve' 'Capítulo 2.16 - Banquero Anarquista'
 'Vivo para Quererte - Remasterizado' ... 'blind'
 "What They'll Say About Us" 'A Day At A Time']
NAs:	71

_____________________

popularity

Type:	Numeric
Range:	0 - 100
NAs:	0

_____________________

duration_ms

Type:	Numeric
Range:	3344 - 5621218
NAs:	0

_____________________

explicit

Type:	Numeric
Range:	0 - 1
NAs:	0

_____________________

artists

Type:	String
Values:
["['Uli']" "['Fernando Pessoa']" "['Ignacio Corsini']" ... "['阿YueYue']"
 "['ROLE MODEL']" "['Gentle Bones', 'Clara Benin']"]
NAs:	0

_____________________

id_artists

Type:	String
Values:
["['45tIt06XoI0Iio4LBEVpls']" "['14jtPCOoNZwquk5wd9DxrY']"
 "['5LiOoJbxVSAMkBS2fUm3X2']" ... "['1QLBXKM5GCpyQQSVMNZqrZ']"
 "['1dy5WNgIKQU6ezkp

**Related Artists**

In [None]:
#related_artists.head(25)
#print(data.keys())

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Variable Assessment

In [None]:
col = "name"

#na_idx = tracks.index[tracks[col].isna()]

tracks[col].isna()

#print(tracks.iloc[na_idx])

**release_date**

Note: Should convert it to year and month variables

In [None]:
release_date

# Pre-Processing

**Track Names**

In [25]:
tracks["name"] = tracks["name"].fillna("")

**Artist Followers**

In [26]:
artists["followers"] = artists["followers"].fillna(0)

**Artist & ID_Artist**

Converts string values to list values

In [27]:
tracks["artists"] = tracks["artists"].apply(literal_eval)
tracks["id_artists"] = tracks["id_artists"].apply(literal_eval)

**Number Of Artists**

Creates a variable for the number of artists on a track

In [28]:
#Converts String List To List
#tracks["artists"] = tracks["artists"].apply(literal_eval)

#Counts Number Of Artists
tracks["num_artists"] = [len(x) for x in tracks["artists"]]

#Sets Maximum Number Of Counted Artists To 6
#Note: Ignore if error is thrown
tracks["num_artists_bin"] = tracks['num_artists'].copy(deep=True)
tracks["num_artists_bin"].loc[tracks["num_artists"] > 6] = 6

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


**Release Date**

Extracts various elements from the release date and builds several new categories. If no date or month is present, a NaN value reported for all day and month related variables.

In [29]:
#Fixes "Los Pincheira del Sur", Which Incorrectly Has A Stated Release Date Of January 1st 1900
tracks["release_date"].iloc[478627] = "2019-5-17"

#Converts String Date To Datetime
tracks["release_date_time"] = tracks["release_date"].copy(deep=True)
tracks["release_date_time"] = pd.to_datetime(tracks["release_date_time"])

#Creates Categories Of Interest
tracks["release_year"] = tracks["release_date_time"].dt.year
tracks["release_decade"] = tracks["release_date_time"].dt.year//10*10
tracks["release_month"] = tracks["release_date_time"].dt.month_name()
tracks["release_dayofweek"] = tracks["release_date_time"].dt.day_name()
tracks["release_quarter"] = tracks["release_date_time"].dt.quarter
tracks["release_endofmonth"] = tracks["release_date_time"].dt.is_month_end

#Identifies Elements That Lack Day & Month Data
date_time_gaps = ~tracks["release_date"].str.contains("-")

#Sets Appropriate Row Elements To NaN
tracks["release_month"].loc[date_time_gaps] = np.nan
tracks["release_decade"].loc[date_time_gaps] = np.nan
tracks["release_dayofweek"].loc[date_time_gaps] = np.nan
tracks["release_quarter"].loc[date_time_gaps] = np.nan
tracks["release_endofmonth"].loc[date_time_gaps] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


**Final Data Shape**

In [30]:
tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,num_artists,num_artists_bin,release_date_time,release_year,release_month,release_dayofweek,release_quarter,release_endofmonth
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,[Uli],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,1,1,1922-02-22,1922,February,Wednesday,1.0,0.0
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,[Fernando Pessoa],[14jtPCOoNZwquk5wd9DxrY],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1,1,1,1922-06-01,1922,June,Thursday,2.0,0.0
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,[Ignacio Corsini],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,1,1,1922-03-21,1922,March,Tuesday,1.0,0.0
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,[Ignacio Corsini],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3,1,1,1922-03-21,1922,March,Tuesday,1.0,0.0
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,[Dick Haymes],[3BiJGZsyX9sJchTqcSA7Su],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4,1,1,1922-01-01,1922,,,,


# Build Input Variables

In [None]:
categorical_cols = [
   "release_month",
   "release_dayofweek"
]

categorical_cols

In [None]:
categorical_numerical_columns = [
  "key",
  "mode",
  "time_signature",
  "release_year",
  "release_decade",
   "release_quarter",
  "release_endofmonth"
]
categorical_numerical_columns

In [None]:
continuous_numerical_columns = [
   "danceability",
   "energy",
   "loudness",
   "speechiness",
   "acousticness",
   "tempo",
   "valence",
   "liveness",
   "instrumentalness"
]
continuous_numerical_columns

In [None]:
name 	
popularity 	
duration_ms 	
explicit 	
artists

Creates Collection Of Categories Of Interest

In [None]:
song_data = lc_data[categorical_cols + categorical_numerical_columns + continuous_numerical_columns + continuous_numerical_columns_standardized]
song_data.head()

Standardizes Continuous Numerical Ranges Using Z-Score Normalization

In [None]:
scaler = MinMaxScaler()
scaler.fit(song_data.iloc[continuous_numerical_columns])
song_data.iloc[continuous_numerical_columns] = scaler.transform(song_data.iloc[continuous_numerical_columns])
song_data.iloc[continuous_numerical_columns]

Creates Dummy Variables

In [None]:
song_data = pd.get_dummies(data = song_data, columns=(categorical_numerical_columns + categorical_cols), drop_first=False)
song_data

Creates Final Processed Dataset

In [None]:
X = song_data
X = X.reset_index()

# Splitting Data

Creates Training, Validation, And Test Sets

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size = 0.33)

print('Train fraction: {}'.format((X_train.shape[0]) / X.shape[0]))
print('Validation fraction: {}'.format((X_valid.shape[0]  / X.shape[0])))
print('Test fraction: {}'.format((X_test.shape[0]  / X.shape[0])))

Training Set Snapshot

In [None]:
X_train.head()

# Training