## Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Creating artist_codes dataframe
artists = pd.read_csv('Data/spotify_artists.csv')
artists = artists.drop(["Unnamed: 0", "artist_uri"], axis=1)

In [3]:
#Set index column
artists = artists.set_index('artist_id')

In [4]:
#Take brackets out of artist_genres column string data
artists['artist_genres'] = artists['artist_genres'].str[1:-1]

In [5]:
#Take quotes out of artist_genres column string data
artists['artist_genres'] = artists['artist_genres'].str.replace("'", "")

In [6]:
#Take spaces out of artist_genres column string data
artists['artist_genres'] = artists['artist_genres'].str.replace(" ", "")

In [7]:
#Take dashes out of artist_genres column string data
artists['artist_genres'] = artists['artist_genres'].str.replace("-", "")

In [8]:
#Replace empty genre data points with NaNs
artists['artist_genres'] = artists['artist_genres'].apply(lambda y: np.nan if y=='' else y)

In [9]:
#Make artist_genres column string data a list
artists['artist_genres'] = artists['artist_genres'].str.split(",")

In [10]:
#Drop rows with NaN genre data
artists = artists.dropna(subset=['artist_genres'])

In [11]:
#Drop rows with NaN artist_name data
artists = artists.dropna(subset=['artist_name'])

In [12]:
#Drop rows with NaN artist_followers data
artists = artists.dropna(subset=['artist_followers'])

In [13]:
print(artists.info())
artists.head(10)

In [14]:
#Creating list of genres
genres_list = []
for genres in artists.artist_genres:
    for genre in genres:
        genres_list.append(genre)

len(genres_list)    

In [15]:
genres_list_df = pd.DataFrame(genres_list)
genres_list_df = genres_list_df.rename(columns={0: "genre"})
genres_list_df['count'] = 1
genres_list_df.head()

In [16]:
#Create list of unique genres
unique_genres_list = list(set(genres_list))
len(unique_genres_list)

In [17]:
unique_genre_df = genres_list_df.groupby(['genre']).count()
unique_genre_df = unique_genre_df.sort_values('count', ascending=False)
unique_genre_df.head(10)

In [18]:
#Creating list of parent genres
#Referencing: https://www.blisshq.com/music-library-management-blog/2011/01/25/fundamental-music-genre-list/

parent_genres = ['blues', 'children', 'classical', 'country',
                 'edm', 'electronic', 'folk', 'funk', 'hiphop', 
                 'indie', 'jazz', 'latin', 'pop', 'r&b', 'rap', 
                 'reggae', 'rock', 'soul', 'soundtrack'
                ]

len(parent_genres)

In [19]:
#Create substrings of parent genre names
blues_substr = 'blues'
children_substr = 'children'
classical_substr = 'classical'
country_substr = 'country'
edm_substr = 'edm'
electronic_substr = 'electronic'
folk_substr = 'folk'
funk_substr = 'funk'
hiphop_substr = 'hop'
indie_substr = 'indie'
jazz_substr = 'jazz'
latin_substr = 'latin'
pop_substr = 'pop'
rnb_substr = 'r&b'
rap_substr = 'rap'
reggae_substr = 'reggae'
rock_substr = 'rock'
soul_substr = 'soul'
soundtrack_substr = 'soundtrack'

In [20]:
#Create dictionary of subgenres per parent genre based on substring search
subgenre_dict = {'blues': [i for i in unique_genres_list if blues_substr in i], 
                 'children': [i for i in unique_genres_list if children_substr in i],
                 'classical': [i for i in unique_genres_list if classical_substr in i],
                 'country': [i for i in unique_genres_list if country_substr in i],
                 'edm': [i for i in unique_genres_list if edm_substr in i],
                 'electronic': [i for i in unique_genres_list if electronic_substr in i],
                 'folk': [i for i in unique_genres_list if folk_substr in i],
                 'funk': [i for i in unique_genres_list if funk_substr in i],
                 'hiphop': [i for i in unique_genres_list if hiphop_substr in i],
                 'indie': [i for i in unique_genres_list if indie_substr in i],
                 'jazz': [i for i in unique_genres_list if jazz_substr in i],
                 'latin': [i for i in unique_genres_list if latin_substr in i],
                 'pop': [i for i in unique_genres_list if pop_substr in i],
                 'r&b': [i for i in unique_genres_list if rnb_substr in i],
                 'rap': [i for i in unique_genres_list if rap_substr in i],
                 'reggae': [i for i in unique_genres_list if reggae_substr in i],
                 'rock': [i for i in unique_genres_list if rock_substr in i],
                 'soul': [i for i in unique_genres_list if soul_substr in i],
                 'soundtrack': [i for i in unique_genres_list if soundtrack_substr in i]
                }

In [21]:
#MIGHT NOT NEED
#Create lists of subgenres per parent genre based on substring search
blues_subgenres = [i for i in unique_genres_list if blues_substr in i]
children_subgenres = [i for i in unique_genres_list if children_substr in i]
classical_subgenres = [i for i in unique_genres_list if classical_substr in i]
country_subgenres = [i for i in unique_genres_list if country_substr in i]
edm_subgenres = [i for i in unique_genres_list if edm_substr in i]
electronic_subgenres = [i for i in unique_genres_list if electronic_substr in i]
folk_subgenres = [i for i in unique_genres_list if folk_substr in i]
funk_subgenres = [i for i in unique_genres_list if funk_substr in i]
hiphop_subgenres = [i for i in unique_genres_list if hiphop_substr in i]
indie_subgenres = [i for i in unique_genres_list if indie_substr in i]
jazz_subgenres = [i for i in unique_genres_list if jazz_substr in i]
latin_subgenres = [i for i in unique_genres_list if latin_substr in i]
pop_subgenres = [i for i in unique_genres_list if pop_substr in i]
rnb_subgenres = [i for i in unique_genres_list if rnb_substr in i]
rap_subgenres = [i for i in unique_genres_list if rap_substr in i]
reggae_subgenres = [i for i in unique_genres_list if reggae_substr in i]
rock_subgenres = [i for i in unique_genres_list if rock_substr in i]
soul_subgenres = [i for i in unique_genres_list if soul_substr in i]
soundtrack_subgenres = [i for i in unique_genres_list if soundtrack_substr in i]

In [22]:
#MIGHT NOT NEED
#List of subgenre lists
subgenre_lists = [blues_subgenres, 
                  children_subgenres, 
                  classical_subgenres, 
                  country_subgenres, 
                  edm_subgenres, 
                  electronic_subgenres, 
                  folk_subgenres, 
                  funk_subgenres,
                  hiphop_subgenres, 
                  indie_subgenres,
                  jazz_subgenres, 
                  latin_subgenres, 
                  pop_subgenres, 
                  rnb_subgenres, 
                  rap_subgenres, 
                  reggae_subgenres, 
                  rock_subgenres, 
                  soul_subgenres, 
                  soundtrack_subgenres
                 ]
len(subgenre_lists)

In [23]:
# Creating new columns for each unique genre and assign NaN values
for genre in parent_genres:
    artists[genre] = np.nan

In [24]:
# Assigning True Boolean value to parent genre columns 
# If subgenre appears in artist_genres column

for genre in parent_genres:
    artists[genre] = artists.artist_genres.apply(lambda x: genre in x)

In [25]:
artists.info()

In [26]:
artists[100:200]

In [27]:
type(artists.artist_genres)

In [28]:



pop_subgenres


artists.pop

artist_genres

for subgenre in subgenre_lists

for artist in artists.pop:

for genre in pop_subgenres

for genre in parent_genres:
    artists['pop'] = artists.artist_genres.apply(lambda x: genre in x)


In [None]:
for genre in parent_genres:
    artists[genre] = artists.artist_genres.apply(lambda x: genre in x)

In [None]:
for sub_genre in pop_genres:
    if sub_genre in artists.artist_genres:
        artists.pop.apply(True)
    else:
        artists.pop = False

In [None]:
for genre in parent_genres:
    for sub_genre in pop_genres:
        artists.pop = artists.artist_genres.apply(lambda x: sub_genre in x)
        if sub_genre in artists.artist_genres:
            artists.genre = True
        else:
            artists.genre = False

In [None]:

if x == i in a_list

In [None]:
for genre in parent_genres:
    for sub_genre in pop_genres:
            artists[genre] = artists.artist_genres.apply(lambda x: sub_genre in x)
            
    

In [None]:
for sub_genre in pop_genres:
    artists.pop = artists.artist_genres.apply(lambda x: sub_genre in x)

In [None]:
# Exporting DataFrame
artists.to_csv('Data/spotify_artists_cleaned.csv', index=False)

In [None]:
# pop2 = ['acousticpop',
#  'afropop',
#  'alternativepop',
#  'alternativepoprock',
#  'antiviralpop',
#  'baroquepop',
#  'bedroompop',
#  'bubblegumpop',
#  'chamberpop',
#  'channelpop',
#  'christianpop',
#  'countrypop',
#  'cpop',
#  'dancepop',
#  'desipop',
#  'dreampop',
#  'electropop',
#  'europop',
#  'experimentalpop',
#  'folkpop',
#  'futurepop',
#  'garagepop',
#  'grungepop',
#  'hyperpop',
#  'indiepop',
#  'jpop',
#  'kpop',
#  'latinpop',
#  'neosynthpop',
#  'newwavepop',
#  'noisepop',
#  'pop',
#  'popedm',
#  'popemo',
#  'popfolk',
#  'pophouse',
#  'poppunk',
#  'poprap',
#  'popreggaeton',
#  'poprock',
#  'postteenpop',
#  'powerpop',
#  'powerpoppunk',
#  'spaceagepop',
#  'synthpop',
#  'teenpop',
#  'undergroundpowerpop',
#  'viralpop',
#  'vpop',]