# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Creating artist_codes dataframe
artists = pd.read_csv('Data/spotify_artists.csv')
artists = artists.drop(["Unnamed: 0", "artist_uri"], axis=1)

In [3]:
#Take brackets out of artist_genres column string data
artists['artist_genres'] = artists['artist_genres'].str[1:-1]

In [4]:
#Take quotes out of artist_genres column string data
artists['artist_genres'] = artists['artist_genres'].str.replace("'", "")

In [5]:
#Take spaces out of artist_genres column string data
artists['artist_genres'] = artists['artist_genres'].str.replace(" ", "")

In [6]:
#Take dashes out of artist_genres column string data
artists['artist_genres'] = artists['artist_genres'].str.replace("-", "")

In [7]:
#Replace empty genre data points with NaNs
artists['artist_genres'] = artists['artist_genres'].apply(lambda y: np.nan if y=='' else y)

In [8]:
#Drop rows with NaN genre data
artists = artists.dropna(subset=['artist_genres'])

In [9]:
#Drop rows with NaN artist_name data
artists = artists.dropna(subset=['artist_name'])

In [10]:
#Drop rows with NaN artist_followers data
artists = artists.dropna(subset=['artist_followers'])

In [11]:
print(artists.info())
artists.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130960 entries, 0 to 314797
Data columns (total 5 columns):
artist_id            130960 non-null object
artist_name          130960 non-null object
artist_genres        130960 non-null object
artist_followers     130960 non-null float64
artist_popularity    130960 non-null int64
dtypes: float64(1), int64(1), object(3)
memory usage: 6.0+ MB
None


Unnamed: 0,artist_id,artist_name,artist_genres,artist_followers,artist_popularity
0,4tN3rZ7cChj4Wns2Wt2Nj6,2 AM,"antiviralpop,kpop,minecraft",2075.0,41
2,3LsQKoRgMc8VEkQn66jfAQ,2:54,metropopolis,10158.0,19
12,6bMul6rmRS03x38tWKYifO,883,"classicitalianpop,europop,italianarenapop,ital...",389768.0,59
13,0Qh5UGy4Y1hrZlClkIP7oG,3-2,"deepgfunk,dirtytexasrap",62.0,19
15,51kPCmCJ7rXClxKDc2r4RA,July 7,trapsoul,3119.0,34
18,733SPLVnEyXs6GxUEnJ7bx,11/5,"gfunk,hyphy",14489.0,29
19,6soPpJHlCtN6SY8pWlfbC6,-M-,"chanson,frenchindiepop,frenchrock",189142.0,60
21,4EbEUPsgjjmNsKjigP3R4u,-ness,hopebeat,936.0,15
23,6bdOuFTmkGliFvCkdpH49Z,:Of The Wand And The Moon:,"martialindustrial,medievalfolk,neoclassical,ne...",12728.0,31
24,3TTbkikkCAbUg2j5i9UrTV,:Wumpscut:,"aggrotech,darkwave,ebm,electroindustrial,indus...",29931.0,36


In [12]:
#Creating list of parent genres
#Referencing: https://www.blisshq.com/music-library-management-blog/2011/01/25/fundamental-music-genre-list/
parent_genres = ['blues', 'children', 'classical', 'country',
                 'edm', 'electronic', 'folk', 'funk', 'hiphop', 
                 'indie', 'jazz', 'latin', 'pop', 'r&b', 'rap', 
                 'reggae', 'rock', 'soul', 'soundtrack'
                ]

len(parent_genres)

19

In [13]:
# Creating new columns for each unique genre and assign NaN values
for genre in parent_genres:
    artists[genre] = np.nan

In [14]:
# Assigning True Boolean value to parent genre columns 
# If subgenre appears in artist_genres column
for genre in parent_genres:
    artists[genre] = artists.artist_genres.apply(lambda x: genre in x)

In [15]:
print(artists.info())
artists.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130960 entries, 0 to 314797
Data columns (total 24 columns):
artist_id            130960 non-null object
artist_name          130960 non-null object
artist_genres        130960 non-null object
artist_followers     130960 non-null float64
artist_popularity    130960 non-null int64
blues                130960 non-null bool
children             130960 non-null bool
classical            130960 non-null bool
country              130960 non-null bool
edm                  130960 non-null bool
electronic           130960 non-null bool
folk                 130960 non-null bool
funk                 130960 non-null bool
hiphop               130960 non-null bool
indie                130960 non-null bool
jazz                 130960 non-null bool
latin                130960 non-null bool
pop                  130960 non-null bool
r&b                  130960 non-null bool
rap                  130960 non-null bool
reggae               130960 non-null bo

Unnamed: 0,artist_id,artist_name,artist_genres,artist_followers,artist_popularity,blues,children,classical,country,edm,...,indie,jazz,latin,pop,r&b,rap,reggae,rock,soul,soundtrack
0,4tN3rZ7cChj4Wns2Wt2Nj6,2 AM,"antiviralpop,kpop,minecraft",2075.0,41,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2,3LsQKoRgMc8VEkQn66jfAQ,2:54,metropopolis,10158.0,19,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
12,6bMul6rmRS03x38tWKYifO,883,"classicitalianpop,europop,italianarenapop,ital...",389768.0,59,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
13,0Qh5UGy4Y1hrZlClkIP7oG,3-2,"deepgfunk,dirtytexasrap",62.0,19,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
15,51kPCmCJ7rXClxKDc2r4RA,July 7,trapsoul,3119.0,34,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
18,733SPLVnEyXs6GxUEnJ7bx,11/5,"gfunk,hyphy",14489.0,29,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
19,6soPpJHlCtN6SY8pWlfbC6,-M-,"chanson,frenchindiepop,frenchrock",189142.0,60,False,False,False,False,False,...,True,False,False,True,False,False,False,True,False,False
21,4EbEUPsgjjmNsKjigP3R4u,-ness,hopebeat,936.0,15,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
23,6bdOuFTmkGliFvCkdpH49Z,:Of The Wand And The Moon:,"martialindustrial,medievalfolk,neoclassical,ne...",12728.0,31,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
24,3TTbkikkCAbUg2j5i9UrTV,:Wumpscut:,"aggrotech,darkwave,ebm,electroindustrial,indus...",29931.0,36,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [16]:
#Create column that counts number of genres per row that are True
artists['genre_count'] = artists.iloc[:,5:24].apply(lambda s: (s > 0).sum(), axis=1)

In [17]:
#Take out rows with more than one parent genre
filtered_artists = artists[(artists['genre_count']==1)]

In [18]:
print(filtered_artists.info())
filtered_artists.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53244 entries, 0 to 314797
Data columns (total 25 columns):
artist_id            53244 non-null object
artist_name          53244 non-null object
artist_genres        53244 non-null object
artist_followers     53244 non-null float64
artist_popularity    53244 non-null int64
blues                53244 non-null bool
children             53244 non-null bool
classical            53244 non-null bool
country              53244 non-null bool
edm                  53244 non-null bool
electronic           53244 non-null bool
folk                 53244 non-null bool
funk                 53244 non-null bool
hiphop               53244 non-null bool
indie                53244 non-null bool
jazz                 53244 non-null bool
latin                53244 non-null bool
pop                  53244 non-null bool
r&b                  53244 non-null bool
rap                  53244 non-null bool
reggae               53244 non-null bool
rock               

Unnamed: 0,artist_id,artist_name,artist_genres,artist_followers,artist_popularity,blues,children,classical,country,edm,...,jazz,latin,pop,r&b,rap,reggae,rock,soul,soundtrack,genre_count
0,4tN3rZ7cChj4Wns2Wt2Nj6,2 AM,"antiviralpop,kpop,minecraft",2075.0,41,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,1
2,3LsQKoRgMc8VEkQn66jfAQ,2:54,metropopolis,10158.0,19,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,1
18,733SPLVnEyXs6GxUEnJ7bx,11/5,"gfunk,hyphy",14489.0,29,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
24,3TTbkikkCAbUg2j5i9UrTV,:Wumpscut:,"aggrotech,darkwave,ebm,electroindustrial,indus...",29931.0,36,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,1
30,1h2OVSKRdJHnVqyCKqCLkf,?uestlove,phillyrap,16178.0,34,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,1
40,1bDWGdIC2hardyt55nlQgG,"""Weird Al"" Yankovic","comedyrock,comic",423423.0,60,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,1
49,2McJOiKebPtBGqyEYyZQFN,[:SITD:],"aggrotech,darkwave,ebm,electroindustrial,futur...",13580.0,36,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,1
56,16LvTqpD4wlfFb4EBcW0x3,[dunkelbunt],"balkanbrass,electroswing,nujazz",16406.0,43,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,1
64,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,"boyband,dancepop,europop,pop",1076991.0,71,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,1
65,0PEqo7RkX5wkaYwlT6hINn,*repeat repeat,nashvilleindie,5270.0,34,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1


In [19]:
# Exporting DataFrame
filtered_artists.to_csv('Data/spotify_artists_cleaned.csv', index=False)