### Imports

In [1]:
import pandas as pd
import string

### Read and Filter Artists

In [2]:
df_artists = pd.read_csv('artists-data.csv')
indexNames = []
for index, row in df_artists.iterrows():
    if type(row.Genres)!=str:
        indexNames.append(index)
df_artists = df_artists.drop(indexNames)
df_artists.head(10)

Unnamed: 0,Artist,Genres,Songs,Popularity,Link
0,Ivete Sangalo,Pop; Axé; Romântico,313.0,4.4,/ivete-sangalo/
1,Chiclete com Banana,Axé,268.0,3.8,/chiclete-com-banana/
2,Banda Eva,Axé; Romântico; Reggae,215.0,2.3,/banda-eva/
3,É O Tchan,Axé,129.0,1.6,/e-o-tchan/
4,Claudia Leitte,Pop; Axé; Romântico,167.0,1.5,/claudia-leitte/
5,Harmonia do Samba,Axé; Samba; Pagode,237.0,0.9,/harmonia-do-samba/
6,Ara Ketu,Axé; Pop,139.0,1.5,/ara-ketu/
7,Daniela Mercury,MPB; Axé,230.0,1.4,/daniela-mercury/
8,Olodum,Axé,74.0,1.3,/olodum/
9,Netinho,Axé,204.0,2.0,/netinho/


### Read and Filter Lyrics

In [3]:
df_lyrics = pd.read_csv('lyrics-data.csv')
df_lyrics = df_lyrics.loc[lambda df: df['language'] == 'en']
df_lyrics.head(10)

Unnamed: 0,ALink,SName,SLink,Lyric,language
69,/ivete-sangalo/,Careless Whisper,/ivete-sangalo/careless-whisper.html,I feel so unsure\nAs I take your hand and lead...,en
86,/ivete-sangalo/,Could You Be Loved / Citação Musical do Rap: S...,/ivete-sangalo/could-you-be-loved-citacao-musi...,"Don't let them fool, ya\nOr even try to school...",en
88,/ivete-sangalo/,Cruisin' (Part. Saulo),/ivete-sangalo/cruisin-part-saulo.html,"Baby, let's cruise, away from here\nDon't be c...",en
111,/ivete-sangalo/,Easy,/ivete-sangalo/easy.html,"Know it sounds funny\nBut, I just can't stand ...",en
140,/ivete-sangalo/,For Your Babies (The Voice cover),/ivete-sangalo/for-your-babies-the-voice-cover...,You've got that look again\nThe one I hoped I ...,en
147,/ivete-sangalo/,Human Nature,/ivete-sangalo/human-nature.html,Looking out\nAcross the night time\nThe city w...,en
159,/ivete-sangalo/,Losing Control (Miss Cady feat. Ivete Sangalo),/ivete-sangalo/losing-control-miss-cady-feat-i...,"Uh, yeah.\nGo, go, go.\nUh, yeah.\nUh, Uh, Uhh...",en
168,/ivete-sangalo/,Master Blaster (Jammin'),/ivete-sangalo/master-blaster-jammin.html,Everyone's feeling pretty\nIt's hotter than Ju...,en
187,/ivete-sangalo/,More Than Words,/ivete-sangalo/more-than-words.html,Saying 'I Love you'\nIs not the words I want t...,en
207,/ivete-sangalo/,Natural Collie,/ivete-sangalo/natural-collie.html,Been down in the valley\nSmoking natural colli...,en


### Merge Data

In [4]:
df_merged = pd.merge(df_lyrics, df_artists, how='inner', left_on='ALink', right_on='Link')
df_merged = df_merged.drop(['ALink','Link', "SName", "SLink", "language", "Artist", "Songs", "Popularity"], axis=1)
df_merged.head(10)

Unnamed: 0,Lyric,Genres
0,I feel so unsure\nAs I take your hand and lead...,Pop; Axé; Romântico
1,"Don't let them fool, ya\nOr even try to school...",Pop; Axé; Romântico
2,"Baby, let's cruise, away from here\nDon't be c...",Pop; Axé; Romântico
3,"Know it sounds funny\nBut, I just can't stand ...",Pop; Axé; Romântico
4,You've got that look again\nThe one I hoped I ...,Pop; Axé; Romântico
5,Looking out\nAcross the night time\nThe city w...,Pop; Axé; Romântico
6,"Uh, yeah.\nGo, go, go.\nUh, yeah.\nUh, Uh, Uhh...",Pop; Axé; Romântico
7,Everyone's feeling pretty\nIt's hotter than Ju...,Pop; Axé; Romântico
8,Saying 'I Love you'\nIs not the words I want t...,Pop; Axé; Romântico
9,Been down in the valley\nSmoking natural colli...,Pop; Axé; Romântico


### Filter to only some genres

In [5]:
l = dict()

for i in range(len(df_merged)):
    for genre in df_merged.iloc[i,1].split("; "):
        if genre not in l:
            l[genre] = 1
        else:
            l[genre] += 1

l = sorted(list(l.items()), key = lambda x: -x[1])[:5]
print(l)
l = set([elem[0] for elem in l])

indexNames = []
for i in range(len(df_merged)):
    genres = df_merged.iloc[i,1].split("; ")
    genres = [genre for genre in genres if genre in l]
    if len(genres)==0:
        indexNames.append(i)
    else:
        df_merged.iloc[i,1] = "; ".join(genres)
df_merged = df_merged.drop(indexNames)

[('Rock', 63717), ('Pop', 33571), ('Pop/Rock', 26360), ('Heavy Metal', 21759), ('Hip Hop', 20902)]


### Remove Punctuation

In [6]:
for i in range(len(df_merged)):
    temp = df_merged.iloc[i,0]
    temp = temp.replace("\n", " ")
    temp = temp.replace("\t", " ")
    temp = temp.translate(str.maketrans('', '', string.punctuation))
    temp = temp.lower()
    df_merged.iloc[i,0] = temp
df_merged.head(10)

Unnamed: 0,Lyric,Genres
0,i feel so unsure as i take your hand and lead ...,Pop
1,dont let them fool ya or even try to school ya...,Pop
2,baby lets cruise away from here dont be confus...,Pop
3,know it sounds funny but i just cant stand the...,Pop
4,youve got that look again the one i hoped i ha...,Pop
5,looking out across the night time the city win...,Pop
6,uh yeah go go go uh yeah uh uh uhh hey hey i ...,Pop
7,everyones feeling pretty its hotter than july ...,Pop
8,saying i love you is not the words i want to h...,Pop
9,been down in the valley smoking natural collie...,Pop


### Save shuffled data

In [7]:
df_merged.sample(frac = 1).to_csv('cleaned-data.csv', index = False)
df = pd.read_csv('cleaned-data.csv')
df.head(10)

Unnamed: 0,Lyric,Genres
0,element of the surprise such a spectacular thi...,Hip Hop
1,i never said i’d lie and wait forever if i die...,Rock
2,come on noon you gotta get on board come on n...,Rock; Heavy Metal
3,second nature hoo ooh ooh ooh hey hey hey hey ...,Hip Hop
4,tell me how many real muthafuckas feel me i sm...,Hip Hop
5,this winters night stars shine like silver bel...,Pop
6,i bought a flat diminished responsibility your...,Pop/Rock
7,tears tears from saying goodbye tears im sorry...,Rock
8,we are all on drugs when your out with your f...,Rock
9,im trapped and my backs up against the wall i ...,Rock


### Get average number of words in each Lyric

In [8]:
s = 0
c = 0

for i in range(len(df_merged)):
    s += len(df_merged.iloc[i,0].split())
    c += 1

s/c

265.78424203930194