In [1]:
import pandas as pd
import spacy 
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
import texthero as hero
import string

In [2]:
spacy_en = spacy.load('en_core_web_md')

In [3]:
lyrics_df = pd.read_csv('lyrics_artists.csv')
lyrics_df = lyrics_df.drop('Unnamed: 0', axis=1)
lyrics_df.tail()

Unnamed: 0,songlink,lyrics,artist
185,Caught Somewhere in Time,If you had the time to lose\nAn open mind and ...,Iron Maiden
186,Number of the Beast (Original Version),"""Woe to you, oh Earth and sea, for the Devil s...",Iron Maiden
187,Speed Of Light,Another time another place\nA hollow universe ...,Iron Maiden
188,Heaven Can Wait,"Can't understand what is happening to me,\nThi...",Iron Maiden
189,Can I Play with Madness,Can I play with madness?\nGive me the sense to...,Iron Maiden


In [4]:
encode = {'Metallica':0, 'Iron Maiden':1}
lyrics_df['artist_code'] = lyrics_df['artist'].map(encode)

In [18]:
lyrics_df.tail()

Unnamed: 0,songlink,lyrics,artist,artist_code
185,Caught Somewhere in Time,If you had the time to lose\nAn open mind and ...,Iron Maiden,1
186,Number of the Beast (Original Version),"""Woe to you, oh Earth and sea, for the Devil s...",Iron Maiden,1
187,Speed Of Light,Another time another place\nA hollow universe ...,Iron Maiden,1
188,Heaven Can Wait,"Can't understand what is happening to me,\nThi...",Iron Maiden,1
189,Can I Play with Madness,Can I play with madness?\nGive me the sense to...,Iron Maiden,1


In [29]:
#calculate the similarity between two songs

song1 = spacy_en(lyrics_df['lyrics'][0]) #Fade to Black (MT)
song2 = spacy_en(lyrics_df['lyrics'][187]) #Speed of Light (IM)
print("The similarity between", lyrics_df['songlink'][0], "and", lyrics_df['songlink'][187], "is", round(song1.similarity(song2),2))

The similarity between Fade To Black and Speed Of Light is 0.97


## Text cleaning
- **Stemming:** reduce words to their root, by removing pre-/suffixes (e.g. *walking --> walk*)
- **Lemmatization:** reduce words to their root form, beyond pre-/suffixes. (e.g.: *better <- good*)
- **Stopwords**
- **POS-Tagging:** identify the part-of-speech of the words.

In [5]:
#remove \n

lyrics_df['lyrics_clean'] = lyrics_df['lyrics'].replace(r'\n',' ', regex=True)
lyrics_df.head()

Unnamed: 0,songlink,lyrics,artist,artist_code,lyrics_clean
0,Fade To Black,Life it seems to fade away\nDrifting further e...,Metallica,0,Life it seems to fade away Drifting further ev...
1,Ride The Lightning,"Guilty as charged\nBut damn it, it ain't right...",Metallica,0,"Guilty as charged But damn it, it ain't right ..."
2,Fight Fire With Fire,Do unto others as they have done unto you\nBut...,Metallica,0,Do unto others as they have done unto you But ...
3,Iced Honey,You can't put a butterfly in a jar\nIf the eff...,Metallica,0,You can't put a butterfly in a jar If the effo...
4,Battery,"Lashing out the action, returning the reaction...",Metallica,0,"Lashing out the action, returning the reaction..."


In [6]:
# Make lowercase, remove punctuation, stopwords.

lyrics_df['lyrics_clean'] = hero.clean(lyrics_df['lyrics_clean'])
lyrics_df.head()

Unnamed: 0,songlink,lyrics,artist,artist_code,lyrics_clean
0,Fade To Black,Life it seems to fade away\nDrifting further e...,Metallica,0,life seems fade away drifting everyday getting...
1,Ride The Lightning,"Guilty as charged\nBut damn it, it ain't right...",Metallica,0,guilty charged damn right someone else control...
2,Fight Fire With Fire,Do unto others as they have done unto you\nBut...,Metallica,0,unto others done unto hell world coming blow u...
3,Iced Honey,You can't put a butterfly in a jar\nIf the eff...,Metallica,0,put butterfly jar effort high matter catch moo...
4,Battery,"Lashing out the action, returning the reaction...",Metallica,0,lashing action returning reaction weak ripped ...


In [9]:
# save the parsed lyrics to a new file

lyrics_df_clean = lyrics_df[['artist','artist_code','lyrics_clean']]
lyrics_df_clean.to_csv('lyrics_artists_clean.csv')