In [1]:
import pandas as pd
import re
import json
import itertools
import string
data = pd.read_pickle("darklyrics/lyrics.pkl")

In [2]:
data.info()
data.head()
data.keys()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 238046 entries, 0 to 941
Data columns (total 5 columns):
album     238046 non-null object
artist    238046 non-null object
lyrics    238046 non-null object
name      238046 non-null object
year      238046 non-null int64
dtypes: int64(1), object(4)
memory usage: 10.9+ MB


Index([u'album', u'artist', u'lyrics', u'name', u'year'], dtype='object')

In [3]:
# Convert lyrics to lowercase
data['lyrics'] = data.lyrics.apply(lambda x: [y.lower() for y in x])

# Remove text between brackets
brackets_regex = re.compile('\[.*?\]')
data['lyrics'] = data.lyrics.apply(lambda x: [brackets_regex.sub("", y) for y in x])

# Remove apostrophes
apostrophe_regex = re.compile("[']")
data['lyrics'] = data.lyrics.apply(lambda x: [apostrophe_regex.sub("", y) for y in x])

# Remove punctuation
punctuation_regex = re.compile('[^0-9a-zA-Z ]+')
data['lyrics'] = data.lyrics.apply(lambda x: [punctuation_regex.sub(" ", y) for y in x])

# Remove double spaces
double_space_regex = re.compile('\s+')
data['lyrics'] = data.lyrics.apply(lambda x: [double_space_regex.sub(" ", y) for y in x])

# Print cleaned lyrics
data.iloc[0,2]
data.iloc[10,2]

[u'dea atra',
 u'redivivus',
 u'dea atra',
 u'gloriosa',
 u'abyss i call',
 u'you open the portals',
 u'in the great vastness of the night',
 u'let',
 u'your obedient',
 u'servant enter',
 u'the shrine and join them',
 u'in a timeless rite',
 u'',
 u'youre an elder divinity ',
 u'the pristine darkness',
 u'guiding nightmares and howling ghosts',
 u'a grim muse of a black mass',
 u'you are a thousand faced mother',
 u'of all gloom',
 u'look with great favor from above',
 u'on our hecatomb',
 u'dea atra',
 u'redivivus',
 u'dea atra',
 u'furiosa',
 u'my frantic heart',
 u'violently beats',
 u'biting into the sharp bars of the chest',
 u'all',
 u'the wounds',
 u'i ve ever had are',
 u'green once more at her ruthless behest',
 u'for this throne',
 u'you paid with many',
 u'guiltless preys',
 u'their dead eyes watch you ',
 u'but you can hold the gaze',
 u'the frail voices curse but',
 u'even cannot sore',
 u'disperse remains of these lost souls',
 u'forevermore',
 u'i am entranced',
 u'by m

In [4]:
# Create new column that places the lyrics a single string
data['lyrics_string'] = data.lyrics.apply(lambda x: " ".join(x))

In [5]:
# Remove songs with no lyrics
data = data[data.lyrics_string != ""]
data.iloc[0,2]

[u'hopeless time everything falls ideas died one by one when the wounds are',
 u'crying and bats are drawn in blood they go to the moon the clouds come and',
 u'draw the smiles of the moon in the marsh of lilac clouds reach from a',
 u'still silence of fog and we forget the night by its dawn the regret of',
 u'broken hearts you trundle in fucking pain you couldnt see you couldnt',
 u'feel like a blinded fold you bloody whore so shut up and die and fuck your',
 u'god ']

In [21]:
# Code taken from http://h6o6.com/2012/12/detecting-language-with-python-and-the-natural-language-toolkit-nltk/
# Used to detect the language on lyrics

from nltk.corpus import stopwords   # stopwords to detect language
from nltk import wordpunct_tokenize # function to split up our words
from sys import stdin               # how else should we get our input :)
 
def get_language_likelihood(input_text):
    """Return a dictionary of languages and their likelihood of being the 
    natural language of the input text
    """
 
    input_text = input_text.lower()
    input_words = wordpunct_tokenize(input_text)
 
    language_likelihood = {}
    total_matches = 0
    for language in stopwords._fileids:
        language_likelihood[language] = len(set(input_words) &
                set(stopwords.words(language)))
 
    return language_likelihood
 
def get_language(input_text):
    """Return the most likely language of the given text
    """
 
    likelihoods = get_language_likelihood(input_text)
    sorted_likelihoods = sorted(likelihoods, key=likelihoods.get, reverse=True)
    first = sorted_likelihoods[0]
    second = sorted_likelihoods[1]
    if likelihoods[second] >= 10 and first == 'english':
        return ''
    else:
        return first

In [22]:
# Remove songs that are not in english
data = data[data.apply(lambda x: get_language(x['lyrics_string']) == 'english', axis=1)]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186627 entries, 3 to 940
Data columns (total 6 columns):
album            186627 non-null object
artist           186627 non-null object
lyrics           186627 non-null object
name             186627 non-null object
year             186627 non-null int64
lyrics_string    186627 non-null object
dtypes: int64(1), object(5)
memory usage: 10.0+ MB


In [23]:
# Remove outlier years
data = data[data.year > 1913]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186609 entries, 3 to 940
Data columns (total 6 columns):
album            186609 non-null object
artist           186609 non-null object
lyrics           186609 non-null object
name             186609 non-null object
year             186609 non-null int64
lyrics_string    186609 non-null object
dtypes: int64(1), object(5)
memory usage: 10.0+ MB


In [24]:
# General statistics
print len(data['artist'].unique()) # 7451 artists
print len(data['album'].unique()) # 22296 albums
print data['year'].max() # 2016
print data['year'].min() # 1968

7433
22179
2016
1968


In [None]:
# Save cleaned dataset in pickle file
data.to_pickle("darklyrics/cleaned_lyrics.pkl")

In [None]:
data = pd.read_pickle("darklyrics/cleaned_lyrics.pkl")

## Stemming and stopword removal

In [25]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

stemmer = SnowballStemmer("english")
cachedStopWords = stopwords.words("english")

def stem_text(text, stemmer):
    return ' '.join([stemmer.stem(word) for word in text.split()])

def remove_stopwords(text, cachedStopWords):
    return ' '.join([word for word in text.split() if word not in cachedStopWords])

## Remove stopwords

In [26]:
data_normal = data.copy()
data_normal['lyrics_string'] = data_normal.lyrics_string.apply(lambda x: remove_stopwords(x, cachedStopWords))

data_stemmed = data.copy()
data_stemmed['lyrics_string'] = data_stemmed.lyrics_string.apply(lambda x: remove_stopwords(x, cachedStopWords))
data_stemmed['lyrics_string'] = data_stemmed.lyrics_string.apply(lambda x: stem_text(x, stemmer))

In [27]:
print data_stemmed['lyrics_string'].iloc[5]
print data_normal['lyrics_string'].iloc[5]

curs night rememb mad rip root curs night rememb branch hatr surround throat curs night rememb shadow disgust curs night rememb color white broken light dead light dead lagoon dark die scar cri repeat isol truth dirti passag human destroy trundl blood midst dirt human rip bodi hope beauti everyth stain everyth fall human like pig butcher everyth dark begin everyth misfortun curs night rememb everyth forgotten
cursed nights remember madness ripping rooted cursed nights remember branches hatred surrounded throat cursed nights remember shadow disgusted cursed nights remember color white broken lights dead lights dead lagoons darkness dying scars crying repeating isolation truth dirty passages humanity destroying trundle blood midst dirts humanity rip body hope beautiful everything stained everything falls human like pigs butcher everything dark beginning everything misfortune cursed nights remember everything forgotten


In [28]:
# Pickle both datasets
data_normal.to_pickle("darklyrics/data_normal.pkl")
data_stemmed.to_pickle("darklyrics/data_stemmed.pkl")

In [29]:
# Group lyrics by artist
artist_data_normal = data_normal.groupby('artist')['lyrics_string'].apply(lambda x: ' '.join(x))
artist_data_stemmed = data_stemmed.groupby('artist')['lyrics_string'].apply(lambda x: ' '.join(x))
artist_data_normal.head()


artist
'68                take heart next room make scene make sure aint...
(sic)monic         revelation resurrections human blood leak woun...
+\-                looked wide open sky window often nights augus...
...aaaarrghh...    hi lmeyi diledin mi ruhunun derinliklerinden g...
...and oceans      fanfare chants trumpets loud proclaim world ho...
Name: lyrics_string, dtype: object

In [30]:
# Save cleaned dataset in pickle file
artist_data_normal.to_pickle("darklyrics/artist_data_normal.pkl")
artist_data_stemmed.to_pickle("darklyrics/artist_data_stemmed.pkl")