In [1]:
import pandas as pd
import re
import json
import itertools
import string
data = pd.read_pickle("darklyrics/lyrics.pkl")

In [2]:
data.info()
data.head()
data.keys()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 238046 entries, 0 to 941
Data columns (total 5 columns):
album     238046 non-null object
artist    238046 non-null object
lyrics    238046 non-null object
name      238046 non-null object
year      238046 non-null int64
dtypes: int64(1), object(4)
memory usage: 10.9+ MB


Index([u'album', u'artist', u'lyrics', u'name', u'year'], dtype='object')

In [3]:
# Convert lyrics to lowercase
data['lyrics'] = data.lyrics.apply(lambda x: [y.lower() for y in x])

# Remove punctuation
punctuation_regex = re.compile('[^0-9a-zA-Z ]+')
data['lyrics'] = data.lyrics.apply(lambda x: [punctuation_regex.sub("", y) for y in x])

# Remove text between brackets
brackets_regex = re.compile('\[.*?\]')
data['lyrics'] = data.lyrics.apply(lambda x: [brackets_regex.sub("", y) for y in x])

# Remove double spaces
double_space_regex = re.compile('\s+')
data['lyrics'] = data.lyrics.apply(lambda x: [double_space_regex.sub(" ", y) for y in x])

# Print cleaned lyrics
data.iloc[0,2]
data.iloc[3,2]

[u'hopeless time everything falls ideas died one by one when the wounds are',
 u'crying and bats are drawn in blood they go to the moon the clouds come and',
 u'draw the smiles of the moon in the marsh of lilac clouds reach from a',
 u'still silence of fog and we forget the night by its dawn the regret of',
 u'broken hearts you trundle in fucking pain you couldnt seeyou couldnt',
 u'feel like a blinded fold you bloody whore so shut up and die and fuck your',
 u'god']

In [4]:
# Create new column that places the lyrics a single string
data['lyrics_string'] = data.lyrics.apply(lambda x: " ".join(x))

In [6]:
# Remove songs with no lyrics
data = data[data.lyrics_string != ""]
data.iloc[0,2]

[u'hopeless time everything falls ideas died one by one when the wounds are',
 u'crying and bats are drawn in blood they go to the moon the clouds come and',
 u'draw the smiles of the moon in the marsh of lilac clouds reach from a',
 u'still silence of fog and we forget the night by its dawn the regret of',
 u'broken hearts you trundle in fucking pain you couldnt seeyou couldnt',
 u'feel like a blinded fold you bloody whore so shut up and die and fuck your',
 u'god']

In [7]:
from nltk.corpus import stopwords   # stopwords to detect language
from nltk import wordpunct_tokenize # function to split up our words
from sys import stdin               # how else should we get our input :)
 
def get_language_likelihood(input_text):
    """Return a dictionary of languages and their likelihood of being the 
    natural language of the input text
    """
 
    input_text = input_text.lower()
    input_words = wordpunct_tokenize(input_text)
 
    language_likelihood = {}
    total_matches = 0
    for language in stopwords._fileids:
        language_likelihood[language] = len(set(input_words) &
                set(stopwords.words(language)))
 
    return language_likelihood
 
def get_language(input_text):
    """Return the most likely language of the given text
    """
 
    likelihoods = get_language_likelihood(input_text)
    return sorted(likelihoods, key=likelihoods.get, reverse=True)[0]

In [8]:
# Remove songs that are not in english
data = data[data.apply(lambda x: get_language(x['lyrics_string']) == 'english', axis=1)]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 189141 entries, 3 to 940
Data columns (total 6 columns):
album            189141 non-null object
artist           189141 non-null object
lyrics           189141 non-null object
name             189141 non-null object
year             189141 non-null int64
lyrics_string    189141 non-null object
dtypes: int64(1), object(5)
memory usage: 10.1+ MB


In [22]:
# Remove outlier years
data = data[data.year > 1913]
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 189123 entries, 3 to 940
Data columns (total 6 columns):
album            189123 non-null object
artist           189123 non-null object
lyrics           189123 non-null object
name             189123 non-null object
year             189123 non-null int64
lyrics_string    189123 non-null object
dtypes: int64(1), object(5)
memory usage: 10.1+ MB


In [23]:
# General statistics
len(data['artist'].unique()) # 7451 artists
len(data['album'].unique()) # 22296 albums
data['year'].max() # 2016
data['year'].min() # 1968

1968

In [25]:
data.to_pickle("darklyrics/cleaned_lyrics.pkl")