In [None]:
import pandas as pd
import re
import json
import itertools
import string
data = pd.read_pickle("darklyrics/lyrics.pkl")

In [None]:
data.info()
data.head()
data.keys()

In [None]:
# Convert lyrics to lowercase
data['lyrics'] = data.lyrics.apply(lambda x: [y.lower() for y in x])

# Remove punctuation
punctuation_regex = re.compile('[^0-9a-zA-Z ]+')
data['lyrics'] = data.lyrics.apply(lambda x: [punctuation_regex.sub(" ", y) for y in x])

# Remove text between brackets
brackets_regex = re.compile('\[.*?\]')
data['lyrics'] = data.lyrics.apply(lambda x: [brackets_regex.sub("", y) for y in x])

# Remove double spaces
double_space_regex = re.compile('\s+')
data['lyrics'] = data.lyrics.apply(lambda x: [double_space_regex.sub(" ", y) for y in x])

# Print cleaned lyrics
data.iloc[0,2]
data.iloc[3,2]

In [None]:
# Create new column that places the lyrics a single string
data['lyrics_string'] = data.lyrics.apply(lambda x: " ".join(x))

In [None]:
# Remove songs with no lyrics
data = data[data.lyrics_string != ""]
data.iloc[0,2]

In [None]:
# Code taken from http://h6o6.com/2012/12/detecting-language-with-python-and-the-natural-language-toolkit-nltk/
# Used to detect the language on lyrics

from nltk.corpus import stopwords   # stopwords to detect language
from nltk import wordpunct_tokenize # function to split up our words
from sys import stdin               # how else should we get our input :)
 
def get_language_likelihood(input_text):
    """Return a dictionary of languages and their likelihood of being the 
    natural language of the input text
    """
 
    input_text = input_text.lower()
    input_words = wordpunct_tokenize(input_text)
 
    language_likelihood = {}
    total_matches = 0
    for language in stopwords._fileids:
        language_likelihood[language] = len(set(input_words) &
                set(stopwords.words(language)))
 
    return language_likelihood
 
def get_language(input_text):
    """Return the most likely language of the given text
    """
 
    likelihoods = get_language_likelihood(input_text)
    return sorted(likelihoods, key=likelihoods.get, reverse=True)[0]

In [None]:
# Remove songs that are not in english
data = data[data.apply(lambda x: get_language(x['lyrics_string']) == 'english', axis=1)]
data.info()

In [None]:
# Remove outlier years
data = data[data.year > 1913]
data.info()

In [None]:
# General statistics
len(data['artist'].unique()) # 7451 artists
len(data['album'].unique()) # 22296 albums
data['year'].max() # 2016
data['year'].min() # 1968

In [None]:
# Save cleaned dataset in pickle file
data.to_pickle("darklyrics/cleaned_lyrics.pkl")

In [None]:
data = pd.read_pickle("darklyrics/cleaned_lyrics.pkl")

## Stemming

In [None]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

def stem_text(text, stemmer):
    return ' '.join([stemmer.stem(word) for word in text.split()])

data['lyrics_string'] = data.lyrics_string.apply(lambda x: stem_text(x, stemmer))

## Remove stopwords

In [None]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

def stem_text(text, stemmer):
    return ' '.join([stemmer.stem(word) for word in text.split()])

data['lyrics_string'] = data.lyrics_string.apply(lambda x: stem_text(x, stemmer))

In [None]:
data.iloc[0,5]

In [None]:
# Group lyrics by artist
artist_data = data.groupby('artist')['lyrics_string'].apply(lambda x: ' '.join(x))
artist_data.head()
artist_data.to_pickle("darklyrics/artist_data.pkl")

In [None]:
# Save cleaned dataset in pickle file
data.to_pickle("darklyrics/clean_lyrics.pkl")