# Pre-Processing Lyrics

This notebook is pre-processing the lyrics. At first, structural information like [Chorus] etc. and punctuations are removed from the lyrics. Then a spell check is applied and all capital letters are replaced by lowercase letters. After this first pre-processing step, the lyrics are lemmatized using the WordNet Lemmatizer. Finally, all stopwords are removed from the lyrics.

In [None]:
import numpy as np
import pandas as pd
from langdetect import detect
import re
from spellchecker import SpellChecker
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet

# Functions

In [None]:
def preprocess_lyrics(lyrics,spell):
    # Flatten lyrics
    lyrics_flat = lyrics.replace('\r', '\n').replace('\n', ' ').lower()
    lyrics_flat = ' ' + lyrics_flat + ' '
    
    # Remove special cases (English)
    lyrics_flat = lyrics_flat.replace("’", "'")
    lyrics_flat = lyrics_flat.replace("'m ", " am ")
    lyrics_flat = lyrics_flat.replace("'re ", " are ")
    lyrics_flat = lyrics_flat.replace("'ve ", " have ")
    lyrics_flat = lyrics_flat.replace("'d ", " would ")
    lyrics_flat = lyrics_flat.replace("'ll ", " will ")
    lyrics_flat = lyrics_flat.replace(" he's ", " he is ")
    lyrics_flat = lyrics_flat.replace(" she's ", " she is ")
    lyrics_flat = lyrics_flat.replace(" it's ", " it is ")
    lyrics_flat = lyrics_flat.replace(" ain't ", " is not ")
    lyrics_flat = lyrics_flat.replace("can't ", " cannot ")
    lyrics_flat = lyrics_flat.replace("won't ", " will not ")
    lyrics_flat = lyrics_flat.replace("'s ", " ")
    lyrics_flat = lyrics_flat.replace("n't ", " not ")
    lyrics_flat = lyrics_flat.replace(" 'cause ", " because ")
    lyrics_flat = lyrics_flat.replace(" 'til ", " until ")
    
    # Remove structural information (like [Chorus] etc.)
    lyrics_flat = re.sub("([\[]).*?([\]])", "\g<1>\g<2>", lyrics_flat)
    
    # Remove punctuation and weird signs except apostrophe (like "nothin'" etc.)
    punctuation = (',', '"', ",", ';', ':', '.', '?', '!', '(', ')',
               '{', '}', '/', '\\', '_', '|', '-', '@', '#', '*','[',']')
    for p in punctuation:
        lyrics_flat = lyrics_flat.replace(p, '')
        
    # Spell checking
    lyrics_tokens = lyrics_flat.split()
    for i in range(0,len(lyrics_tokens)):
        lyrics_tokens[i] = spell.correction(lyrics_tokens[i])
    
    # Remove all remaining apostrophes
    lyrics_flat = ' '.join(lyrics_tokens)
    lyrics_flat = lyrics_flat.replace("''", "")
    lyrics_tokens = lyrics_flat.split()
    
    # Join tokens to string
    lyrics_flat = ' '.join(lyrics_tokens)
    
    return lyrics_flat

In [None]:
# Lemmatize with POS Tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# Pre-process Lyrics

- Remove structural information like [Chorus], punctuation
- Spell checking
- All lowercase


In [None]:
# Load raw lyrics data
data = pd.read_csv('/path/to/lyrics_data.csv',index_col=0)

# Initialising
lyrics_preprocessed = []
spell = SpellChecker()

# Pre-process lyrics
for row in data.itertuples():
     """
    ATTENTION: 
    Index for filename in row might differ 
    depending on the dataset used!
    """"
    lyrics = row[7]
    tokens = preprocess_lyrics(lyrics,spell)
    lyrics_preprocessed.append(tokens)

# Add to dataframe
data['lyrics_preprocessed'] = lyrics_preprocessed

# Save to csv file
data.to_csv(r'./path/to/lyrics_preprocessed.csv')

# Lemmatizing and removal of stopwords

In [None]:
# Initialising
lemmatizer = WordNetLemmatizer()
lyrics_lemma_no_sw = []

for row in data.itertuples():
    """
    ATTENTION: 
    Index for filename in row might differ 
    depending on the dataset used!
    """"
    tokens = row[9].split()

    # Lemmatize tokens
    tokens_lemma = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokens]
    # Remove stopwords
    tokens_lemma = [word for word in tokens_lemma if not word in stopwords.words()]
    # Join tokens to string
    lyrics_lemma = ' '.join(tokens_lemma)
    # Add to data
    lyrics_lemma_ohne_sw.append(lyrics_lemma)

# Add to data
data['lyrics_lemma_no_sw'] = lyrics_lemma_no_sw

data.to_csv(r'./path/to/lyrics_lemma_no_sw.csv')