In [1]:
from tqdm import tqdm # progress bar when long task
import pandas as pd
import numpy as np
import pickle
import re
import os

from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.corpus import stopwords
from nltk import Counter
import spacy
nlp = spacy.load("en")

## LyricsPreprocessing CLASS

In [2]:
class LyricsPreprocessing():
    
    
    def __init__(self, lyrics_df):
        
        self.lyrics_df = lyrics_df
        
        
    def preliminary_preprocessing(self, lyrics_df=None, replace_number_with="0", every_digit=True):
        
        if lyrics_df is None:
            lyrics_df = self.lyrics_df
        
        # lowercase
        print("Converting to lower case ...", flush=True)
        lyrics_df.loc[:, "lyrics"] = lyrics_df.lyrics.apply(lambda text: text.strip().lower())
        
        # replace numbers
        print(f"Replacing numbers with {replace_number_with} ...", flush=True)
        if every_digit:
            numbers = re.compile('[0-9]+')
        else:
            numbers = re.compile('[0-9]')
        
        lyrics_df.loc[:, "lyrics"] = lyrics_df.lyrics.apply(lambda text: re.sub(numbers, replace_number_with, text))
        
        return lyrics_df

    
    def tokenize(self, lyrics_df=None, save=False):
        
        if lyrics_df is None:
            lyrics_df = self.lyrics_df
        
        print("Tokenizing ...", flush=True)
        tokenized_corpus = [[token.text for token in nlp(lyrics)] for lyrics in tqdm(lyrics_df.loc[:, "lyrics"].to_list())]
        self.tokenized_corpus = tokenized_corpus
        
        if not save is False:
            self.save(tokenized_corpus, 'tokenized_corpus', how=save)
        
        return tokenized_corpus
    
    
    def core_preprocessing(self, lyrics_df, save=False):
        
        if lyrics_df is None:
            lyrics_df = self.lyrics_df
        
        lemmatized_corpus = []
        tagged_lemmatized_corpus = []
        trimmed_corpus = []
        tagged_trimmed_corpus = []
        for lyrics in tqdm(lyrics_df.loc[:, "lyrics"].to_list()):
            lemmas = []
            tagged_lemmas = []
            trimmed_lemmas = []
            tagged_trimmed_lemmas = []
            for token in nlp(lyrics):
                # remove all tokens with length <= 2
                if len(str(token)) > 2:
                    # lemmatize and get POS
                    lemma = token.lemma_
                    pos = token.pos_
                    lemmas.append(lemma)
                    tagged_lemmas.append(lemma + "_" + pos)
                    # keep only if POS in ['NOUN', 'VERB', 'ADJ', 'ADV','INTJ', 'X']
                    if pos in ['NOUN', 'VERB', 'ADJ', 'ADV', 'INTJ', 'X']:
                        trimmed_lemmas.append(lemma)
                        tagged_trimmed_lemmas.append(lemma + "_" + pos)
            lemmatized_corpus.append(lemmas)
            tagged_lemmatized_corpus.append(tagged_lemmas)
            trimmed_corpus.append(trimmed_lemmas)
            tagged_trimmed_corpus.append(tagged_trimmed_lemmas)
        
        self.lemmatized_corpus = lemmatized_corpus
        self.tagged_lemmatized_corpus = tagged_lemmatized_corpus
        self.trimmed_corpus = trimmed_corpus
        self.tagged_corpus = tagged_trimmed_corpus
        
        if not save is False:
            self.save(lemmatized_corpus, 'lemmatized_corpus', how=save)
            self.save(tagged_lemmatized_corpus, 'tagged_lemmatized_corpus', how=save)
            self.save(trimmed_corpus, 'trimmed_corpus', how=save)
            self.save(tagged_trimmed_corpus, 'tagged_trimmed_corpus', how=save)
        
           
    def join_collocations(self, preprocessed_corpus, most_common=20):
        
        # separate each lyrics with a space in order to avoid collocations between different songs
        words = []
        for idx, lyrics in enumerate(preprocessed_corpus):
            # not for the first
            if idx > 0:
                words.append(' ')
            for token in lyrics:
                words.append(token)
            
        finder = BigramCollocationFinder.from_words(words)
        bgm = BigramAssocMeasures()
        score = bgm.mi_like
        collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}
        
        return Counter(collocations).most_common(most_common)
    
    
    def save(self, obj, filename, path='/Users/lucamasserano/Desktop/BOCCONI/nlp/final_project/lyrics_project/data/', how='pickle'):
        
        if how == 'pickle':
            with open(os.path.join(path, filename + '.pickle'), 'wb') as pickled_obj:
                pickle.dump(obj, pickled_object)
               

### Analysis

In [3]:
lyrics_df = pd.read_csv('/Users/lucamasserano/Desktop/BOCCONI/nlp/final_project/lyrics_project/data/lyrics_cleaned.csv')
lyrics_df.shape

(43844, 7)

In [4]:
lyrics_preprocess = LyricsPreprocessing(lyrics_df=lyrics_df)

In [5]:
lyrics_cleaned = lyrics_preprocess.preliminary_preprocessing()
lyrics_cleaned.iloc[4758, :]

Converting to lower case ...
Replacing numbers with 0 ...


artist                                          Bruce Springsteen
song                                         Racing In The Street
album                                Darkness on the Edge of Town
release_date                                           1978-06-02
genre                                                        Rock
lyrics          i got a sixty-nine chevy with a 0 fuelie heads...
year                                                         1978
Name: 4758, dtype: object

In [6]:
lyrics_preprocess.core_preprocessing(lyrics_df, save='pickle')

100%|██████████| 43844/43844 [57:34<00:00, 13.05it/s]  


NameError: name 'pickled_object' is not defined

In [None]:
# tokenization took some time to finish. Let's pickle it to make it easily reusable
with open('./data/trimmed_corpus.pickle', 'rb') as pickled_object:
    trimmed_corpus = pickle.load(pickled_object)

In [None]:
genre_idxs = lyrics_df.loc[lyrics_df.genre == "Hip-Hop", :].index

In [None]:
lyrics_preprocess.join_collocations(np.array(trimmed_corpus)[genre_idxs].tolist(), most_common=100)

# STATISTICS 

## Tokens 

## Types

## Collocations