# TEXT PREPROCESSING

In [1]:
from tqdm import tqdm # progress bar when long task
import pandas as pd
import numpy as np
import spacy
import pickle
import re

In [2]:
lyrics_df = pd.read_csv("/Users/lucamasserano/Desktop/BOCCONI/nlp/final_project/lyrics_project/data/lyrics_cleaned.csv")
lyrics_df.shape

(43844, 7)

In [3]:
nlp = spacy.load("en")

Example: Racing in the Street, Bruce Springsteen

In [4]:
lyrics_df.loc[4758, "lyrics"]

" I got a sixty-nine Chevy with a 396 Fuelie heads and a Hurst on the floor She’s waiting tonight down in the parking lot Outside the Seven-Eleven store Me and my partner Sonny built her straight out of scratch And he rides with me from town to town We only run for the money got no strings attached We shut ’em up and then we shut ’em down   Tonight, tonight the strip’s just right I wanna blow 'em off in my first heat Summer’s here and the time is right For racin’ in the street   We take all the action we can meet And we cover all the northeast states When the strip shuts down we run ’em in the street From the fire roads to the interstate Now some guys they just give up living And start dying little by little, piece by piece Some guys come home from work and wash up Then go racin’ in the street   Tonight, tonight the strip’s just right I wanna blow ’em all out of their seats We're calling out around the world We’re going racin’ in the street   I met her on the strip three years ago In a

## Common preprocessing

### 1. Lowercase 

In [14]:
lyrics_df.loc[:, "lyrics"] = lyrics_df.lyrics.apply(lambda text: text.strip().lower())

In [15]:
lyrics_df.loc[4758, "lyrics"][:100]

'i got a sixty-nine chevy with a 000 fuelie heads and a hurst on the floor she’s waiting tonight down'

### 2. Replace numbers

In [10]:
numbers = re.compile('[0-9]')
lyrics_df.loc[:, "lyrics"] = lyrics_df.lyrics.apply(lambda text: re.sub(numbers, "0", text))

In [18]:
lyrics_df.loc[4758, "lyrics"][:100]

'i got a sixty-nine chevy with a 000 fuelie heads and a hurst on the floor she’s waiting tonight down'

## Tokenization

In [None]:
tokenized_corpus = [[token.text for token in nlp(lyrics)] for lyrics in tqdm(lyrics_df.loc[:, "lyrics"].to_list())]

In [None]:
# tokenization took some time to finish. Let's pickle it to make it easily reusable
with open('./data/tokenized_corpus.pickle', 'wb') as pickled_object:
    pickle.dump(tokenized_corpus, pickled_object)

In [None]:
# open pickled tokenized corpus
with open('./data/tokenized_corpus.pickle', "rb") as pickled_object:
    tokenized_corpus = pickle.load(pickled_object)

In [None]:
tokenized_corpus[5001]

## Lemmatization, Stopwords and Punctuation

In [None]:
lemmatized_corpus = []
trimmed_lemmatized_corpus = []
for lyrics in tqdm(lyrics_df.loc[:, "lyrics"].to_list()):
    lemmas = []
    trimmed_lemmas = []
    for token in nlp(lyrics):
        lemma = token.lemma_
        lemmas.append(lemma)
        if (not token.is_stop) & (not token.is_punct):
            trimmed_lemmas.append(lemma)
    lemmatized_corpus.append(lemmas)
    trimmed_lemmatized_corpus.append(trimmed_lemmas)

In [None]:
# lemmatization took some time to finish. Let's pickle it to make it easily reusable
with open('./data/lemmatized_corpus.pickle', 'wb') as pickled_object:
    pickle.dump(lemmatized_corpus, pickled_object)
    
with open('./data/trimmed_lemmatized_corpus.pickle', 'wb') as pickled_object:
    pickle.dump(trimmed_lemmatized_corpus, pickled_object)

In [12]:
# open pickled lemmatized and trimmed corpus
with open('./data/lemmatized_corpus.pickle', "rb") as pickled_object:
    lemmatized_corpus = pickle.load(pickled_object)

with open('./data/trimmed_lemmatized_corpus.pickle', "rb") as pickled_object:
    trimmed_lemmatized_corpus = pickle.load(pickled_object)

In [13]:
lemmatized_corpus[4758]

[' ',
 'i',
 'get',
 'a',
 'sixty',
 '-',
 'nine',
 'chevy',
 'with',
 'a',
 '396',
 'fuelie',
 'head',
 'and',
 'a',
 'hurst',
 'on',
 'the',
 'floor',
 '-PRON-',
 '’',
 'wait',
 'tonight',
 'down',
 'in',
 'the',
 'parking',
 'lot',
 'outside',
 'the',
 'seven',
 '-',
 'eleven',
 'store',
 '-PRON-',
 'and',
 '-PRON-',
 'partner',
 'sonny',
 'build',
 '-PRON-',
 'straight',
 'out',
 'of',
 'scratch',
 'and',
 '-PRON-',
 'ride',
 'with',
 '-PRON-',
 'from',
 'town',
 'to',
 'town',
 '-PRON-',
 'only',
 'run',
 'for',
 'the',
 'money',
 'get',
 'no',
 'string',
 'attach',
 '-PRON-',
 'shut',
 '-PRON-',
 'up',
 'and',
 'then',
 '-PRON-',
 'shut',
 '-PRON-',
 'down',
 '  ',
 'tonight',
 ',',
 'tonight',
 'the',
 'strip',
 '’',
 'just',
 'right',
 'i',
 'wanna',
 'blow',
 '-PRON-',
 'off',
 'in',
 '-PRON-',
 'first',
 'heat',
 'summer',
 '’s',
 'here',
 'and',
 'the',
 'time',
 'be',
 'right',
 'for',
 'racin',
 "'",
 'in',
 'the',
 'street',
 '  ',
 '-PRON-',
 'take',
 'all',
 'the',
 'ac

In [None]:
trimmed_lemmatized_corpus[5001]

# STATISTICS 

## Tokens 

## Types

## Collocations

# ANALYSIS-SPECIFIC PREPROCESSING  

## 1. TF-IDF

In [23]:
def tfidf_preprocess(lyrics_df):
    
    import spacy
    nlp = spacy.load("en")
    
    # lowercase
    lyrics_df.loc[:, "lyrics"] = lyrics_df.lyrics.apply(lambda text: text.strip().lower())
    
    # replace numbers
    numbers = re.compile('[0-9]')
    lyrics_df.loc[:, "lyrics"] = lyrics_df.lyrics.apply(lambda text: re.sub(numbers, "0", text))
    
    # lemmatize with POS constraints
    lyrics_df.loc[:, "lyrics"] = tqdm(lyrics_df.lyrics.apply(lambda text:\
                    ' '.join([token.lemma_ for token in nlp(text) if token.pos_ in {'NOUN', 'VERB', 'ADJ', 'ADV', 'INTJ', 'X'}])))
    
    return lyrics_df.lyrics.to_list()