In [1]:
import pandas as pd
import spacy
import re
import numpy as np

In [2]:
## Read in previously scraped lyrics (code for scraping available in different notebook)
queen_df = pd.read_csv('Data/Queen_lyrics.csv', index_col=0)
rlst_df = pd.read_csv('Data/RollingStones_lyrics.csv', index_col=0)

In [3]:
## Drop NAs from dataframes
queen_df.dropna(inplace=True)
rlst_df.dropna(inplace=True)

### Standard string cleaning with built-in python replace functionality and RegEx functions

In [4]:
## Clean up lyrics

def str_cleaner(df): 

    ## Remove the \r\n strings, strip the words and make it lowercase
    Lyrics_clean = [string.replace('\r\n',' ').replace('\n',' ').replace('\'', '').strip().lower() for string in df['Lyrics']]

    ## Remove some other special characters and anything within square brackets, e.g., [chorus]
    Lyrics_clean = [re.sub(r'\[[^]]*\]','',i) for i in Lyrics_clean]
    Lyrics_clean = [re.sub(r'\.','',i) for i in Lyrics_clean]
    Lyrics_clean = [re.sub(r'[ ](?=[ ])|[^-_,A-Za-z0-9 ]+','',i) for i in Lyrics_clean]
    Lyrics_clean = [re.sub(r',','',i) for i in Lyrics_clean]
    
    return Lyrics_clean

In [5]:
queen_clean = str_cleaner(queen_df)
rlst_clean = str_cleaner(rlst_df)

In [6]:
## Balance out the two datasets to contain same amount of lyrics

if len(queen_clean) < len(rlst_clean):
    rand = list(np.random.randint(0,len(rlst_clean)-1,len(queen_clean)))
    rlst_clean = [rlst_clean[i] for i in rand]
else:
    rand = list(np.random.randint(0,len(queen_clean)-1,len(rlst_clean)))
    queen_clean = [queen_clean[i] for i in rand]

In [7]:
## Check the length of the two datasets

len(queen_clean) == len(rlst_clean)

True

---

### Tokenize with spacy for lemmatisation (and remove stop words, punctuations and numbers)

In [8]:
## Import and instantiate spacy nlp model (neural network)
import spacy
nlp = spacy.load('en_core_web_md')

In [9]:
## Function to tokenize (just to get it in a spacy format for further analysis)

def tokenize(lyrics_vect):
    
    tokenized = []

    for song in lyrics_vect:
        tokens = nlp(song)
        tokenized.append(tokens)
        
    return tokenized

In [10]:
## Run tokenization function
tokenized_queen = tokenize(queen_clean)
tokenized_rolling = tokenize(rlst_clean)

In [11]:
## Function to do the spacy stuff

def spacy_cleaner(song):
    new_song = []
    for word in song:
        if not word.is_stop and not word.is_punct and not word.pos_ == 'NUM':
            lemma = word.lemma_
            new_song.append(lemma)
    joined = ' '.join(new_song)
    return joined

In [12]:
## Apply lemmatisation defined in above function to every document (lyric) in the Queen lyrics collection

spacy_queen = []
for document in tokenized_queen:
    spacy_queen.append(spacy_cleaner(document))

In [13]:
## Apply lemmatisation defined in above function to every document (lyric) in the Rolling Stones lyrics collection

spacy_rlst = []
for document in tokenized_rolling:
    spacy_rlst.append(spacy_cleaner(document))

---

### Combine all cleaned lyrics into one corpus (to be X) and create y variable 

In [14]:
corpus = spacy_queen + spacy_rlst

In [15]:
len(corpus) ==  len(spacy_queen) + len(spacy_rlst), len(spacy_queen), len(spacy_rlst), len(corpus)

(True, 304, 304, 608)

In [16]:
y = ['Queen'] * len(spacy_queen) + ['Rolling Stones'] * len(spacy_rlst)

In [30]:
corpus_df = pd.DataFrame({'Label': y, 'Corpus': corpus})

In [33]:
## Export to csv so that a separate machine learning modeling pipeline can work directly with that
corpus_df.to_csv('Output/corpus.csv')