In [2]:
import pandas as pd
import spacy
import re
import numpy as np

In [8]:
## Load spacy's NLP model
nlp = spacy.load('en_core_web_md')

In [27]:
## Load data
queen_df = pd.read_csv('Data/Queen_lyrics.csv', index_col=0)
rlst_df = pd.read_csv('Data/RollingStones_lyrics.csv', index_col=0)
miley_df = pd.read_csv('Data/MileyCyrus_lyrics.csv', index_col=0)
rhcp_df = pd.read_csv('Data/RedHotChiliPeppers_lyrics.csv', index_col=0)

In [48]:
## Balance datasets if needed and combine into one dataframe including artist names
corpus = pd.concat([queen_df[:100], rlst_df[:100], miley_df[:100], rhcp_df[:100]], axis=0)

In [51]:
corpus['Artist'] = ['Queen'] * 100 + ['Rolling Stones'] * 100 + ['Miley Cyrus'] * 100 + ['Red Hot Chili Peppers'] * 100

In [53]:
def lyrics_cleaner(df):
    """
    Takes a dataframe and uses text in Lyrics columns to clean data and return a sanitised corpus. 
    Function requires a csv file where the lyrics are in the Lyrics column
    """
        
    ## Firstly, we need to drop all rows with potential NA values
    df.dropna(inplace=True)
    
    ## Remove the \r\n strings, strip the words and make it lowercase
    lyrics_clean = [string.replace('\r\n',' ').replace('\n',' ').replace('\'', '').strip().lower() for string in df['Lyrics']]

    ## Remove some other special characters and anything within square brackets, e.g., [chorus]
    lyrics_clean = [re.sub(r'\[[^]]*\]','',i) for i in lyrics_clean]
    lyrics_clean = [re.sub(r'\.','',i) for i in lyrics_clean]
    lyrics_clean = [re.sub(r'[ ](?=[ ])|[^-_,A-Za-z0-9 ]+','',i) for i in lyrics_clean]
    lyrics_clean = [re.sub(r',','',i) for i in lyrics_clean]
    
    ## Tokenize lyrics for spacy string cleaning (lemmatisation, etc.)
    tokenized_lyrics = []

    for song in lyrics_clean:
        tokens = nlp(song)
        tokenized_lyrics.append(tokens)
    
    ## Clean the lyrics with spacy methods
    clean_corpus = []
    
    for document in tokenized_lyrics:
        
        new_song = []
        for word in document:
            if not word.is_stop and not word.is_punct and not word.pos_ == 'NUM':
                lemma = word.lemma_
                new_song.append(lemma)
        
        clean_song = ' '.join(new_song)
        clean_corpus.append(clean_song)
    
    return clean_corpus

### Combine all cleaned lyrics into one corpus (to be X) and create y variable 

In [62]:
X = lyrics_cleaner(corpus)

In [57]:
y = corpus['Artist']

In [63]:
corpus_df = pd.DataFrame({'Label': y, 'Corpus': X})

In [64]:
## Export to csv so that a separate machine learning modeling pipeline can work directly with that
corpus_df.to_csv('Output/corpus.csv')