# Project Pipeline

Lucovica Schaerf, Antònio Mendes, Jaël Kortekaas

Large part of our code is used from: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

## Introduction

This file contains the the preprocessing pipeline to our project. 
As a first step we are importing the data and filtering out all the songs that
we don't need for our analysis. Secondly, we will implement the 'standard' 
pipeline and, once we obtain the most common words per each album, author, year
(...) we will move to another file to do the clustering and topic analysis.

## Import

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

plot_dir = Path("./figures")
data_dir = Path("./data")

In [3]:
songs = []

with open('./data/lyrics.csv', 'r', encoding="utf-8") as infile:
    songs = pd.read_csv(infile)
    
print(songs.columns,"\n")

#print(len(songs.values))



Index(['index', 'song', 'year', 'artist', 'genre', 'lyrics'], dtype='object') 



In [20]:

#print('elliott-smith' in songs["artist"].tolist())
#print('dio' in songs["artist"].tolist())
print('judas-priest' in songs["artist"].tolist())

artist_set = set(songs["artist"].tolist())
print(len(artist_set))
print(sorted(artist_set))

False
18231


In [70]:
print(songs["artist"].shape)

artist_search = songs[u'artist'] == 'elliott-smith'

#print('angel-olsen' in songs["artist"].tolist())

#print('arctic-monkeys' in songs["artist"].tolist())

#print('ariana-grande' in songs["artist"].tolist())

#print('audioslave' in songs["artist"].tolist())

#print('at-the-drive-in' in songs["artist"].tolist())

#print('beatles' in songs["artist"].tolist())

#print('the-cure' in songs["artist"].tolist())

#print('bob-dylan' in songs["artist"].tolist())

#print('beastie-boys' in songs["artist"].tolist())

#print('the-clash' in songs["artist"].tolist())

#print('beach-boys' in songs["artist"].tolist())

#print('can' in songs["artist"].tolist())

#print('the-doors' in songs["artist"].tolist())

#print('godspeed-you-black-emperor' in songs["artist"].tolist())

#print('fleetwood-mac' in songs["artist"].tolist())

#print('genesis' in songs["artist"].tolist())

#print('american-football' in songs["artist"].tolist())

#print('dismemberment-plan' in songs["artist"].tolist())

#print('built-to-spill' in songs["artist"].tolist())

#print('creedence-clearwater-revival' in songs["artist"].tolist())

#print('elliott-smith' in songs["artist"].tolist())

#print('deep-purple' in songs["artist"].tolist())

#print('funkadelic' in songs["artist"].tolist())

#print('curtis-mayfield' in songs["artist"].tolist())

#print('dio' in songs["artist"].tolist())

#print('bruce-springsteen' in songs["artist"].tolist())

#print('dead-kennedys' in songs["artist"].tolist())

#print('glenn-branca' in songs["artist"].tolist())

#print('burial' in songs["artist"].tolist())

#print('converge' in songs["artist"].tolist())

#print('animal-collective' in songs["artist"].tolist())

#print('coil' in songs["artist"].tolist())

#print('amy-winehouse' in songs["artist"].tolist())

#print('deerhunter' in songs["artist"].tolist())

#print('deafheaven' in songs["artist"].tolist())

(362237,)
False


In [4]:
artists = ['joy-division', 'metallica', 'black-sabbath', 'pink-floyd', 'david-bowie']

david_bowie = songs[songs[u'artist'] == 'david-bowie']
black_sabbath = songs[songs[u'artist'] == 'black-sabbath']

## Processing Pipeline

In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['oh', 'yeah', 'hey', 'doo', 'oo', 'uh', 'la', 'verse', 'chorus', 'bridge']) # filter out common meaningless words/sounds and words describing song structure

In [17]:
import re
import string

lyrics = songs['lyrics'].tolist()
lyrics = [re.sub('\-', '', str(lyric)) for lyric in lyrics] # take out all hyphens that often connect meaningless words/sounds to these stopwords can be filtered out later
lyrics = [re.sub('[\.\,\?,\!,\(,\),\:]', '', str(lyric)) for lyric in lyrics] # take out all punctuation

In [18]:
# sentence splitting in songs is quite hard because ends of sentences are not indicated with periods, we decided to split on newlines instead as this is the closest indicator of a sentece boundary

sent_split_lyrics = []

for lyric in lyrics[0:10]: #has to be changed to set of all songs of selected artists!
    sent_split_lyric = lyric.split('\n')
    sent_split_lyrics.append(sent_split_lyric)

In [25]:
from nltk.tokenize import word_tokenize

def simple_preprocess(lyrics):
    lyrics_words = []
    sentence_words = []

    for lyric in lyrics:
        for sentence in lyric:
            sentence_words.append(word_tokenize((sentence.lower())))
        lyrics_words.append(sentence_words)
    return(lyrics_words)

In [22]:
import gensim

bigram = gensim.models.Phrases(lyrics_words[0], min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[lyrics_words[0]], threshold=100) 

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[lyrics_words[0][0]]])

['oh', 'baby', 'how', 'you', 'doing']


In [23]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [32]:
import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(sent_split_lyrics)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

OSError: [E050] Can't find model 'en'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.