# Project Pipeline

Lucovica Schaerf, Antònio Mendes, Jaël Kortekaas

Large part of our code is used from: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

## Introduction

This file contains the the preprocessing pipeline to our project. 
As a first step we are importing the data and filtering out all the songs that
we don't need for our analysis. Secondly, we will implement the 'standard' 
pipeline and, once we obtain the most common words per each album, author, year
(...) we will move to another file to do the clustering and topic analysis.

## Import

In [8]:
from pathlib import Path
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.collocations import *
from nltk.collocations import BigramAssocMeasures
import nltk
import re
import string
import gensim
import spacy
import itertools

from gensim import corpora, models
from nltk.corpus import wordnet as wn
from operator import itemgetter

import matplotlib.pyplot as plt

plot_dir = Path("./figures")
data_dir = Path("./data")

In [2]:
songs = []

with open('./data/lyrics.csv', 'r', encoding="utf-8") as infile:
    songs = pd.read_csv(infile)
    
print(songs.columns)

Index(['index', 'song', 'year', 'artist', 'genre', 'lyrics'], dtype='object')


In [3]:
artists = ['bruce-springsteen', 'elliot-smith', 'black-sabbath', 'deep-purple', 'david-bowie']

david_bowie = songs[songs[u'artist'] == 'david-bowie']
black_sabbath = songs[songs[u'artist'] == 'black-sabbath']
bruce_springsteen = songs[songs[u'artist'] == 'bruce-springsteen']
elliot_smith = songs[songs[u'artist'] == 'elliot-smith']
deep_purple = songs[songs[u'artist'] == 'deep-purple']

lyrics = pd.concat([david_bowie, black_sabbath, bruce_springsteen, elliot_smith, deep_purple], axis=0)
lyrics = lyrics.dropna()
lyrics

Unnamed: 0,index,song,year,artist,genre,lyrics
116793,116793,if-i-m-dreaming-my-life,2009,david-bowie,Rock,VERSE (there)\nWas she never there/here?\nWas ...
116794,116794,seven,2009,david-bowie,Rock,I forgot what my father said\nI forgot what he...
116795,116795,i-can-t-read,2009,david-bowie,Rock,I can't read and I can't write down\nI don't k...
116796,116796,thursday-s-child,2009,david-bowie,Rock,All of my life I've tried so hard\nDoing my be...
116797,116797,survive,2009,david-bowie,Rock,"Oh, my\nNaked eyes\nI should have kept you\nI ..."
...,...,...,...,...,...,...
67509,67509,child-in-time,1972,deep-purple,Rock,Sweet child in time\nYou'll see the line\nThe ...
67510,67510,deep-purple-overture,2015,deep-purple,Rock,"Good golly, said little miss molly\nWhen she w..."
67517,67517,paint-it-black,2015,deep-purple,Rock,"I see a red door and I want it painted black,\..."
67523,67523,anyone-s-daughter,2014,deep-purple,Rock,"Well, I stood under your bedroom window, throw..."


## Processing Pipeline

In [103]:
stop_words = stopwords.words('english')
stop_words.extend(['oh', 'yeah', 'hey', 'doo', 'oo', 'uh', 'la', 'verse', 'chorus', 'bridge', 'x2', "'m", 'da', 'ooh', 'aaaahh', 'ooo', 'duh', 'whop', 'u', 'ah', 'na', 'whoa', 'ai', "n't", 'wa', 'gon', "'ll", 'gon', "'d", "'re", "'ve", "'em", "'", 'ca', 'ha', 'wo', 'wir', 'wan', 'doe', 'well', 'sha', 'ya', 'ta', "'cause", "`"]) # filter out common meaningless words/sounds and words describing song structure

The preprocessing pipeline we decided to implement includes the following steps:
- filtering out stopwords, puntuation, sounds typical from songs
- lemmatizing
- adding the most common bigrams

In [155]:
wnl = WordNetLemmatizer()
bigram_measures = BigramAssocMeasures()
un2wn_mapping = {"VERB" : wn.VERB, "NOUN" : wn.NOUN, "ADJ" : wn.ADJ, "ADV" : wn.ADV}

def convertTuple(tup): 
    str =  '_'.join(tup) 
    return str
  
def simple_preprocess(lyrics, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    lyricslist = [re.sub('\-', '', str(lyric)) for lyric in lyrics['lyrics'].tolist()] # take out all hyphens that often connect meaningless words/sounds to these stopwords can be filtered out later
    lyricslist = [re.sub('[\.\,\?,\!,\(,\),\:,\",\[,\]]', '', str(lyric)) for lyric in lyricslist] # take out all punctuation
    lyricslist = [word_tokenize(lyric.lower()) for lyric in lyricslist]
    finder = []
    for i,lyric in enumerate(lyricslist):
        for j,tup in enumerate(nltk.pos_tag(lyric, tagset="universal")):
            token = "-".join([tup[0], tup[1]])
            if token.split("-")[-1] in un2wn_mapping.keys():
                lyricslist[i][j] = "-".join([wnl.lemmatize(str(token.split("-")[0]), pos = un2wn_mapping[token.split("-")[-1]]), token.split("-")[-1]])
            else:
                lyricslist[i][j] = "-".join([wnl.lemmatize(str(token.split("-")[0])),token.split("-")[-1]]) 
        lyricslist[i] = [word for word in lyric if word.split("-")[0] not in stop_words]
        finder.append(BigramCollocationFinder.from_words(lyric))
        finder[i] = finder[i].nbest(bigram_measures.pmi, 20)
        finder[i] = [convertTuple(x) for x in finder[i]] #need to append the two words back together
        lyricslist[i] = [word for word in lyric if word.split("-")[-1] == "NOUN"]
        lyricslist[i] = lyricslist[i] + finder[i]
    lyrics['bag_of_words'] = lyricslist


simple_preprocess(lyrics)
example = lyrics['bag_of_words'][116793]
example

['verse-NOUN',
 'air-NOUN',
 'time-NOUN',
 'flower-NOUN',
 'gallery-NOUN',
 'hymn-NOUN',
 'night-NOUN',
 'chorus-NOUN',
 'time-NOUN',
 'day-NOUN',
 'light-NOUN',
 'life-NOUN',
 'time-NOUN',
 'chance-NOUN',
 'mother-NOUN',
 'father-NOUN',
 'step-NOUN',
 'time-NOUN',
 'verse-NOUN',
 'chorus-NOUN',
 'life-NOUN',
 'life-NOUN',
 'life-NOUN',
 'dreaming-NOUN',
 'life-NOUN',
 'life-NOUN',
 'repeat-NOUN',
 'life-NOUN',
 'dreaming-NOUN',
 'life-NOUN',
 'come-VERB_to-PRT',
 'fade-VERB_now-ADV',
 'father-NOUN_step-NOUN',
 'flower-NOUN_so-ADV',
 'gallery-NOUN_with-ADP',
 'hymn-NOUN_of-ADP',
 'it-PRON_air-NOUN',
 'just-ADV_one-NUM',
 'live-VERB_chance-NOUN',
 'mother-NOUN_sigh-VERB',
 'never-ADV_there/here-DET',
 'night-NOUN_sing-VERB',
 'of-ADP_night-NOUN',
 'one-NUM_live-VERB',
 'sing-VERB_come-VERB',
 'so-ADV_from-ADP',
 'step-NOUN_aside-ADV',
 'to-PRT_me-PRON',
 'x2-ADP_second-ADJ',
 'chance-NOUN_when-ADV']

In [156]:
#count how many times a word appears --> give a document
#lyrics['bag_of_words']

list_BOWlyrics = [[]]
for lyric in lyrics['bag_of_words']:
    list_BOWlyrics += [lyric]

lyrics_dictionary = corpora.Dictionary(list_BOWlyrics)
print('Number of unique tokens:', len(lyrics_dictionary))

Number of unique tokens: 27851


In [157]:
print(dict(itertools.islice(lyrics_dictionary.token2id.items(), 12)))
print("word with id 8:", lyrics_dictionary[8])
print("frequency of token 8:", lyrics_dictionary.dfs[8])

{'air-NOUN': 0, 'chance-NOUN': 1, 'chance-NOUN_when-ADV': 2, 'chorus-NOUN': 3, 'come-VERB_to-PRT': 4, 'day-NOUN': 5, 'dreaming-NOUN': 6, 'fade-VERB_now-ADV': 7, 'father-NOUN': 8, 'father-NOUN_step-NOUN': 9, 'flower-NOUN': 10, 'flower-NOUN_so-ADV': 11}
word with id 8: father-NOUN
frequency of token 8: 43


In [158]:
# Filter out words that occur in less than 5 documents, or more than 70% of the documents.
lyrics_dictionary.filter_extremes(no_below=2, no_above=0.3)
print('Number of unique tokens:', len(lyrics_dictionary))

Number of unique tokens: 8343


In [159]:
# Bag-of-words representation of the documents
lyrics_bow_corpus = [lyrics_dictionary.doc2bow(d) for d in list_BOWlyrics]

In [160]:
# the BOW representation of the first document
print(lyrics_bow_corpus[1][:50])

[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 8), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2)]


In [161]:
# which words (and how often) appear in the first document?
for i, freq in sorted(lyrics_bow_corpus[1], key=itemgetter(1), reverse=True)[:15]:
    print(lyrics_dictionary[i], "-->", freq)
print("...")

life-NOUN --> 8
chorus-NOUN --> 2
dreaming-NOUN --> 2
verse-NOUN --> 2
air-NOUN --> 1
chance-NOUN --> 1
come-VERB_to-PRT --> 1
day-NOUN --> 1
father-NOUN --> 1
flower-NOUN --> 1
gallery-NOUN --> 1
hymn-NOUN --> 1
just-ADV_one-NUM --> 1
light-NOUN --> 1
mother-NOUN --> 1
...


In [162]:
lyrics_ldamodel = models.ldamodel.LdaModel(lyrics_bow_corpus, num_topics=10, id2word = lyrics_dictionary, passes=25)

In [163]:
# the setting formatted=False allows you to get rid of the word*probability format when retrieveing topics
lyrics_ldamodel.show_topics(formatted=False, num_words=15)

[(0,
  [('da-NOUN', 0.038209487),
   ('baby-NOUN', 0.023037024),
   ('somebody-NOUN', 0.018633816),
   ('love-NOUN', 0.015927922),
   ('whoa-NOUN', 0.015855681),
   ('yeah-NOUN', 0.0112876445),
   ('man-NOUN', 0.011008566),
   ('way-NOUN', 0.009823215),
   ('eye-NOUN', 0.0093977805),
   ('boot-NOUN', 0.0052265404),
   ('mind-NOUN', 0.0051820143),
   ('style-NOUN', 0.0051124296),
   ('hey-NOUN', 0.0050349906),
   ('fool-NOUN', 0.0050342097),
   ('world-NOUN', 0.004933279)]),
 (1,
  [('love-NOUN', 0.055522867),
   ('thing-NOUN', 0.025577333),
   ('way-NOUN', 0.0125366775),
   ('fire-NOUN', 0.01136626),
   ('life-NOUN', 0.011214286),
   ('baby-NOUN', 0.009950187),
   ('ground-NOUN', 0.009311527),
   ('day-NOUN', 0.008648619),
   ('eye-NOUN', 0.007957869),
   ('home-NOUN', 0.007954745),
   ('heart-NOUN', 0.007518441),
   ('man-NOUN', 0.0073767006),
   ('sound-NOUN', 0.0073655765),
   ('boy-NOUN', 0.00577361),
   ('world-NOUN', 0.0056224666)]),
 (2,
  [('train-NOUN', 0.0198593),
   ('home-N

In [None]:
from collections import defaultdict

import nltk.corpus
import numpy as np
import matplotlib.pyplot as plt
import itertools

from gensim import corpora, models
from nltk.corpus import wordnet as wn
from operator import itemgetter

import sklearn
from sklearn import metrics, manifold
import scipy
from scipy import cluster
import matplotlib as mpl 
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
