In [1]:
import re
import pickle
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
plt.rcParams['figure.titlesize'] = 20
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.grid'] = False
plt.rcParams['axes.axisbelow'] = True
plt.rcParams['legend.fontsize'] = 14
%matplotlib inline

from nltk.corpus import words as nltk_words
from nltk.corpus import stopwords

from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

In [2]:
# stopwords_dir = 'C:Users/philippe/AppData/Roaming/nltk_data/corpora/stopwords/'
stopwords_dir = '/home/philippe/nltk_data/corpora/stopwords'
files = os.listdir(stopwords_dir)
languages = [file for file in files if file != 'README']
stop_words = set()
for language in languages:
    stop_words.update(set(stopwords.words(language)))

In [3]:
df = pd.read_hdf('combined-data/song-lyrics-genres-reviews.h5', key='df', mode='r')

In [4]:
bands = sorted(set(df.band))
df_bands = pd.DataFrame(index=np.arange(len(bands)), columns=['band', 'lyrics_raw', 'lyrics', 'words', 'genres', 'reviews'])
for i, band in enumerate(bands):
    lyrics_raw = []
    df_band = df[df.band == band]
    genres = df_band.genres.iloc[0]
    songs = df_band.lyrics.values
    reviews = df_band.reviews.values[0]
    for song in songs:
        lyrics_raw += song
    lyrics_processed = ' '.join([re.sub(r"[^\w\d'\s]+", '', word) for line in lyrics_raw for word in line.split()])
    words = lyrics_processed.split()
    df_bands.iloc[i] = dict(band=band, lyrics_raw=lyrics_raw, lyrics=lyrics_processed, words=words, genres=genres, reviews=reviews)

english_words = set(nltk_words.words())
def pct_english(x):
    return len(set(x).intersection(english_words)) / len(set(x))
num_english_words = df_bands.words.apply(pct_english)
df_bands = df_bands[num_english_words > 0.5]
df_bands['words'] = df_bands.words.apply(lambda x: [word for word in x if word.lower() in english_words])

In [58]:
pipeline = Pipeline(
    [
        ('vectorizer', CountVectorizer(max_df=0.5, min_df=0.05)),
        ('lda', LDA(n_components=5, n_jobs=-1))
    ]
)
pipeline.fit(df_bands.lyrics.values)
vectorizer = dict(pipeline.steps)['vectorizer']
lda = dict(pipeline.steps)['lda']
words = vectorizer.get_feature_names()
n_top_words = 10
for topic_idx, topic in enumerate(lda.components_):
    print("\nTopic #%d:" % topic_idx)
    print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))


Topic #0:
thy thee satan thou behold abyss wisdom wrath mighty sword

Topic #1:
gonna wanna yeah ain hey bad gotta baby fuck money

Topic #2:
afraid im trying blame tried wonder road anymore dont else

Topic #3:
rock yeah gonna metal wanna roll thunder wild sword fighting

Topic #4:
fucking satan fuck torture metal corpse rot violence rotting victims


In [44]:
lda.transform(vectorizer.transform(df_bands[df_bands.band == 'archspire'].lyrics.values)).argsort()

array([[1, 3, 4, 6, 2, 5, 8, 9, 0, 7]])

In [46]:
texts = vectorizer.transform(df_bands.lyrics.values)

In [50]:
from nltk.tokenize import word_tokenize

In [54]:
tokens = [word_tokenize(doc) for doc in df_bands.lyrics.values]

In [56]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

id2word = Dictionary(tokens)

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=tokens, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

ValueError: This topic model is not currently supported. Supported topic models should implement the `get_topics` method.