# In this file I apply LDA topic modeling and create WordClouds for the topics

In [None]:
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import os
import re

from gensim.models.wrappers import LdaMallet

from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS

import nltk
nltk.download('stopwords') #once

from nltk.corpus import stopwords

import spacy

### Get Song Lyrics from enriched dataset

In [None]:
#lyrical dataset enriched with meta data created in 'enriched_metadata.ipynb' 
songs = pd.read_csv("songs_enriched.csv", sep=",", engine="python", encoding='utf-8')
data = songs.a_lyrics.values.tolist()

### Stopword Removal and Lemmatization

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['gon', 'na', 'wan'])

#tokenize
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [None]:
# only one time
spacy.cli.download("en_core_web_sm")

nlp = spacy.load("en_core_web_md")
import en_core_web_sm
nlp = en_core_web_sm.load()
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

## LDA Model Training
### Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip unpack zip-file and update mallet_path to file

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]


os.environ.update({'MALLET_HOME':r'C:/mallet/mallet-2.0.8/mallet-2.0.8/'}) # update this path
mallet_path = 'C:/mallet/mallet-2.0.8/mallet-2.0.8/bin/mallet' # update this path 

# ! change workers to number of cpu cores - 1 !
model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=12, id2word=id2word, workers=5)

# Topic coherence score to measure how good the topic model is
coherence_model_lda = CoherenceModel(model=model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### Get dominant Topic for each Song

In [None]:
data = df.Lyrics.values.tolist()
genres = df.genre.values.tolist()


def format_topics_sentences(ldamodel=model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num),round(prop_topic,4)]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    g = pd.Series(genres)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    sent_topics_df = pd.concat([sent_topics_df, g], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Perc_Contribution', 'Text', 'Genre']

# Show
df_dominant_topic.head(10)

## Create WordClouds for each Topic

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['gon', 'na', 'wan', 'gonna', 'wanna']) # additional common stopwords

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=1920,
                  height=1080,
                  max_words=20,
                  prefer_horizontal=1.0)

topics = model.show_topics(formatted=False, num_words=15, num_topics=12)

fig, axes = plt.subplots(2, 6, figsize=(20,20), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i+1), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()