# Topic model hyperparameter tuning

In [None]:
!pip install pyLDAvis
!pip3 install pickle5

In [None]:
import pandas as pd
import numpy as np
import sqlite3
from tqdm import tqdm
import matplotlib.pyplot as plt
import re
from collections import Counter
from tqdm import tqdm
import spacy
nlp = spacy.load('en_core_web_sm')
from itertools import product
import pickle5 as pickle

#NLTK
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#For plotting
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
with open('/content/drive/MyDrive/paragraph_df.pkl', "rb") as fh:
    paragraph_df = pickle.load(fh)
paragraph_df = pd.DataFrame(paragraph_df)

In [None]:
with open('/content/drive/MyDrive/report_details_df.pkl', "rb") as fh:
    report_details_df = pickle.load(fh)
report_details_df = pd.DataFrame(report_details_df)

In [None]:
sc_only_para_df = paragraph_df[paragraph_df['Supply_Chain']=='Yes']
print(len(sc_only_para_df))

## Model & variable setup

In [None]:
# Stop words
stop_words = stopwords.words('english')
additional_stop_words = ['report', 'annualreport','esg','sustainability',
                         'sustainable','also','business','group','company','year'] 
sc_keywords = ['supplier', 'suppliers', 'supply', 'chain', 'chains', 
               'procurement', 'vendor', 'vendors', 'sourcing']

stop_words += additional_stop_words
stop_words += sc_keywords

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# Limit to nouns, adjectives, and adverbs (???-check whether I want to keep this)
def paragraphs_to_words(paragraphs):
    for paragraph in paragraphs:
        # note: deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(paragraph), deacc=True))

def pos_and_location_filter(text):
    doc = nlp(text)
    included_tags = ["NOUN","ADJ","ADV"]
    ents = [e.text for e in doc.ents if e.label_ in ['GPE','LOC','ORG']]
    filtered_text = [item.text for item in doc if (not item.text in ents) and (item.pos_ in included_tags)]
    return filtered_text
        
def text_preprocessing(text):
    text = str(text)
    text = re.sub('[,\.!?]', '', text) # remove punctuation
    text = ''.join(i for i in text if not i.isdigit()) # remove numbers
    text = re.sub('[^a-zA-Z]', ' ', text) # removes non-letter characters
    
    text = text.lower()
    text_list = pos_and_location_filter(text)
    text_list = [w for w in text_list if w not in stop_words]
    text_list = [lemmatizer.lemmatize(word) for word in text_list]
    return ' '.join(text_list)

## Processing

In [None]:
sc_only_para_df['Processed_Text'] = sc_only_para_df['Paragraph'].map(text_preprocessing)

In [None]:
def create_dict_texts_corpus(list_of_paragraphs):
    texts = list(paragraphs_to_words(list_of_paragraphs))

    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # Trigram texts
    texts_w_trigrams = [trigram_mod[bigram_mod[doc]] for doc in texts]

    id2word = corpora.Dictionary(texts_w_trigrams)
    corpus = [id2word.doc2bow(text) for text in texts_w_trigrams]
    
    return bigram_mod, trigram_mod, texts_w_trigrams, id2word, corpus, texts

In [None]:
# All docs
total_list_of_paragraphs = sc_only_para_df['Processed_Text'].values.tolist()
total_bigram_mod, total_trigram_mod, total_texts_w_trigrams, total_id2word, total_corpus, total_texts = create_dict_texts_corpus(total_list_of_paragraphs)

## Pre-topic model analysis

In [None]:
unigram_counter = Counter([word for doc in total_texts for word in doc])
top_30_uni = unigram_counter.most_common(30)

bigram_counter = Counter([word for doc in total_texts for word in total_bigram_mod[doc] if '_' in word])
top_30_bi = bigram_counter.most_common(30)

trigram_counter = Counter([word for doc in total_texts_w_trigrams for word in doc if word.count('_') == 2])
top_30_tri = trigram_counter.most_common(30)

top_words_df = pd.DataFrame({'Unigrams':top_30_uni, 'Bigrams':top_30_bi, 'Trigrams':top_30_tri})
display(top_words_df)

## Get best topic model

In [None]:
def topic_model_grid_search(dictionary, corpus, texts, num_topics_range):
    """
    Compute c_v coherence, perplexity for various numbers of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    num_topics_range : List of topic counts to test

    Returns:
    -------
    dict with LDA topic models' coherence and perplexity values
    """

    results = {}
    
    for num_topics in tqdm(num_topics_range):
        results[num_topics] = {}
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=num_topics, 
                                            random_state=100,
                                            update_every=1,
                                            chunksize=1000,
                                            passes=10)
        perplexity = model.log_perplexity(corpus)
        results[num_topics]['perplexity'] = perplexity
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        results[num_topics]['coherence'] = coherencemodel.get_coherence()

    return results

# Num topics
num_topics_range = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 40, 80, 100, 120, 140, 160, 180, 200]

results = topic_model_grid_search(dictionary=total_id2word, 
                                  corpus=total_corpus, 
                                  texts=total_texts_w_trigrams, 
                                  num_topics_range=num_topics_range)

In [None]:
# Show graph
topics = num_topics_range
coherences = [results[t]['coherence'] for t in topics]
plt.plot(topics, coherences, label='Coherence')
plt.xlabel("Number of Topics")
plt.ylabel("Score")
plt.legend()
plt.show()

In [None]:
with open("/content/drive/MyDrive/topic_model_results.txt", "wb") as f:
        pickle.dump(results, f)