# Topic modelling

## Set up

In [None]:
import pandas as pd
import numpy as np
import sqlite3
from tqdm import tqdm
import matplotlib.pyplot as plt
import re
from collections import Counter
from tqdm import tqdm
import spacy
nlp = spacy.load('en_core_web_sm')
from itertools import product
import pickle

#NLTK
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#For plotting
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
paragraph_df = pd.read_pickle('../Data/paragraph_df.pkl')

In [None]:
sc_only_para_df = paragraph_df[paragraph_df['Supply_Chain']=='Yes']
print(len(sc_only_para_df))

## Functions and variables

In [None]:
# Stop words
stop_words = stopwords.words('english')
additional_stop_words = ['report', 'annualreport','esg','sustainability',
                         'sustainable','also','business','group','company','year'] 
sc_keywords = ['supplier', 'suppliers', 'supply', 'chain', 'chains', 
               'procurement', 'vendor', 'vendors', 'sourcing']

stop_words += additional_stop_words
stop_words += sc_keywords

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# Limit to nouns, adjectives, and adverbs (???-check whether I want to keep this)
def paragraphs_to_words(paragraphs):
    for paragraph in paragraphs:
        # note: deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(paragraph), deacc=True))

def pos_and_location_filter(text):
    doc = nlp(text)
    included_tags = ["NOUN","ADJ","ADV"]
    ents = [e.text for e in doc.ents if e.label_ in ['GPE','LOC','ORG']]
    filtered_text = [item.text for item in doc if (not item.text in ents) and (item.pos_ in included_tags)]
    return filtered_text
        
def text_preprocessing(text):
    text = str(text)
    text = re.sub('[,\.!?]', '', text) # remove punctuation
    text = ''.join(i for i in text if not i.isdigit()) # remove numbers
    text = re.sub('[^a-zA-Z]', ' ', text) # removes non-letter characters
    
    text = text.lower()
    text_list = pos_and_location_filter(text)
    text_list = [w for w in text_list if w not in stop_words]
    text_list = [lemmatizer.lemmatize(word) for word in text_list]
    return ' '.join(text_list)

In [None]:
def create_dict_texts_corpus(list_of_paragraphs):
    texts = list(paragraphs_to_words(list_of_paragraphs))

    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # Trigram texts
    texts_w_trigrams = [trigram_mod[bigram_mod[doc]] for doc in texts]

    id2word = corpora.Dictionary(texts_w_trigrams)
    corpus = [id2word.doc2bow(text) for text in texts_w_trigrams]
    
    return bigram_mod, trigram_mod, texts_w_trigrams, id2word, corpus, texts

In [None]:
#get the topics per doc
def get_predicted_topics(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

## Process text

In [None]:
sc_only_para_df['Processed_Text'] = sc_only_para_df['Paragraph'].map(text_preprocessing)

In [None]:
# All docs
total_list_of_paragraphs = sc_only_para_df['Processed_Text'].values.tolist()
total_bigram_mod, total_trigram_mod, total_texts_w_trigrams, total_id2word, total_corpus, total_texts = create_dict_texts_corpus(total_list_of_paragraphs)

## Build topic model

In [None]:
model20 = gensim.models.ldamodel.LdaModel(corpus=total_corpus,
                                            id2word=total_id2word,
                                            num_topics=20, 
                                            random_state=100,
                                            update_every=1,
                                            chunksize=1000,
                                            passes=10)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model20, total_corpus, total_id2word)
vis

In [None]:
words_per_topic = model20.show_topics(num_topics=20, num_words=15)

for i, topic_words in words_per_topic:
    formatted_topic_words = topic_words.split(" + ")
    formatted_topic_words = [''.join(i for i in word if i.isalpha()) for word in formatted_topic_words]
    print(f"Topic {i}: {formatted_topic_words}")
    print()

In [None]:
df_topic_sents_keywords = get_predicted_topics(ldamodel=model20, 
                                                  corpus=total_corpus, texts=total_texts_w_trigrams)

In [None]:
topic_names = {0:'Management systems',1:'Deforestation',2:'Human rights',3:'Employee health & safety',
               4:'Resource usage',
               5:'Certifications & training',6:'Collaborations & partnerships',7:'Plans and progress',
               8:'Governance & stakeholders',9:'Policies',
               10:'Product quality',11:'Diversity & inclusion',12:'Junk',13:'Agriculture',
               14:'Risk assessments',
               15:'Chemicals',16:'Transportation & logistics',
               17:'Society',18:'Store operations',19:'Materials & packaging'}

In [None]:
df_topic_sents_keywords['Dominant_Topic_Named'] = df_topic_sents_keywords['Dominant_Topic'].map(lambda x: topic_names[round(x)])

In [None]:
merged_sc_df = pd.concat([sc_only_para_df.reset_index(),df_topic_sents_keywords], axis=1)
merged_sc_df = merged_sc_df.merge(report_details_df[['Company Name_x','Year','Main industry','Filename']],on='Filename')
merged_sc_df.head(3)

In [None]:
topic_group_mapping = {'Management systems':'Actions','Deforestation':'Environment',
                       'Human rights':'Social','Employee health & safety':'Social',
                       'Resource usage':'Environment',
                       'Certifications & training':'Actions',
                       'Collaborations & partnerships':'Actions',
                       'Plans and progress':'Actions',
                       'Governance & stakeholders':'Actions','Policies':'Actions',
                       'Product quality':'Social',
                       'Diversity & inclusion':'Social','Junk':None,'Agriculture':'Environment',
                       'Risk assessments':'Actions','Chemicals':'Environment',
                       'Transportation & logistics':'Environment',
                       'Society':'Social','Store operations':'Social',
                       'Materials & packaging':'Environment'}

merged_sc_df['Groups'] = merged_sc_df["Dominant_Topic_Named"].map(topic_group_mapping)
merged_sc_df['Groups'].value_counts()

In [None]:
merged_sc_df.to_pickle('../Data/paragraphs_w_topics.pkl')