### Import packages

In [1]:
# NLP modules
import spacy

from gensim.corpora import Dictionary
from gensim.models import LdaModel

import pyLDAvis
import pyLDAvis.gensim_models

# Custom packages
from utils.utils import f_read_pdf
from utils.utils import f_textual_cleaning



### Set global parameters

In [36]:
# PDF = 'data\\2022 Annual Report based on US GAAP jder53.pdf'
PDF = 'data\\Patagonia-2021-BCorp-Report-Updated-2-15-22.pdf'

### Extract data from PDF

In [41]:
# pagenumber = 1 # set pagenumber to extract (OPTIONAL)
# pdf_text = f_read_pdf(PDF, pagenumber)
pdf_text = f_read_pdf(PDF)

### Textual data cleaning

In [43]:
# Read corpus of text
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 1500000 # increase maximum length of text
doc = nlp(pdf_text)

In [44]:
# Remove stop words, punctuation and perform lemmatisation
text = f_textual_cleaning(doc)

### Topic modeling

In [45]:
# Cleaning and Setting Up the Corpus
dictionary = Dictionary(text)
corpus = [dictionary.doc2bow(text) for text in text]

In [48]:
# Latent Dirichlet Allocation (LDA)
lda_model = LdaModel(corpus=corpus, num_topics=3, id2word=dictionary)
lda_model.show_topics()

[(0,
  '0.172*" " + 0.021*" \n" + 0.009*"Corporation" + 0.009*"Benefit" + 0.008*"company" + 0.007*"Report" + 0.007*"Annual" + 0.007*"operation" + 0.007*"environmental" + 0.006*"use"'),
 (1,
  '0.110*" " + 0.074*" \n" + 0.029*" \n \n" + 0.016*"Patagonia" + 0.015*"number" + 0.012*"child" + 0.011*"employee" + 0.010*"care" + 0.009*"\n \n" + 0.006*"program"'),
 (2,
  '0.263*" " + 0.069*" \n" + 0.016*"Patagonia" + 0.015*" \n \n" + 0.010*"product" + 0.008*"employee" + 0.006*"environmental" + 0.006*"work" + 0.006*"\n \n" + 0.006*"year"')]

In [49]:
# Visualizing Topics with pyLDAvis
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)