### Import packages

In [1]:
# NLP modules
import spacy

from gensim.corpora import Dictionary
from gensim.models import LdaModel

import pyLDAvis
import pyLDAvis.gensim_models

# Custom packages
from utils.utils import f_read_pdf
from utils.utils import f_textual_cleaning



### Set global parameters

In [2]:
PDF = 'data\\2022 Annual Report based on US GAAP jder53.pdf'

### Extract data from PDF

In [3]:
pagenumber = 1 # set pagenumber to extract
pdf_text = f_read_pdf(PDF, pagenumber)

### Textual data cleaning

In [4]:
# Read corpus of text
nlp = spacy.load('en_core_web_sm')
doc = nlp(pdf_text)

In [5]:
# Remove stop words, punctuation and perform lemmatisation
text = f_textual_cleaning(doc)

### Topic modeling

In [11]:
# Cleaning and Setting Up the Corpus
dictionary = Dictionary(text)
corpus = [dictionary.doc2bow(text) for text in text]

In [15]:
# Latent Dirichlet Allocation (LDA)
lda_model = LdaModel(corpus=corpus, num_topics=3, id2word=dictionary)
lda_model.show_topics()

[(0,
  '0.034*"technology" + 0.032*"innovation" + 0.032*"enable" + 0.032*"help" + 0.032*"energy" + 0.032*"ground" + 0.032*"humanity" + 0.032*"break" + 0.032*"healthcare" + 0.032*"transition"'),
 (1,
  '0.037*"challenge" + 0.037*"facilitate" + 0.037*"seize" + 0.037*"mobility" + 0.037*"living" + 0.037*"opportunity" + 0.037*"smart" + 0.037*"innovation" + 0.037*"manage" + 0.037*"customer"'),
 (2,
  '0.039*"pattern" + 0.038*"work" + 0.038*"small" + 0.038*"big" + 0.022*"help" + 0.022*"life" + 0.022*"strongly" + 0.022*"day" + 0.022*"shrink" + 0.022*"embed"')]

In [16]:
# Visualizing Topics with pyLDAvis
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)