### Topic Modeling

In [1]:
!pip install nltk
!pip install pandas
!pip install matplotlib
!pip install networkx  
!pip install pdfplumber
!pip install python-docx



In [2]:
!pip install gensim



In [3]:
!pip install pyLDAvis



In [2]:
import os
import pdfplumber
import docx

def extract_text_from_pdf(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ''
    return text

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return " ".join([para.text for para in doc.paragraphs])


def load_documents(directory):
    docs = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith('.pdf'):
            docs.append(extract_text_from_pdf(file_path))
        elif filename.endswith('.docx'):
            docs.append(extract_text_from_docx(file_path))
    return docs

directory = r"C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-Sing"

docs = load_documents(directory)



In [3]:
import re
import spacy
import nltk

nltk.download('punkt')
nltk.download('stopwords')

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    
    lower_text = re.sub(r'\d+', '', text.lower())
    doc = nlp(lower_text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    clean_text = " ".join(tokens)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text

processed_docs = [preprocess_text(doc) for doc in docs]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\u\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\u\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
import pyLDAvis
import pyLDAvis.gensim as gensimvis
from gensim import corpora, models


tokenized_docs = [doc.split() for doc in processed_docs]


dictionary = corpora.Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]


lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10, random_state=100)


topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")


pyLDAvis.enable_notebook()

vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

pyLDAvis.display(vis_data)

pyLDAvis.save_html(vis_data, 'Singapre_lda_visualization.html')



Topic 0: 0.042*"solar" + 0.032*"energy" + 0.012*"reset" + 0.010*"system" + 0.010*"green" + 0.009*"singapore" + 0.007*"power" + 0.007*"sustainable" + 0.007*"deployment" + 0.007*"source"
Topic 1: 0.038*"agreement" + 0.037*"article" + 0.021*"party" + 0.018*"shall" + 0.018*"paragraph" + 0.018*"footnote" + 0.017*"include" + 0.014*"force" + 0.014*"annex" + 0.010*"year"
Topic 2: 0.034*"hydrogen" + 0.016*"carbon" + 0.014*"singapore" + 0.013*"fuel" + 0.013*"low" + 0.010*"e" + 0.010*"t" + 0.009*"n" + 0.008*"power" + 0.008*"sector"
Topic 3: 0.020*"energy" + 0.019*"singapore" + 0.016*"carbon" + 0.013*"hydrogen" + 0.012*"green" + 0.011*"target" + 0.011*"new" + 0.008*"sustainable" + 0.008*"low" + 0.007*"reduce"
Topic 4: 0.039*"singapore" + 0.039*"energy" + 0.012*"solar" + 0.011*"alternative" + 0.009*"approach" + 0.008*"power" + 0.008*"carbon" + 0.007*"technology" + 0.007*"climate" + 0.007*"future"


### Word Count

In [35]:
pip install wordcloud matplotlib


Note: you may need to restart the kernel to use updated packages.


In [6]:
from collections import Counter

all_text = ' '.join(processed_docs)  
word_counts = Counter(all_text.split())  

In [10]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.graph_objs as go
from PIL import Image
import numpy as np
from collections import Counter


all_text = ' '.join(processed_docs)  
word_counts = Counter(all_text.split())


wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=200).generate_from_frequencies(word_counts)


wordcloud_image = wordcloud.to_image()


wordcloud_array = np.array(wordcloud_image)


trace = go.Image(z=wordcloud_array)


layout = go.Layout(
    xaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
    yaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
    margin=dict(l=0, r=0, b=0, t=0)
)


fig = go.Figure(data=[trace], layout=layout)


fig.write_html('Singapore_wordcloud.html')

print("Word cloud saved as 'wordcloud.html'.")


Word cloud saved as 'wordcloud.html'.
