### Topic Modeling

In [1]:
!pip install nltk
!pip install pandas
!pip install matplotlib
!pip install networkx  
!pip install pdfplumber
!pip install python-docx



In [2]:
!pip install gensim



In [3]:
!pip install pyLDAvis



In [10]:
import os
import pdfplumber
import docx

def extract_text_from_pdf(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ''
    return text

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return " ".join([para.text for para in doc.paragraphs])


def load_documents(directory):
    docs = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith('.pdf'):
            docs.append(extract_text_from_pdf(file_path))
        elif filename.endswith('.docx'):
            docs.append(extract_text_from_docx(file_path))
    return docs

directory = r"C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-AUS"

docs = load_documents(directory)



In [13]:
import re
import spacy
import nltk

nltk.download('punkt')
nltk.download('stopwords')

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    
    lower_text = re.sub(r'\d+', '', text.lower())
    doc = nlp(lower_text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    clean_text = " ".join(tokens)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text

processed_docs = [preprocess_text(doc) for doc in docs]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\u\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\u\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
import pyLDAvis
import pyLDAvis.gensim as gensimvis
from gensim import corpora, models

tokenized_docs = [doc.split() for doc in processed_docs]

dictionary = corpora.Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10, random_state=100)

topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")


pyLDAvis.enable_notebook()

vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)
pyLDAvis.save_html(vis_data, 'Aus_lda.html')


Topic 0: 0.028*"australia" + 0.021*"bill" + 0.019*"future" + 0.009*"australian" + 0.009*"arena" + 0.009*"government" + 0.009*"act" + 0.008*"energy" + 0.008*"minister" + 0.007*"omnibus"
Topic 1: 0.027*"licence" + 0.023*"subclause" + 0.017*"provide" + 0.016*"clause" + 0.014*"offshore" + 0.013*"person" + 0.011*"infrastructure" + 0.011*"provision" + 0.010*"area" + 0.009*"bill"
Topic 2: 0.024*"energy" + 0.024*"cefc" + 0.014*"technology" + 0.012*"investment" + 0.011*"ccs" + 0.011*"bill" + 0.010*"australia" + 0.009*"op" + 0.009*"clean" + 0.009*"submission"
Topic 3: 0.031*"wind" + 0.017*"farm" + 0.012*"committee" + 0.011*"energy" + 0.009*"mr" + 0.009*"turbine" + 0.008*"health" + 0.007*"submission" + 0.007*"state" + 0.006*"noise"
Topic 4: 0.024*"emission" + 0.020*"australia" + 0.017*"technology" + 0.015*"energy" + 0.011*"project" + 0.010*"low" + 0.009*"australian" + 0.008*"cost" + 0.008*"government" + 0.008*"hydrogen"


### Word Count

In [35]:
pip install wordcloud matplotlib


Note: you may need to restart the kernel to use updated packages.


In [15]:
from collections import Counter

all_text = ' '.join(processed_docs)  
word_counts = Counter(all_text.split())  

In [17]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.graph_objs as go
from PIL import Image
import numpy as np
from collections import Counter


wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=150).generate_from_frequencies(word_counts)
wordcloud_image = wordcloud.to_image()
wordcloud_array = np.array(wordcloud_image)
if wordcloud_array.dtype != np.uint8:
    wordcloud_array = wordcloud_array.astype(np.uint8)

trace = go.Image(z=wordcloud_array)
layout = go.Layout(
    xaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
    yaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
    margin=dict(l=0, r=0, b=0, t=0)
)

fig = go.Figure(data=[trace], layout=layout)

fig.write_html('Aus_wordcloud.html')

print("Word cloud saved as 'Aus_wordcloud.html'.")


Word cloud saved as 'Aus_wordcloud.html'.
