In [26]:
import os
import pdfplumber
import docx


def extract_text_from_pdf(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ''
    return text


def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return " ".join([para.text for para in doc.paragraphs])


def load_documents(directory):
    docs = []
    files_in_directory = os.listdir(directory)
    
    for filename in files_in_directory:
        file_path = os.path.join(directory, filename)
        if filename.endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
            if text:
                docs.append(text)
        elif filename.endswith('.docx'):
            text = extract_text_from_docx(file_path)
            if text:
                docs.append(text)
    return docs


directories = {
    'AUS': r"C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-AUS",
    'CHN': r"C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-China",
    'IND': r"C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-Ind",
    'SING': r"C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-Sing"
}


country_docs = {}


for country, directory in directories.items():
    print(f"Loading documents for {country} from {directory}")
    docs = load_documents(directory)  
    if docs:
        country_docs[country] = docs  
        print(f"Loaded {len(docs)} documents for {country}\n")
    else:
        print(f"No documents loaded for {country}\n")


print(f"Total countries loaded: {len(country_docs)}")


Loading documents for AUS from C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-AUS
Loaded 18 documents for AUS

Loading documents for CHN from C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-China
Loaded 20 documents for CHN

Loading documents for IND from C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-Ind
Loaded 15 documents for IND

Loading documents for SING from C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-Sing
Loaded 8 documents for SING

Total countries loaded: 4


In [28]:
import re
import spacy


nlp = spacy.load('en_core_web_sm')


def preprocess_text(text):
   
    lower_text = re.sub(r'\d+', '', text.lower())  
    doc = nlp(lower_text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]  
    clean_text = " ".join(tokens)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()  
    return clean_text


def clean_country_documents(country_docs):
    cleaned_docs_by_country = {}  
    
    for country, docs in country_docs.items():
        print(f"Cleaning documents for {country}...")
        cleaned_docs = [preprocess_text(doc) for doc in docs]  
        cleaned_docs_by_country[country] = cleaned_docs
        print(f"Finished cleaning {len(docs)} documents for {country}\n")
    
    return cleaned_docs_by_country


cleaned_docs_by_country = clean_country_documents(country_docs)


Cleaning documents for AUS...
Finished cleaning 18 documents for AUS

Cleaning documents for CHN...
Finished cleaning 20 documents for CHN

Cleaning documents for IND...
Finished cleaning 15 documents for IND

Cleaning documents for SING...
Finished cleaning 8 documents for SING



In [29]:
import pyLDAvis
import pyLDAvis.gensim as gensimvis
from gensim import corpora, models


tokenized_docs = [doc.split() for doc in combined_cleaned_docs]
dictionary = corpora.Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10, random_state=100)
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

pyLDAvis.enable_notebook()

vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

pyLDAvis.display(vis_data)

pyLDAvis.save_html(vis_data, 'combined_lda.html')
print("LDA visualization saved as 'combined_lda.html'")


Topic 0: 0.031*"wind" + 0.016*"farm" + 0.012*"committee" + 0.011*"energy" + 0.010*"mr" + 0.009*"turbine" + 0.008*"health" + 0.008*"submission" + 0.007*"state" + 0.006*"noise"
Topic 1: 0.027*"licence" + 0.023*"subclause" + 0.017*"provide" + 0.016*"clause" + 0.014*"offshore" + 0.013*"person" + 0.011*"infrastructure" + 0.011*"provision" + 0.010*"area" + 0.010*"bill"
Topic 2: 0.017*"solar" + 0.014*"power" + 0.009*"plant" + 0.009*"project" + 0.009*"date" + 0.007*"consumer" + 0.007*"electricity" + 0.006*"state" + 0.006*"rs" + 0.006*"include"
Topic 3: 0.020*"energy" + 0.015*"cefc" + 0.015*"project" + 0.010*"grant" + 0.009*"investment" + 0.009*"bill" + 0.009*"clean" + 0.008*"fund" + 0.007*"technology" + 0.007*"grid"
Topic 4: 0.018*"australia" + 0.017*"energy" + 0.016*"emission" + 0.013*"technology" + 0.010*"hydrogen" + 0.008*"low" + 0.007*"carbon" + 0.007*"project" + 0.007*"government" + 0.006*"australian"
LDA visualization saved as 'combined_lda.html'


In [30]:
from collections import Counter

all_text = ' '.join(combined_cleaned_docs)

word_counts = Counter(all_text.split())


In [31]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.graph_objs as go
from PIL import Image
import numpy as np
from collections import Counter


all_text = ' '.join(combined_cleaned_docs)  
word_counts = Counter(all_text.split())
wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=200).generate_from_frequencies(word_counts)
wordcloud_image = wordcloud.to_image()
wordcloud_array = np.array(wordcloud_image)
trace = go.Image(z=wordcloud_array)


layout = go.Layout(
    xaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
    yaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
    margin=dict(l=0, r=0, b=0, t=0)
)


fig = go.Figure(data=[trace], layout=layout)
fig.write_html('combined_wordcloud.html')

print("Word cloud saved as 'combined_wordcloud.html'.")


Word cloud saved as 'combined_wordcloud.html'.
