### Topic Modeling

In [1]:
!pip install nltk
!pip install pandas
!pip install matplotlib
!pip install networkx  
!pip install pdfplumber
!pip install python-docx



In [2]:
!pip install gensim



In [3]:
!pip install pyLDAvis



In [1]:
import os
import pdfplumber
import docx

def extract_text_from_pdf(pdf_path):
    text = ''
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or ''
    except Exception as e:
        print(f"Failed to process PDF {pdf_path}: {str(e)}")
    return text

def extract_text_from_docx(docx_path):
    text = ''
    try:
        doc = docx.Document(docx_path)
        text = " ".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Failed to process DOCX {docx_path}: {str(e)}")
    return text

def load_documents(directory):
    docs = []
    if not os.path.exists(directory):
        print(f"Directory not found: {directory}")
        return docs
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        print(f"Processing file: {file_path}")  # Debug: Print the path being processed
        if filename.endswith('.pdf'):
            docs.append(extract_text_from_pdf(file_path))
        elif filename.endswith('.docx'):
            docs.append(extract_text_from_docx(file_path))
    return docs

directory = r"C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-China"

docs = load_documents(directory)
print(f"Loaded {len(docs)} documents.")


Processing file: C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-China\Central Budget and Final Accounts Public Platform.pdf
Processing file: C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-China\China's National Hydrogen Development Plan - Energy Iceberg.pdf
Processing file: C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-China\Letter from the General Office of the National Energy Administration soliciting opinions on the Notice on the Implementation of the Renewable Energy Power Quota System.pdf
Processing file: C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-China\Notice on Issuing the 13th Five-Year Plan for Geothermal Energy Development and Utilization---National Energy Administration.pdf
Processing file: C:\Users\u\OneDrive - Swinburne University\Policy Documents\Policy docs-China\The Ministry of Finance issued a supplementary notice on the _Interim Measures for the Administration of S

In [2]:
import re
import spacy
import nltk

nltk.download('punkt')
nltk.download('stopwords')

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    
    lower_text = re.sub(r'\d+', '', text.lower())
    doc = nlp(lower_text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    clean_text = " ".join(tokens)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text

processed_docs = [preprocess_text(doc) for doc in docs]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\u\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\u\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import pyLDAvis
import pyLDAvis.gensim as gensimvis
from gensim import corpora, models


tokenized_docs = [doc.split() for doc in processed_docs]


dictionary = corpora.Dictionary(tokenized_docs)

corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10, random_state=100)


topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")


pyLDAvis.enable_notebook()  
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)
pyLDAvis.save_html(vis_data, 'China_lda.html')


Topic 0: 0.001*"energy" + 0.001*"power" + 0.001*"china" + 0.001*"development" + 0.001*"project" + 0.001*"green" + 0.001*"hydrogen" + 0.001*"national" + 0.001*"new" + 0.001*"reform"
Topic 1: 0.027*"carbon" + 0.016*"development" + 0.013*"china" + 0.011*"emission" + 0.010*"ndrc" + 0.008*"energy" + 0.008*"say" + 0.008*"peak" + 0.007*"green" + 0.007*"plan"
Topic 2: 0.033*"energy" + 0.028*"power" + 0.024*"project" + 0.020*"photovoltaic" + 0.019*"hydrogen" + 0.014*"development" + 0.014*"national" + 0.014*"generation" + 0.011*"green" + 0.010*"scale"
Topic 3: 0.021*"development" + 0.020*"china" + 0.014*"power" + 0.014*"energy" + 0.011*"national" + 0.010*"hydrogen" + 0.008*"policy" + 0.008*"reform" + 0.008*"solar" + 0.008*"project"
Topic 4: 0.023*"energy" + 0.018*"subsidy" + 0.018*"fund" + 0.015*"renewable" + 0.015*"power" + 0.012*"finance" + 0.011*"ministry" + 0.010*"project" + 0.010*"year" + 0.010*"gas"


### Word Count

In [11]:
pip install wordcloud matplotlib




In [6]:
pip install plotly wordcloud


Note: you may need to restart the kernel to use updated packages.


In [15]:
from collections import Counter

all_text = ' '.join(processed_docs)  
word_counts = Counter(all_text.split()) 

In [19]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.graph_objs as go
from PIL import Image
import numpy as np
from collections import Counter


all_text = ' '.join(processed_docs)  
word_counts = Counter(all_text.split())

wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=300).generate_from_frequencies(word_counts)

wordcloud_image = wordcloud.to_image()

wordcloud_array = np.array(wordcloud_image)

trace = go.Image(z=wordcloud_array)

layout = go.Layout(
    xaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
    yaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
    margin=dict(l=0, r=0, b=0, t=0)
)


fig = go.Figure(data=[trace], layout=layout)

fig.write_html('China_wordcloud.html')

print("Word cloud saved as 'China_wordcloud.html'.")


Word cloud saved as 'China_wordcloud.html'.
