<a href="https://colab.research.google.com/github/jeremychia/singapore-parliament-speeches/blob/main/Parliamentary_Data_Topic_Modelling_(LDA).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install bigquery client library
!pip install google-cloud-bigquery --quiet

In [2]:
# authenticate to GCP
from google.colab import auth
auth.authenticate_user()

In [3]:
# set up project id and dataset
project_id = "singapore-parliament-speeches"  # Replace with your GCP project ID
dataset_id = "singapore-parliament-speeches.prod_mart.mart_speeches"

In [4]:
# create a bigquery client
from google.cloud import bigquery
client = bigquery.Client(project=project_id)

In [5]:
from datetime import datetime

today = datetime.today()

date_yyyymmdd = today.strftime('%Y%m%d')
date_yyyymmdd = str(date_yyyymmdd)

In [6]:
# define query
query = f"""
SELECT
  topic_id,
  topic_title,
  speech_text
FROM {dataset_id}
WHERE
  NOT (
    topic_type = 'BI'
    OR (
        topic_type = 'OS'
        AND (
            LOWER(topic_title) LIKE ANY(
                '%adjournment%',
                'extension%',
                'exempted business%',
                '%estimates of expenditure%',
                '%estimate of expenditure%',
                '%estimates for the%',
                "%president's address%",
                'business motion',
                '%time limit%',
                '%visit%',
                'committee of supply reporting progress',
                'rearrangement of business',
                'suspension of standing%',
                'election of speaker',
                'administration of oath',
                ' '
            )
        )
    )
  )
ORDER BY date DESC, speech_id
"""

In [7]:
# run query
import pandas as pd
query_job = client.query(query)
df = query_job.to_dataframe()

In [8]:
# compbine text by topic

def combine_text_by_topic(df):

  combined_text = {}
  for index, row in df.iterrows():
    topic_id = row['topic_id']
    text = row['speech_text']
    if topic_id not in combined_text:
      combined_text[topic_id] = row['topic_title']
    combined_text[topic_id] += ' ' + text

  result_df = pd.DataFrame.from_dict(combined_text, orient='index', columns=['speech_text'])
  result_df.reset_index(inplace=True)
  result_df.rename(columns={'index': 'topic_id'}, inplace=True)

  return result_df

# execute
ct_df = combine_text_by_topic(df)

In [9]:
ct_df.head()

Unnamed: 0,topic_id,speech_text
0,2024-03-07-T-001,"Point of Order Order. Sir, can I ask you to ex..."
1,2024-03-07-T-002,Ensuring Mental Well-being of National Service...
2,2024-03-07-T-003,Capacity Increase at IMH and Alexandra Hospita...
3,2024-03-07-T-004,Selection of Organisations to Operate Active A...
4,2024-03-07-T-005,Review of HDB Flat Size Eligibility Criterion ...


In [10]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# text preprocessing functions
def lowercase_text(text):
  return text.lower()

def remove_punctuation(text):
  punctuation = string.punctuation
  return "".join([char for char in text if char not in punctuation])

def remove_stopwords(text, custom_stopwords=[]):
  stopword_list = stopwords.words('english')
  stopword_list.extend(custom_stopwords)

  words = [word for word in text.split() if word not in stopword_list]
  return " ".join(words)

def lemmatize_text(text):
  lemmatizer = WordNetLemmatizer()
  words = [lemmatizer.lemmatize(word) for word in text.split()]
  return " ".join(words)

def clean_text(text):
  cleaned_text = re.sub(r"[^\x00-\x7F]+", "", text)  # Remove non-ASCII characters
  cleaned_text = re.sub(r"\d+", "", cleaned_text)  # Remove numbers
  cleaned_text = re.sub(r"\s+", " ", cleaned_text)  # Replace multiple spaces with a single space
  cleaned_text = re.sub(r"\n|\t|\r|\f", "", cleaned_text)  # Remove newlines, tabs, carriage returns, and form feeds
  return cleaned_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


The following stop words are removed.

Stopwords, comprising common words like "and," "the," and "is," are typically removed in text analysis tasks for their high frequency and low semantic value. This removal reduces data noise, lowers dimensionality, and improves model performance by focusing on more meaningful terms. Moreover, excluding stopwords enhances interpretability and normalization of text data, aiding in more efficient and effective text analysis and modeling processes.

| Category                     | Stopwords                                         |
|------------------------------|---------------------------------------------------|
| Government and Political     | parliament, ministry, minister, parliamentary,   |
|                              | mr, speaker, member, deputy, indranee, rajah,    |
|                              | zaqy, mohamad, yien, hai, fu, grace, leader,    |
|                              | house                                             |
| Terms                        |                                                   |
| General Stopwords            | also, year, time, need, new, would, one, may,   |
|                              | many, like, whether, u, make, public, take,      |
|                              | well, even, example, text, sitting, act,         |
|                              | edition, read, printed, adjourned, adjourn,      |
|                              | resolved, order, assent, proceeding, chapter,     |
|                              | revised, amend, presented, second, available,    |
|                              | amendment, consequential, state, debate,        |
|                              | tomorrow, resumption, day, beg, fixed, stand,    |
|                              | date, today, accordingly, sit, exempted,         |
|                              | provision, present, general, paper, item,        |
|                              | today, allotted, supply, committee,              |
|                              | consideration, th, nd, rd, pursuant, minute,     |
|                              | pm, rising, speech, respect, discussion,         |
|                              | agreed, january, february, march, april,        |
|                              | may, june, july, august, september, october,    |
|                              | november, december, fy, leave, progress,        |
|                              | chair, head, said                                |
| Specific Terms and Names     | bill, first, question, continue, must, ensure,   |
|                              | proc, amendment, consequential, debate,         |
|                              | according, thursday, friday, saturday, sunday,  |
|                              | kim, gan, desmond, lee, yong, hon, provision,   |
|                              | mdm                                               |

In [11]:
custom_stopwords = ['parliament', 'ministry', 'minister',
                    'parliamentary', 'mr', 'speaker', 'asked', 'sir', 'thank',
                    'also', 'year', 'time', 'need', 'new', 'government',
                    'would', 'one', 'may', 'many', 'year', 'member',
                    'like', 'whether', 'u', 'make', 'public', 'take',
                    'bill', 'first', 'question', 'well', 'continue', 'must',
                    'ensure', 'even', 'example', 'proc', 'text', 'sitting',
                    'act', 'edition', 'read', 'printed', 'adjourned', 'adjourn',
                    'resolved', 'order', 'assent', 'standing', 'consent', 'proceeding',
                    'chapter', 'revised', 'amend', 'presented', 'second', 'available',
                    'amendment', 'consequential', 'state', 'debate', 'tomorrow',
                    'resumption', 'day', 'deputy',  'beg', 'indranee', 'rajah',
                    'zaqy', 'mohamad', 'yien', 'hai', 'fu', 'grace', 'fixed',
                    'stand', 'date', 'today', 'accordingly', 'sit', 'exempted',
                    'member', 'provision', 'present', 'general', 'paper', 'item',
                    'today', 'allotted', 'supply', 'committee', 'consideration',
                    'th', 'nd', 'rd', 'pursuant', 'minute', 'pm',
                    'hen', 'eng', 'ng', 'dr', 'monday', 'tuesday', 'wednesday',
                    'thursday', 'friday', 'saturday', 'sunday', 'rising',
                    'kim', 'gan', 'desmond', 'lee', 'yong', 'member', 'hon',
                    'speech', 'proceeding', 'respect', 'discussion', 'leader',
                    'provision', 'agreed', 'house', 'january', 'february', 'march',
                    'april', 'may', 'june', 'july', 'august', 'september',
                    'october', 'november', 'december', 'fy', 'leave', 'progress',
                    'chair', 'head', 'mdm', 'said', 'interruption', 'minute',
                    'propose', 'extend', 'moment', 'day', 'period', 'completion',
                    'business', 'days', 'today', 'facilitate', 'singapore',
                    'singaporean', 'ma', 'rgb', 'stylecolor', 'span', 'u']

ct_df['speech_text'] = ct_df['speech_text'].apply(lowercase_text)
ct_df['speech_text'] = ct_df['speech_text'].apply(remove_punctuation)
ct_df['speech_text'] = ct_df['speech_text'].apply(remove_stopwords, custom_stopwords=custom_stopwords)
ct_df['speech_text'] = ct_df['speech_text'].apply(lemmatize_text)
ct_df['speech_text'] = ct_df['speech_text'].apply(clean_text)

In [12]:
ct_df.iloc[4,1]

'review hdb flat size eligibility criterion silver support scheme manpower view senior live mature estate whose hdb flat shorter remaining lease lower resale value consider reassessing use hdb flat type eligibility criterion silver support scheme especially regard ownership room larger hdb flat disqualifies senior receiving silver support silver support scheme targeted senior lower income working year little family support resource retirement property ownership indicative seniors resource remains relevant ensuring silver support scheme targeted senior particular senior room larger housing development board hdb flat likely resource access additional retirement fund hence eligible silver support nonetheless senior face unique circumstance including room larger hdb flat short remaining lease low annual value write central provident fund cpf board review eligibility silver support consider merit appeal casebycase basis strengthen support retirement adequacy announced enhancement silver sup

In [13]:
import time
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import matplotlib.pyplot as plt

texts = ct_df['speech_text']

# Preprocess data
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(texts)

# Create a Gensim Dictionary
dictionary = Dictionary(texts.apply(str.split))

# Convert sparse TF-IDF matrix to Gensim corpus
corpus = [dictionary.doc2bow(doc.split()) for doc in texts]

# Define a number of topics
num_topics = 26

In [15]:
lda_model = LdaModel(corpus=corpus, num_topics=num_topics,
                         id2word=dictionary, random_state=42,
                     passes=10)

In [16]:
for i in range(num_topics):
  print(f"Topic {i}")
  print(lda_model.print_topic(i, topn=20))

Topic 0
0.024*"police" + 0.020*"officer" + 0.017*"offence" + 0.016*"case" + 0.016*"enforcement" + 0.013*"penalty" + 0.008*"action" + 0.008*"security" + 0.008*"home" + 0.007*"crime" + 0.007*"person" + 0.007*"agency" + 0.007*"investigation" + 0.007*"illegal" + 0.007*"victim" + 0.007*"fine" + 0.006*"criminal" + 0.006*"offender" + 0.005*"number" + 0.005*"breach"
Topic 1
0.015*"court" + 0.013*"law" + 0.011*"amendment" + 0.011*"clause" + 0.010*"case" + 0.010*"section" + 0.009*"legal" + 0.007*"power" + 0.007*"person" + 0.007*"proposed" + 0.006*"party" + 0.006*"pension" + 0.005*"dispute" + 0.005*"process" + 0.004*"claim" + 0.004*"member" + 0.004*"service" + 0.004*"made" + 0.004*"provision" + 0.004*"right"
Topic 2
0.016*"sector" + 0.014*"company" + 0.014*"industry" + 0.010*"job" + 0.010*"support" + 0.009*"growth" + 0.009*"smes" + 0.009*"productivity" + 0.009*"economy" + 0.008*"help" + 0.007*"programme" + 0.007*"opportunity" + 0.007*"business" + 0.006*"technology" + 0.006*"local" + 0.006*"skill"

In [17]:
from gensim.utils import simple_preprocess

def get_topic_distribution(text):

    tokens = text.split()
    bow_vector = dictionary.doc2bow(tokens)
    topic_distribution = lda_model.get_document_topics(bow_vector)

    return topic_distribution

# Apply the function to each row of the DataFrame
ct_df['topic_distribution'] = ct_df['speech_text'].apply(get_topic_distribution)

In [18]:
ct_df

Unnamed: 0,topic_id,speech_text,topic_distribution
0,2024-03-07-T-001,point ask exercise power reinstate half hour q...,"[(1, 0.02571733), (18, 0.90088326), (21, 0.067..."
1,2024-03-07-T-002,ensuring mental wellbeing national serviceman ...,"[(3, 0.5434681), (4, 0.34441826), (11, 0.06563..."
2,2024-03-07-T-003,capacity increase imh alexandra hospital patie...,"[(3, 0.7616762), (8, 0.042608332), (10, 0.0304..."
3,2024-03-07-T-004,selection organisation operate active ageing c...,"[(3, 0.39930356), (5, 0.013476493), (6, 0.1193..."
4,2024-03-07-T-005,review hdb flat size eligibility criterion sil...,"[(3, 0.12190527), (10, 0.17841686), (13, 0.449..."
...,...,...,...
16083,2012-09-10-T-067,collaboration overseas university education to...,"[(1, 0.060407948), (2, 0.2707337), (15, 0.0367..."
16084,2012-09-10-T-068,allocation void deck childcare centre acting c...,"[(3, 0.68192214), (6, 0.038166948), (8, 0.0264..."
16085,2012-09-10-T-069,proper accommodation foreign worker acting man...,"[(0, 0.43784615), (6, 0.014063312), (16, 0.500..."
16086,2012-09-10-T-070,impact reduction foreign worker smes acting ma...,"[(2, 0.43806773), (5, 0.023604097), (14, 0.011..."


In [19]:
ct_df.iloc[2,1]

'capacity increase imh alexandra hospital patient seeking mental health specialist care health planned capacity increase patient seeking mental health specialist care imh ii alexandra hospital b step taken support necessary resource required alexandra hospital provide outpatient inpatient mental health specialist service including acute bed psychiatric care rehabilitation institute mental health imh recently refurbished acute psychiatric ward enhance inpatient care expanding capacity meet future demand support increase demand mental health service planning increase number psychiatrist increase number healthcare worker training psychology expand mental health service primary community setting support national mental health wellbeing strategy m nadia quick followup supplementary senior sharing training place learning opportunity think student interested pursue route could senior give little bit detail training opportunity trickle learning institution understand m nadia samdins training o

In [20]:
ct_df.iloc[2,2]

[(3, 0.7616762),
 (8, 0.042608332),
 (10, 0.030451072),
 (18, 0.015064715),
 (21, 0.1457664)]

In [21]:
def extract_topic_info(row, num_topics):
    topic_info = dict(row['topic_distribution'])
    for i in range(num_topics):
        if i in topic_info:
            row[f'topic_{i}_distribution'] = topic_info[i]
        else:
            row[f'topic_{i}_distribution'] = 0
    return row

In [22]:
def extract_highest_topic_info(topic_distribution):
    if not topic_distribution:
        return None, None
    topic_info = dict(topic_distribution)
    max_topic = max(topic_info, key=topic_info.get)
    return int(max_topic), topic_info.get(max_topic, 0)

In [23]:
ct_df = ct_df.apply(lambda row: extract_topic_info(row, num_topics), axis=1)

In [24]:
ct_df['highest_topic'], ct_df['highest_topic_distribution'] = zip(*ct_df['topic_distribution'].apply(extract_highest_topic_info))

In [25]:
ct_df.head()

Unnamed: 0,topic_id,speech_text,topic_distribution,topic_0_distribution,topic_1_distribution,topic_2_distribution,topic_3_distribution,topic_4_distribution,topic_5_distribution,topic_6_distribution,...,topic_18_distribution,topic_19_distribution,topic_20_distribution,topic_21_distribution,topic_22_distribution,topic_23_distribution,topic_24_distribution,topic_25_distribution,highest_topic,highest_topic_distribution
0,2024-03-07-T-001,point ask exercise power reinstate half hour q...,"[(1, 0.02571733), (18, 0.90088326), (21, 0.067...",0.0,0.025717,0.0,0.0,0.0,0.0,0.0,...,0.900883,0.0,0.0,0.06712,0.0,0.0,0.0,0.0,18,0.900883
1,2024-03-07-T-002,ensuring mental wellbeing national serviceman ...,"[(3, 0.5434681), (4, 0.34441826), (11, 0.06563...",0.0,0.0,0.0,0.543468,0.344418,0.0,0.0,...,0.044372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0.543468
2,2024-03-07-T-003,capacity increase imh alexandra hospital patie...,"[(3, 0.7616762), (8, 0.042608332), (10, 0.0304...",0.0,0.0,0.0,0.761676,0.0,0.0,0.0,...,0.015065,0.0,0.0,0.145766,0.0,0.0,0.0,0.0,3,0.761676
3,2024-03-07-T-004,selection organisation operate active ageing c...,"[(3, 0.39930356), (5, 0.013476493), (6, 0.1193...",0.0,0.0,0.0,0.399304,0.0,0.013476,0.119387,...,0.295047,0.0,0.0,0.0,0.0,0.0,0.0,0.026615,3,0.399304
4,2024-03-07-T-005,review hdb flat size eligibility criterion sil...,"[(3, 0.12190527), (10, 0.17841686), (13, 0.449...",0.0,0.0,0.0,0.121905,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13,0.449571


In [26]:
pivoted_distributions = ct_df[["topic_id"]+[f"topic_{i}_distribution" for i in range(num_topics)]]
unpivoted = pivoted_distributions.melt(id_vars = ['topic_id'],
                                       var_name = 'topic',
                                       value_name = 'distribution')
unpivoted['topic'] = unpivoted['topic'].str.extract('(\d+)').astype(int)
unpivoted = unpivoted[unpivoted['distribution'] != 0]
unpivoted = unpivoted.sort_values(by=['topic_id', 'topic'])
unpivoted = unpivoted.reset_index(drop=True)
unpivoted['date'] = unpivoted['topic_id'].apply(lambda x: pd.to_datetime(x[:10]))

In [27]:
unpivoted

Unnamed: 0,topic_id,topic,distribution,date
0,2012-09-10-T-001,1,0.109790,2012-09-10
1,2012-09-10-T-001,5,0.054308,2012-09-10
2,2012-09-10-T-001,18,0.752902,2012-09-10
3,2012-09-10-T-001,25,0.047722,2012-09-10
4,2012-09-10-T-002,0,0.041304,2012-09-10
...,...,...,...,...
89866,2024-03-07-T-041,11,0.051949,2024-03-07
89867,2024-03-07-T-041,16,0.486436,2024-03-07
89868,2024-03-07-T-041,23,0.182859,2024-03-07
89869,2024-03-07-T-042,8,0.569613,2024-03-07


In [28]:
dataset_id = "topic_modelling"
table_id = f"topic_distribution_{num_topics}_lda_{date_yyyymmdd}"

unpivoted.to_gbq(destination_table=f"{dataset_id}.{table_id}",
          project_id=project_id,
          if_exists="replace")

100%|██████████| 1/1 [00:00<00:00, 2437.13it/s]


In [29]:
highest_topic = ct_df[["topic_id", "highest_topic", "highest_topic_distribution"]]
highest_topic['date'] = highest_topic['topic_id'].apply(lambda x: pd.to_datetime(x[:10]))
highest_topic = highest_topic[~highest_topic['highest_topic'].isna()]
highest_topic['highest_topic'] = highest_topic['highest_topic'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highest_topic['date'] = highest_topic['topic_id'].apply(lambda x: pd.to_datetime(x[:10]))


In [30]:
highest_topic

Unnamed: 0,topic_id,highest_topic,highest_topic_distribution,date
0,2024-03-07-T-001,18,0.900883,2024-03-07
1,2024-03-07-T-002,3,0.543468,2024-03-07
2,2024-03-07-T-003,3,0.761676,2024-03-07
3,2024-03-07-T-004,3,0.399304,2024-03-07
4,2024-03-07-T-005,13,0.449571,2024-03-07
...,...,...,...,...
16083,2012-09-10-T-067,21,0.578329,2012-09-10
16084,2012-09-10-T-068,3,0.681922,2012-09-10
16085,2012-09-10-T-069,16,0.500938,2012-09-10
16086,2012-09-10-T-070,16,0.494803,2012-09-10


In [31]:
dataset_id = "topic_modelling"
table_id = f"highest_topic_{num_topics}_lda_{date_yyyymmdd}"

highest_topic.to_gbq(destination_table=f"{dataset_id}.{table_id}",
          project_id=project_id,
          if_exists="replace")

100%|██████████| 1/1 [00:00<00:00, 1731.75it/s]


In [32]:
!pip install openai --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/262.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/262.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.9/262.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [33]:
from openai import OpenAI
from google.colab import userdata

client = OpenAI(
    api_key = userdata.get('OPENAI_API_KEY')
)

In [34]:
# Function to get summarization from ChatGPT
def get_summarization(topic_top_n_words):
    response = client.chat.completions.create(
      model="gpt-3.5-turbo-0125",
      messages=[
        {"role": "system", "content": "You're a political text analyst focusing on Singapore Parliament Speeches. Given top words, provide a 1-2 word summary of the topic."},
        {"role": "user", "content": f"The words are: {topic_top_n_words}. What's a summary in 1-2 words?"}
      ]
    )
    return response.choices[0].message.content

def extract_topic_words(topic_repr):
    # Split the representation by "+" to get individual terms
    terms = topic_repr.split('+')
    topic_words = []
    for term in terms:
        # Extract the word (text between quotes) using regular expression
        word = re.findall(r'"([^"]*)"', term)
        # Append the extracted word to the list of topic words
        if word:
            topic_words.append(word[0])
    return topic_words

In [35]:
topic_words_list = []

for i in range(num_topics):
    topic_repr = lda_model.print_topic(i, topn=15)
    topic_words = extract_topic_words(topic_repr)
    topic_words_list.append(topic_words)

In [36]:
for i in range(num_topics):
  print(' '.join(topic_words_list[i]))

police officer offence case enforcement penalty action security home crime person agency investigation illegal victim
court law amendment clause case section legal power person proposed party pension dispute process claim
sector company industry job support growth smes productivity economy help programme opportunity business technology local
health care healthcare service patient centre community support medical childcare mental operator hospital senior programme
defence saf national air security training water n serviceman threat mindef pub level plastic force
company consumer market industry product contract business trade price retailer good service sale retail competition
hdb town resident council area development estate construction work project lift site building park space
drug community religious society people prison race group threat inmate law youth rehabilitation social religion
community art sport national youth support programme work artist culture organisation fund group

In [37]:
topic_summaries = []
for topic_words in topic_words_list:
    topic_top_n_words = ' '.join(topic_words)
    summary = get_summarization(topic_top_n_words)
    topic_summaries.append(summary)

In [38]:
topic_summaries = [topic.strip('"').title() for topic in topic_summaries]

In [39]:
for i, summary in enumerate(topic_summaries):
    print(f"Topic {i}: {summary}")

Topic 0: Law Enforcement
Topic 1: Legal Reform
Topic 2: Economic Development
Topic 3: Healthcare Services
Topic 4: Military Defense.
Topic 5: Commerce & Retail
Topic 6: Urban Development
Topic 7: Social Welfare
Topic 8: Cultural Development
Topic 9: Family Support
Topic 10: Public Support
Topic 11: Fire Safety
Topic 12: Public Health
Topic 13: Housing Policy.
Topic 14: Digital Services
Topic 15: Retirement Savings.
Topic 16: Labour Market
Topic 17: Finance Industry
Topic 18: Debates.
Topic 19: Hawker Centers
Topic 20: Sustainable Development
Topic 21: Education Policy
Topic 22: Public Transport.
Topic 23: Economic Policies
Topic 24: International Relations
Topic 25: Economic Growth


In [40]:
topic_names = pd.DataFrame({
    'topic': [i for i in range(num_topics)],
    'topic_summary': topic_summaries,
    'top_n_words': [' '.join(topic_words) for topic_words in topic_words_list]
})

In [41]:
dataset_id = "topic_modelling"
table_id = f"topic_names_{num_topics}_lda_{date_yyyymmdd}"

topic_names.to_gbq(destination_table=f"{dataset_id}.{table_id}",
          project_id=project_id,
          if_exists="replace")

100%|██████████| 1/1 [00:00<00:00, 7653.84it/s]


In [42]:
# Save LDA Model

from google.colab import drive
import os
drive.mount('/content/drive')

# Define the directory path
directory = "/content/drive/My Drive/singapore-parliament-speeches/"

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

lda_model.save(f"{directory}model_{num_topics}_lda_{date_yyyymmdd}")


# Load the LDA model from the 'singapore-parliament-speeches' folder in Google Drive
# lda_model_path = "f/content/drive/My Drive/singapore-parliament-speeches/lda_model_{num_topics}_topics"
# loaded_lda_model = LdaModel.load(lda_model_path)

Mounted at /content/drive
