<a href="https://colab.research.google.com/github/jeremychia/singapore-parliament-speeches/blob/main/Parliamentary_Data_Topic_Modelling_(NMF).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [127]:
# install bigquery client library
!pip install google-cloud-bigquery --quiet

In [128]:
# authenticate to GCP
from google.colab import auth
auth.authenticate_user()

In [129]:
# set up project id and dataset
project_id = "singapore-parliament-speeches"  # Replace with your GCP project ID
dataset_id = "singapore-parliament-speeches.prod_mart.mart_speeches"

In [130]:
# create a bigquery client
from google.cloud import bigquery
client = bigquery.Client(project=project_id)

In [131]:
from datetime import datetime

today = datetime.today()

date_yyyymmdd = today.strftime('%Y%m%d')
date_yyyymmdd = str(date_yyyymmdd)

In [132]:
# define query
query = f"""
SELECT
  topic_id,
  topic_title,
  speech_text
FROM {dataset_id}
WHERE
  NOT (
    topic_type = 'BI'
    OR (
        topic_type = 'OS'
        AND (
            LOWER(topic_title) LIKE ANY(
                '%adjournment%',
                'extension%',
                'exempted business%',
                '%estimates of expenditure%',
                '%estimate of expenditure%',
                '%estimates for the%',
                "%president's address%",
                'business motion',
                '%time limit%',
                '%visit%',
                'committee of supply reporting progress',
                'rearrangement of business',
                'suspension of standing%',
                'election of speaker',
                'administration of oath',
                ' '
            )
        )
    )
  )
ORDER BY date DESC, speech_id
"""

In [133]:
# run query
import pandas as pd
query_job = client.query(query)
df = query_job.to_dataframe()

In [134]:
# compbine text by topic

def combine_text_by_topic(df):

  combined_text = {}
  for index, row in df.iterrows():
    topic_id = row['topic_id']
    text = row['speech_text']
    if topic_id not in combined_text:
      combined_text[topic_id] = row['topic_title']
    combined_text[topic_id] += ' ' + text

  result_df = pd.DataFrame.from_dict(combined_text, orient='index', columns=['speech_text'])
  result_df.reset_index(inplace=True)
  result_df.rename(columns={'index': 'topic_id'}, inplace=True)

  return result_df

# execute
ct_df = combine_text_by_topic(df)

In [135]:
ct_df.head()

Unnamed: 0,topic_id,speech_text
0,2024-03-07-T-001,"Point of Order Order. Sir, can I ask you to ex..."
1,2024-03-07-T-002,Ensuring Mental Well-being of National Service...
2,2024-03-07-T-003,Capacity Increase at IMH and Alexandra Hospita...
3,2024-03-07-T-004,Selection of Organisations to Operate Active A...
4,2024-03-07-T-005,Review of HDB Flat Size Eligibility Criterion ...


In [136]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# text preprocessing functions
def lowercase_text(text):
  return text.lower()

def remove_punctuation(text):
  punctuation = string.punctuation
  return "".join([char for char in text if char not in punctuation])

def remove_stopwords(text, custom_stopwords=[]):
  stopword_list = stopwords.words('english')
  stopword_list.extend(custom_stopwords)

  words = [word for word in text.split() if word not in stopword_list]
  return " ".join(words)

def lemmatize_text(text):
  lemmatizer = WordNetLemmatizer()
  words = [lemmatizer.lemmatize(word) for word in text.split()]
  return " ".join(words)

def clean_text(text):
  cleaned_text = re.sub(r"[^\x00-\x7F]+", "", text)  # Remove non-ASCII characters
  cleaned_text = re.sub(r"\d+", "", cleaned_text)  # Remove numbers
  cleaned_text = re.sub(r"\s+", " ", cleaned_text)  # Replace multiple spaces with a single space
  cleaned_text = re.sub(r"\n|\t|\r|\f", "", cleaned_text)  # Remove newlines, tabs, carriage returns, and form feeds
  return cleaned_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


The following stop words are removed.

Stopwords, comprising common words like "and," "the," and "is," are typically removed in text analysis tasks for their high frequency and low semantic value. This removal reduces data noise, lowers dimensionality, and improves model performance by focusing on more meaningful terms. Moreover, excluding stopwords enhances interpretability and normalization of text data, aiding in more efficient and effective text analysis and modeling processes.

| Category                     | Stopwords                                         |
|------------------------------|---------------------------------------------------|
| Government and Political     | parliament, ministry, minister, parliamentary,   |
|                              | mr, speaker, member, deputy, indranee, rajah,    |
|                              | zaqy, mohamad, yien, hai, fu, grace, leader,    |
|                              | house                                             |
| Terms                        |                                                   |
| General Stopwords            | also, year, time, need, new, would, one, may,   |
|                              | many, like, whether, u, make, public, take,      |
|                              | well, even, example, text, sitting, act,         |
|                              | edition, read, printed, adjourned, adjourn,      |
|                              | resolved, order, assent, proceeding, chapter,     |
|                              | revised, amend, presented, second, available,    |
|                              | amendment, consequential, state, debate,        |
|                              | tomorrow, resumption, day, beg, fixed, stand,    |
|                              | date, today, accordingly, sit, exempted,         |
|                              | provision, present, general, paper, item,        |
|                              | today, allotted, supply, committee,              |
|                              | consideration, th, nd, rd, pursuant, minute,     |
|                              | pm, rising, speech, respect, discussion,         |
|                              | agreed, january, february, march, april,        |
|                              | may, june, july, august, september, october,    |
|                              | november, december, fy, leave, progress,        |
|                              | chair, head, said                                |
| Specific Terms and Names     | bill, first, question, continue, must, ensure,   |
|                              | proc, amendment, consequential, debate,         |
|                              | according, thursday, friday, saturday, sunday,  |
|                              | kim, gan, desmond, lee, yong, hon, provision,   |
|                              | mdm                                               |

In [137]:
custom_stopwords = ['parliament', 'ministry', 'minister',
                    'parliamentary', 'mr', 'speaker', 'asked', 'sir', 'thank',
                    'also', 'year', 'time', 'need', 'new', 'government',
                    'would', 'one', 'may', 'many', 'year', 'member',
                    'like', 'whether', 'u', 'make', 'public', 'take',
                    'bill', 'first', 'question', 'well', 'continue', 'must',
                    'ensure', 'even', 'example', 'proc', 'text', 'sitting',
                    'act', 'edition', 'read', 'printed', 'adjourned', 'adjourn',
                    'resolved', 'order', 'assent', 'standing', 'consent', 'proceeding',
                    'chapter', 'revised', 'amend', 'presented', 'second', 'available',
                    'amendment', 'consequential', 'state', 'debate', 'tomorrow',
                    'resumption', 'day', 'deputy',  'beg', 'indranee', 'rajah',
                    'zaqy', 'mohamad', 'yien', 'hai', 'fu', 'grace', 'fixed',
                    'stand', 'date', 'today', 'accordingly', 'sit', 'exempted',
                    'member', 'provision', 'present', 'general', 'paper', 'item',
                    'today', 'allotted', 'supply', 'committee', 'consideration',
                    'th', 'nd', 'rd', 'pursuant', 'minute', 'pm',
                    'hen', 'eng', 'ng', 'dr', 'monday', 'tuesday', 'wednesday',
                    'thursday', 'friday', 'saturday', 'sunday', 'rising',
                    'kim', 'gan', 'desmond', 'lee', 'yong', 'member', 'hon',
                    'speech', 'proceeding', 'respect', 'discussion', 'leader',
                    'provision', 'agreed', 'house', 'january', 'february', 'march',
                    'april', 'may', 'june', 'july', 'august', 'september',
                    'october', 'november', 'december', 'fy', 'leave', 'progress',
                    'chair', 'head', 'mdm', 'said', 'interruption', 'minute',
                    'propose', 'extend', 'moment', 'day', 'period', 'completion',
                    'business', 'days', 'today', 'facilitate', 'singapore',
                    'singaporean', 'ma', 'rgb', 'stylecolor', 'span', 'u']

ct_df['speech_text'] = ct_df['speech_text'].apply(lowercase_text)
ct_df['speech_text'] = ct_df['speech_text'].apply(remove_punctuation)
ct_df['speech_text'] = ct_df['speech_text'].apply(remove_stopwords, custom_stopwords=custom_stopwords)
ct_df['speech_text'] = ct_df['speech_text'].apply(lemmatize_text)
ct_df['speech_text'] = ct_df['speech_text'].apply(clean_text)

In [138]:
ct_df.iloc[4,1]

'review hdb flat size eligibility criterion silver support scheme manpower view senior live mature estate whose hdb flat shorter remaining lease lower resale value consider reassessing use hdb flat type eligibility criterion silver support scheme especially regard ownership room larger hdb flat disqualifies senior receiving silver support silver support scheme targeted senior lower income working year little family support resource retirement property ownership indicative seniors resource remains relevant ensuring silver support scheme targeted senior particular senior room larger housing development board hdb flat likely resource access additional retirement fund hence eligible silver support nonetheless senior face unique circumstance including room larger hdb flat short remaining lease low annual value write central provident fund cpf board review eligibility silver support consider merit appeal casebycase basis strengthen support retirement adequacy announced enhancement silver sup

In [139]:
import time
from sklearn.feature_extraction.text import TfidfVectorizer

In [140]:
from gensim.corpora import Dictionary
from gensim.models import Nmf
import matplotlib.pyplot as plt

texts = ct_df['speech_text']

# Preprocess data
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(texts)

# Create a Gensim Dictionary
dictionary = Dictionary(texts.apply(str.split))

# Convert sparse TF-IDF matrix to Gensim corpus
corpus = [dictionary.doc2bow(doc.split()) for doc in texts]

# Define a number of topics
num_topics = 25

In [141]:
nmf_model = Nmf(corpus=corpus, num_topics=num_topics, id2word=dictionary, random_state=42)

In [142]:
for i in range(num_topics):
  print(f"Topic {i}")
  print(nmf_model.print_topic(i, topn=20))

Topic 0
0.026*"court" + 0.015*"law" + 0.014*"case" + 0.009*"legal" + 0.008*"section" + 0.008*"person" + 0.007*"clause" + 0.006*"justice" + 0.006*"appeal" + 0.005*"offence" + 0.005*"amendment" + 0.005*"criminal" + 0.005*"information" + 0.005*"judge" + 0.005*"power" + 0.004*"lawyer" + 0.004*"process" + 0.004*"victim" + 0.004*"interest" + 0.004*"protection"
Topic 1
0.022*"budget" + 0.017*"singaporean" + 0.014*"tax" + 0.012*"gst" + 0.010*"year" + 0.010*"price" + 0.010*"income" + 0.010*"cost" + 0.009*"increase" + 0.008*"generation" + 0.008*"reserve" + 0.007*"future" + 0.007*"land" + 0.007*"revenue" + 0.006*"billion" + 0.006*"policy" + 0.006*"people" + 0.006*"national" + 0.005*"spending" + 0.005*"value"
Topic 2
0.016*"company" + 0.010*"industry" + 0.009*"carbon" + 0.009*"business" + 0.009*"sector" + 0.009*"economy" + 0.008*"support" + 0.008*"energy" + 0.008*"green" + 0.008*"tax" + 0.008*"smes" + 0.007*"global" + 0.007*"enterprise" + 0.007*"change" + 0.006*"job" + 0.006*"growth" + 0.006*"oppo

In [143]:
from gensim.utils import simple_preprocess

def get_topic_distribution(text):

    tokens = text.split()
    bow_vector = dictionary.doc2bow(tokens)
    topic_distribution = nmf_model.get_document_topics(bow_vector)

    return topic_distribution

# Apply the function to each row of the DataFrame
ct_df['topic_distribution'] = ct_df['speech_text'].apply(get_topic_distribution)

In [144]:
ct_df

Unnamed: 0,topic_id,speech_text,topic_distribution
0,2024-03-07-T-001,point ask exercise power reinstate half hour q...,"[(1, 0.02957727311468095), (7, 0.0246200762183..."
1,2024-03-07-T-002,ensuring mental wellbeing national serviceman ...,"[(4, 0.07760100265951475), (5, 0.1339319563386..."
2,2024-03-07-T-003,capacity increase imh alexandra hospital patie...,"[(4, 0.0817768864123097), (8, 0.09125843073928..."
3,2024-03-07-T-004,selection organisation operate active ageing c...,"[(0, 0.03749121232291699), (3, 0.0692197482622..."
4,2024-03-07-T-005,review hdb flat size eligibility criterion sil...,"[(3, 0.2788692180579809), (4, 0.42784410698273..."
...,...,...,...
16083,2012-09-10-T-067,collaboration overseas university education to...,"[(0, 0.04997353487989181), (1, 0.0575973225866..."
16084,2012-09-10-T-068,allocation void deck childcare centre acting c...,"[(0, 0.011560167267903392), (3, 0.130586006640..."
16085,2012-09-10-T-069,proper accommodation foreign worker acting man...,"[(0, 0.010637917539358664), (4, 0.069189041645..."
16086,2012-09-10-T-070,impact reduction foreign worker smes acting ma...,"[(0, 0.013428427394958642), (2, 0.374353282126..."


In [145]:
ct_df.iloc[2,1]

'capacity increase imh alexandra hospital patient seeking mental health specialist care health planned capacity increase patient seeking mental health specialist care imh ii alexandra hospital b step taken support necessary resource required alexandra hospital provide outpatient inpatient mental health specialist service including acute bed psychiatric care rehabilitation institute mental health imh recently refurbished acute psychiatric ward enhance inpatient care expanding capacity meet future demand support increase demand mental health service planning increase number psychiatrist increase number healthcare worker training psychology expand mental health service primary community setting support national mental health wellbeing strategy m nadia quick followup supplementary senior sharing training place learning opportunity think student interested pursue route could senior give little bit detail training opportunity trickle learning institution understand m nadia samdins training o

In [146]:
ct_df.iloc[2,2]

[(4, 0.0817768864123097),
 (8, 0.0912584307392834),
 (10, 0.10067469825822083),
 (15, 0.3431220508236999),
 (17, 0.20573234010572727),
 (19, 0.06105573422354683),
 (22, 0.10502111119966671)]

In [147]:
def extract_topic_info(row, num_topics):
    topic_info = dict(row['topic_distribution'])
    for i in range(num_topics):
        if i in topic_info:
            row[f'topic_{i}_distribution'] = topic_info[i]
        else:
            row[f'topic_{i}_distribution'] = 0
    return row

In [148]:
def extract_highest_topic_info(topic_distribution):
    if not topic_distribution:
        return None, None
    topic_info = dict(topic_distribution)
    max_topic = max(topic_info, key=topic_info.get)
    return int(max_topic), topic_info.get(max_topic, 0)

In [149]:
ct_df = ct_df.apply(lambda row: extract_topic_info(row, num_topics), axis=1)

In [150]:
ct_df['highest_topic'], ct_df['highest_topic_distribution'] = zip(*ct_df['topic_distribution'].apply(extract_highest_topic_info))

In [151]:
ct_df.head()

Unnamed: 0,topic_id,speech_text,topic_distribution,topic_0_distribution,topic_1_distribution,topic_2_distribution,topic_3_distribution,topic_4_distribution,topic_5_distribution,topic_6_distribution,...,topic_17_distribution,topic_18_distribution,topic_19_distribution,topic_20_distribution,topic_21_distribution,topic_22_distribution,topic_23_distribution,topic_24_distribution,highest_topic,highest_topic_distribution
0,2024-03-07-T-001,point ask exercise power reinstate half hour q...,"[(1, 0.02957727311468095), (7, 0.0246200762183...",0.0,0.029577,0.0,0.0,0.0,0.0,0.0,...,0.0,0.02844,0.054018,0.0,0.045165,0.0,0.01911,0.0,14,0.4877
1,2024-03-07-T-002,ensuring mental wellbeing national serviceman ...,"[(4, 0.07760100265951475), (5, 0.1339319563386...",0.0,0.0,0.0,0.0,0.077601,0.133932,0.0,...,0.071636,0.021895,0.0,0.0,0.035379,0.0,0.0,0.0,15,0.585603
2,2024-03-07-T-003,capacity increase imh alexandra hospital patie...,"[(4, 0.0817768864123097), (8, 0.09125843073928...",0.0,0.0,0.0,0.0,0.081777,0.0,0.0,...,0.205732,0.0,0.061056,0.0,0.0,0.105021,0.0,0.0,15,0.343122
3,2024-03-07-T-004,selection organisation operate active ageing c...,"[(0, 0.03749121232291699), (3, 0.0692197482622...",0.037491,0.0,0.0,0.06922,0.059321,0.0,0.0,...,0.128156,0.0,0.0,0.111769,0.0,0.0,0.0,0.0,7,0.19966
4,2024-03-07-T-005,review hdb flat size eligibility criterion sil...,"[(3, 0.2788692180579809), (4, 0.42784410698273...",0.0,0.0,0.0,0.278869,0.427844,0.0,0.0,...,0.0,0.0,0.0,0.0,0.115276,0.0,0.0,0.0,4,0.427844


In [152]:
pivoted_distributions = ct_df[["topic_id"]+[f"topic_{i}_distribution" for i in range(num_topics)]]
unpivoted = pivoted_distributions.melt(id_vars = ['topic_id'],
                                       var_name = 'topic',
                                       value_name = 'distribution')
unpivoted['topic'] = unpivoted['topic'].str.extract('(\d+)').astype(int)
unpivoted = unpivoted[unpivoted['distribution'] != 0]
unpivoted = unpivoted.sort_values(by=['topic_id', 'topic'])
unpivoted = unpivoted.reset_index(drop=True)
unpivoted['date'] = unpivoted['topic_id'].apply(lambda x: pd.to_datetime(x[:10]))

In [153]:
unpivoted

Unnamed: 0,topic_id,topic,distribution,date
0,2012-09-10-T-001,0,0.030391,2012-09-10
1,2012-09-10-T-001,5,0.017484,2012-09-10
2,2012-09-10-T-001,6,0.043733,2012-09-10
3,2012-09-10-T-001,10,0.019100,2012-09-10
4,2012-09-10-T-001,14,0.331654,2012-09-10
...,...,...,...,...
117870,2024-03-07-T-042,0,0.035224,2024-03-07
117871,2024-03-07-T-042,12,0.898117,2024-03-07
117872,2024-03-07-T-042,13,0.016908,2024-03-07
117873,2024-03-07-T-042,15,0.019758,2024-03-07


In [154]:
dataset_id = "topic_modelling"
table_id = f"topic_distribution_{num_topics}_nmf_{date_yyyymmdd}"

unpivoted.to_gbq(destination_table=f"{dataset_id}.{table_id}",
          project_id=project_id,
          if_exists="replace")

100%|██████████| 1/1 [00:00<00:00, 773.14it/s]


In [155]:
highest_topic = ct_df[["topic_id", "highest_topic", "highest_topic_distribution"]]
highest_topic['date'] = highest_topic['topic_id'].apply(lambda x: pd.to_datetime(x[:10]))
highest_topic = highest_topic[~highest_topic['highest_topic'].isna()]
highest_topic['highest_topic'] = highest_topic['highest_topic'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highest_topic['date'] = highest_topic['topic_id'].apply(lambda x: pd.to_datetime(x[:10]))


In [156]:
highest_topic

Unnamed: 0,topic_id,highest_topic,highest_topic_distribution,date
0,2024-03-07-T-001,14,0.487700,2024-03-07
1,2024-03-07-T-002,15,0.585603,2024-03-07
2,2024-03-07-T-003,15,0.343122,2024-03-07
3,2024-03-07-T-004,7,0.199660,2024-03-07
4,2024-03-07-T-005,4,0.427844,2024-03-07
...,...,...,...,...
16083,2012-09-10-T-067,19,0.414493,2012-09-10
16084,2012-09-10-T-068,12,0.481961,2012-09-10
16085,2012-09-10-T-069,6,0.413802,2012-09-10
16086,2012-09-10-T-070,2,0.374353,2012-09-10


In [157]:
dataset_id = "topic_modelling"
table_id = f"highest_topic_{num_topics}_nmf_{date_yyyymmdd}"

highest_topic.to_gbq(destination_table=f"{dataset_id}.{table_id}",
          project_id=project_id,
          if_exists="replace")

100%|██████████| 1/1 [00:00<00:00, 2796.20it/s]


In [158]:
!pip install openai --quiet

In [159]:
from openai import OpenAI
from google.colab import userdata

client = OpenAI(
    api_key = userdata.get('OPENAI_API_KEY')
)

In [160]:
# Function to get summarization from ChatGPT
def get_summarization(topic_top_n_words):
    response = client.chat.completions.create(
      model="gpt-3.5-turbo-0125",
      messages=[
        {"role": "system", "content": "You're a political text analyst focusing on Singapore Parliament Speeches. Given top words, provide a 1-2 word summary of the topic."},
        {"role": "user", "content": f"The words are: {topic_top_n_words}. What's a summary in 1-2 words?"}
      ]
    )
    return response.choices[0].message.content

def extract_topic_words(topic_repr):
    # Split the representation by "+" to get individual terms
    terms = topic_repr.split('+')
    topic_words = []
    for term in terms:
        # Extract the word (text between quotes) using regular expression
        word = re.findall(r'"([^"]*)"', term)
        # Append the extracted word to the list of topic words
        if word:
            topic_words.append(word[0])
    return topic_words

In [177]:
topic_words_list = []

for i in range(num_topics):
    topic_repr = nmf_model.print_topic(i, topn=15)
    topic_words = extract_topic_words(topic_repr)
    topic_words_list.append(topic_words)

In [178]:
for i in range(num_topics):
  print(' '.join(topic_words_list[i]))

court law case legal section person clause justice appeal offence amendment criminal information judge power
budget singaporean tax gst year price income cost increase generation reserve future land revenue billion
company industry carbon business sector economy support energy green tax smes global enterprise change job
flat hdb housing home resident rental scheme estate building price bto resale room senior owner
support work worker senior help employer provide sector working need programme better effort caregiver scheme
security officer woman police threat defence national saf digital power attack section technology incident force
job employer worker employment company work cpf singaporean employee foreign woman retirement workforce mom workplace
transport bus road vehicle lta system driver safety taxi operator commuter car user ev food
service digital data sector technology programme agency industry chairman help company work access use medium
foreign country u singaporean world ase

In [179]:
topic_summaries = []
for topic_words in topic_words_list:
    topic_top_n_words = ' '.join(topic_words)
    summary = get_summarization(topic_top_n_words)
    topic_summaries.append(summary)

In [180]:
topic_summaries = [topic.strip('"').title() for topic in topic_summaries]

In [181]:
for i, summary in enumerate(topic_summaries):
    print(f"Topic {i}: {summary}")

Topic 0: Legal System.
Topic 1: Economic Policy.
Topic 2: Green Economy
Topic 3: Housing Policy
Topic 4: Workforce Support
Topic 5: National Security
Topic 6: Labor Market
Topic 7: Transportation Governance
Topic 8: Digital Services
Topic 9: Foreign Relations
Topic 10: Healthcare System
Topic 11: Social Support
Topic 12: Community And Youth
Topic 13: Community Development
Topic 14: Societal Perspectives
Topic 15: Mental Health
Topic 16: Police Investigation
Topic 17: Healthcare Reform
Topic 18: Crime Prevention
Topic 19: Education Policy
Topic 20: Financial Regulation
Topic 21: Economic Policy
Topic 22: Labor Market
Topic 23: Family Law
Topic 24: Family Support


In [182]:
topic_names = pd.DataFrame({
    'topic': [i for i in range(num_topics)],
    'topic_summary': topic_summaries,
    'top_n_words': [' '.join(topic_words) for topic_words in topic_words_list]
})

In [183]:
dataset_id = "topic_modelling"
table_id = f"topic_names_{num_topics}_nmf_{date_yyyymmdd}"

topic_names.to_gbq(destination_table=f"{dataset_id}.{table_id}",
          project_id=project_id,
          if_exists="replace")

100%|██████████| 1/1 [00:00<00:00, 1560.96it/s]


In [184]:
# Save NMF Model

from google.colab import drive
import os
drive.mount('/content/drive')

# Define the directory path
directory = "/content/drive/My Drive/singapore-parliament-speeches/"

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

nmf_model.save(f"{directory}model_{num_topics}_nmf_{date_yyyymmdd}")


# Load the NMF model from the 'singapore-parliament-speeches' folder in Google Drive
# nmf_model_path = "f/content/drive/My Drive/singapore-parliament-speeches/nmf_model_{num_topics}_topics"
# loaded_nmf_model = Nmf.load(nmf_model_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
