<a href="https://colab.research.google.com/github/mswae/first-mayoral-hackathon/blob/main/peopulse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Peopulse**: Silence-Aware Citizen Feedback System

## Import Libraries

In [1]:
!pip install -q transformers datasets evaluate

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import re #regular expression; for text manipulation
import evaluate #for evaluation metrics

from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


## Load Dataset (to revise)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

ds_name = "/content/drive/MyDrive/Datasets/peopulse-dataset-2.csv"
df = pd.read_csv(ds_name)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Model from Hugging Face (to revise)

In [None]:
#loads RoBERTa-Tagalog
model_name = "jcblaise/roberta-tagalog-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model_rtb = AutoModel.from_pretrained(model_name, num_labels=3)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Helper Functions (to revise)

In [None]:
# --- preprocessing helper functions ---

def clean_data(text):

  text = text.lower()
  text = re.sub(r"http\S+|www\S+", "", text)  #remove URLs
  text = re.sub(r"@\w+", "", text)  #remove @mentions
  text = re.sub(r"u/\w+", "", text)  #remove u/mentions
  text = re.sub(r"#(\w+)", r"\1", text)  #remove hashtag symbols but keep the word
  text = re.sub(r"[^\w\s!?.,'’“”-]", " ", text)  #keep only basic punctuation
  text = re.sub(r"\s+", " ", text).strip()

  return text

def tokenize_fn(batch, tokenizer, text_col="text", label_col="label"):

  tokens = tokenizer(batch[text_col], truncation=True, padding="max_length", max_length=128    )
  tokens["labels"] = batch[label_col]

  return tokens

# --- evaluation helper function ---

f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    f1 = f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"]
    precision = precision_metric.compute(predictions=preds, references=labels, average="weighted")["precision"]
    recall = recall_metric.compute(predictions=preds, references=labels, average="weighted")["recall"]

    results = {
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

    return results

## Stratify Data for Training and Testing (80-20) (to revise)

In [None]:
label_col = "label" #replace with the actual name of your label column

le = LabelEncoder()
df[label_col] = le.fit_transform(df[label_col])

X = df.drop(columns=[label_col])
y = df[label_col]

#perform an 80-20 stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Original dataset shape: {y.value_counts(normalize=True)}")
print(f"Train set shape: {y_train.value_counts(normalize=True)}")
print(f"Test set shape: {y_test.value_counts(normalize=True)}")

train_df = pd.DataFrame({
    "text": X_train["text"].apply(clean_data),
    "category": y_train
})

test_df = pd.DataFrame({
    "text": X_test["text"].apply(clean_data),
    "category": y_test
})

#convert to HF dataset
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

#tokenization
train_dataset = train_ds.map(
    lambda batch: tokenize_fn(batch, tokenizer),
    batched=True,
    remove_columns=train_ds.column_names #removes columns after tokenization
)

test_dataset = test_ds.map(
    lambda batch: tokenize_fn(batch, tokenizer),
    batched=True,
    remove_columns=test_ds.column_names #removes columns after tokenization
)

Original dataset shape: category
8    0.111111
2    0.111111
7    0.111111
4    0.111111
5    0.111111
1    0.111111
0    0.111111
3    0.111111
6    0.111111
Name: proportion, dtype: float64
Train set shape: category
8    0.111111
4    0.111111
3    0.111111
5    0.111111
7    0.111111
1    0.111111
2    0.111111
6    0.111111
0    0.111111
Name: proportion, dtype: float64
Test set shape: category
8    0.111111
7    0.111111
6    0.111111
5    0.111111
3    0.111111
1    0.111111
4    0.111111
0    0.111111
2    0.111111
Name: proportion, dtype: float64


Map:   0%|          | 0/2880 [00:00<?, ? examples/s]

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

## Model Training

In [None]:
# args = TrainingArguments(
#     output_dir=f"./results/roberta-tagalog-peopulse-v2",
#     num_train_epochs=1,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     eval_strategy="steps",
#     eval_steps=50,
#     save_strategy="steps",
#     save_steps=50,
#     logging_steps=50,
#     logging_dir=f"./logs/roberta-tagalog-peopulse-v2",
#     load_best_model_at_end=True,
#     learning_rate=2e-5, #default
#     report_to="none"
# )

# trainer = Trainer(
#     model=model_rtb,
#     args=args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
# )

# trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss,F1,Precision,Recall
50,1.8983,0.972418,0.703109,0.778315,0.719444
100,0.5459,0.108364,0.976247,0.976536,0.976389
150,0.1,0.015107,0.998611,0.998628,0.998611




KeyboardInterrupt: 

In [None]:
trainer.state.best_model_checkpoint

'./results/RoBERTa-Tagalog/checkpoint-150'

## Save Model

In [None]:
# save_path = f"./saved_models/RoBERTa-Tagalog"

# trainer.save_model(save_path)  #saves model weights, config, tokenizer, etc.
# tokenizer.save_pretrained(save_path)

# print(f"Model saved to {save_path}")

Model saved to ./saved_models/RoBERTa-Tagalog


In [None]:
# # pushing the model to HF
# from huggingface_hub import login, HfApi
# login()

# HF_USERNAME = "sdbrgo"
# # SETTING = "POC" #VAL/TEST/POC
# MODEL = "RoBERTa-Tagalog"

# api = HfApi()

# api.upload_folder(
#     folder_path=f"./saved_models/{MODEL}",
#     repo_id=f"{HF_USERNAME}/roberta-tagalog-sentiment-classifier",
#     repo_type="model"
# )

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/sdbrgo/roberta-tagalog-public-concern-classifier/commit/4b5fa981fe0dadb3df0dc806c22029e5f60bff05', commit_message='Upload folder using huggingface_hub', commit_description='', oid='4b5fa981fe0dadb3df0dc806c22029e5f60bff05', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sdbrgo/roberta-tagalog-public-concern-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='sdbrgo/roberta-tagalog-public-concern-classifier'), pr_revision=None, pr_num=None)

# Peopulse Ingestion Pipeline

### Define Helper Functions (this is for peopulse v1)

In [2]:
# ===== ingestion.py =====
# --- load dataset ---
def load_raw_data(file):
  df = pd.read_csv(file)
  df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
  return df.dropna(subset=["text", "timestamp"])

# --- window selector ---
def filter_current_window(df, lookback_days=30):
  now = pd.Timestamp.now(tz="UTC")
  start = now - pd.Timedelta(days=lookback_days)

  window_df = df[
      (df["timestamp"] >= start) &
      (df["timestamp"] < now)
  ].copy()

  return window_df

# --- cleans "text" column only ---
def clean_text_column(df, text_col="text"):
  df = df.copy()
  df[text_col] = df[text_col].astype(str).apply(clean_data)
  return df

# --- fills out rows with missing category ---
def classify_missing_categories(df, model, tokenizer, text_col="text", cat_col="category"):
  df = df.copy()

  mask = df[cat_col].isna() | (df[cat_col] == "")
  if not mask.any():
      return df  # nothing to do

  texts = df.loc[mask, text_col].tolist()

  inputs = tokenizer(
      texts,
      truncation=True,
      padding=True,
      max_length=128,
      return_tensors="pt"
  )

  model.eval()
  with torch.no_grad():
      outputs = model(**inputs)
      preds = outputs.logits.argmax(dim=-1).cpu().numpy()

  df.loc[mask, cat_col] = preds
  df.loc[mask, "category_source"] = "predicted"
  df.loc[~mask, "category_source"] = "original"

  return df

  # ===== embedding.py =====
  def embed_texts(
    df,
    model,
    tokenizer,
    text_col="text",
    batch_size=16,
    max_length=128,
    pooling="mean"  # "cls" or "mean"
):
    model.eval()
    all_embeddings = []

    texts = df[text_col].tolist()

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        inputs = tokenizer(
            batch_texts,
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors="pt"
        )

        with torch.no_grad():
            outputs = model(
                **inputs,
                output_hidden_states=True,
                return_dict=True
            )

        last_hidden = outputs.hidden_states[-1]

        if pooling == "cls":
            embeddings = last_hidden[:, 0, :]
        else:
            # mean pooling with attention mask
            mask = inputs["attention_mask"].unsqueeze(-1)
            summed = (last_hidden * mask).sum(dim=1)
            counts = mask.sum(dim=1)
            embeddings = summed / counts

        all_embeddings.append(embeddings.cpu().numpy())

    return np.vstack(all_embeddings)

## Main

In [None]:
LOOKBACK_DAYS = 30
TEXT_COL = "text" # match actual feature
PATH = "/content/drive/MyDrive/Datasets/peopulse_dataset_timestamped.csv"

peopulse_model = "sdbrgo/roberta-tagalog-public-concern-classifier"
tokenizer = AutoTokenizer.from_pretrained(peopulse_model)

df = load_raw_data(PATH)

window_df = filter_current_window(df, LOOKBACK_DAYS)

window_df = clean_text_column(window_df)

window_df = classify_missing_categories(
    window_df,
    model=peopulse_model,
    tokenizer=tokenizer
)

embeddings = embed_texts(
    window_df,
    model=peopulse_model,
    tokenizer=tokenizer,
    text_col=TEXT_COL
    # pooling="mean"
)