<a href="https://colab.research.google.com/github/mswae/first-mayoral-hackathon/blob/main/peopulse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Peopulse**: Public Concern Early Warning System and Decision Support

## Import Libraries

In [None]:
!pip install -q transformers datasets evaluate

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re #regular expression; for text manipulation
import evaluate #for evaluation metrics

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

from sklearn.model_selection import train_test_split
import hdbscan

## Load Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ds_name = "/content/drive/MyDrive/Datasets/populse_dataset.csv"
df = pd.read_csv(ds_name)

## Load Model from Hugging Face

In [None]:
#loads RoBERTa-Tagalog
model_name = "jcblaise/roberta-tagalog-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model_rtb = AutoModelForSequenceClassification.from_pretrained(model_name)

## Define Helper Functions

In [None]:
# --- preprocessing helper functions ---

def clean_data(text):

  text = text.lower()
  text = re.sub(r"http\S+|www\S+", "", text)  #remove URLs
  text = re.sub(r"@\w+", "", text)  #remove @mentions
  text = re.sub(r"u/\w+", "", text)  #remove u/mentions
  text = re.sub(r"#(\w+)", r"\1", text)  #remove hashtag symbols but keep the word
  text = re.sub(r"[^\w\s!?.,'’“”-]", " ", text)  #keep only basic punctuation
  text = re.sub(r"\s+", " ", text).strip()

  return text

def tokenize_fn(batch, tokenizer, text_col="text", label_col="category"):

  tokens = tokenizer(batch[text_col], truncation=True, padding="max_length", max_length=128    )
  tokens["labels"] = batch[label_col]

  return tokens

# --- evaluation helper function ---

f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    f1 = f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"]
    precision = precision_metric.compute(predictions=preds, references=labels, average="weighted")["precision"]
    recall = recall_metric.compute(predictions=preds, references=labels, average="weighted")["recall"]

    results = {
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

    return results

## Stratify Data for Training and Testing (80-20)

In [None]:
label_col = 'category' #replace with the actual name of your label column

X = df.drop(columns=[label_col])
y = df[label_col]

#perform an 80-20 stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Original dataset shape: {y.value_counts(normalize=True)}")
print(f"Train set shape: {y_train.value_counts(normalize=True)}")
print(f"Test set shape: {y_test.value_counts(normalize=True)}")

## **Model Training**