## **Imports**

In [1]:
import pandas as pd
import random
from datasets import Dataset, DatasetDict, load_dataset
from typing import List
import re
import string

## **Load Dataset and rename columns**

In [2]:
BASE_DATA_PATH = "../../data/nlp/Sentiment-analysis-for-mental-health.csv"
AUGMENTED_ANXIETY_CLASS_DATA_PATH = "../../data/nlp/anxiety_class_augmented.csv"
AUGMENTED_TAGLISH_ANXIETY_CLASS_DATA_PATH = "../../data/nlp/class_sampled_taglish_augmented.csv"
DATA_OUPUT_PATH = "../../data/nlp/class_sampled.csv"
CLEAN_DATASET = "../../data/nlp/clean-sentiment-analysis-for-mental-health.csv"
CLEAN_SAMPLED_ANXIETY_DATASET = "../../data/nlp/clean-anxiety_class_augmented.csv"

In [3]:

# Load two datasets
dataset = load_dataset("csv", data_files=BASE_DATA_PATH)
dataset_2 = load_dataset("csv", data_files = AUGMENTED_ANXIETY_CLASS_DATA_PATH)

# Define functions to rename datasets
def rename_data_columns(data: Dataset, column_names_to_change: List[str], new_column_names: List[str]) -> Dataset:
    if len(column_names_to_change) != len(new_column_names):
        raise ValueError("Both lists must have the same length.")
    
    dataset = data
    for old_name, new_name in zip(column_names_to_change, new_column_names):
        dataset = dataset.rename_column(old_name, new_name)

    print(dataset)
    
    return dataset

# Remove unwanted column
dataset = dataset.remove_columns(["Unnamed: 0"])

# Rename
print("\nDataset: tweet_emotions")

dataset = rename_data_columns(
    data=dataset,
    column_names_to_change=['status', 'statement'],
    new_column_names=['labels', 'text']
  )



Dataset: tweet_emotions
DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 53043
    })
})


## **Clean text and remove special characters**

In [4]:
def clean_text(example):
    text = example.get("text", "")

    if text is None or not isinstance(text, str) or text.strip() == "":
        return {"text": ""}   # keep it empty; will be dropped later

    # Lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove newlines and other non-word characters except spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove words containing numbers
    text = re.sub(r'\w*\d\w*', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return {"text": text}

dataset = dataset.map(clean_text)
dataset_2 = dataset_2.map(clean_text)

dataset["train"].to_pandas().head()
dataset_2["train"].to_pandas().head()

Unnamed: 0,text,labels
0,i made huge embarrassing baka mistake at work ...,Anxiety
1,i had naman an syempre worries attack during a...,Anxiety
2,do i have a personality i dont feel like i hav...,Anxiety
3,possible sobra health anxiety parang maybe ser...,Anxiety
4,venting a little baka info new to baka the com...,Anxiety


## **Check if there are any null values**

In [5]:
def check_for_null_blank(dataset, dataset_name="Dataset"):
    import pandas as pd

    # If it's a DatasetDict, loop over splits
    if hasattr(dataset, "keys"):  # DatasetDict
        for split in dataset.keys():
            print(f"Checking split: {split} in {dataset_name}\n")
            df = dataset[split].to_pandas()
            
            # Nulls
            print("Nulls in each column:")
            print(df[["text", "labels"]].isnull().sum())
            print("\n")
            
            # Rows with nulls
            print("Rows where text or labels are null/blank:")
            print(df[df["text"].isnull() | df["labels"].isnull()])
            print("\n")
            
            # Blank strings
            print("Check for blank and empty string values:")
            print("Blank texts:", (df["text"].str.strip() == "").sum())
            print("Blank labels:", (df["labels"].astype(str).str.strip() == "").sum())
            print("\n")
    else:  # Single Dataset
        df = dataset.to_pandas()
        print(f"Checking {dataset_name}\n")
        print("Nulls in each column:")
        print(df[["text", "labels"]].isnull().sum())
        print("\n")
        print("Rows where text or labels are null/blank:")
        print(df[df["text"].isnull() | df["labels"].isnull()])
        print("\n")
        print("Check for blank and empty string values:")
        print("Blank texts:", (df["text"].str.strip() == "").sum())
        print("Blank labels:", (df["labels"].astype(str).str.strip() == "").sum())
        print("\n")

check_for_null_blank(dataset=dataset, dataset_name="sentiment analysis for mental health")
check_for_null_blank(dataset=dataset_2, dataset_name="anxiety class sampled")

Checking split: train in sentiment analysis for mental health

Nulls in each column:
text      0
labels    0
dtype: int64


Rows where text or labels are null/blank:
Empty DataFrame
Columns: [text, labels]
Index: []


Check for blank and empty string values:
Blank texts: 369
Blank labels: 0


Checking split: train in anxiety class sampled

Nulls in each column:
text      0
labels    0
dtype: int64


Rows where text or labels are null/blank:
Empty DataFrame
Columns: [text, labels]
Index: []


Check for blank and empty string values:
Blank texts: 0
Blank labels: 0




## **Drop null values**

In [6]:
def drop_nulls_from_dataset(dataset, columns=["text", "labels"]):
    """
    Drops null and blank values from a Dataset or DatasetDict.
    Works for both cases.
    """
    if isinstance(dataset, DatasetDict):  # Case 1: DatasetDict with train/test
        cleaned_splits = {}
        for split_name, ds_split in dataset.items():
            df = ds_split.to_pandas()
            df = df.dropna(subset=columns)
            if "text" in columns:
                df = df[df["text"].str.strip() != ""]
            cleaned_splits[split_name] = Dataset.from_pandas(df)
            print(f"✅ Cleaned {split_name}: {len(ds_split)} → {len(df)} rows")
        return DatasetDict(cleaned_splits)

    elif isinstance(dataset, Dataset):  # Case 2: Single Dataset
        df = dataset.to_pandas()
        df = df.dropna(subset=columns)
        if "text" in columns:
            df = df[df["text"].str.strip() != ""]
        cleaned_dataset = Dataset.from_pandas(df)
        print(f"✅ Cleaned dataset: {len(dataset)} → {len(df)} rows")
        return cleaned_dataset

    else:
        raise TypeError("Expected Dataset or DatasetDict")

dataset = drop_nulls_from_dataset(dataset=dataset)
dataset_2 = drop_nulls_from_dataset(dataset=dataset_2)

# Check counts again
check_for_null_blank(dataset);
check_for_null_blank(dataset_2);

✅ Cleaned train: 53043 → 52674 rows
✅ Cleaned train: 2000 → 2000 rows
Checking split: train in Dataset

Nulls in each column:
text      0
labels    0
dtype: int64


Rows where text or labels are null/blank:
Empty DataFrame
Columns: [text, labels, __index_level_0__]
Index: []


Check for blank and empty string values:
Blank texts: 0
Blank labels: 0


Checking split: train in Dataset

Nulls in each column:
text      0
labels    0
dtype: int64


Rows where text or labels are null/blank:
Empty DataFrame
Columns: [text, labels]
Index: []


Check for blank and empty string values:
Blank texts: 0
Blank labels: 0




## **Check if there are duplicate rows**

In [7]:
def check_for_duplicates(dataset, dataset_name="Dataset", text_column="text", label_column="labels"):
    """
    Checks for duplicate rows in a Hugging Face Dataset or DatasetDict.
    Detects duplicates based on text only, or text+label if specified.
    """

    if isinstance(dataset, DatasetDict):  # Case 1: DatasetDict
        for split_name, ds_split in dataset.items():
            df = ds_split.to_pandas()
            print(f"🔍 Checking duplicates in split: {split_name} ({dataset_name})\n")
            
            # Duplicates by text only
            dup_text = df[df.duplicated(subset=[text_column], keep=False)]
            print(f"Duplicates by `{text_column}` only: {len(dup_text)} rows")
            if not dup_text.empty:
                print(dup_text.head())
            print("\n")
            
            # Duplicates by text + label (exact row duplicates)
            dup_both = df[df.duplicated(subset=[text_column, label_column], keep=False)]
            print(f"Duplicates by `{text_column}` + `{label_column}`: {len(dup_both)} rows")
            if not dup_both.empty:
                print(dup_both.head())
            print("\n")

    elif isinstance(dataset, Dataset):  # Case 2: Single Dataset
        df = dataset.to_pandas()
        print(f"🔍 Checking duplicates in {dataset_name}\n")
        
        # Duplicates by text only
        dup_text = df[df.duplicated(subset=[text_column], keep=False)]
        print(f"Duplicates by `{text_column}` only: {len(dup_text)} rows")
        if not dup_text.empty:
            print(dup_text.head())
        print("\n")
        
        # Duplicates by text + label (exact row duplicates)
        dup_both = df[df.duplicated(subset=[text_column, label_column], keep=False)]
        print(f"Duplicates by `{text_column}` + `{label_column}`: {len(dup_both)} rows")
        if not dup_both.empty:
            print(dup_both.head())
        print("\n")
    else:
        raise TypeError("Expected Dataset or DatasetDict")
    
check_for_duplicates(dataset, dataset_name="Sentiment analysis for mental health")
check_for_duplicates(dataset_2, dataset_name="Anxiety class sampled")


🔍 Checking duplicates in split: train (Sentiment analysis for mental health)

Duplicates by `text` only: 3205 rows
                                                 text   labels  \
18  no regrets or grudgesangry at things that have...  Anxiety   
39  but my heart is still restless even though my ...  Anxiety   
53                              restless and restless  Anxiety   
56                                   why am i nervous  Anxiety   
78                              restless and agitated  Anxiety   

    __index_level_0__  
18                 18  
39                 39  
53                 53  
56                 56  
78                 78  


Duplicates by `text` + `labels`: 3167 rows
                                                 text   labels  \
18  no regrets or grudgesangry at things that have...  Anxiety   
39  but my heart is still restless even though my ...  Anxiety   
53                              restless and restless  Anxiety   
56                                 

## **Drop Duplicated Rows**

In [8]:
def drop_duplicates_from_dataset(dataset, text_column="text", by_text_only=True):
    """
    Drops duplicate rows from a Hugging Face Dataset or DatasetDict.

    Args:
        dataset: Dataset or DatasetDict
        text_column: column to check for duplicates
        by_text_only: 
            - True  = drop rows where the same text appears more than once (remove ALL copies)
            - False = drop rows where the same text+label pair appears more than once (remove ALL copies)
    
    Returns:
        Cleaned Dataset or DatasetDict
    """
    from datasets import Dataset, DatasetDict

    def _clean(df):
        # Decide grouping
        subset_cols = [text_column] if by_text_only else [text_column, "labels"]

        # Find duplicates → mark all occurrences
        dup_mask = df.duplicated(subset=subset_cols, keep=False)

        # Keep only unique rows
        df_cleaned = df[~dup_mask].reset_index(drop=True)

        removed = len(df) - len(df_cleaned)
        return df_cleaned, removed

    if isinstance(dataset, DatasetDict):  # Case 1: DatasetDict
        cleaned_splits = {}
        for split_name, ds_split in dataset.items():
            df = ds_split.to_pandas()
            df_cleaned, removed = _clean(df)
            print(f"✅ Cleaned {split_name}: {len(df)} → {len(df_cleaned)} rows (removed {removed})")
            cleaned_splits[split_name] = Dataset.from_pandas(df_cleaned, preserve_index=False)
        return DatasetDict(cleaned_splits)

    elif isinstance(dataset, Dataset):  # Case 2: Single Dataset
        df = dataset.to_pandas()
        df_cleaned, removed = _clean(df)
        print(f"✅ Cleaned dataset: {len(df)} → {len(df_cleaned)} rows (removed {removed})")
        return Dataset.from_pandas(df_cleaned, preserve_index=False)

    else:
        raise TypeError("Expected Dataset or DatasetDict")

# Remove ALL rows where text is duplicated (even with different labels)
dataset = drop_duplicates_from_dataset(dataset, by_text_only=True)
dataset_2 = drop_duplicates_from_dataset(dataset_2, by_text_only=True)

# Check again
check_for_duplicates(dataset)
check_for_duplicates(dataset_2)

✅ Cleaned train: 52674 → 49469 rows (removed 3205)
✅ Cleaned train: 2000 → 1959 rows (removed 41)
🔍 Checking duplicates in split: train (Dataset)

Duplicates by `text` only: 0 rows


Duplicates by `text` + `labels`: 0 rows


🔍 Checking duplicates in split: train (Dataset)

Duplicates by `text` only: 0 rows


Duplicates by `text` + `labels`: 0 rows




In [37]:
df = dataset["train"].to_pandas()

# Drop the unwanted index column if it exists
if "__index_level_0__" in df.columns:
    df = df.drop(columns=["__index_level_0__"])

# Save clean CSV
df.to_csv(CLEAN_DATASET, index=False, encoding="utf-8-sig")

df_2 = dataset_2["train"].to_pandas()

# Drop the unwanted index column if it exists
if "__index_level_0__" in df.columns:
    df_2 = df_2.drop(columns=["__index_level_0__"])

# Save clean sampled anxiety class csv
df_2.to_csv(CLEAN_SAMPLED_ANXIETY_DATASET, index=False, encoding="utf-8-sig")


In [4]:
# df_base_cleaned = pd.read_csv(CLEAN_DATASET, encoding="utf-8-sig")
# df_anxiety_augmented = pd.read_csv(CLEAN_SAMPLED_ANXIETY_DATASET, encoding="utf-8-sig")
# df_anxiety_augmented_taglish = pd.read_csv(AUGMENTED_TAGLISH_ANXIETY_CLASS_DATA_PATH, encoding="utf-8-sig")
# df_stress_aug = pd.read_csv('../../data/nlp/stress_class_sampled_augmented.csv', encoding="utf-8-sig")
# df_stress_aug_v2 = pd.read_csv('../../data/nlp/stress_class_sampled_augmented_v2.csv', encoding="utf-8-sig")

df_past_combined = pd.read_csv('../../data/nlp/combined_cleaned_dataset_v2.csv', encoding="utf-8-sig")
augmented_suicidal = pd.read_csv('../../data/nlp/augmented_suicidal_depression_confused_cases_paraphrased.csv', encoding="utf-8-sig")

# Drop unwanted columns
augmented_suicidal = augmented_suicidal.drop(columns=['text', 'pred'], errors='ignore')

# Rename columns
augmented_suicidal = augmented_suicidal.rename(columns={
    'true': 'labels',
    'paraphrase': 'text'
})

# Combine all
df_combined = pd.concat([df_past_combined, augmented_suicidal], ignore_index=True)

# Save back
df_combined.to_csv("combined_cleaned_dataset.csv", index=False, encoding="utf-8-sig")

In [5]:
print(augmented_suicidal.head())

       labels                                               text
0    Suicidal  I really want to die and I might carry it out ...
1  Depression  I'm overwhelmed with sadness and can't seem to...
2  Depression  I've been feeling numb and utterly hopeless ab...
3  Depression  Lately i've been feeling very down and empty, ...
4  Depression  Lately i've been feeling very down and empty, ...


In [9]:
def sample_class_to_csv(dataset, label_name, num_rows, output_path, label_column="labels", seed=42):
    """
    Samples rows from a given class (by label name, e.g. "Anxiety")
    and saves them to CSV.

    Args:
        dataset (Dataset or DatasetDict): Hugging Face dataset.
        label_name (str): The target class to filter, e.g. "Anxiety".
        num_rows (int): Number of rows to sample.
        output_path (str): Path to save CSV.
        label_column (str): Column name for labels (should contain string labels).
        seed (int): Random seed for reproducibility.
    """
    # Convert dataset to single DataFrame
    if isinstance(dataset, DatasetDict):
        if "train" not in dataset:
            raise ValueError("DatasetDict must contain a 'train' split or specify which split to use.")
        df = dataset["train"].to_pandas()
    elif isinstance(dataset, Dataset):
        df = dataset.to_pandas()
    else:
        raise TypeError("Expected Dataset or DatasetDict")

    # Filter rows for the specific class
    class_rows = df[df[label_column] == label_name]

    if class_rows.empty:
        raise ValueError(f"No rows found for label '{label_name}'")

    # Sample randomly
    sampled = class_rows.sample(
        n=min(num_rows, len(class_rows)),
        random_state=seed
    )

    # Save to CSV
    sampled.to_csv(output_path, index=False, encoding="utf-8-sig")
    print(f"✅ Saved {len(sampled)} rows for class '{label_name}' to {output_path}")

    return sampled


In [10]:
# Suppose dataset is your HF dataset
sampled_df = sample_class_to_csv(
    dataset=dataset,
    label_name="Stress",                 # class to sample (e.g., minority class)
    num_rows=2000,               # number of rows to take
    output_path=DATA_OUPUT_PATH
)


✅ Saved 2000 rows for class 'Stress' to ../../data/nlp/class_sampled.csv
