# Dataset Balancing: SelfMA + SBIC

Create a balanced binary classification dataset by combining:
- **Label 1 (Microaggressive)**: SelfMA dataset samples
- **Label 0 (Non-offensive)**: Social Bias Frames (SBIC) non-offensive samples

## Pipeline:
1. Setup and helper functions
2. Load SelfMA dataset (microaggressive texts)
3. Load SBIC dataset (filter non-offensive samples)
4. Balance and combine datasets
5. Save final balanced dataset

## 1. Setup

In [None]:
# Install dependencies
!pip install -U -q gdown
!pip install -q git+https://github.com/dnozza/profanity-obfuscation.git

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for profanity_obfuscation (setup.py) ... [?25l[?25hdone


In [None]:
import os
import pandas as pd
import requests
import shutil
import urllib.request
import tarfile
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_dataset
from profanity_obfuscation import Prof

# Configuration
BASE_PATH = '/content/drive/MyDrive/266_project/'
RANDOM_SEED = 42

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 2. Profanity Obfuscation Setup

In [None]:
# Download profanity table and setup obfuscator
local_profanity_table_path = 'prof_table.tsv'
if not os.path.exists(local_profanity_table_path):
    profanity_table_url = 'https://raw.githubusercontent.com/dnozza/profanity-obfuscation/main/resources/prof_table.tsv'
    response = requests.get(profanity_table_url)
    response.raise_for_status()
    with open(local_profanity_table_path, 'wb') as f:
        f.write(response.content)

class CustomProfanityObfuscator(Prof):
    def __init__(self, profanity_table_path):
        self.prof_table = pd.read_csv(profanity_table_path, sep="\t")

obfuscator = CustomProfanityObfuscator(local_profanity_table_path)

def process_text(text):
    """Apply profanity obfuscation to standardize text."""
    if text is None:
        text = ""
    return obfuscator.obfuscate_string(text)

print("Profanity obfuscator ready.")

Profanity obfuscator ready.


## 3. Load SelfMA Dataset

SelfMA contains microaggressive text samples. All samples will be labeled as **1 (microaggressive)**.

In [None]:
# Download SelfMA dataset
file_id = '138fisv1BB7pEDlc0bJQzoAV4Wgewi9bV'
file_name = 'self_MA.json'
!gdown --id {file_id} -O {file_name}

self_MA = pd.read_json(file_name, lines=True)
print(f"SelfMA loaded: {len(self_MA)} samples")
print(f"Columns: {list(self_MA.columns)}")

Downloading...
From: https://drive.google.com/uc?id=138fisv1BB7pEDlc0bJQzoAV4Wgewi9bV
To: /content/self_MA.json
100% 1.96M/1.96M [00:00<00:00, 144MB/s]
SelfMA loaded: 3240 samples
Columns: ['id', 'type', 'quote', 'text', 'transcript', 'media_url', 'permalink', 'tags', 'n_hearts', 'n_comments', 'time_ago']


In [None]:
# Process SelfMA: apply profanity obfuscation and prepare for ML
unique_tags = set(tag for tags in self_MA["tags"] if isinstance(tags, list) for tag in tags)
print(f"Unique tags found: {len(unique_tags)}")

# Keep all samples with tags (microaggressions)
mask = self_MA["tags"].apply(lambda xs: isinstance(xs, list) and len(xs) > 0)
self_MA_filtered = self_MA[mask].copy()

# Apply profanity obfuscation
self_MA_filtered['text'] = self_MA_filtered['quote'].apply(process_text)
self_MA_filtered['text'] = self_MA_filtered['text'].str.strip('"')  # Remove surrounding quotes
self_MA_filtered['label'] = 1  # All SelfMA samples are microaggressive

# Keep only text and label columns
selfma_df = self_MA_filtered[['text', 'label']].copy()
selfma_df = selfma_df[selfma_df['text'].str.strip() != ''].reset_index(drop=True)

print(f"SelfMA processed: {len(selfma_df)} samples (all label=1)")
display(selfma_df.head())

Unique tags found: 65
SelfMA processed: 1300 samples (all label=1)


Unnamed: 0,text,label
0,"Yeah, but you're not that kind of Native.",1
1,CAN YOU HEAR ME?,1
2,All immigrants should go back to their own cou...,1
3,Whenever black people come to the beach I star...,1
4,I think black girls with short hair are ugly.,1


In [None]:
# Split SelfMA into train/validation/test (80/10/10)
train_df, temp_df = train_test_split(selfma_df, test_size=0.2, random_state=RANDOM_SEED)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=RANDOM_SEED)

selfMA_ds = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "validation": Dataset.from_pandas(valid_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True)),
})

print("SelfMA splits created:")
for split, ds in selfMA_ds.items():
    print(f"  {split}: {len(ds)} samples")

SelfMA splits created:
  train: 1040 samples
  validation: 130 samples
  test: 130 samples


## 4. Load Social Bias Frames (SBIC) Dataset

SBIC contains posts labeled for offensiveness. We extract **non-offensive samples (label=0)**
to balance against SelfMA's microaggressive samples.

In [None]:
# Download and extract SBIC dataset
local_csv_dir = "./SBIC_data"
data_url = "https://homes.cs.washington.edu/~msap/social-bias-frames/SBIC.v2.tgz"
local_tgz_file = "./SBIC.v2.tgz"

if not os.path.exists(local_csv_dir) or len([f for f in os.listdir(local_csv_dir) if f.endswith('.csv')]) == 0:
    os.makedirs(local_csv_dir, exist_ok=True)

    if not os.path.exists(local_tgz_file):
        print(f"Downloading SBIC dataset...")
        response = urllib.request.urlopen(data_url)
        total_size = int(response.headers.get('Content-Length', 0))

        with tqdm(total=total_size, unit='B', unit_scale=True, desc="Downloading") as pbar:
            tgz_data = b""
            while True:
                chunk = response.read(8192)
                if not chunk:
                    break
                tgz_data += chunk
                pbar.update(len(chunk))

        with open(local_tgz_file, 'wb') as f:
            f.write(tgz_data)

    print("Extracting archive...")
    with tarfile.open(local_tgz_file, 'r:gz') as tar:
        tar.extractall(local_csv_dir)
        # Move CSV files to main directory
        for root, dirs, files in os.walk(local_csv_dir):
            for file in files:
                if file.endswith('.csv') and root != local_csv_dir:
                    shutil.move(os.path.join(root, file), os.path.join(local_csv_dir, file))

print(f"SBIC data ready in {local_csv_dir}")
print(f"Files: {[f for f in os.listdir(local_csv_dir) if f.endswith('.csv')]}")

Downloading SBIC dataset...


Downloading:   0%|          | 0.00/9.46M [00:00<?, ?B/s]

Extracting archive...


  tar.extractall(local_csv_dir)


SBIC data ready in ./SBIC_data
Files: ['SBIC.v2.tst.csv', 'SBIC.v2.trn.csv', 'SBIC.v2.agg.tst.csv', 'SBIC.v2.agg.dev.csv', 'SBIC.v2.dev.csv', 'SBIC.v2.agg.trn.csv']


In [None]:
# Load SBIC splits
train_df_sbic = pd.read_csv(os.path.join(local_csv_dir, 'SBIC.v2.trn.csv'))
valid_df_sbic = pd.read_csv(os.path.join(local_csv_dir, 'SBIC.v2.dev.csv'))
test_df_sbic = pd.read_csv(os.path.join(local_csv_dir, 'SBIC.v2.tst.csv'))

print(f"SBIC loaded:")
print(f"  Train: {len(train_df_sbic)} samples")
print(f"  Validation: {len(valid_df_sbic)} samples")
print(f"  Test: {len(test_df_sbic)} samples")

SBIC loaded:
  Train: 112900 samples
  Validation: 16738 samples
  Test: 17501 samples


In [None]:
def prepare_sbic_split(df):
    """Prepare SBIC split: apply profanity obfuscation, filter non-offensive (label=0)."""
    prepared = df.copy()

    # Apply profanity obfuscation
    prepared['text'] = prepared['post'].apply(process_text)

    # Filter: keep only non-offensive (offensiveYN = 0.0), exclude ambiguous (0.5)
    prepared = prepared[prepared['offensiveYN'] == 0.0].copy()
    prepared['label'] = 0

    # Clean up
    prepared = prepared[['text', 'label']]
    prepared = prepared[prepared['text'].str.strip() != ''].reset_index(drop=True)

    return prepared

# Prepare each SBIC split
sbic_train = prepare_sbic_split(train_df_sbic)
sbic_valid = prepare_sbic_split(valid_df_sbic)
sbic_test = prepare_sbic_split(test_df_sbic)

sbic_ds = DatasetDict({
    "train": Dataset.from_pandas(sbic_train),
    "validation": Dataset.from_pandas(sbic_valid),
    "test": Dataset.from_pandas(sbic_test),
})

print("SBIC non-offensive samples extracted:")
for split, ds in sbic_ds.items():
    print(f"  {split}: {len(ds)} samples (all label=0)")

SBIC non-offensive samples extracted:
  train: 46673 samples (all label=0)
  validation: 5736 samples (all label=0)
  test: 5500 samples (all label=0)


## 5. Balance and Combine Datasets

For each split:
1. Take all SelfMA samples (label=1)
2. Sample equal number from SBIC (label=0)
3. Combine and shuffle

In [None]:
def balance_and_combine(selfma_split, sbic_split, split_name):
    """Balance SelfMA (label=1) with SBIC (label=0) samples."""
    selfma_df = selfma_split.to_pandas()
    sbic_df = sbic_split.to_pandas()

    # Determine sample size (limited by smaller dataset)
    n_samples = min(len(selfma_df), len(sbic_df))

    # Sample equal amounts
    balanced_selfma = selfma_df.sample(n=n_samples, random_state=RANDOM_SEED).reset_index(drop=True)
    balanced_sbic = sbic_df.sample(n=n_samples, random_state=RANDOM_SEED).reset_index(drop=True)

    # Combine and shuffle
    combined = pd.concat([balanced_selfma, balanced_sbic], ignore_index=True)
    combined = combined.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

    print(f"{split_name}: {n_samples} label=1 + {n_samples} label=0 = {len(combined)} total")
    return combined

print("Balancing datasets...")
balanced_train = balance_and_combine(selfMA_ds['train'], sbic_ds['train'], 'Train')
balanced_valid = balance_and_combine(selfMA_ds['validation'], sbic_ds['validation'], 'Validation')
balanced_test = balance_and_combine(selfMA_ds['test'], sbic_ds['test'], 'Test')

Balancing datasets...
Train: 1040 label=1 + 1040 label=0 = 2080 total
Validation: 130 label=1 + 130 label=0 = 260 total
Test: 130 label=1 + 130 label=0 = 260 total


In [None]:
# Create final balanced DatasetDict
balanced_selfMA_SBIC_ds = DatasetDict({
    "train": Dataset.from_pandas(balanced_train),
    "validation": Dataset.from_pandas(balanced_valid),
    "test": Dataset.from_pandas(balanced_test),
})

print("\nFinal balanced dataset:")
for split, ds in balanced_selfMA_SBIC_ds.items():
    df = ds.to_pandas()
    print(f"  {split}: {len(ds)} samples")
    print(f"    Label distribution: {dict(df['label'].value_counts())}")


Final balanced dataset:
  train: 2080 samples
    Label distribution: {0: np.int64(1040), 1: np.int64(1040)}
  validation: 260 samples
    Label distribution: {1: np.int64(130), 0: np.int64(130)}
  test: 260 samples
    Label distribution: {1: np.int64(130), 0: np.int64(130)}


## 6. Save Dataset

In [None]:
# Save to Google Drive
output_path = BASE_PATH + 'balanced_selfMA_SBIC_ds'
balanced_selfMA_SBIC_ds.save_to_disk(output_path)
print(f"Dataset saved to: {output_path}")

Saving the dataset (0/1 shards):   0%|          | 0/2080 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/260 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/260 [00:00<?, ? examples/s]

Dataset saved to: /content/drive/MyDrive/266_project/balanced_selfMA_SBIC_ds


## 7. Summary

In [None]:
print("=" * 60)
print("DATASET CREATION COMPLETE: SelfMA + SBIC")
print("=" * 60)

total = sum(len(balanced_selfMA_SBIC_ds[s]) for s in ['train', 'validation', 'test'])
print(f"""
Dataset Structure:
------------------
- Label 0: Non-offensive text (from SBIC)
- Label 1: Microaggressive text (from SelfMA)

Split Sizes:
------------
- Train:      {len(balanced_selfMA_SBIC_ds['train']):,} samples
- Validation: {len(balanced_selfMA_SBIC_ds['validation']):,} samples
- Test:       {len(balanced_selfMA_SBIC_ds['test']):,} samples
- Total:      {total:,} samples

Class Balance: 1:1 (perfectly balanced)

Output: {output_path}
""")

# Show samples
print("Sample data:")
display(balanced_selfMA_SBIC_ds['train'].to_pandas().head(10))

DATASET CREATION COMPLETE: SelfMA + SBIC

Dataset Structure:
------------------
- Label 0: Non-offensive text (from SBIC)
- Label 1: Microaggressive text (from SelfMA)

Split Sizes:
------------
- Train:      2,080 samples
- Validation: 260 samples
- Test:       260 samples
- Total:      2,600 samples

Class Balance: 1:1 (perfectly balanced)

Output: /content/drive/MyDrive/266_project/balanced_selfMA_SBIC_ds

Sample data:


Unnamed: 0,text,label
0,Dad:: You saw Zero Dark Thirty right? Who's in...,0
1,I didn't watch the halftime show because she d...,1
2,"I have lots of sympathy for women, but I have ...",1
3,Are you sure you are alright with driving a st...,1
4,"f you don’t build your dream, someone else wil...",0
5,Evil seeks to maintain power by suppressing th...,0
6,I don't get those people. Why can't they just ...,1
7,But you're not REALLY disabled. You look so pr...,1
8,"If Rihanna likes the way it hurts, why did she...",1
9,"We'll give you this one, trust me Andre is not...",1
