# Install

In [None]:
!pip install -U -q gdown
!pip install git+https://github.com/dnozza/profanity-obfuscation.git

Collecting git+https://github.com/dnozza/profanity-obfuscation.git
  Cloning https://github.com/dnozza/profanity-obfuscation.git to /tmp/pip-req-build-3hfjy_q7
  Running command git clone --filter=blob:none --quiet https://github.com/dnozza/profanity-obfuscation.git /tmp/pip-req-build-3hfjy_q7
  Resolved https://github.com/dnozza/profanity-obfuscation.git to commit 48238b96463138c418325d0636c1dd16e8e54566
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: profanity_obfuscation
  Building wheel for profanity_obfuscation (setup.py) ... [?25l[?25hdone
  Created wheel for profanity_obfuscation: filename=profanity_obfuscation-0.1.0-py2.py3-none-any.whl size=4283 sha256=6940b2caba63546e80682efec1307433919426f48b80e425aafc45295951f433
  Stored in directory: /tmp/pip-ephem-wheel-cache-fjvtfwfl/wheels/46/37/a0/b5532a9e76d6d4f7cb7e513adbfd9cb39d6c0e16862f556d6c
Successfully built profanity_obfuscation
Installing collected packages: profanity_obfuscatio

# Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from profanity_obfuscation import Prof
import os
import requests
import shutil
import urllib.request
import tarfile
from tqdm.auto import tqdm # Use tqdm.auto for better Colab integration

# Helper functions

In [None]:
# Profanity obfuscation check
local_profanity_table_path = 'prof_table.tsv'
if not os.path.exists(local_profanity_table_path):
    profanity_table_url = 'https://raw.githubusercontent.com/dnozza/profanity-obfuscation/main/resources/prof_table.tsv'
    response = requests.get(profanity_table_url)
    response.raise_for_status() # Raise an exception for bad status codes
    with open(local_profanity_table_path, 'wb') as f:
        f.write(response.content)


class CustomProfanityObfuscator(Prof):
    def __init__(self, profanity_table_path):
        self.prof_table = pd.read_csv(profanity_table_path, sep="\t")

obfuscator = CustomProfanityObfuscator(local_profanity_table_path)


def process_text(text):
    if text is None:
        text = ""
    # Assuming obfuscate_string handles standardization internally
    processed_text = obfuscator.obfuscate_string(text)
    return processed_text

# Social Bias Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import os
import requests
import urllib.request
import tarfile
from tqdm.auto import tqdm # Use tqdm.auto for better Colab integration
import shutil

print("Loading Social Bias Frames dataset...")

local_csv_dir = "./SBIC_data"

# Download and extract if needed
if not os.path.exists(local_csv_dir) or len([f for f in os.listdir(local_csv_dir) if f.endswith('.csv')]) == 0:
    data_url = "https://homes.cs.washington.edu/~msap/social-bias-frames/SBIC.v2.tgz"
    local_tgz_file = "./SBIC.v2.tgz"

    os.makedirs(local_csv_dir, exist_ok=True)

    if not os.path.exists(local_tgz_file):
        print(f"Downloading Social Bias Frames archive from: {data_url}")
        print("Note: This file is large - download may take a few minutes...")

        response = urllib.request.urlopen(data_url)
        total_size = int(response.headers.get('Content-Length', 0))
        block_size = 8192
        tgz_data = b""

        if total_size > 0:
            with tqdm(total=total_size, unit='B', unit_scale=True, desc="Downloading") as pbar:
                while True:
                    chunk = response.read(block_size)
                    if not chunk:
                        break
                    tgz_data += chunk
                    pbar.update(len(chunk))
        else:
            tgz_data = response.read()

        with open(local_tgz_file, 'wb') as f:
            f.write(tgz_data)
        print(f"Downloaded and saved to: {local_tgz_file}")
    else:
        print(f"Found local archive: {local_tgz_file}")

    print("Extracting archive...")
    temp_extract_dir = os.path.join(local_csv_dir, 'temp_extract')
    os.makedirs(temp_extract_dir, exist_ok=True)

    with tarfile.open(local_tgz_file, 'r:gz') as tar:
        csv_members = [m for m in tar.getmembers() if m.name.endswith('.csv')]
        print(f"Found {len(csv_members)} CSV files in archive:")
        for member in csv_members:
            print(f"  - {member.name}")
        tar.extractall(temp_extract_dir)

        for root, dirs, files in os.walk(temp_extract_dir):
            for file in files:
                 if file.endswith('.csv'):
                    src = os.path.join(root, file)
                    dst = os.path.join(local_csv_dir, file)
                    shutil.move(src, dst)
                    print(f"  Moved {file} to {local_csv_dir}")

        shutil.rmtree(temp_extract_dir, ignore_errors=True)
else:
    print(f"Found extracted CSV files in: {local_csv_dir}")
    csv_files_found = [f for f in os.listdir(local_csv_dir) if f.endswith('.csv')]
    print(f"  CSV files: {csv_files_found}")


# Define paths to the original split files
train_filepath = os.path.join(local_csv_dir, 'SBIC.v2.trn.csv')
val_filepath = os.path.join(local_csv_dir, 'SBIC.v2.dev.csv')
test_filepath = os.path.join(local_csv_dir, 'SBIC.v2.tst.csv')

# Check if original split files exist
if not os.path.exists(train_filepath) or not os.path.exists(val_filepath) or not os.path.exists(test_filepath):
     print("Warning: Original split files not found in the expected location after download/extraction.")
     print("Attempting to locate files in the extracted directory...")

     found_train = False
     found_val = False
     found_test = False

     for root, dirs, files in os.walk(local_csv_dir):
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                file_name_lower = file.lower()
                if 'trn' in file_name_lower and not found_train:
                    train_filepath = file_path
                    found_train = True
                elif 'dev' in file_name_lower or 'val' in file_name_lower and not found_val:
                    val_filepath = file_path
                    found_val = True
                elif 'tst' in file_name_lower or 'test' in file_name_lower and not found_test:
                    test_filepath = file_path
                    found_test = True

     if not (found_train and found_val and found_test):
         raise FileNotFoundError("Could not find all original split files (train, dev, test) after download and extraction.")


print("\nLoading data from original splits:")

# Load the original split files
try:
    train_df_sbf = pd.read_csv(train_filepath)
    valid_df_sbf = pd.read_csv(val_filepath)
    test_df_sbf = pd.read_csv(test_filepath)

    print(f"  Train: {len(train_df_sbf)} samples")
    print(f"  Validation: {len(valid_df_sbf)} samples")
    print(f"  Test: {len(test_df_sbf)} samples")

except Exception as e:
    raise IOError(f"Error loading original split files: {e}")

# Apply profanity obfuscation to relevant text columns
if 'post' in train_df_sbf.columns:
    train_df_sbf['processed_post'] = train_df_sbf['post'].apply(process_text)
if 'context' in train_df_sbf.columns:
    train_df_sbf['processed_context'] = train_df_sbf['context'].apply(process_text)

if 'post' in valid_df_sbf.columns:
    valid_df_sbf['processed_post'] = valid_df_sbf['post'].apply(process_text)
if 'context' in valid_df_sbf.columns:
    valid_df_sbf['processed_context'] = valid_df_sbf['context'].apply(process_text)

if 'post' in test_df_sbf.columns:
    test_df_sbf['processed_post'] = test_df_sbf['post'].apply(process_text)
if 'context' in test_df_sbf.columns:
    test_df_sbf['processed_context'] = test_df_sbf['context'].apply(process_text)


# --- Data Preparation for ML ---
def prepare_sbf_data_split(df, text_mode='post_only', label_mode='binary'):
    """
    Prepares a specific split of the Social Bias Frames data for ML.

    Args:
        df (pd.DataFrame): The input DataFrame for a split (e.g., train_df_sbf).
        text_mode (str): 'post_only' or 'post_plus_context'.
        label_mode (str): 'binary' (1 if offensive/stereotyping, 0 otherwise) or 'offensiveYN'.

    Returns:
        pd.DataFrame: DataFrame with 'text' and 'label' columns.
    """
    prepared_df = df.copy()

    # Determine the text column to use based on processed columns
    text_col = 'processed_post' if 'processed_post' in prepared_df.columns else ('post' if 'post' in prepared_df.columns else None)
    context_col = 'processed_context' if 'processed_context' in prepared_df.columns else ('context' if 'context' in prepared_df.columns else None)

    if text_col is None:
        raise ValueError("Could not find a suitable text column ('processed_post' or 'post')")

    # Select text input mode
    if text_mode == 'post_only':
        prepared_df['text'] = prepared_df[text_col].astype(str).fillna('')
    elif text_mode == 'post_plus_context' and context_col:
         # Using [SEP] token for BERT compatibility
        prepared_df['text'] = prepared_df.apply(
            lambda row: f"{row[text_col].strip()} [SEP] {row[context_col].strip()}".strip(), axis=1
        )
        # Handle cases where either is empty after strip
        prepared_df['text'] = prepared_df['text'].replace(r'\[SEP\]', '', regex=True).str.strip()
    elif text_mode == 'post_plus_context' and not context_col:
         print(f"Warning: text_mode set to 'post_plus_context' but no context column found. Using '{text_col}_only'.")
         prepared_df['text'] = prepared_df[text_col].astype(str).fillna('')
    else:
        raise ValueError("text_mode must be 'post_only' or 'post_plus_context'")

    # Select label mode
    # Example usage:
    # label_mode must be 'binary' or 'offensiveYN' (if column exists)

    if label_mode == 'binary':
        # Filter out samples with offensiveYN of 0.5 and map 1.0 to 1, 0.0 to 0
        initial_count = len(prepared_df)
        if 'offensiveYN' in prepared_df.columns:
            prepared_df = prepared_df[prepared_df['offensiveYN'] != 0.5].copy()
            filtered_count = len(prepared_df)
            if initial_count > filtered_count:
                print(f"Note: Filtered out {initial_count - filtered_count} samples with offensiveYN = 0.5 for binary labeling.")
            prepared_df['label'] = prepared_df['offensiveYN'].apply(
                lambda x: 1 if x == 1.0 else 0
            )
        elif 'targetStereotype' in prepared_df.columns:
            # If has stereotype targeting, mark as implicit-offensive (1), otherwise 0
            prepared_df['label'] = prepared_df['targetStereotype'].notna().astype(int)
        else:
            # Default: assume all are offensive/stereotyping if no clear label column
            print("Warning: No offensiveYN or targetStereotype column found for binary labeling. Defaulting to all label 1.")
            prepared_df['label'] = 1
    elif label_mode == 'offensiveYN' and 'offensiveYN' in prepared_df.columns:
         prepared_df['label'] = prepared_df['offensiveYN'] # Keep original offensiveYN
    else:
        raise ValueError("label_mode must be 'binary' or 'offensiveYN' (if column exists)")


    return prepared_df[['text', 'label']]

# Prepare each split using the new function
train_df_sbf_prepared = prepare_sbf_data_split(train_df_sbf, text_mode='post_only', label_mode='binary')
valid_df_sbf_prepared = prepare_sbf_data_split(valid_df_sbf, text_mode='post_only', label_mode='binary')
test_df_sbf_prepared = prepare_sbf_data_split(test_df_sbf, text_mode='post_only', label_mode='binary')

# Drop rows with empty text in each prepared split
train_df_sbf_prepared = train_df_sbf_prepared[train_df_sbf_prepared['text'].str.strip() != ''].copy()
valid_df_sbf_prepared = valid_df_sbf_prepared[valid_df_sbf_prepared['text'].str.strip() != ''].copy()
test_df_sbf_prepared = test_df_sbf_prepared[test_df_sbf_prepared['text'].str.strip() != ''].copy()


# Convert to DatasetDict
sbf_ds = DatasetDict({
    "train": Dataset.from_pandas(train_df_sbf_prepared.reset_index(drop=True)),
    "validation": Dataset.from_pandas(valid_df_sbf_prepared.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df_sbf_prepared.reset_index(drop=True)),
})

print("\nSocial Bias Frames DatasetDict created with original splits.")
display(sbf_ds)

Loading Social Bias Frames dataset...
Downloading Social Bias Frames archive from: https://homes.cs.washington.edu/~msap/social-bias-frames/SBIC.v2.tgz
Note: This file is large - download may take a few minutes...


Downloading:   0%|          | 0.00/9.46M [00:00<?, ?B/s]

Downloaded and saved to: ./SBIC.v2.tgz
Extracting archive...
Found 6 CSV files in archive:
  - SBIC.v2.agg.dev.csv
  - SBIC.v2.agg.trn.csv
  - SBIC.v2.agg.tst.csv
  - SBIC.v2.dev.csv
  - SBIC.v2.trn.csv
  - SBIC.v2.tst.csv


  tar.extractall(temp_extract_dir)


  Moved SBIC.v2.tst.csv to ./SBIC_data
  Moved SBIC.v2.dev.csv to ./SBIC_data
  Moved SBIC.v2.agg.dev.csv to ./SBIC_data
  Moved SBIC.v2.trn.csv to ./SBIC_data
  Moved SBIC.v2.agg.trn.csv to ./SBIC_data
  Moved SBIC.v2.agg.tst.csv to ./SBIC_data

Loading data from original splits:
  Train: 112900 samples
  Validation: 16738 samples
  Test: 17501 samples
Note: Filtered out 9992 samples with offensiveYN = 0.5 for binary labeling.
Note: Filtered out 1307 samples with offensiveYN = 0.5 for binary labeling.
Note: Filtered out 1230 samples with offensiveYN = 0.5 for binary labeling.

Social Bias Frames DatasetDict created with original splits.


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 102908
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 15431
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 16271
    })
})

In [None]:
# Display a specific row of the train split as a pandas DataFrame using .iloc[]
display(sbf_ds["train"].to_pandas())

Unnamed: 0,text,label
0,RT @_LexC__: I'm convinced that some of y'all ...,1
1,RT @iBeZo: Stupid f*cking n*gger LeBron. You f...,1
2,RT @iBeZo: Stupid f*cking n*gger LeBron. You f...,1
3,RT @iBeZo: Stupid f*cking n*gger LeBron. You f...,1
4,RT @iBeZo: Stupid f*cking n*gger LeBron. You f...,1
...,...,...
102903,Thought you fellas might like to explain to so...,0
102904,Thought you fellas might like to explain to so...,1
102905,Thought you fellas might like to explain to so...,0
102906,"If female voters were discounted, Labour would...",0


# SelfMA Dataset

In [None]:
file_id = '138fisv1BB7pEDlc0bJQzoAV4Wgewi9bV'
file_name = 'self_MA.json'
!gdown --id {file_id} -O {file_name}
self_MA = pd.read_json(file_name, lines=True)

# Get all unique tags from the column
unique_tags = set(tag for tags in self_MA["tags"] if isinstance(tags, list) for tag in tags)

# Filter if necessary
include = unique_tags # {"Skin Tone", "skin tone", "racism", "orientalism", "race", "ethnicity", "eurocentrism", "eurocentricism"}
mask = self_MA["tags"].apply(lambda xs: any(t in include for t in xs))
self_MA_filtered = self_MA[mask].copy()

# Profanity obfuscation check (Keeping this for now as it was in the original code)
self_MA_filtered['processed_quote'] = self_MA_filtered['quote'].apply(process_text)
self_MA_filtered['processed_text'] = self_MA_filtered['text'].apply(process_text)


# --- Modified Data Preparation ---
def prepare_data(df, text_mode='quote_only', label_mode='binary'):
    """
    Prepares the data for ML by selecting text input and label type.

    Args:
        df (pd.DataFrame): The input DataFrame (e.g., self_MA_filtered).
        text_mode (str): 'quote_only' or 'quote_plus_text'.
        label_mode (str): 'binary' (1 if has tags, 0 otherwise) or 'tags'.

    Returns:
        pd.DataFrame: DataFrame with 'text' and 'label' columns.
    """
    prepared_df = df.copy()

    # Select text input mode
    if text_mode == 'quote_only':
        prepared_df['text'] = prepared_df['processed_quote'].fillna('')
    elif text_mode == 'quote_plus_text':
        # Using [SEP] token for BERT compatibility
        prepared_df['text'] = prepared_df.apply(
            lambda row: f"{row['processed_quote'].strip()} [SEP] {row['processed_text'].strip()}".strip(), axis=1
        )
        # Handle cases where either quote or text is empty after strip
        # Fixed SyntaxWarning by using raw string or double backslash
        prepared_df['text'] = prepared_df['text'].replace(r'\[SEP\]', '', regex=True).str.strip()

    else:
        raise ValueError("text_mode must be 'quote_only' or 'quote_plus_text'")

    # Select label mode
    if label_mode == 'binary':
        prepared_df['label'] = prepared_df['tags'].apply(lambda x: 1 if x and len(x) > 0 else 0)
    elif label_mode == 'tags':
        prepared_df['label'] = prepared_df['tags'] # Keep the list of tags
    else:
        raise ValueError("label_mode must be 'binary' or 'tags'")

    return prepared_df[['text', 'label']]

# Example usage:
# To get data with only quotes and binary labels:
data_for_split_binary_quote = prepare_data(self_MA_filtered, text_mode='quote_only', label_mode='binary')

# To get data with combined quote+text and binary labels:
#data_for_split_binary_quotetext = prepare_data(self_MA_filtered, text_mode='quote_plus_text', label_mode='binary')

# To get data with only quotes and original tags as labels:
# data_for_split_tags_quote = prepare_data(self_MA_filtered, text_mode='quote_only', label_mode='tags')

# To get data with combined quote+text and original tags as labels:
# data_for_split_tags_quotetext = prepare_data(self_MA_filtered, text_mode='quote_plus_text', label_mode='tags')


# Use the prepared data for splitting
data_for_split = data_for_split_binary_quote # Using combined text and binary labels for the split as an example

# Drop rows with empty text
data_for_split = data_for_split[data_for_split['text'].str.strip() != ''].copy()


# Split into training (80%) and temporary (20%) sets
# Removed stratify as there is only one class after filtering
train_df, temp_df = train_test_split(data_for_split, test_size=0.2, random_state=42)

# Split the temporary set into validation (10%) and test (10%) sets
# Removed stratify as there is only one class after filtering
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Reorder index for all splits
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)


print("Shape of training data:", train_df.shape)
print("Shape of validation data:", valid_df.shape)
print("Shape of test data:", test_df.shape)

# Convert to DatasetDict
selfMA_ds = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(valid_df),
    "test": Dataset.from_pandas(test_df),
})

Downloading...
From: https://drive.google.com/uc?id=138fisv1BB7pEDlc0bJQzoAV4Wgewi9bV
To: /content/self_MA.json
100% 1.96M/1.96M [00:00<00:00, 128MB/s]
Shape of training data: (1040, 2)
Shape of validation data: (130, 2)
Shape of test data: (130, 2)


In [None]:
display(selfMA_ds["train"].to_pandas())

Unnamed: 0,text,label
0,"""If I can't do the accent how can I say it the...",1
1,"""I'm not a misogynist! I don't hate woman! It'...",1
2,"""Oh... So I'm just going to put that you aren'...",1
3,"""My friend Kelly; she's black, but she's reall...",1
4,"""God, they should just shut up and be happy fo...",1
...,...,...
1035,"""Are you on your period?""",1
1036,"""If you keep getting fatter, men won't want to...",1
1037,"""Either that guy doesn't really care about his...",1
1038,"""It's okay that you're gay, just please stick ...",1


# Balanced SelfMA

In [None]:
# Assuming self_ma_ds (SelfMA DatasetDict) and sbf_ds (Social Bias Frames DatasetDict) are available

print("Creating balanced dataset splits...")

# Balance Train Split (Existing Logic)
selfma_label_1_train_df = selfMA_ds["train"].to_pandas()
sbf_label_0_train_df = sbf_ds["train"].filter(lambda example: example['label'] == 0).to_pandas()
num_samples_train = min(len(selfma_label_1_train_df), len(sbf_label_0_train_df))
print(f"Balancing train split with {num_samples_train} samples of label 1 (SelfMA) and {num_samples_train} samples of label 0 (Social Bias Frames).")
balanced_selfma_train = selfma_label_1_train_df.sample(n=num_samples_train, random_state=42).reset_index(drop=True)
balanced_sbf_train = sbf_label_0_train_df.sample(n=num_samples_train, random_state=42).reset_index(drop=True)
balanced_train_df = pd.concat([balanced_selfma_train, balanced_sbf_train], ignore_index=True)
balanced_train_df = balanced_train_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Balance Validation Split
selfma_label_1_valid_df = selfMA_ds["validation"].to_pandas()
sbf_label_0_valid_df = sbf_ds["validation"].filter(lambda example: example['label'] == 0).to_pandas()
num_samples_valid = min(len(selfma_label_1_valid_df), len(sbf_label_0_valid_df))
print(f"Balancing validation split with {num_samples_valid} samples of label 1 (SelfMA) and {num_samples_valid} samples of label 0 (Social Bias Frames).")
balanced_selfma_valid = selfma_label_1_valid_df.sample(n=num_samples_valid, random_state=42).reset_index(drop=True)
balanced_sbf_valid = sbf_label_0_valid_df.sample(n=num_samples_valid, random_state=42).reset_index(drop=True)
balanced_valid_df = pd.concat([balanced_selfma_valid, balanced_sbf_valid], ignore_index=True)
balanced_valid_df = balanced_valid_df.sample(frac=1, random_state=42).reset_index(drop=True)


# Balance Test Split
selfma_label_1_test_df = selfMA_ds["test"].to_pandas()
sbf_label_0_test_df = sbf_ds["test"].filter(lambda example: example['label'] == 0).to_pandas()
num_samples_test = min(len(selfma_label_1_test_df), len(sbf_label_0_test_df))
print(f"Balancing test split with {num_samples_test} samples of label 1 (SelfMA) and {num_samples_test} samples of label 0 (Social Bias Frames).")
balanced_selfma_test = selfma_label_1_test_df.sample(n=num_samples_test, random_state=42).reset_index(drop=True)
balanced_sbf_test = sbf_label_0_test_df.sample(n=num_samples_test, random_state=42).reset_index(drop=True)
balanced_test_df = pd.concat([balanced_selfma_test, balanced_sbf_test], ignore_index=True)
balanced_test_df = balanced_test_df.sample(frac=1, random_state=42).reset_index(drop=True)


# Create a new DatasetDict with the balanced splits
balanced_selfMA_ds_quoteonly = DatasetDict({
    "train": Dataset.from_pandas(balanced_train_df),
    "validation": Dataset.from_pandas(balanced_valid_df),
    "test": Dataset.from_pandas(balanced_test_df)
})

print("\nBalanced SelfMA and Social Bias Frames DatasetDict created.")
display(balanced_selfMA_ds_quoteonly)

Creating balanced dataset splits...


Filter:   0%|          | 0/102908 [00:00<?, ? examples/s]

Balancing train split with 1040 samples of label 1 (SelfMA) and 1040 samples of label 0 (Social Bias Frames).


Filter:   0%|          | 0/15431 [00:00<?, ? examples/s]

Balancing validation split with 130 samples of label 1 (SelfMA) and 130 samples of label 0 (Social Bias Frames).


Filter:   0%|          | 0/16271 [00:00<?, ? examples/s]

Balancing test split with 130 samples of label 1 (SelfMA) and 130 samples of label 0 (Social Bias Frames).

Balanced SelfMA and Social Bias Frames DatasetDict created.


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2080
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 260
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 260
    })
})

In [None]:
display(balanced_selfMA_ds_quoteonly["train"].to_pandas())

Unnamed: 0,text,label
0,Most comedians say Michael J. Fox is an easy t...,0
1,"""I didn't watch the halftime show because she ...",1
2,"""I have lots of sympathy for women, but I have...",1
3,"""Are you sure you are alright with driving a s...",1
4,always remind myself nothing product about ins...,0
...,...,...
2075,.@SwiftOnSecurity HTTP going over WiFi has end...,0
2076,"Kalispell, Montana - YouTube Elliott shooting ...",0
2077,"I won Wyoming on a scratch card, so what?",0
2078,What are some movies and TV shows which realis...,0


In [None]:
#Save in Datasets native format (best for reloading)
from datasets import Dataset, DatasetDict


#Save to Drive instead
from google.colab import drive
drive.mount('/content/drive')
balanced_selfMA_ds_quoteonly.save_to_disk("/content/drive/MyDrive/266_project/balanced_selfMA_ds_quoteonly")


Mounted at /content/drive


Saving the dataset (0/1 shards):   0%|          | 0/2080 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/260 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/260 [00:00<?, ? examples/s]