In [None]:
# Install necessary libraries
!pip install -q datasets transformers
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import torch
from transformers import BertForSequenceClassification, AutoTokenizer, get_scheduler
from torch.optim import AdamW
from torch.utils.data import DataLoader
from datasets import Dataset
from sklearn.metrics import accuracy_score

# Load the dataset
url = "https://raw.githubusercontent.com/nataliecclaire/RedditBias/master/data/orientation/reddit_comments_orientation_lgbtq_processed_phrase_annotated.csv"
gender_url = "https://raw.githubusercontent.com/nataliecclaire/RedditBias/master/data/gender/reddit_comments_gender_female_processed_phrase_annotated.csv"
religion1_url = "https://raw.githubusercontent.com/nataliecclaire/RedditBias/master/data/religion1/reddit_comments_religion1_jews_processed_phrase_annotated.csv"
religion2_url = "https://raw.githubusercontent.com/nataliecclaire/RedditBias/master/data/religion2/reddit_comments_religion2_muslims_processed_phrase_annotated.csv"
race_url = "https://raw.githubusercontent.com/nataliecclaire/RedditBias/master/data/race/reddit_comments_race_black_processed_phrase_annotated.csv"

# Load and combine datasets
orientation_data = pd.read_csv(url)
gender_data = pd.read_csv(gender_url, encoding="latin1")
religion1_data = pd.read_csv(religion1_url, encoding="latin1")
religion2_data = pd.read_csv(religion2_url, encoding="latin1")
race_data = pd.read_csv(race_url, encoding="latin1")

# Combine all datasets
all_data = pd.concat([orientation_data, gender_data, religion1_data, religion2_data, race_data], ignore_index=True)

# Data preprocessing
all_data = all_data.dropna(subset=['bias_sent', 'comment'])
all_data['bias_sent'] = all_data['bias_sent'].replace('1 - context needed', 1)
values_to_remove = [np.nan, 're-state', 'biased?', 'toxic-unrelated', 'fact?', 'question']
mask = all_data['bias_sent'].isin(values_to_remove) | all_data['bias_sent'].isna()
all_data = all_data[~mask]

# Convert data types
all_data['comment'] = all_data['comment'].astype(str)
all_data['bias_sent'] = all_data['bias_sent'].astype(int)
all_data['bias_sent'] = all_data['bias_sent'].clip(0, 1)

# Prepare input and target
X = all_data['comment']
y = all_data['bias_sent']

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["comment"], padding="max_length", truncation=True)

# K-Fold Cross-Validation
k_folds = 10
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
fold_accuracies = []

# Training and evaluation loop
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
num_epochs = 10
gradient_accumulation_steps = 4
max_grad_norm = 1.0

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\n===== Fold {fold + 1}/{k_folds} =====")

    # Split data into training and validation sets
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Convert to HuggingFace Dataset
    train_df = pd.DataFrame({'comment': X_train, 'label': y_train})
    val_df = pd.DataFrame({'comment': X_val, 'label': y_val})
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    train_dataset = train_dataset.rename_column("label", "labels")
    val_dataset = val_dataset.rename_column("label", "labels")

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
    val_dataloader = DataLoader(val_dataset, batch_size=8)

    # Initialize model
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model.to(device)

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    num_training_steps = len(train_dataloader) * num_epochs
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0.1 * num_training_steps, num_training_steps=num_training_steps)

    # Training loop
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss / gradient_accumulation_steps
            loss.backward()

            if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_dataloader):
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            if step % 50 == 0:
                print(f"Step {step}/{len(train_dataloader)}, Loss: {loss.item():.4f}")

    # Validation loop
    model.eval()
    all_predictions, all_labels = [], []
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch['labels'].cpu().numpy())

    # Calculate accuracy
    fold_accuracy = accuracy_score(all_labels, all_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold + 1}: {fold_accuracy:.4f}")

# Final cross-validation accuracy
print("\n===== Cross-Validation Results =====")
print(f"Average Accuracy: {np.mean(fold_accuracies):.4f}")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]


===== Fold 1/10 =====


Map:   0%|          | 0/10365 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Step 0/1296, Loss: 0.1764
Step 50/1296, Loss: 0.1715
Step 100/1296, Loss: 0.1468
Step 150/1296, Loss: 0.1717
Step 200/1296, Loss: 0.2063
Step 250/1296, Loss: 0.1596
Step 300/1296, Loss: 0.1830
Step 350/1296, Loss: 0.1881
Step 400/1296, Loss: 0.1718
Step 450/1296, Loss: 0.1447
Step 500/1296, Loss: 0.1788
Step 550/1296, Loss: 0.1658
Step 600/1296, Loss: 0.1780
Step 650/1296, Loss: 0.1680
Step 700/1296, Loss: 0.1720
Step 750/1296, Loss: 0.1639
Step 800/1296, Loss: 0.1747
Step 850/1296, Loss: 0.1809
Step 900/1296, Loss: 0.1743
Step 950/1296, Loss: 0.2023
Step 1000/1296, Loss: 0.2093
Step 1050/1296, Loss: 0.1809
Step 1100/1296, Loss: 0.1737
Step 1150/1296, Loss: 0.1416
Step 1200/1296, Loss: 0.1555
Step 1250/1296, Loss: 0.1684
Epoch 2/10
Step 0/1296, Loss: 0.1368
Step 50/1296, Loss: 0.1619
Step 100/1296, Loss: 0.1532
Step 150/1296, Loss: 0.1686
Step 200/1296, Loss: 0.1510
Step 250/1296, Loss: 0.1392
Step 300/1296, Loss: 0.1778
Step 350/1296, Loss: 0.1048
Step 400/1296, Loss: 0.154

Map:   0%|          | 0/10365 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Step 0/1296, Loss: 0.1868
Step 50/1296, Loss: 0.1603
Step 100/1296, Loss: 0.1788
Step 150/1296, Loss: 0.1763
Step 200/1296, Loss: 0.1705
Step 250/1296, Loss: 0.1885
Step 300/1296, Loss: 0.1552
Step 350/1296, Loss: 0.1753
Step 400/1296, Loss: 0.1449
Step 450/1296, Loss: 0.1480
Step 500/1296, Loss: 0.1486
Step 550/1296, Loss: 0.1740
Step 600/1296, Loss: 0.1537
Step 650/1296, Loss: 0.1632
Step 700/1296, Loss: 0.1785
Step 750/1296, Loss: 0.1627
Step 800/1296, Loss: 0.1782
Step 850/1296, Loss: 0.1515
Step 900/1296, Loss: 0.1529
Step 950/1296, Loss: 0.1461
Step 1000/1296, Loss: 0.1586
Step 1050/1296, Loss: 0.1287
Step 1100/1296, Loss: 0.1630
Step 1150/1296, Loss: 0.1610
Step 1200/1296, Loss: 0.1669
Step 1250/1296, Loss: 0.1288
Epoch 2/10
Step 0/1296, Loss: 0.2019
Step 50/1296, Loss: 0.1677
Step 100/1296, Loss: 0.1647
Step 150/1296, Loss: 0.1804
Step 200/1296, Loss: 0.1649
Step 250/1296, Loss: 0.1395
Step 300/1296, Loss: 0.1552
Step 350/1296, Loss: 0.1728
Step 400/1296, Loss: 0.130