In [1]:
import re
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from torch.utils.data import TensorDataset, DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

In [2]:
dataset = load_dataset("mahmed31/revised_Toraman22_hate_speech_v2")
df = pd.DataFrame(dataset['train'])

Downloading readme:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [2]:
# Selecting 5 samples for each class
class_0_indices = df[df['label'] == 0].index[:1615]
class_1_indices = df[df['label'] == 1].index[:1615]
class_2_indices = df[df['label'] == 2].index[:1615]

# Combining indices of all classes
selected_indices = np.concatenate([class_0_indices, class_1_indices, class_2_indices])

X = df.iloc[selected_indices]['text'].values
y = df.iloc[selected_indices]['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

X_train = X_train.tolist()
y_train = y_train.tolist()

  df = pd.read_csv('datasets/Toraman22_hate_speech_v2.tsv', delimiter='\t', encoding='utf-8').dropna()


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

inputs = tokenizer(X_train, padding=True, truncation=True, return_tensors="pt")
labels = torch.tensor(y_train).long()

# Assuming CUDA is available, use GPU; otherwise, fall back to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Custom dataset
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Stratified K-Fold Cross-Validation
n_splits = 4
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

total_accuracy = []

model_save_path = 'saved_models'  # Define the directory to save your models
os.makedirs(model_save_path, exist_ok=True)  # Create the directory if it doesn't exist

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {fold+1}/{n_splits}")

    # Split the data
    X_train_fold, X_val_fold = X[train_idx], X[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]

    # Tokenize
    train_encodings = tokenizer(X_train_fold.tolist(), padding=True, truncation=True, return_tensors="pt")
    val_encodings = tokenizer(X_val_fold.tolist(), padding=True, truncation=True, return_tensors="pt")

    train_labels = torch.tensor(y_train_fold).long()
    val_labels = torch.tensor(y_val_fold).long()

    # Prepare datasets and dataloaders
    train_dataset = TextDataset(train_encodings, train_labels)
    val_dataset = TextDataset(val_encodings, val_labels)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Model, optimizer, and scheduler
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3).to(device)
    optimizer = AdamW(model.parameters(), lr=1e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)

    # Training loop
    model.train()
    for epoch in range(3):  # Number of epochs
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

    # Validation
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).tolist())
            true_labels.extend(labels.tolist())

    accuracy = accuracy_score(true_labels, predictions)
    total_accuracy.append(accuracy)
    print(f'Fold {fold+1} Accuracy: {accuracy}')

    # After training and validation, save the model for this fold
    model_save_filename = f'model_fold_{fold+1}.pt'  # Naming the model file
    model_save_full_path = os.path.join(model_save_path, model_save_filename)  # Full path

    # Save the model's state_dict
    torch.save(model.state_dict(), model_save_full_path)
    print(f'Model saved to {model_save_full_path}')

    
# Display overall performance
print(f'Mean Accuracy: {np.mean(total_accuracy)}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1/4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 1, Loss: 0.9045141182103
Epoch 2, Loss: 0.5322670045789781
Epoch 3, Loss: 0.378957388820229
Fold 1 Accuracy: 0.847265221878225
Model saved to saved_models\model_fold_1.pt
Fold 2/4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 1, Loss: 0.8777584223956852
Epoch 2, Loss: 0.48622975588499845
Epoch 3, Loss: 0.3534513322206644
Fold 2 Accuracy: 0.8503611971104231
Model saved to saved_models\model_fold_2.pt
Fold 3/4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 1, Loss: 0.8952074568350237
Epoch 2, Loss: 0.5628359429128877
Epoch 3, Loss: 0.41947576305368445
Fold 3 Accuracy: 0.846233230134159
Model saved to saved_models\model_fold_3.pt
Fold 4/4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 1, Loss: 0.9337272133146014
Epoch 2, Loss: 0.6518117195957309
Epoch 3, Loss: 0.49578255578711794
Fold 4 Accuracy: 0.8173374613003096
Model saved to saved_models\model_fold_4.pt
Mean Accuracy: 0.8402992776057792
