In [1]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='tensorflow_io')
import logging
import time
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, TensorDataset
import spacy
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split # Removed ParameterGrid for grid search
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from torch.optim import AdamW, RAdam
from torch.optim.lr_scheduler import OneCycleLR, ReduceLROnPlateau
from torch.cuda.amp import GradScaler, autocast
from transformers import BartTokenizer, BartConfig, BartForSequenceClassification, logging as hf_logging

from datasets import load_dataset
from sklearn.utils.class_weight import compute_class_weight
import nltk
from nltk.corpus import stopwords
from collections import Counter



In [2]:
# Configure logging level to suppress warnings from transformers library
logging.basicConfig(level=logging.ERROR)
hf_logging.set_verbosity_error()

# Load the small BERT tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
max_length = 512

# Load the Davidson and ethos dataset
df_davidson = pd.read_csv('https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv')
ethos_data = load_dataset('ethos', 'binary')
ethos_df = ethos_data['train'].to_pandas()

# Load the HateXplain dataset from Hugging Face
hatexplain_data = load_dataset('hatexplain')
hatexplain_df = hatexplain_data['train'].to_pandas()
print("Davidson dataset:")
print(df_davidson.head())
print("Ethos dataset:")
print(ethos_df.head())
print("Hatexplain dataset:")
print(hatexplain_df.head())
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

# Load the OLID dataset
olid_train_df = pd.read_csv('https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/offensive/train_text.txt', sep='\t', header=None, names=['tweet'])
olid_train_labels = pd.read_csv('https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/offensive/train_labels.txt', sep='\t', header=None, names=['class'])
olid_train_df = pd.concat([olid_train_df, olid_train_labels], axis=1)

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


def preprocess_text(text):
    """
    Preprocess text data by removing URLs, mentions, hashtags,
    and punctuation, and applying lemmatization.
    """
    # Lowercase the text
    text = text.lower()

    # Remove URLs, mentions, and hashtags
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)

    # Remove punctuation
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)

    # Apply lemmatization
    doc = nlp(text)
    words = [token.lemma_ for token in doc]

    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # Join the words back into a string
    text = ' '.join(words)

    return text

# Combine the 'Hate Speech' and 'Offensive Language' classes in Davidson dataset by taking majority vote among annotators for each post.
label_map = {0: 0, 1: 0, 2: 1}
df_davidson['class'] = df_davidson['class'].map(label_map)

# Preprocess the text data using the preprocess_text function in Davidson dataset.
df_davidson['tweet'] = df_davidson['tweet'].apply(preprocess_text)

# Preprocess the text data using the preprocess_text function in Ethos dataset.
ethos_df['text'] = ethos_df['text'].apply(preprocess_text)

# Preprocess the HateXplain dataset using the preprocess_text function in HateXplain dataset.
hatexplain_df['post_tokens'] = hatexplain_df['post_tokens'].apply(lambda x: ' '.join(x)).apply(preprocess_text)

# Preprocess the text data using the preprocess_text function in OLID dataset.
olid_train_df['tweet'] = olid_train_df['tweet'].apply(preprocess_text)

# Combine hate speech and offensive speech labels in HateXplain dataset by taking majority vote among annotators for each post.
hatexplain_df['label'] = hatexplain_df['annotators'].apply(lambda x: int(sum(x['label']) / len(x['label']) >= 1))

# Rename columns in Ethos and HateXplain datasets
ethos_df.rename(columns={'text': 'tweet', 'label': 'class'}, inplace=True)
hatexplain_df.rename(columns={'post_tokens': 'tweet', 'label': 'class'}, inplace=True)

# Combine the Davidson, Ethos and HateXplain datasets.
df_combined = pd.concat([
    df_davidson[['tweet', 'class']],
    ethos_df[['tweet', 'class']],
    hatexplain_df[['tweet', 'class']]
], axis=0)

# Combine the OLID dataset with the existing training data
df_combined = pd.concat([df_combined, olid_train_df], axis=0)

# Drop rows with missing values.
df_combined.dropna(subset=['class'], inplace=True)

# Print the shape of the combined dataset
print(f'Combined dataset shape: {df_combined.shape}')

# Print the number of samples in each class
print('Number of samples in each class:')
print(df_combined['class'].value_counts())

# Print the first few rows of the combined dataset
print('First few rows of the combined dataset:')
print(df_combined.head())

# Print the number of missing values in each column
print('Number of missing values in each column:')
print(df_combined.isnull().sum())

# Print the length of the longest text in the combined dataset
print('Length of the longest text:')
print(df_combined['tweet'].str.len().max())

# Print the average length of the text in the combined dataset
print('Average length of the text:')
print(df_combined['tweet'].str.len().mean())

# Split the combined dataset into train and validation sets.
train_df, val_df = train_test_split(df_combined, test_size=0.2, random_state=42, stratify=df_combined['class'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class_weights = compute_class_weight('balanced', classes=np.unique(train_df['class']), y=train_df['class'])
class_weights = torch.tensor(class_weights).to(device)

# Tokenize and pad the text data using the BERT tokenizer.
train_inputs = tokenizer.batch_encode_plus(
    train_df['tweet'].tolist(),
    max_length=max_length,
    padding='max_length',
    truncation=True,
    return_tensors='pt',
    batch_size=32
)
train_labels = pd.get_dummies(train_df['class']).values

# Apply SMOTE to balance the class distribution in the training data
smote = SMOTE(random_state=42)
train_inputs_smote, train_labels_smote = smote.fit_resample(train_inputs['input_ids'], train_labels)

# Convert the training data to tensors
train_inputs_smote = torch.tensor(train_inputs_smote)
train_labels_smote = torch.tensor(train_labels_smote)

val_inputs = tokenizer.batch_encode_plus(
    val_df['tweet'].tolist(),
    max_length=max_length,
    padding='max_length',
    truncation=True,
    return_tensors='pt',
    batch_size=32
)
val_labels = torch.tensor(pd.get_dummies(val_df['class']).values)
# Convert train_labels and val_labels from numpy arrays to PyTorch tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Define the BERT configuration with dropout
config = BartConfig.from_pretrained('facebook/bart-base', num_labels=2, hidden_dropout=0.5, attention_dropout=0.5)

# Define the BERT model for sequence classification with dropout
model = BartForSequenceClassification.from_pretrained('facebook/bart-base', config=config)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/940 [00:00<?, ?B/s]

Downloading and preparing dataset ethos/binary (download: 121.01 KiB, generated: 121.90 KiB, post-processed: Unknown size, total: 242.91 KiB) to /root/.cache/huggingface/datasets/ethos/binary/1.0.0/898d3d005459ee3ff80dbeec2f169c6b7ea13de31a08458193e27dec3dd9ae38...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/50.3k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/998 [00:00<?, ? examples/s]

Dataset ethos downloaded and prepared to /root/.cache/huggingface/datasets/ethos/binary/1.0.0/898d3d005459ee3ff80dbeec2f169c6b7ea13de31a08458193e27dec3dd9ae38. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading and preparing dataset hatexplain/plain_text (download: 12.25 MiB, generated: 8.47 MiB, post-processed: Unknown size, total: 20.73 MiB) to /root/.cache/huggingface/datasets/hatexplain/plain_text/1.0.0/df474d8d8667d89ef30649bf66e9c856ad8305bef4bc147e8e31cbdf1b8e0249...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/145k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/15383 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1922 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1924 [00:00<?, ? examples/s]

Dataset hatexplain downloaded and prepared to /root/.cache/huggingface/datasets/hatexplain/plain_text/1.0.0/df474d8d8667d89ef30649bf66e9c856ad8305bef4bc147e8e31cbdf1b8e0249. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Davidson dataset:
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  
Ethos dataset:
                                                text  label
0          You should know women's sports are a joke      1
1    You look like Sloth with deeper Down’s syndrome      1
2  You look like R

  val_labels = torch.tensor(val_labels)


Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [3]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [4]:
# Check if GPU is available.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
train_inputs.to(device)
train_labels.to(device)
val_inputs.to(device)
val_labels.to(device)

num_epochs = 5 # Added number of epochs hyperparameter

import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import FixedSampler

batch_size = 16

def objective(trial):
    # Define the hyperparameters to tune
    fixed_params = {'learning_rate': [1e-5, 1e-3]}
       

    # Define the optimizer with learning rate # Modified for grid search
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5, verbose=True)
    scaler = GradScaler()

    # Define the training hyperparameters
    accumulation_steps = 4
    max_grad_norm = 1.0
    val_loss = torch.tensor(0.0)

    # Define the early stopping hyperparameters
    patience = 2
    best_val_loss = float('inf')
    counter = 0

    # Compute class weights to use with weighted cross-entropy loss
    class_weights = compute_class_weight('balanced', classes=np.unique(train_df['class']), y=train_df['class'])
    class_weights = torch.tensor(class_weights).to(device)

    # Create an empty list to accumulate gradients
    grad_accumulator = []

    # Create a DataLoader for the training data with batch size # Modified for grid search
    train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        # Create a progress bar for the current epoch
        progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}')

        for step, batch in enumerate(progress_bar):
            # Forward pass
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device).float()

            with autocast():
                criterion = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights[1])
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)

            # Backward pass
            scaler.scale(loss).backward()

            # Accumulate gradients
            if (step + 1) % accumulation_steps == 0 or (step + 1) == len(train_dataloader):
                # Unscale the gradients
                scaler.unscale_(optimizer)

                # Clip the gradients to prevent exploding gradients
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                # Accumulate the unscaled gradients
                grad_accumulator.append(loss.item())

                # Update the model parameters
                scaler.step(optimizer)

                # Clear the gradients
                scaler.update()
                model.zero_grad()

                # Print the average loss over accumulation steps
                avg_loss = sum(grad_accumulator) / len(grad_accumulator)
                progress_bar.set_postfix({'Loss': avg_loss})

                # Clear the gradient accumulator
                grad_accumulator = []
            else:
                # Accumulate the unscaled gradients
                grad_accumulator.append(loss.item())

            total_loss += loss.item()

        average_loss = total_loss / len(train_dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}")

        # Evaluate the model on the validation set
        model.eval()

        # Create a DataLoader for the validation data with batch size 
        val_dataset = TensorDataset(val_inputs['input_ids'], val_inputs['attention_mask'], val_labels)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

        val_predictions = []
        val_total_loss = 0.0

        for batch in val_dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device).float()

            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                predictions = outputs.logits.argmax(dim=1).cpu().numpy()
                val_predictions.extend(predictions)
                val_total_loss += outputs.loss.item()

        val_predictions = np.array(val_predictions)
        val_accuracy = (val_predictions == val_df['class'].values).mean()
        val_classification_report = classification_report(val_df['class'].values, val_predictions)

        print(f'Validation Accuracy: {val_accuracy:.3f}')
        print('Classification Report:')
        print(val_classification_report)

    return val_total_loss / len(val_dataloader)

# Create an Optuna study with the TPESampler and MedianPruner
study = optuna.create_study(
    direction='minimize',
    sampler=FixedSampler(fixed_params),
    pruner=MedianPruner()
)

gc.collect()
torch.cuda.empty_cache()

# Optimize the objective function
study.optimize(objective, n_trials=len(fixed_params['learning_rate']))

# Print the best hyperparameters
print(study.best_params)