# Enhanced Fine Tuning

In [2]:
# 1. Data Preparation

import tensorflow as tf
import pandas as pd
import os
import re
import string
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification, create_optimizer
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [3]:
# 1.1 Load and Preprocess Data

def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    print("Initial Data Sample:")
    print(df.head())

    # Check if 'Content' and 'Label' columns exist
    if 'Content' not in df.columns or 'Label' not in df.columns:
        raise ValueError("Dataset must contain 'Content' and 'Label' columns.")

    # Text preprocessing
    def preprocess_text(text):
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'@\w+|#\w+', '', text)  # Remove mentions and hashtags
        text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        text = text.lower()  # Lowercase
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
        return text

    df['clean_text'] = df['Content'].apply(preprocess_text)
    return df

df = load_and_preprocess_data("HateSpeechDatasetBalanced_reduced.csv")

Initial Data Sample:
                                             Content  Label  \
0  he don't want my surrender to my new math book...      1   
1  pravi drzavnicki gov or sto god missile oh emu...      1   
2       i fucking hate going to my girlfriends house      0   
3  asked that pretend son to b a time the son wou...      1   
4  well myself am connected to an ethnically jewi...      1   

                                          clean_text  
0  he dont want my surrender to my new math book ...  
1  pravi drzavnicki gov or sto god missile oh emu...  
2       i fucking hate going to my girlfriends house  
3  asked that pretend son to b a time the son wou...  
4  well myself am connected to an ethnically jewi...  


In [4]:
# 1.2 Split the Data

def split_data(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    if train_size + val_size + test_size != 1.0:
        raise ValueError("Train, validation, and test sizes must sum to 1.")

    train_texts, temp_texts, train_labels, temp_labels = train_test_split(
        df['clean_text'].tolist(), df['Label'].tolist(), 
        test_size=(1 - train_size), random_state=random_state, stratify=df['Label']
    )

    val_ratio = val_size / (val_size + test_size)
    val_texts, test_texts, val_labels, test_labels = train_test_split(
        temp_texts, temp_labels, 
        test_size=(1 - val_ratio), random_state=random_state, stratify=temp_labels
    )

    print(f"Training samples: {len(train_texts)}")
    print(f"Validation samples: {len(val_texts)}")
    print(f"Testing samples: {len(test_texts)}")
    
    return train_texts, val_texts, test_texts, train_labels, val_labels, test_labels

train_texts, val_texts, test_texts, train_labels, val_labels, test_labels = split_data(df)

Training samples: 69999
Validation samples: 15000
Testing samples: 15001


In [5]:
# 2. Baseline Model Evaluation

# 2.1 Load the Pretrained Model and Tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# 2.2 Prepare the Test Dataset

def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )

test_encodings = tokenize_texts(test_texts, tokenizer)
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(32)  # Increased batch size for faster evaluation

# 2.3 Evaluate Baseline Model

def compile_and_evaluate(model, dataset):
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.metrics.SparseCategoricalAccuracy('accuracy')
    
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
    loss_val, accuracy = model.evaluate(dataset)
    return loss_val, accuracy

baseline_loss, baseline_accuracy = compile_and_evaluate(model, test_dataset)
print(f"Baseline Test Accuracy: {baseline_accuracy * 100:.2f}%")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Baseline Test Accuracy: 52.26%


In [7]:
# 3. Model Fine-Tuning

# 3.1 Prepare Training and Validation Datasets

train_encodings = tokenize_texts(train_texts, tokenizer)
val_encodings = tokenize_texts(val_texts, tokenizer)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(10000).batch(32).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(32).prefetch(tf.data.AUTOTUNE)

# 3.2 Fine-Tune the Model

# Re-initialize the model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Compile with more advanced optimizer settings
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.metrics.SparseCategoricalAccuracy('accuracy')]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Define callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-7),
    tf.keras.callbacks.ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)
]

# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5,  # Increased epochs with early stopping
    callbacks=callbacks
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
 384/2188 [====>.........................] - ETA: 2:08:04 - loss: 0.4429 - accuracy: 0.7943

KeyboardInterrupt: 