In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
column_names = ["text", "is_hate"]

df = pd.read_csv('1.preprocessed_data.csv', on_bad_lines='skip', sep=",", encoding='iso-8859-1', header=0, names=column_names)
df['is_hate'] = df['is_hate'].astype(bool)
df['text'] = df['text'].astype('str')
df.head()

Unnamed: 0,text,is_hate
0,ponnayo danne kellek aduwa gaman laga inna kol...,True
1,ape harak samjeta eka honda adrshyak,False
2,tpita pisuda yako man htuwe atta kiyala aiyo,False
3,kimbak eduwoth ape untath amma thaththawath pe...,True
4,lisan nathawa yanna puluwan yako api dannawa o...,False


### Preprocess the Data:
Tokenize the text data to feed into DistilBert:

### Fine-Tuning DistilBert:
Train the DistilBert model:

In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from scipy.special import softmax
import numpy as np

# 1. Data Preparation
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def encode_text(df, tokenizer, max_length=256):
    return tokenizer(df["text"].tolist(), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

encoded_data = encode_text(df, tokenizer)
inputs, attention_masks, labels = encoded_data["input_ids"], encoded_data["attention_mask"], torch.tensor(df["is_hate"].tolist()).long()

train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    inputs, attention_masks, labels, test_size=0.2
)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
val_data = TensorDataset(val_inputs, val_masks, val_labels)

train_loader = DataLoader(train_data, shuffle=True, batch_size=8)
val_loader = DataLoader(val_data, batch_size=8)

# 2. Model, Loss, and Optimizer
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)
loss_function = torch.nn.CrossEntropyLoss()

fpr = None
tpr = None
roc_auc = None
cm_percentage = None


# 3. Training Loop
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_loader):
        batch_input_ids = batch[0].to(device)
        batch_attention_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(batch_input_ids, attention_mask=batch_attention_mask)[0]
        loss = loss_function(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch: {epoch + 1}, Training loss: {total_loss / len(train_loader)}")

    # Validation at the end of epoch
    model.eval()
    total_val_loss = 0
    correct_predictions = 0
    true_labels = []
    predicted_labels = []

    model_outputs = []
    with torch.no_grad():
        for batch in val_loader:
            batch_input_ids = batch[0].to(device)
            batch_attention_mask = batch[1].to(device)
            batch_labels = batch[2].to(device)

            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)[0]
            loss = loss_function(outputs, batch_labels)
            total_val_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            correct_predictions += (preds == batch_labels).sum().item()

            true_labels.extend(batch_labels.cpu().numpy())
            predicted_labels.extend(preds.cpu().numpy())

            outputs_np = outputs.cpu().numpy()
            model_outputs.extend(outputs_np)


    val_accuracy = correct_predictions / len(val_labels)
    print(f"Epoch: {epoch + 1}, Validation loss: {total_val_loss / len(val_loader)}, Validation accuracy: {val_accuracy}")

    print("\nClassification Report:")
    report = classification_report(true_labels, predicted_labels)
    print(report)

    # values for confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # normalize the confusion matrix

    # values for ROC curve
    # Convert model output to probabilities and plot ROC curve
    model_outputs = np.array(model_outputs)
    probs = softmax(model_outputs, axis=1)[:, 1]  # Assume the second column is the probability for class "1"
    fpr, tpr, thresholds = roc_curve(true_labels, probs)
    roc_auc = auc(fpr, tpr)
    
    


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

Epoch: 1, Training loss: 0.6511623232343078
Epoch: 1, Validation loss: 0.6212485879659653, Validation accuracy: 0.655328798185941

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.81      0.72       248
           1       0.65      0.46      0.54       193

    accuracy                           0.66       441
   macro avg       0.65      0.63      0.63       441
weighted avg       0.65      0.66      0.64       441

Epoch: 2, Training loss: 0.5358924518478402
Epoch: 2, Validation loss: 0.5302783149693694, Validation accuracy: 0.7528344671201814

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.69      0.76       248
           1       0.68      0.83      0.75       193

    accuracy                           0.75       441
   macro avg       0.76      0.76      0.75       441
weighted avg       0.77      0.75      0.75       441

Epoch: 3, Training loss: 0.3749487867

### took 13mins

In [3]:
def predict(model, tokenizer, device, text):
    # Tokenize input string
    inputs = tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors="pt").to(device)
    
    # Make prediction
    with torch.no_grad():
        logits = model(**inputs)[0]
        predicted_class = torch.argmax(logits, dim=1).item()
    
    return predicted_class

# Example usage
input_string = "kalakanni deshapaluwo"
model.eval()  # Ensure model is in evaluation mode
prediction = predict(model, tokenizer, device, input_string)
print(f"Prediction for '{input_string}': {'Hate' if prediction == 1 else 'Not Hate'}")


Prediction for 'kalakanni deshapaluwo': Hate


In [7]:
import pickle

report =  classification_report(true_labels, predicted_labels, target_names=['True', 'False'])
# save the values to a file
with open('2.12 DistilBert.pkl', 'wb') as f:
    pickle.dump({
        'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc, 'cm_percentage': cm_percentage,'report': report
    }, f)
