In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
column_names = ["text", "is_hate"]

df = pd.read_csv('1.preprocessed_data.csv', on_bad_lines='skip', sep=",", encoding='iso-8859-1', header=0, names=column_names)
df['is_hate'] = df['is_hate'].astype(bool)
df['text'] = df['text'].astype('str')
df.head()

Unnamed: 0,text,is_hate
0,ponnayo danne kellek aduwa gaman laga inna kol...,True
1,ape harak samjeta eka honda adrshyak,False
2,tpita pisuda yako man htuwe atta kiyala aiyo,False
3,kimbak eduwoth ape untath amma thaththawath pe...,True
4,lisan nathawa yanna puluwan yako api dannawa o...,False


In [4]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from torch.nn.functional import softmax

# Assuming df already exists
# If reading from a file:
# df = pd.read_csv('your_file.csv')

# Tokenize the texts
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 174

def tokenize_data(texts, tokenizer, max_length):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

tokens = tokenize_data(df['text'].tolist(), tokenizer, max_length)

# Create a DataLoader
dataset = TensorDataset(tokens.input_ids, tokens.attention_mask, torch.tensor(df['is_hate'].values).long())
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=16)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Training setup
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 4
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):  # <-- Note the change here
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss/len(train_loader)}")

# Validation
model.eval()
all_preds = []
all_labels = []
all_probs = []

for batch in tqdm(val_loader, desc="Validation"):  # <-- Note the change here
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask).logits
    probabilities = softmax(logits, dim=1)
    all_probs.extend(probabilities[:, 1].numpy())
    preds = torch.argmax(logits, dim=1).tolist()
    all_preds.extend(preds)
    all_labels.extend(labels.tolist())

print("Validation Accuracy:", accuracy_score(all_labels, all_preds))
report = classification_report(all_labels, all_preds)
print(report)

# Prediction function
def predict(text, model, tokenizer):
    model.eval()
    tokens = tokenize_data([text], tokenizer, max_length)
    with torch.no_grad():
        logits = model(tokens['input_ids'], attention_mask=tokens['attention_mask']).logits
    return torch.argmax(logits, dim=1).item()

# Sample prediction
text = "Your sample text here"
result = predict(text, model, tokenizer)
print("Hateful" if result == 1 else "Not hateful")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:  61%|██████▏   | 76/124 [02:53<01:48,  2.26s/it]

It took 43 mins

In [4]:
# Sample prediction
text = "kalakanni deshapaluwo"
result = predict(text, model, tokenizer)
print("Hateful" if result == 1 else "Not hateful")

Hateful


In [7]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import pickle
import numpy as np

report = classification_report(all_labels, all_preds, target_names=['True', 'False'])
# values for confusion matrix
cm = confusion_matrix(all_labels, all_preds)
cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # normalize the confusion matrix

# values for ROC curve
# Convert model output to probabilities and plot ROC curve
fpr, tpr, thresholds = roc_curve(all_labels, all_probs)
roc_auc = auc(fpr, tpr)

# save the values to a file
with open('2.11 Bert.pkl', 'wb') as f:
    pickle.dump({
        'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc, 'cm_percentage': cm_percentage, 'report': report
    }, f)