# Load Datasets

In [1]:
import pandas as pd

# DATASET_PATH = "./OUTPUTS_data/ALL_merged.csv"
DATASET_PATH = "./Davidson-Original/train_ORIGINAL.csv"
# LANGUAGE = "ALL"
TEST_DATASET_PATH = "./Davidson-Original/test.csv"

train_dataset = pd.read_csv(DATASET_PATH)

In [2]:
train_dataset.columns = ['tweet', 'class']
train_dataset.columns

Index(['tweet', 'class'], dtype='object')

In [3]:
train_dataset["class"].value_counts()

class
1    17285
2     3753
0     1266
Name: count, dtype: int64

In [4]:
test_dataset = pd.read_csv(TEST_DATASET_PATH)
# Change column names
test_dataset.columns = ['tweet', 'class']

print(test_dataset['class'].value_counts())
test_dataset.head()

class
1    1905
2     410
0     164
Name: count, dtype: int64


Unnamed: 0,tweet,class
0,934 8616\ni got a missed call from yo bitch,1
1,RT @KINGTUNCHI_: Fucking with a bad bitch you ...,1
2,RT @eanahS__: @1inkkofrosess lol my credit ain...,2
3,RT @Maxin_Betha Wipe the cum out of them faggo...,1
4,Niggas cheat on they bitch and don't expect no...,1


# Defining the custom dataset class

In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report

# Define a custom dataset class for loading tweet data
class TweetDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_length):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.tweets)
    
    def __getitem__(self, idx):
        tweet = str(self.tweets[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'tweet': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 13.2kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 2.33MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 672kB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 41.3kB/s]
model.safetensors: 100%|██████████| 440M/440M [00:15<00:00, 27.6MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Define training parameters
batch_size = 8
max_length = 128
num_epochs = 3
learning_rate = 2e-5


In [8]:
# Create DataLoader for train and test datasets
train_dataset = TweetDataset(train_dataset['tweet'], train_dataset['class'], tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TweetDataset(test_dataset['tweet'], test_dataset['class'], tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

In [9]:
# Fine-tune BERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}')

KeyboardInterrupt: 

In [None]:
# Evaluate the fine-tuned model on the test dataset
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Print classification report
target_names = ['Hate', 'Offensive', 'Neither']
print(classification_report(true_labels, predictions, target_names=target_names))