In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import Adam
from transformers import get_scheduler
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv('/kaggle/input/twibot-sc-text/Text5_balanced.csv')

In [3]:
labels_dict = {"human": 0, "bot": 1}

In [4]:
df['label'] = df['label'].apply(lambda x: labels_dict[x])

In [5]:
y = df['label']
X = df['text']

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
def evaluate_classification(y_actual, y_pred):
    # Confusion matrix to get TN, FP, FN, TP
    tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
    
    # Metrics
    accuracy = accuracy_score(y_actual, y_pred)
    precision = precision_score(y_actual, y_pred)
    recall = recall_score(y_actual, y_pred)
    f1 = f1_score(y_actual, y_pred)
    specificity = tn / (tn + fp)
    
    # AUC (for binary classification, needs probabilities or scores ideally)
    try:
        auc = roc_auc_score(y_actual, y_pred)
    except:
        auc = "AUC requires probability scores or appropriate labels"

    # Print results
    print(f"Accuracy    : {accuracy:.4f}")
    print(f"Precision   : {precision:.4f}")
    print(f"Recall      : {recall:.4f}")
    print(f"F1 Score    : {f1:.4f}")
    print(f"Specificity : {specificity:.4f}")
    print(f"AUC         : {auc if isinstance(auc, str) else round(auc, 4)}")

## Bonus Model

In [9]:
X_train = X_train.reset_index(drop=True)[:10000]
y_train = y_train.reset_index(drop=True)[:10000]
X_test = X_test.reset_index(drop=True)[:2000]
y_test = y_test.reset_index(drop=True)[:2000]

In [10]:
X_train

0       @DeniseMBrown3 @TaylorCalderone @advocatemomda...
1       The Future Of The Galaxy Is Female\nhttps://t....
2       It made me so happy to rejoin ⁦@SwingsetLife⁩ ...
3       Hey #NCTIES2022 thank you for an amazing time!...
4       This week's Weekend Film is Dear Evan Hansen! ...
                              ...                        
9995    RT @Anchorage: We are pleased to announce that...
9996    Proud to announce that our colleague Talisha, ...
9997    RT @YvetteCooperMP: My God. People are fleeing...
9998    @e_kitoh Take care and get well soon🙏👊 [1] @jo...
9999    RT @DlCountdown: ECML-PKDD 2022 (abstract): 20...
Name: text, Length: 10000, dtype: object

In [11]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, RobertaConfig
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss
import numpy as np

# Define a custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [12]:
# Parameters
MAX_LEN = 439
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 2e-5

# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
config = RobertaConfig.from_pretrained('roberta-base', num_labels=2)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Create datasets and dataloaders
train_dataset = TextDataset(X_train, y_train, tokenizer, MAX_LEN)
test_dataset = TextDataset(X_test, y_test, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

loss_fn = CrossEntropyLoss(weight=class_weights)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Training loss: {avg_train_loss:.4f}')

Epoch 1/5
Training loss: 0.6856
Epoch 2/5
Training loss: 0.6650
Epoch 3/5
Training loss: 0.6348
Epoch 4/5
Training loss: 0.5819
Epoch 5/5
Training loss: 0.5003


In [14]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score
)


In [15]:
# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            logits = outputs.logits
            batch_preds = torch.argmax(logits, dim=1)
            
            predictions.extend(batch_preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    evaluate_classification(true_labels, predictions)
    return predictions

# Evaluate on test set
test_predictions = evaluate(model, test_loader)

Accuracy    : 0.5960
Precision   : 0.6117
Recall      : 0.5562
F1 Score    : 0.5826
Specificity : 0.6369
AUC         : 0.5966


In [16]:
print(classification_report(y_test, test_predictions))

              precision    recall  f1-score   support

           0       0.58      0.64      0.61       986
           1       0.61      0.56      0.58      1014

    accuracy                           0.60      2000
   macro avg       0.60      0.60      0.60      2000
weighted avg       0.60      0.60      0.60      2000

