In [None]:
import numpy as np
import pandas as pd
import torch
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import BertForSequenceClassification, BertTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load Data
class TextDataset(Dataset):
    def __init__(self, tokenizer, filepath):
        self.data = pd.read_csv(filepath)
        self.tokenizer = tokenizer
        self.texts = self.data['review'].tolist()
        # Convert labels to integers
        self.labels = torch.tensor(self.data['label'].apply(lambda x: 0 if x == 'human' else 1).tolist())

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        input_ids = inputs['input_ids'].squeeze(0)  # Remove batch dimension
        return input_ids, label

    
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load data
dataset = TextDataset(tokenizer, 'reviews.csv')

# Split the dataset
train, test = train_test_split(dataset, test_size=0.2, random_state=42)

# Create data loaders
train_loader = DataLoader(train, batch_size=16, shuffle=True)
test_loader = DataLoader(test, batch_size=16, shuffle=False)

## Train Adversarial Model

In [6]:
def adversarial_training(model, inputs, labels, epsilon=0.01):
    model.zero_grad()
    outputs = model(**inputs)
    loss = torch.nn.functional.cross_entropy(outputs.logits, labels)
    loss.backward()  # Compute gradients
    
    # Create perturbed inputs with adversarial noise
    perturbed_inputs = inputs.copy()
    for key in perturbed_inputs:
        if perturbed_inputs[key].requires_grad:
            perturbation = epsilon * perturbed_inputs[key].grad.sign()
            perturbed_inputs[key] = perturbed_inputs[key] + perturbation.detach()  # detach to avoid further graph tracking

    # Clear past gradients
    model.zero_grad()

    # Re-run the model on perturbed data
    perturbed_outputs = model(**perturbed_inputs)
    perturbed_loss = torch.nn.functional.cross_entropy(perturbed_outputs.logits, labels)

    return loss.item(), perturbed_loss.item()

In [7]:
# Initialize model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set model to training mode
model.train()

for epoch in range(4):  # Loop over the dataset multiple times
    total_loss = 0
    total_adv_loss = 0
    for batch_inputs, batch_labels in train_loader:
        inputs = {'input_ids': batch_inputs, 'labels': batch_labels}
        loss, adv_loss = adversarial_training(model, inputs, batch_labels, epsilon=0.01)
        optimizer.step()
        total_loss += loss
        total_adv_loss += adv_loss

    avg_loss = total_loss / len(train_loader)
    avg_adv_loss = total_adv_loss / len(train_loader)
    print(f'Epoch {epoch+1}, Loss: {avg_loss}, Adversarial Loss: {avg_adv_loss}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [None]:
def evaluate_model(model, dataloader):
    model.eval()
    true_labels, predictions = [], []

    with torch.no_grad():
        for batch_inputs, batch_labels in dataloader:
            inputs = {'input_ids': batch_inputs}
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.numpy())
            true_labels.extend(batch_labels.numpy())

    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    conf_matrix = confusion_matrix(true_labels, predictions)

    return accuracy, precision, recall, f1, conf_matrix

In [None]:
accuracy, precision, recall, f1, conf_matrix = evaluate_model(model, test_loader)
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print('Confusion Matrix:')

# Display confusion matrix for the test set
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['AI', 'Human'], yticklabels=['AI', 'Human'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix for Test Set')
plt.show()