In [None]:
import json
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW 

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class EssayDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_len):
        self.data = []
        with open(file_path, 'r') as f:
            for line in f:
                self.data.append(json.loads(line))
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        essay = self.data[idx]['text']
        label = self.data[idx].get('label', None)  

        encoding = self.tokenizer(
            essay,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        item = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
        }

        if label is not None:  
            item['label'] = torch.tensor(label, dtype=torch.long)

        return item

# Load datasets
train_file = '/kaggle/input/ds-for-fyp/new_train.jsonl'
dev_file = '/kaggle/input/ds-for-fyp/new_dev.jsonl'
test_file = '/kaggle/input/ds-for-fyp/devtest_text_id_only.jsonl'

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets and dataloaders
max_len = 128
batch_size = 32  # Increased batch size for efficiency

def collate_fn(batch):
    input_ids = torch.stack([x['input_ids'] for x in batch])
    attention_mask = torch.stack([x['attention_mask'] for x in batch])

    if 'label' in batch[0]:
        labels = torch.tensor([x['label'] for x in batch], dtype=torch.long)
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': labels}
    return {'input_ids': input_ids, 'attention_mask': attention_mask}

train_dataset = EssayDataset(train_file, tokenizer, max_len)
dev_dataset = EssayDataset(dev_file, tokenizer, max_len)
test_dataset = EssayDataset(test_file, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Learning rate scheduler
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

def train_model(model, train_loader, dev_loader, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

        evaluate_model(model, dev_loader)

def evaluate_model(model, data_loader):
    model.eval()
    preds, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(true_labels, preds)
    precision = precision_score(true_labels, preds, average='binary')
    recall = recall_score(true_labels, preds, average='binary')
    f1 = f1_score(true_labels, preds, average='binary')

    print(f"Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

def generate_predictions(model, data_loader, output_file):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(batch_preds)

    with open(output_file, 'w') as f:
        for i, pred in enumerate(predictions):
            f.write(json.dumps({"id": i + 1, "label": "ai" if pred == 1 else "human"}) + '\n')

# Train the model
train_model(model, train_loader, dev_loader, epochs=3)

# Generate predictions on test set
output_predictions_file = '/kaggle/working/output_predictions.jsonl'
generate_predictions(model, test_loader, output_predictions_file)
print(f"Predictions saved to {output_predictions_file}")

In [None]:
torch.save(model.state_dict(), "model.pth")
print("Pth file saved!!!!!!")
