In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.utils.data import Dataset, DataLoader
from bert_score import score
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter 


In [None]:
# Section 2: Load and Preprocess Data

# Load the data
train_essays = pd.read_csv('train_essays.csv')
train_prompts = pd.read_csv('train_prompts.csv')

# Combine the essays and prompts based on prompt_id
train_data = pd.merge(train_essays, train_prompts, on='prompt_id')

# Split the data into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Preprocess the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

class EssayDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['generated']

        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Set up training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Initialize the BERT model with dropout regularization
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Data augmentation
aug = naw.SynonymAug(aug_src='wordnet')

# Apply data augmentation to the training data
train_data_augmented = train_data.copy()
train_data_augmented['text'] = train_data_augmented['text'].apply(lambda x: aug.augment(x))

# Combine the augmented data with the original training data
train_data = pd.concat([train_data, train_data_augmented], ignore_index=True)

# Create the training and validation datasets
train_dataset = EssayDataset(train_data, tokenizer)
val_dataset = EssayDataset(val_data, tokenizer)

# Define the fitness function for PSO
def fitness_function(params):
    learning_rate, num_train_epochs = params

    # Add dropout to the BERT model
    model.config.attention_probs_dropout_prob = 0.1
    model.config.hidden_dropout_prob = 0.1

    # Add weight decay to the optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01, correct_bias=False)

    # Create the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy='steps',
        eval_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        save_total_limit=1
    )

    # Create the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.02)],
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()

    # Display the evaluation results
    print("Evaluation Results:", eval_results)

    # Return the validation loss
    return -eval_results.get("eval_acc", 0)  # Use get method to handle the KeyError

In [None]:
# Define the bounds for PSO
lower_bound = [1e-6, 1]  # Lower bound for learning rate and num_train_epochs
upper_bound = [1e-2, 5]  # Upper bound for learning rate and num_train_epochs

# Perform PSO optimization
best_params, _ = pso(fitness_function, lower_bound, upper_bound, swarmsize=5, maxiter=10)
print("Best Hyperparameters found by PSO:")
print(f"Learning Rate: {best_params[0]}")
print(f"Num Train Epochs: {int(best_params[1])}")

In [None]:
optimizer = AdamW(model.parameters(), lr=best_params[0], weight_decay=0.01, correct_bias=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert_model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=1,
    num_train_epochs=int(best_params[1]),
    save_steps=500,
    logging_steps=100,
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="tensorboard",
)

In [None]:
# Initialize TensorBoard SummaryWriter
writer = SummaryWriter(log_dir="./logs")

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=EssayDataset(train_data, tokenizer),
    eval_dataset=EssayDataset(val_data, tokenizer),
    tokenizer=tokenizer,
    data_collator=None,
    compute_metrics=None,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.02)],
    optimizers=(optimizer, None)
)

# Train the model
trainer.train()

In [None]:
# Save the model
trainer.save_model("./bert_model")

from sklearn.metrics import accuracy_score, classification_report
# Calculate accuracy
predictions = trainer.predict(EssayDataset(val_data, tokenizer))
predicted_labels = predictions.predictions.argmax(axis=1)
true_labels = val_data['generated'].values
accuracy = accuracy_score(true_labels, predicted_labels)
# Calculate BERTScore
P, R, F1 = score(hyps, refs, lang="en", verbose=True)

# Display the evaluation results
print("***** Evaluation Results *****")
print(f"Validation Accuracy: {accuracy}")
print(f"BERTScore Precision: {P.mean().item():.4f}")
print(f"BERTScore Recall: {R.mean().item():.4f}")
print(f"BERTScore F1: {F1.mean().item():.4f}")

In [None]:
# Evaluate the model using BERTScore
val_dataset = EssayDataset(val_data, tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

refs = [ref['text'].tolist() for _, ref in val_data.groupby('id')]
hyps = []

model.eval()
with torch.no_grad():
    for batch in tqdm(val_dataloader, desc='Validation'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        hyps.extend([tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids])


In [None]:
# Calculate BERTScore
P, R, F1 = score(hyps, refs, lang="en", verbose=True)

# Display the evaluation results
print(f"BERTScore Precision: {P.mean().item():.4f}")
print(f"BERTScore Recall: {R.mean().item():.4f}")
print(f"BERTScore F1: {F1.mean().item():.4f}")

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

# Load the test data
test_essays = pd.read_csv('test_essays.csv')

# Load the trained model
model = BertForSequenceClassification.from_pretrained('./bert_model')  # Change the path accordingly
tokenizer = BertTokenizer.from_pretrained('./bert_model')  # Change the path accordingly

# Preprocess the test data
class EssayTestDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']

        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

# Create a DataLoader for the test data
test_dataset = EssayTestDataset(test_essays, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Make predictions
model.eval()
predictions = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc='Predicting'):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=1)

        predictions.extend(batch_predictions.cpu().numpy())

# Add predictions to the test data
test_essays['predicted'] = predictions

# Display the predictions
print(test_essays[['id', 'predicted']])