In [None]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
from transformers import (
    AutoModel,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
    BertTokenizer,
    BertModel,
    RobertaTokenizer,
    RobertaModel,
    BartTokenizer,
    BartModel
)

os.makedirs('plots', exist_ok=True)

class TransformerAspectDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.data = []
        self.tokenizer = tokenizer
        self.max_length = max_length

        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                item = json.loads(line.strip())
                tokens = item['tokens']
                polarity = item['polarity']
                aspect_term = item['aspect_term'][0]
                index = item['index']

                # Join tokens into text
                text = ' '.join(tokens)

                # Convert polarity to numeric label
                if polarity == 'positive':
                    label = 0
                elif polarity == 'negative':
                    label = 1
                else:  # neutral
                    label = 2

                # Tokenize for transformers - create special format to highlight aspect
                # Format: [CLS] text [SEP] aspect [SEP]
                encoding = self.tokenizer(
                    text,
                    aspect_term,
                    padding='max_length',
                    truncation='longest_first',
                    max_length=self.max_length,
                    return_tensors='pt'
                )

                # Remove batch dimension
                encoding = {k: v.squeeze(0) for k, v in encoding.items()}
                encoding['label'] = torch.tensor(label)

                self.data.append(encoding)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Model classes for each transformer
class BERTClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

class BARTClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.bart = BartModel.from_pretrained('facebook/bart-base')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bart.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bart(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use the first token for classification
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

class RoBERTaClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.last_hidden_state[:, 0, :]  # First token for classification
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

def finetune_transformer(model_name, train_file, val_file, num_epochs=3, batch_size=16, learning_rate=2e-5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Initialize tokenizer and model based on model_name
    if model_name == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BERTClassifier().to(device)
    elif model_name == 'bart':
        tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
        model = BARTClassifier().to(device)
    else:  # roberta
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = RoBERTaClassifier().to(device)

    # Create datasets and dataloaders
    train_dataset = TransformerAspectDataset(train_file, tokenizer)
    val_dataset = TransformerAspectDataset(val_file, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Define optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # Add learning rate scheduler
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=total_steps*0.1,
        num_training_steps=total_steps
    )

    # Track metrics
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    best_val_acc = 0

    # Training loop
    for epoch in range(num_epochs):
        # Training
        model.train()
        epoch_loss = 0
        correct = 0
        total = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}

            # Forward pass
            optimizer.zero_grad()

            # Handle different model architectures
            if model_name == 'bert':
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    token_type_ids=batch['token_type_ids']
                )
            else:  # bart and roberta don't use token_type_ids
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask']
                )

            loss = criterion(outputs, batch['label'])

            # Backward pass
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Track metrics
            epoch_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += batch['label'].size(0)
            correct += (predicted == batch['label']).sum().item()

        train_loss = epoch_loss / len(train_loader)
        train_acc = correct / total
        train_losses.append(train_loss)
        train_accs.append(train_acc)

        # Validation
        model.eval()
        epoch_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
                # Move batch to device
                batch = {k: v.to(device) for k, v in batch.items()}

                # Forward pass
                if model_name == 'bert':
                    outputs = model(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        token_type_ids=batch['token_type_ids']
                    )
                else:  # bart and roberta don't use token_type_ids
                    outputs = model(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask']
                    )

                loss = criterion(outputs, batch['label'])

                # Track metrics
                epoch_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += batch['label'].size(0)
                correct += (predicted == batch['label']).sum().item()

        val_loss = epoch_loss / len(val_loader)
        val_acc = correct / total
        val_losses.append(val_loss)
        val_accs.append(val_acc)

        print(f"Epoch {epoch+1}/{num_epochs}:")
        print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        print(f"  Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), f'{model_name}_model_best.pt')
            print(f"  Saved best model with val acc: {val_acc:.4f}")

    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title(f'{model_name.upper()} Loss')
    plt.grid(linestyle='--', alpha=0.7)

    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Train Acc')
    plt.plot(val_accs, label='Val Acc')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title(f'{model_name.upper()} Accuracy')
    plt.grid(linestyle='--', alpha=0.7)

    plt.suptitle(f'{model_name.upper()} - Best Val Acc: {best_val_acc:.4f}', fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.95])  # Make room for suptitle

    # Save plots in multiple formats with detailed filenames
    plot_filename = f'plots/{model_name}_training'
    plt.savefig(f'{plot_filename}.png', dpi=300)
    plt.savefig(f'{plot_filename}.pdf')
    plt.close()

    # Create a focused plot just for loss curves
    plt.figure(figsize=(8, 5))
    plt.plot(train_losses, 'b-', label='Train Loss')
    plt.plot(val_losses, 'r-', label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'{model_name.upper()} - Loss Curves')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(f'plots/{model_name}_loss_curves.png', dpi=300)
    plt.close()

    # Create a focused plot just for accuracy curves
    plt.figure(figsize=(8, 5))
    plt.plot(train_accs, 'g-', label='Train Accuracy')
    plt.plot(val_accs, 'm-', label='Val Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title(f'{model_name.upper()} - Accuracy Curves')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(f'plots/{model_name}_accuracy_curves.png', dpi=300)
    plt.close()

    return {
        'model_name': model_name,
        'best_val_acc': best_val_acc,
        'train_losses': train_losses,
        'val_losses': val_losses,
        'train_accs': train_accs,
        'val_accs': val_accs
    }

def train_all_transformers(train_file, val_file, num_epochs=3):
    transformer_results = []

    # Fine-tune each transformer model
    for model_name in ['bert', 'bart', 'roberta']:
        print(f"\n\n{'='*50}")
        print(f"Fine-tuning {model_name.upper()} model")
        print(f"{'='*50}\n")

        result = finetune_transformer(
            model_name=model_name,
            train_file=train_file,
            val_file=val_file,
            num_epochs=num_epochs
        )

        transformer_results.append(result)

    # Create comparison plot
    plt.figure(figsize=(10, 6))
    models = [r['model_name'].upper() for r in transformer_results]
    accs = [r['best_val_acc'] for r in transformer_results]

    colors = ['#3498db', '#2ecc71', '#e74c3c']  # Blue, Green, Red
    bars = plt.bar(models, accs, color=colors)

    # Add accuracy labels
    for bar, acc in zip(bars, accs):
        plt.text(bar.get_x() + bar.get_width()/2,
                bar.get_height() + 0.01,
                f'{acc:.4f}',
                ha='center', va='bottom',
                fontsize=12, fontweight='bold')

    plt.title('Transformer Models Validation Accuracy', fontsize=16)
    plt.xlabel('Model', fontsize=14)
    plt.ylabel('Accuracy', fontsize=14)
    plt.ylim(0, max(accs) + 0.1)  # Dynamic upper limit
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()

    plt.savefig('plots/transformer_comparison.png', dpi=300)
    plt.savefig('plots/transformer_comparison.pdf')
    plt.close()

    # Create a combined loss plot for all models
    plt.figure(figsize=(12, 6))
    for i, result in enumerate(transformer_results):
        plt.plot(result['train_losses'], 'o-', label=f"{result['model_name'].upper()} Train", alpha=0.7)
        plt.plot(result['val_losses'], 's--', label=f"{result['model_name'].upper()} Val", alpha=0.7)

    plt.xlabel('Epoch', fontsize=12)
    plt.ylabel('Loss', fontsize=12)
    plt.title('Loss Comparison Across Transformer Models', fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    plt.tight_layout()
    plt.savefig('plots/all_transformers_loss.png', dpi=300)
    plt.close()

    # Create a combined accuracy plot for all models
    plt.figure(figsize=(12, 6))
    for i, result in enumerate(transformer_results):
        plt.plot(result['train_accs'], 'o-', label=f"{result['model_name'].upper()} Train", alpha=0.7)
        plt.plot(result['val_accs'], 's--', label=f"{result['model_name'].upper()} Val", alpha=0.7)

    plt.xlabel('Epoch', fontsize=12)
    plt.ylabel('Accuracy', fontsize=12)
    plt.title('Accuracy Comparison Across Transformer Models', fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    plt.tight_layout()
    plt.savefig('plots/all_transformers_accuracy.png', dpi=300)
    plt.close()

    return transformer_results

# Run the transformer fine-tuning
if __name__ == "__main__":
    transformer_results = train_all_transformers(
        'train_task_2.json',
        'val_task_2.json',
        num_epochs=3  
    )

    # Print final results
    print("\n===== TRANSFORMER RESULTS =====")
    for result in transformer_results:
        print(f"{result['model_name'].upper()}: {result['best_val_acc']:.4f}")



Fine-tuning BERT model

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/3 - Training: 100%|██████████| 186/186 [00:57<00:00,  3.21it/s]
Epoch 1/3 - Validation: 100%|██████████| 24/24 [00:02<00:00,  9.95it/s]


Epoch 1/3:
  Train Loss: 0.8476 | Train Acc: 0.5816
  Val Loss: 0.7211 | Val Acc: 0.6981
  Saved best model with val acc: 0.6981


Epoch 2/3 - Training: 100%|██████████| 186/186 [01:00<00:00,  3.09it/s]
Epoch 2/3 - Validation: 100%|██████████| 24/24 [00:02<00:00,  9.70it/s]


Epoch 2/3:
  Train Loss: 0.5234 | Train Acc: 0.7852
  Val Loss: 0.5941 | Val Acc: 0.7547
  Saved best model with val acc: 0.7547


Epoch 3/3 - Training: 100%|██████████| 186/186 [01:00<00:00,  3.10it/s]
Epoch 3/3 - Validation: 100%|██████████| 24/24 [00:02<00:00,  9.65it/s]


Epoch 3/3:
  Train Loss: 0.3785 | Train Acc: 0.8568
  Val Loss: 0.5822 | Val Acc: 0.7574
  Saved best model with val acc: 0.7574


Fine-tuning BART model

Using device: cuda


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Epoch 1/3 - Training: 100%|██████████| 186/186 [01:13<00:00,  2.54it/s]
Epoch 1/3 - Validation: 100%|██████████| 24/24 [00:02<00:00,  8.20it/s]


Epoch 1/3:
  Train Loss: 0.9847 | Train Acc: 0.5768
  Val Loss: 0.7713 | Val Acc: 0.6739
  Saved best model with val acc: 0.6739


Epoch 2/3 - Training: 100%|██████████| 186/186 [01:13<00:00,  2.54it/s]
Epoch 2/3 - Validation: 100%|██████████| 24/24 [00:02<00:00,  8.20it/s]


Epoch 2/3:
  Train Loss: 0.6847 | Train Acc: 0.7204
  Val Loss: 0.6872 | Val Acc: 0.7035
  Saved best model with val acc: 0.7035


Epoch 3/3 - Training: 100%|██████████| 186/186 [01:13<00:00,  2.54it/s]
Epoch 3/3 - Validation: 100%|██████████| 24/24 [00:02<00:00,  8.21it/s]


Epoch 3/3:
  Train Loss: 0.5979 | Train Acc: 0.7531
  Val Loss: 0.7015 | Val Acc: 0.7035


Fine-tuning ROBERTA model

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3 - Training: 100%|██████████| 186/186 [01:01<00:00,  3.04it/s]
Epoch 1/3 - Validation: 100%|██████████| 24/24 [00:02<00:00, 10.43it/s]


Epoch 1/3:
  Train Loss: 0.8303 | Train Acc: 0.6417
  Val Loss: 0.7004 | Val Acc: 0.6873
  Saved best model with val acc: 0.6873


Epoch 2/3 - Training: 100%|██████████| 186/186 [01:01<00:00,  3.04it/s]
Epoch 2/3 - Validation: 100%|██████████| 24/24 [00:02<00:00, 10.41it/s]


Epoch 2/3:
  Train Loss: 0.5341 | Train Acc: 0.7754
  Val Loss: 0.5383 | Val Acc: 0.7493
  Saved best model with val acc: 0.7493


Epoch 3/3 - Training: 100%|██████████| 186/186 [01:00<00:00,  3.05it/s]
Epoch 3/3 - Validation: 100%|██████████| 24/24 [00:02<00:00, 10.47it/s]


Epoch 3/3:
  Train Loss: 0.3582 | Train Acc: 0.8555
  Val Loss: 0.5683 | Val Acc: 0.7790
  Saved best model with val acc: 0.7790

===== TRANSFORMER RESULTS =====
BERT: 0.7574
BART: 0.7035
ROBERTA: 0.7790
