In [2]:
#!pip install datasets
from datasets import load_dataset
dataset = load_dataset("lmqg/qg_annotation")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset qg_annotation (/home/ubuntu/.cache/huggingface/datasets/lmqg___qg_annotation/qg_annotation/0.0.3/5fd9d4573e46b0d58528a31764696d51ac8a3415afeed228dd12eb3d4499a30e)
100%|██████████| 1/1 [00:00<00:00, 75.21it/s]


In [6]:
dataset['test'][0]
df = dataset['test'].to_pandas()
import plotly.express as px

fig = px.histogram(df, x="correctness")
fig.show()

{'correctness': 1.7999999523162842,
 'grammaticality': 3.0,
 'understandability': 2.4000000953674316,
 'prediction': 'What trade did the Ming dynasty have a shortage of?',
 'Bleu_4': 0.4961682856082916,
 'METEOR': 0.3572683334350586,
 'ROUGE_L': 0.7272727489471436,
 'BERTScore': 0.9142221808433533,
 'MoverScore': 0.6782580614089966,
 'reference_raw': 'What important trade did the Ming Dynasty have with Tibet?',
 'answer_raw': 'horse trade',
 'paragraph_raw': "Some scholars note that Tibetan leaders during the Ming frequently engaged in civil war and conducted their own foreign diplomacy with neighboring states such as Nepal. Some scholars underscore the commercial aspect of the Ming-Tibetan relationship, noting the Ming dynasty's shortage of horses for warfare and thus the importance of the horse trade with Tibet. Others argue that the significant religious nature of the relationship of the Ming court with Tibetan lamas is underrepresented in modern scholarship. In hopes of reviving th

In [18]:
dataset['test'][0]
def extract_feature(item):
    return {'score':item['grammaticality'],'text':item['prediction']}
from torch.utils.data import DataLoader
grammaticality_dataset = dataset.map(
    extract_feature, batched=True, remove_columns=dataset["test"].column_names
)
dataloader = DataLoader(grammaticality_dataset['test'], batch_size=32, num_workers=4)

# Iterate over the dataloader
for batch in dataloader:
    # The batch will contain only the "my_feature" feature
    print(batch)

100%|██████████| 3/3 [00:00<00:00, 295.91ba/s]


{'score': tensor([3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000,
        3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000,
        3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 2.8000, 3.0000, 3.0000, 2.8000,
        3.0000, 3.0000, 2.8000, 3.0000, 3.0000], dtype=torch.float64), 'text': ['What trade did the Ming dynasty have a shortage of?', 'What team did the Notre Dame football team beat in 1928?', 'How many Tajik citizens were killed in World War II?', "What was the name of West's new album?", 'What French retailer was accused of donating funds to the Dalai Lama?', 'Where was the Jesuit Ratio Studiorum from?', "What was the name of Jay-Z's 2001 album?", 'How does Whitehead see God?', 'What did Whitehead believe religion necessitates the realization of?', 'In what season was the group round eliminated?', 'Along with social science, medicine and design, what is a course of study offered at a Fachhochschule?', 'Who declared the conflict in Darfu

In [None]:
import os
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from transformers import (
    AdamW,
    get_linear_schedule_with_warmup,
    RobertaModel,
    RobertaTokenizer,
)
import pytorch_lightning as pl

class RegressionModel(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.hparams = hparams

        # Load the RoBERTa model and tokenizer
        self.model = RobertaModel.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = RobertaTokenizer.from_pretrained(hparams.model_name_or_path)

        # Define the regression head
        self.regression_head = nn.Linear(self.model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        # Get the hidden states from the RoBERTa model
        _, hidden_states = self.model(input_ids, attention_mask)

        # Take the first hidden state and pass it through the regression head
        output = self.regression_head(hidden_states[-1])

        return output

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        output = self.forward(input_ids, attention_mask)
        loss = self.loss(output.squeeze(-1), labels)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        output = self.forward(input_ids, attention_mask)
        loss = self.loss(output.squeeze(-1), labels)
        return {"val_loss": loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        return {"avg_val_loss": avg_loss}

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.hparams.total_steps
        )
        return [optimizer], [scheduler]

    def loss(self, output, labels):
        return nn.MSELoss()(output, labels)

class RegressionDataset(data.Dataset):
    def __init__(self, data_dir):
        self.data = []
        for file in os.listdir(data_dir):
            with open(os.path.join(data_dir, file), "r") as f:
                for line in f:
                    text, label = line.strip().split(",")
                    self.data.append((text, float(label)))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]
        input_ids = self.tokenizer.encode(text, return_tensors="pt").squeeze(0)
        attention_mask = (input_ids != 0).long()
        return input_ids, attention_mask, torch.tensor(label)

def train(hparams):
    # Set random seeds for reproducibility
    random.seed(hparams.seed)
    torch.manual_seed(hparams.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(hparams.seed)

    # Create the dataset and dataloader
    dataset = RegressionDataset(hparams.data_dir)
    dataloader = data.DataLoader(
        dataset, batch_size=hparams.batch_size, num_workers=hparams.num_workers
    )
    
    # Initialize the model and trainer
    model = RegressionModel(hparams)
    trainer = pl.Trainer(
        gpus=hparams.gpus,
        max_epochs=hparams.num_epochs,
        early_stop_callback=True,
        deterministic=True,
    )

    # Start training
    trainer.fit(model, dataloader)

from argparse import Namespace
if __name__ == "__main__":
    # Define the hyperparameters
    hparams = Namespace(
        model_name_or_path="roberta-base",
        data_dir="data/",
        learning_rate=2e-5,
        warmup_steps=0,
        total_steps=10000,
        batch_size=32,
        num_workers=4,
        num_epochs=10,
        seed=42,
        gpus=1,
    )

    # Start training
    train(hparams)