In [1]:
import torch
from torch import nn
import random
import pandas as pd
import numpy as np
import json
import math
import re
import pytorch_lightning as pl
from torch.utils.data import DataLoader, random_split, Dataset
from torch.nn import functional as F
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.optim.lr_scheduler import ReduceLROnPlateau
import os
import time
from string import punctuation
import string
import argparse
from argparse import ArgumentParser
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint



In [2]:
def data_clean_and_format(df):
    df_question = pd.DataFrame(df.question.str.split('.').str[-1])
    df_answer = pd.DataFrame(df.answer)
    df_context = pd.DataFrame(df.question.str.split('.').str[:-1])
    df_context['context'] = [','.join(map(str, l)) for l in df_context['question']]
    df_context = df_context.drop('question', axis=1)
    # Concatenate the dataframes horizontally (axis=1)
    final_df = pd.concat([df_context, df_question, df_answer], axis=1)
    
    return final_df

In [3]:
training_set = pd.read_json("train.jsonl", lines=True)
testing_set = pd.read_json("test.jsonl", lines=True)

In [4]:
train_data = data_clean_and_format(training_set)
test_data = data_clean_and_format(testing_set)

In [5]:
'''
# Define a function to remove values enclosed within << >>
def remove_values(text):
    return re.sub('<<.*?>>', '', text)

# Apply the function to the 'answer' column of the DataFrame
train_data['answer'] = train_data['answer'].apply(remove_values)
test_data['answer'] = test_data['answer'].apply(remove_values)
'''

"\n# Define a function to remove values enclosed within << >>\ndef remove_values(text):\n    return re.sub('<<.*?>>', '', text)\n\n# Apply the function to the 'answer' column of the DataFrame\ntrain_data['answer'] = train_data['answer'].apply(remove_values)\ntest_data['answer'] = test_data['answer'].apply(remove_values)\n"

In [6]:
# Set the random seed for reproducibility
random_seed = 42

# Define the proportion of data to be used for validation
validation_ratio = 0.20

# Randomly sample the DataFrame to create the validation set
val_data = train_data.sample(frac=validation_ratio, random_state=random_seed)

# Create the train set by excluding the validation set
train_data = train_data.drop(val_data.index)

# Print the shapes of train and validation sets
print("Train set shape:", train_data.shape)
print("Validation set shape:", val_data.shape)

Train set shape: (5978, 3)
Validation set shape: (1495, 3)


In [7]:
train_data.shape

(5978, 3)

In [8]:
print(train_data.shape, val_data.shape)

(5978, 3) (1495, 3)


In [5]:
pretrained_model_path = 'checkpoints/T5_span_loss_large_v2'
model = T5ForConditionalGeneration.from_pretrained(pretrained_model_path)
tokenizer = T5Tokenizer.from_pretrained(pretrained_model_path)



In [10]:
#model.config

In [11]:
class GSMData(Dataset):
    def __init__(self, df, tokenizer, input_max_length=512, output_max_length=512):
        self.dataset = df
        self.tokenizer = tokenizer
        self.input_length = input_max_length
        self.output_length = output_max_length
    
    def __len__(self):
        return len(self.dataset)
    
    
    def __getitem__(self, index):
        
        data_row = self.dataset.iloc[index]
        source_encoding = tokenizer(
            data_row["question"],
            data_row["context"],
            max_length=self.input_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )
        
        target_encoding = tokenizer(
            data_row["answer"],
            max_length=self.output_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )
        
        labels = target_encoding["input_ids"]
        labels[labels == 0] = -100
        
        return dict(
            question=data_row["question"],
            context=data_row["context"],
            answer=data_row["answer"],
            input_ids=source_encoding["input_ids"].flatten(),
            attention_mask=source_encoding["attention_mask"].flatten(),
            labels=labels.flatten()
        )
            

In [12]:
class GSMDataModule(pl.LightningDataModule):
    def __init__(self, train_file, val_file, test_file, tokenizer_name_or_path, input_length=512, output_length=512, batch_size=2, num_workers=4):
        super().__init__()
        self.train_file = train_file
        self.val_file = val_file
        self.test_file = test_file
        self.tokenizer = T5Tokenizer.from_pretrained(tokenizer_name_or_path)
        self.input_length = input_length
        self.output_length = output_length
        self.batch_size = batch_size
        self.num_workers = num_workers
             
    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
            self.train_data = GSMData(self.train_file, self.tokenizer)
            self.val_data = GSMData(self.val_file, self.tokenizer)
        if stage == 'test' or stage is None:
            self.test_data = GSMData(self.test_file, self.tokenizer)
        
    def train_dataloader(self):
        return DataLoader(self.train_data, batch_size=self.batch_size, num_workers=self.num_workers)
     
    def val_dataloader(self):
        return DataLoader(self.val_data, batch_size=self.batch_size, num_workers=self.num_workers)
    
    def test_dataloader(self):
        return DataLoader(self.test_data, batch_size=self.batch_size, num_workers=self.num_workers)

In [13]:
data_module = GSMDataModule(train_data, val_data, test_data, pretrained_model_path)
data_module.setup()

In [6]:
class GSMModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(pretrained_model_path, return_dict = True)
        
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids = input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-5)

In [11]:
model = GSMModel()

In [16]:
checkpoint_callback = ModelCheckpoint(
    dirpath="Finetuned_checkpoint",
    filename='best_model_span_large',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min',
    save_weights_only=True,
)

In [17]:
# create a logger instance
logger = TensorBoardLogger(save_dir='logs/', name='GSM_Finetuning_Logs')

In [54]:
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs = 25,
    logger=logger
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [55]:
trainer.fit(model,data_module)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 783 M 
-----------------------------------------------------
783 M     Trainable params
0         Non-trainable params
783 M     Total params
3,132.600 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 2989: 'val_loss' reached 1.58178 (best 1.58178), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 5978: 'val_loss' reached 1.32776 (best 1.32776), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 2, global step 8967: 'val_loss' reached 1.16963 (best 1.16963), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 11956: 'val_loss' reached 1.03473 (best 1.03473), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 14945: 'val_loss' reached 0.97187 (best 0.97187), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 17934: 'val_loss' reached 0.94873 (best 0.94873), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 20923: 'val_loss' reached 0.92274 (best 0.92274), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 23912: 'val_loss' reached 0.90186 (best 0.90186), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 8, global step 26901: 'val_loss' reached 0.89086 (best 0.89086), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 9, global step 29890: 'val_loss' reached 0.87498 (best 0.87498), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 10, global step 32879: 'val_loss' reached 0.87007 (best 0.87007), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 11, global step 35868: 'val_loss' reached 0.86251 (best 0.86251), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 12, global step 38857: 'val_loss' reached 0.85703 (best 0.85703), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 13, global step 41846: 'val_loss' reached 0.85261 (best 0.85261), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 14, global step 44835: 'val_loss' reached 0.85228 (best 0.85228), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 15, global step 47824: 'val_loss' reached 0.85154 (best 0.85154), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 16, global step 50813: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 17, global step 53802: 'val_loss' reached 0.84827 (best 0.84827), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_span_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 18, global step 56791: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 19, global step 59780: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 20, global step 62769: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 21, global step 65758: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 22, global step 68747: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 23, global step 71736: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 24, global step 74725: 'val_loss' was not in top 1
`Trainer.fit` stopped: `max_epochs=25` reached.


In [56]:
trainer.test(model, datamodule=data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.8776357769966125
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.8776357769966125}]

In [7]:
### predictions
finetuned_model = GSMModel.load_from_checkpoint("Finetuned_checkpoint/best_model_span_large.ckpt")
finetuned_model.freeze()
#finetuned_model.eval()

In [8]:
def generate_answer(question, return_sequences):
    source_encoding = tokenizer(
        question["question"],
        question["context"],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    # Move the input tensors to the same device as the model
    input_ids = source_encoding["input_ids"].to(finetuned_model.device)
    attention_mask = source_encoding["attention_mask"].to(finetuned_model.device)

    generated_ids = finetuned_model.model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        num_beams=120,
        max_length=512,
        early_stopping=True,
        use_cache=True,
        repetition_penalty=3.0,
        #temperature=1,
        num_return_sequences = return_sequences
    )

    preds = [tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_space=True)
             for gen_id in generated_ids]

    return preds#" ".join(preds)


In [12]:
device = torch.device('cpu')
finetuned_model = finetuned_model.to(device)

In [None]:
pass_outcome = {}

# Generate sequences for the largest value in pass_list
max_return_seq = 80 #max(pass_list)
sequences = {}  # Dictionary to store the generated sequences for each question

for i in range(586, len(test_data)):
    question = test_data.iloc[i]
    sequences[i] = generate_answer(question, max_return_seq)

In [None]:
len(sequences)

In [None]:
pd.DataFrame(sequences).to_csv('predictions_t5_large_jaccard_586+.csv', index=False)

In [5]:
pred1 = pd.read_csv('predictions_t5_large_jaccard_586.csv')
pred2 = pd.read_csv('predictions_t5_large_jaccard_586+.csv')
pred1.shape, pred2.shape

((80, 586), (80, 733))

In [6]:
df_combined = pd.concat([pred1, pred2], axis=1)
df_combined.to_csv('predictions_t5_large_jaccard_temp1_penality3.csv', index=False)
df_combined.shape

(80, 1319)

In [7]:
# Convert the DataFrame to a dictionary
df_combined_dict = df_combined.to_dict(orient='list')

In [8]:
pass_outcome = {}
max_return_seq = 80

for return_seq in range(1,max_return_seq+1):
    correct_predictions = 0
    for i in range(len(test_data)):
        predictions = df_combined_dict[str(i)][:return_seq]  # Use the pre-generated sequences up to return_seq
        question = test_data.iloc[i]
        answer = question.answer
        ground_truth = question.answer.split('#### ')[-1]
        for generated_seq in predictions:
            # Extract the final answer from the generated sequence
            generated_answer = generated_seq.split('#### ')[-1]
            if generated_answer == ground_truth:
                correct_predictions += 1
                break
    pass_outcome[return_seq] = correct_predictions

    # Calculate the accuracy
    accuracy = correct_predictions / (return_seq * len(test_data))
    print(f'Absolute Accuracy for pass@:{return_seq} is {accuracy}')
    pass_accuracy = correct_predictions / len(test_data)
    print(f'Pass Accuracy for pass@:{return_seq} is {pass_accuracy}')

Absolute Accuracy for pass@:1 is 0.12736921910538287
Pass Accuracy for pass@:1 is 0.12736921910538287
Absolute Accuracy for pass@:2 is 0.07657316148597422
Pass Accuracy for pass@:2 is 0.15314632297194844
Absolute Accuracy for pass@:3 is 0.058630275461207984
Pass Accuracy for pass@:3 is 0.17589082638362397
Absolute Accuracy for pass@:4 is 0.047763457164518575
Pass Accuracy for pass@:4 is 0.1910538286580743
Absolute Accuracy for pass@:5 is 0.0401819560272934
Pass Accuracy for pass@:5 is 0.20090978013646701
Absolute Accuracy for pass@:6 is 0.03538033864038413
Pass Accuracy for pass@:6 is 0.21228203184230476
Absolute Accuracy for pass@:7 is 0.032275533412758586
Pass Accuracy for pass@:7 is 0.2259287338893101
Absolute Accuracy for pass@:8 is 0.029757391963608795
Pass Accuracy for pass@:8 is 0.23805913570887036
Absolute Accuracy for pass@:9 is 0.02720916519248589
Pass Accuracy for pass@:9 is 0.244882486732373
Absolute Accuracy for pass@:10 is 0.02532221379833207
Pass Accuracy for pass@:10 is

In [9]:
pass_outcome = {}

# Generate sequences for the largest value in pass_list
max_return_seq = 50 #max(pass_list)
sequences = {}  # Dictionary to store the generated sequences for each question

for i in range(len(test_data)):
    question = test_data.iloc[i]
    sequences[i] = generate_answer(question, max_return_seq)

print("sequence generation complete")
# Iterate over pass_list and calculate metrics using the generated sequences
for return_seq in range(1,max_return_seq+1):
    correct_predictions = 0
    for i in range(len(test_data)):

        predictions = sequences[i][:return_seq]  # Use the pre-generated sequences up to return_seq
        question = test_data.iloc[i]
        answer = question.answer
        ground_truth = question.answer.split('#### ')[-1]
        
        for generated_seq in predictions:
            # Extract the final answer from the generated sequence
            generated_answer = generated_seq.split('#### ')[-1]
            if generated_answer == ground_truth:
                correct_predictions += 1
                break
            
    pass_outcome["pass@"+str(return_seq)] = correct_predictions

    # Calculate the accuracy
    accuracy = correct_predictions / (return_seq * len(test_data))
    print(f'Absolute Accuracy for pass@:{return_seq} is {accuracy}')
    pass_accuracy = correct_predictions / len(test_data)
    print(f'Pass Accuracy for pass@:{return_seq} is {pass_accuracy}')

RuntimeError: CUDA out of memory. Tried to allocate 122.00 MiB (GPU 0; 47.54 GiB total capacity; 39.33 GiB already allocated; 9.31 MiB free; 46.57 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
pd.DataFrame(sequences).to_csv('predictions_t5_large_jaccard.csv', index=False)

In [11]:
len(sequences)

586

In [None]:
pd.DataFrame(sequences).to_csv('predictions_t5_large_jaccard.csv', index=False)  # Save the test_data with predictions as a CSV file

import csv
filename = 'pass_outcome_t5_large_jaccard.csv'

with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Pass', 'Outcome'])
    for key, value in pass_outcome.items():
        writer.writerow([key, value])

In [9]:
import csv
filename = 'pass_outcome_t5_large_jaccard_temp1_penality3.csv'

with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Pass', 'Outcome'])
    for key, value in pass_outcome.items():
        writer.writerow([key, value])

## Below is the pass sequence computation when we change the temperature

In [7]:
def generate_answer(question, return_sequences, temp):
    source_encoding = tokenizer(
        question["question"],
        question["context"],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    # Move the input tensors to the same device as the model
    input_ids = source_encoding["input_ids"].to(finetuned_model.device)
    attention_mask = source_encoding["attention_mask"].to(finetuned_model.device)

    generated_ids = finetuned_model.model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        num_beams=80,
        max_length=512,
        early_stopping=True,
        use_cache=True,
        repetition_penalty=3.0,
        temperature=temp,
        num_return_sequences = return_sequences
    )

    preds = [tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_space=True)
             for gen_id in generated_ids]

    return preds#" ".join(preds)


In [None]:
pass_outcome = {}

# Generate sequences for the largest value in pass_list
max_return_seq = 50 
sequences = {}  # Dictionary to store the generated sequences for each question
temp = 1

for i in range(len(test_data)):
    question = test_data.iloc[i]
    sequences[i] = generate_answer(question, max_return_seq, temp)

print("sequence generation complete")
# Iterate over pass_list and calculate metrics using the generated sequences
for return_seq in range(1,max_return_seq+1):
    correct_predictions = 0
    for i in range(len(test_data)):
        predictions = sequences[i][:return_seq]  # Use the pre-generated sequences up to return_seq
        question = test_data.iloc[i]
        answer = question.answer
        ground_truth = question.answer.split('#### ')[-1]
        for generated_seq in predictions:
            # Extract the final answer from the generated sequence
            generated_answer = generated_seq.split('#### ')[-1]
            if generated_answer == ground_truth:
                correct_predictions += 1
                break
    pass_outcome[return_seq] = correct_predictions

    # Calculate the accuracy
    pass_accuracy = correct_predictions / len(test_data)
    print(f'Pass Accuracy for pass@:{return_seq} is {pass_accuracy}')

In [None]:
pd.DataFrame(sequences).to_csv('predictions_t5_large_jc_temp1_penality_3.csv', index=False)  # Save the test_data with predictions as a CSV file
import csv
filename = 'pass_outcome_t5_large_jc_temp1_penality_3.csv'

with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Pass', 'Outcome'])
    for key, value in pass_outcome.items():
        writer.writerow([key, value])

In [None]:
pass_outcome = {}

# Generate sequences for the largest value in pass_list
max_return_seq = 50 #max(pass_list)
sequences = {}  # Dictionary to store the generated sequences for each question
temp = 0.9

for i in range(len(test_data)):
    question = test_data.iloc[i]
    sequences[i] = generate_answer(question, max_return_seq, temp)

print("sequence generation complete")
# Iterate over pass_list and calculate metrics using the generated sequences
for return_seq in range(1,max_return_seq+1):
    correct_predictions = 0
    for i in range(len(test_data)):
        predictions = sequences[i][:return_seq]  # Use the pre-generated sequences up to return_seq
        question = test_data.iloc[i]
        answer = question.answer
        ground_truth = question.answer.split('#### ')[-1]
        for generated_seq in predictions:
            # Extract the final answer from the generated sequence
            generated_answer = generated_seq.split('#### ')[-1]
            if generated_answer == ground_truth:
                correct_predictions += 1
                break
    pass_outcome[return_seq] = correct_predictions

    # Calculate the accuracy
    pass_accuracy = correct_predictions / len(test_data)
    print(f'Pass Accuracy for pass@:{return_seq} is {pass_accuracy}')

sequence generation complete
Pass Accuracy for pass@:1 is 0.12054586808188021
Pass Accuracy for pass@:2 is 0.14935557240333586
Pass Accuracy for pass@:3 is 0.16982562547384383
Pass Accuracy for pass@:4 is 0.19029567854435178
Pass Accuracy for pass@:5 is 0.20318423047763456
Pass Accuracy for pass@:6 is 0.2137983320697498
Pass Accuracy for pass@:7 is 0.22365428354814254
Pass Accuracy for pass@:8 is 0.2304776345716452
Pass Accuracy for pass@:9 is 0.24184988627748294
Pass Accuracy for pass@:10 is 0.2532221379833207
Pass Accuracy for pass@:11 is 0.2630780894617134
Pass Accuracy for pass@:12 is 0.2676269901440485
Pass Accuracy for pass@:13 is 0.2721758908263836
Pass Accuracy for pass@:14 is 0.27748294162244125
Pass Accuracy for pass@:15 is 0.27824109173616374
Pass Accuracy for pass@:16 is 0.2835481425322214
Pass Accuracy for pass@:17 is 0.29037149355572406
Pass Accuracy for pass@:18 is 0.2941622441243366
Pass Accuracy for pass@:19 is 0.2979529946929492
Pass Accuracy for pass@:20 is 0.2994692

In [None]:
pd.DataFrame(sequences).to_csv('predictions_t5_large_jc_temp90.csv', index=False)  # Save the test_data with predictions as a CSV file
import csv
filename = 'pass_outcome_t5_large_jc_temp90.csv'

with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Pass', 'Outcome'])
    for key, value in pass_outcome.items():
        writer.writerow([key, value])

In [None]:
pass_outcome = {}

# Generate sequences for the largest value in pass_list
max_return_seq = 50 #max(pass_list)
sequences = {}  # Dictionary to store the generated sequences for each question
temp = 0.85

for i in range(len(test_data)):
    question = test_data.iloc[i]
    sequences[i] = generate_answer(question, max_return_seq, temp)

print("sequence generation complete")
# Iterate over pass_list and calculate metrics using the generated sequences
for return_seq in range(1,max_return_seq+1):
    correct_predictions = 0
    for i in range(len(test_data)):
        predictions = sequences[i][:return_seq]  # Use the pre-generated sequences up to return_seq
        question = test_data.iloc[i]
        answer = question.answer
        ground_truth = question.answer.split('#### ')[-1]
        for generated_seq in predictions:
            # Extract the final answer from the generated sequence
            generated_answer = generated_seq.split('#### ')[-1]
            if generated_answer == ground_truth:
                correct_predictions += 1
                break
    pass_outcome[return_seq] = correct_predictions

    # Calculate the accuracy
    accuracy = correct_predictions / (return_seq * len(test_data))
    print(f'Absolute Accuracy for pass@:{return_seq} is {accuracy}')
    pass_accuracy = correct_predictions / len(test_data)
    print(f'Pass Accuracy for pass@:{return_seq} is {pass_accuracy}')

sequence generation complete
Absolute Accuracy for pass@:1 is 0.12054586808188021
Pass Accuracy for pass@:1 is 0.12054586808188021
Absolute Accuracy for pass@:2 is 0.07467778620166793
Pass Accuracy for pass@:2 is 0.14935557240333586
Absolute Accuracy for pass@:3 is 0.056608541824614604
Pass Accuracy for pass@:3 is 0.16982562547384383
Absolute Accuracy for pass@:4 is 0.047573919636087945
Pass Accuracy for pass@:4 is 0.19029567854435178
Absolute Accuracy for pass@:5 is 0.04063684609552692
Pass Accuracy for pass@:5 is 0.20318423047763456
Absolute Accuracy for pass@:6 is 0.0356330553449583
Pass Accuracy for pass@:6 is 0.2137983320697498
Absolute Accuracy for pass@:7 is 0.03195061193544893
Pass Accuracy for pass@:7 is 0.22365428354814254
Absolute Accuracy for pass@:8 is 0.02880970432145565
Pass Accuracy for pass@:8 is 0.2304776345716452
Absolute Accuracy for pass@:9 is 0.026872209586386992
Pass Accuracy for pass@:9 is 0.24184988627748294
Absolute Accuracy for pass@:10 is 0.02532221379833207

In [None]:
pd.DataFrame(sequences).to_csv('predictions_t5_large_jc_temp85.csv', index=False)  # Save the test_data with predictions as a CSV file
import csv
filename = 'pass_outcome_t5_large_jc_temp85.csv'

with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Pass', 'Outcome'])
    for key, value in pass_outcome.items():
        writer.writerow([key, value])

In [None]:
pass_outcome = {}

# Generate sequences for the largest value in pass_list
max_return_seq = 50 #max(pass_list)
sequences = {}  # Dictionary to store the generated sequences for each question
temp = 0.8

for i in range(len(test_data)):
    question = test_data.iloc[i]
    sequences[i] = generate_answer(question, max_return_seq, temp)

print("sequence generation complete")
# Iterate over pass_list and calculate metrics using the generated sequences
for return_seq in range(1,max_return_seq+1):
    correct_predictions = 0
    for i in range(len(test_data)):
        predictions = sequences[i][:return_seq]  # Use the pre-generated sequences up to return_seq
        question = test_data.iloc[i]
        answer = question.answer
        ground_truth = question.answer.split('#### ')[-1]
        for generated_seq in predictions:
            # Extract the final answer from the generated sequence
            generated_answer = generated_seq.split('#### ')[-1]
            if generated_answer == ground_truth:
                correct_predictions += 1
                break
    pass_outcome[return_seq] = correct_predictions

    # Calculate the accuracy
    accuracy = correct_predictions / (return_seq * len(test_data))
    print(f'Absolute Accuracy for pass@:{return_seq} is {accuracy}')
    pass_accuracy = correct_predictions / len(test_data)
    print(f'Pass Accuracy for pass@:{return_seq} is {pass_accuracy}')

sequence generation complete
Absolute Accuracy for pass@:1 is 0.12054586808188021
Pass Accuracy for pass@:1 is 0.12054586808188021
Absolute Accuracy for pass@:2 is 0.07467778620166793
Pass Accuracy for pass@:2 is 0.14935557240333586
Absolute Accuracy for pass@:3 is 0.056608541824614604
Pass Accuracy for pass@:3 is 0.16982562547384383
Absolute Accuracy for pass@:4 is 0.047573919636087945
Pass Accuracy for pass@:4 is 0.19029567854435178
Absolute Accuracy for pass@:5 is 0.04063684609552692
Pass Accuracy for pass@:5 is 0.20318423047763456
Absolute Accuracy for pass@:6 is 0.0356330553449583
Pass Accuracy for pass@:6 is 0.2137983320697498
Absolute Accuracy for pass@:7 is 0.03195061193544893
Pass Accuracy for pass@:7 is 0.22365428354814254
Absolute Accuracy for pass@:8 is 0.02880970432145565
Pass Accuracy for pass@:8 is 0.2304776345716452
Absolute Accuracy for pass@:9 is 0.026872209586386992
Pass Accuracy for pass@:9 is 0.24184988627748294
Absolute Accuracy for pass@:10 is 0.02532221379833207

In [None]:
pd.DataFrame(sequences).to_csv('predictions_t5_large_jc_temp80.csv', index=False)  # Save the test_data with predictions as a CSV file
import csv
filename = 'pass_outcome_t5_large_jc_temp80.csv'

with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Pass', 'Outcome'])
    for key, value in pass_outcome.items():
        writer.writerow([key, value])

## Temperature iterations finished

In [17]:
question = test_data.iloc[586]
question

context     Hannah needs to drink 100 ml of water for ever...
question         How many ml of water does she need to drink?
answer      First find the total calories burned on aerobi...
Name: 586, dtype: object

In [18]:
question.answer

'First find the total calories burned on aerobics: 2 hours * 500 calories/hour = <<2*500=1000>>1000 calories\nThen add the calories burned running to find the total calories burned: 600 calories + 1000 calories = <<600+1000=1600>>1600 calories\nFinally, divide that number by the ratio of calories burned to water drunk to find how many water Hannah needs to drink: 1600 calories * 100 ml/200 calories = <<1600*100/200=800>>800 ml\n#### 800'

In [19]:
answer = generate_answer(question, 1)
answer

RuntimeError: CUDA out of memory. Tried to allocate 122.00 MiB (GPU 0; 47.54 GiB total capacity; 39.33 GiB already allocated; 9.31 MiB free; 46.57 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [25]:
correct_predictions = 0

for generated_seq in answer:
    # Extract the final answer from the generated sequence
    generated_answer = generated_seq.split('#### ')[-1]
    actual_answer = question.answer.split('#### ')[-1]
    # Compare the generated answer with the ground truth
    if generated_answer == actual_answer:
        correct_predictions += 1

# Calculate the accuracy
accuracy = correct_predictions / len(answer)
print(f'Accuracy: {accuracy}')

Accuracy: 0.0


In [51]:
len(test_data)

1319

In [57]:
correct_predictions = 0
predictions_list = []

for i in range(len(test_data)):
    question = test_data.iloc[i]
    answer = question.answer
    
    ground_truth = question.answer.split('#### ')[-1]
    predictions = generate_answer(question)
    predictions_list.append(predictions)
    for generated_seq in predictions:
        # Extract the final answer from the generated sequence
        generated_answer = generated_seq.split('#### ')[-1]
        if generated_answer == actual_answer:
            correct_predictions += 1

# Calculate the accuracy
accuracy = correct_predictions / (5 * len(test_data))
print(f'Accuracy: {accuracy}')

Accuracy: 0.02031842304776346


In [59]:
correct_predictions

134

In [None]:
### Finetuning on the Large T5 model to set the baseline
pretrained_model_path2 = 'google/t5-v1_1-large'

In [None]:
tokenizer = T5Tokenizer.from_pretrained(pretrained_model_path2)
model = T5ForConditionalGeneration.from_pretrained(pretrained_model_path2)

In [22]:
class T5_large(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(pretrained_model_path2, return_dict = True)
        
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids = input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-5)


model2 = T5_large()

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/607 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath="Finetuned_checkpoint",
    filename='best_model_large',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min',
    save_weights_only=True,
)

In [None]:
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs = 25,
    #logger=logger
)

In [70]:
trainer.fit(model2,data_module)

Missing logger folder: /home/work/GSM8K/CustomT5/lightning_logs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 783 M 
-----------------------------------------------------
783 M     Trainable params
0         Non-trainable params
783 M     Total params
3,132.600 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 2989: 'val_loss' reached 4.39661 (best 4.39661), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 5978: 'val_loss' reached 1.85001 (best 1.85001), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 2, global step 8967: 'val_loss' reached 1.46090 (best 1.46090), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 11956: 'val_loss' reached 1.28854 (best 1.28854), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 14945: 'val_loss' reached 1.18146 (best 1.18146), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 17934: 'val_loss' reached 1.10508 (best 1.10508), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 20923: 'val_loss' reached 1.05347 (best 1.05347), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 23912: 'val_loss' reached 1.01184 (best 1.01184), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 8, global step 26901: 'val_loss' reached 0.98198 (best 0.98198), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_large.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 9, global step 29890: 'val_loss' reached 0.95629 (best 0.95629), saving model to '/home/work/GSM8K/CustomT5/Finetuned_checkpoint/best_model_large.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


In [71]:
### predictions
finetuned_model2 = T5_large.load_from_checkpoint("Finetuned_checkpoint/best_model_large.ckpt")
finetuned_model2.freeze()

def generate_answer(question, return_sequences):
    source_encoding = tokenizer(
        question["question"],
        question["context"],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    # Move the input tensors to the same device as the model
    input_ids = source_encoding["input_ids"].to(finetuned_model.device)
    attention_mask = source_encoding["attention_mask"].to(finetuned_model.device)

    generated_ids = finetuned_model2.model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        num_beams=20,
        max_length=512,
        early_stopping=True,
        use_cache=True,
        repetition_penalty=2.5,
        #temperature=1,
        num_return_sequences = return_sequences
    )

    preds = [tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_space=True)
             for gen_id in generated_ids]

    return preds#" ".join(preds)


In [None]:
correct_predictions = 0
predictions_list2 = []
pass_outcome = {}

# Generate sequences for the largest value in pass_list
max_return_seq = 1 #max(pass_list)
sequences = {}  # Dictionary to store the generated sequences for each question

for i in range(len(test_data)):
    question = test_data.iloc[i]
    answer = question.answer
    ground_truth = question.answer.split('#### ')[-1]
    sequences[i] = generate_answer(question, max_return_seq)

# Iterate over pass_list and calculate metrics using the generated sequences
for return_seq in range(1,51):
    correct_predictions = 0
    for i in range(len(test_data)):
        predictions = sequences[i][:return_seq]  # Use the pre-generated sequences up to return_seq
        for generated_seq in predictions:
            # Extract the final answer from the generated sequence
            generated_answer = generated_seq.split('#### ')[-1]
            if generated_answer == ground_truth:
                correct_predictions += 1

    pass_outcome[return_seq] = correct_predictions

    # Calculate the accuracy
    accuracy = correct_predictions / (return_seq * len(test_data))
    print(f'Absolute Accuracy for pass@:{return_seq} is {accuracy}')
    pass_accuracy = correct_predictions / len(test_data)
    print(f'Pass Accuracy for pass@:{return_seq} is {pass_accuracy}')

sequences.to_csv('predictions_t5_large.csv', index=False)  # Save the test_data with predictions as a CSV file
pass_outcome.to_csv('pass_outcomes_t5_large.csv', index=False)  # Save the pass_outcome as a CSV file