In [None]:
# Auto reload modules
%load_ext autoreload
%autoreload 2

# Set CUDA_VISIBLE_DEVICES
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# matplotlib setting
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8, 8)
plt.rcParams["font.size"] = 14
%matplotlib inline

In [6]:
import torch
import pickle
import os
import sys
import datasets

import numpy as np
import pytorch_lightning as pl

from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric, load_from_disk
from transformers import AutoModel, AutoModelForSeq2SeqLM, AutoConfig, AutoModelForMaskedLM, AutoModelForCausalLM, default_data_collator
from pytorch_lightning.callbacks import ModelCheckpoint
from plt_model import LitModel
from pytorch_lightning.loggers import TensorBoardLogger
from dataset import MyDataLoader, prepare_data, preprocess_datasets
from pytorch_lightning.callbacks import Callback


from train import main as train_func
from generate import main as generate_func
from evaluate import main as evaluate_func
from evaluate_utils import basic_stats, pvalue_calculator


In [7]:
def prepare_data(path='./data/babiqa'):
    if (path is not None) and (not os.path.isdir(path)):
        print("Downloading and processing dataset...")
        dataset = load_dataset('babi_qa', 'en-10k-qa1')
        dataset.save_to_disk(path)
    else:
        print("Dataset already downloaded and processed")
        dataset = load_from_disk(path)
    dataset.pop("test")
    # dataset.pop("validation")
    return dataset

In [8]:
dataset = prepare_data()

Downloading and processing dataset...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|██████████| 41.3k/41.3k [00:00<00:00, 22.9MB/s]
Downloading metadata: 100%|██████████| 369k/369k [00:00<00:00, 5.54MB/s]
Downloading readme: 100%|██████████| 106k/106k [00:00<00:00, 1.66MB/s]
Downloading data: 100%|██████████| 15.7M/15.7M [00:01<00:00, 8.64MB/s]
Generating train split: 100%|██████████| 2000/2000 [00:00<00:00, 4273.51 examples/s]
Generating test split: 100%|██████████| 200/200 [00:00<00:00, 1916.60 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2000/2000 [00:00<00:00, 278709.81 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 200/200 [00:00<00:00, 43845.95 examples/s]


In [11]:
dataset = dataset["train"]

In [15]:
print(dataset[0]['story'].keys())

dict_keys(['id', 'type', 'text', 'supporting_ids', 'answer'])


In [22]:
processed_data = []
for data in dataset:
    story = data['story']
    context = ""
    for ind in range(len(story["text"])):
        text = story["text"][ind]
        if text[-1] == '?':
            # Question found
            processed_data.append({"context": context + text, "question": text, "answer": story["answer"][ind]})
        else:
            context += text + " "


In [23]:
len(processed_data)

10000

In [25]:
print(processed_data[0])
print(processed_data[1])

{'context': 'Mary moved to the bathroom. John went to the hallway. Where is Mary?', 'question': 'Where is Mary?', 'answer': 'bathroom'}
{'context': 'Mary moved to the bathroom. John went to the hallway. Daniel went back to the hallway. Sandra moved to the garden. Where is Daniel?', 'question': 'Where is Daniel?', 'answer': 'hallway'}


In [None]:
from evaluate_utils import basic_stats, exam_taker_babi

In [None]:
tag_model = "meta-llama/Llama-3.2-1B"
# tag_model = "facebook/opt-125m"
evaluate_KT = "kt_dataset_gpt_mini.json"
experiment_stage = 0
total_fractions = 2
data_fractions = 1 / total_fractions
batch_size = 32
num_devices = 2
num_epochs = 3

model = LitModel.load_from_checkpoint("/scr/echollm/checkpoints/focal_2.0/frac6/meta-llama/Llama-3.2-1B/M0-llama1b-batch16-lr2e-05-gamma2.0-seed10/last.ckpt",
                                      map_location="cpu",
                                    )

tokenizer = AutoTokenizer.from_pretrained(tag_model,
                                          cache_dir='./model_cache_dir',
                                          return_dict=True)

IsADirectoryError: [Errno 21] Is a directory: '/scr'

In [None]:
results = exam_taker_babi(model, tokenizer, processed_data)
accuracy, confidence = basic_stats(results)

In [5]:
gen_data = pickle.load(open("/scr/echollm/generated_datasets/llama1b/focal_2.0/frac_6/batch16-lr2e-5-seed10/stage5.pkl", 'rb'))

In [10]:
gen_text = ""
for i in range(len(gen_data)):
    gen_text += tokenizer.decode(gen_data[i]['input_ids'])

In [11]:
len(gen_text)

10956869

In [12]:
with open ("/scr/echollm/stage5.txt", "w") as f:
    f.write(gen_text)

In [None]:
gen_data_possibly_wrong = pickle.load(open("/scr/echollm/generated_datasets/llama1b/frac2/batch16-lr2e-05-seed0/stage0_fraction0.pkl", 'rb'))

In [None]:
for i in range(200):
    print(tokenizer.decode(gen_data_possibly_wrong[i]['input_ids']))