In [1]:
import datasets
import os
import pandas as pd
from compgen import recogs_exact_match

SRC_DIRNAME = os.path.join("data", "recogs")

def load_split(filename):
    return pd.read_csv(
        filename,
        delimiter="\t",
        names=['input', 'output', 'category'])

dataset = {}

for splitname in ("train", "dev", "gen"):
    dataset[splitname] = load_split(f"{SRC_DIRNAME}/{splitname}.tsv")
    


In [2]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast


def get_tokenizer(vocab_filename):
    with open(vocab_filename) as f:
        vocab = f.read().splitlines()
    vocab_size = len(vocab)
    vocab = dict(zip(vocab, list(range(vocab_size))))
    tok = Tokenizer(WordLevel(vocab, unk_token='[UNK]'))
    # This definitely needs to be done here and in the construction of
    # `PreTrainedTokenizerFast`. Don't be tempted to "clean this up"!
    tok.add_special_tokens(["[BOS]", "[UNK]", "[PAD]", "[EOS]"])
    tok.pre_tokenizer = WhitespaceSplit()
    tok.post_processor = TemplateProcessing(
        single=f"[BOS]:0 $A:0 [EOS]:0",
        special_tokens=[
            ("[BOS]", tok.token_to_id("[BOS]")),
            ("[EOS]", tok.token_to_id("[EOS]"))])
    return PreTrainedTokenizerFast(
        tokenizer_object=tok,
        bos_token="[BOS]",
        unk_token="[UNK]",
        pad_token="[PAD]",
        eos_token="[EOS]",
        # This vital; otherwise any periods will have their leading
        # spaces removed, which is wrong for COGS/ReCOGS.
        clean_up_tokenization_spaces=False)

# enc_tokenizer = get_tokenizer(os.path.join(SRC_DIRNAME, "src_vocab.txt"))
# dec_tokenizer = get_tokenizer(os.path.join(SRC_DIRNAME, "tgt_vocab.txt"))


import torch

class RecogsDataset(torch.utils.data.Dataset):
    def __init__(self, enc_tokenizer, dec_tokenizer, X, y=None):
        self.X = [enc_tokenizer.encode(s) for s in X]
        self.y = y
        if y is not None:
            self.y = [dec_tokenizer.encode(s) for s in y]

    @staticmethod
    def collate_fn(batch):
        """Unfortunately, we can't pass the tokenizer in as an argument
        to this method, since it is a static method, so we need to do
        the work of creating the necessary attention masks."""
        def get_pad_and_mask(vals):
            lens = [len(i) for i in vals]
            maxlen = max(lens)
            pad = []
            mask = []
            for ex, length in zip(vals, lens):
                diff = maxlen - length
                pad.append(ex + ([0] * diff))
                mask.append(([1] * length) + ([0] * diff))
            return torch.tensor(pad), torch.tensor(mask)
        batch_elements = list(zip(*batch))
        X = batch_elements[0]
        X_pad, X_mask = get_pad_and_mask(X)
        if len(batch_elements) == 1:
            # return X_pad, X_mask
            return {"input_ids": X_pad, "attention_mask": X_mask}
        else:
            y = batch_elements[1]
            y_pad, y_mask = get_pad_and_mask(y)
            # Repeat `y_pad` because our optimizer expects to find
            # labels in final position. These will not be used because
            # Hugging Face will calculate the loss for us.
            return {"input_ids": X_pad, "attention_mask": X_mask, "labels": y_pad}

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is None:
            return (self.X[idx],)
        else:
            return (self.X[idx], self.y[idx])


from torch_model_base import TorchModelBase
import torch.nn as nn
from transformers import EncoderDecoderModel

class RecogsLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.reduction = "mean"

    def forward(self, outputs, labels):
        """`labels` is ignored, as it was already used to assign a
        value of `outputs.loss`, and that value is all we need."""
        return outputs.loss
    
class RecogsModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.encdec = EncoderDecoderModel.from_pretrained(
            f"ReCOGS/ReCOGS-model")

    def forward(self, X_pad, X_mask, y_pad, y_mask, labels=None):
        outputs = self.encdec(
            input_ids=X_pad, 
            attention_mask=X_mask,
            decoder_attention_mask=y_mask,
            labels=y_pad)
        return outputs
    
    
class RecogsModel(TorchModelBase):
    def __init__(self, *args,
            initialize=True,
            enc_vocab_filename=f"{SRC_DIRNAME}/src_vocab.txt",
            dec_vocab_filename=f"{SRC_DIRNAME}/tgt_vocab.txt",
            **kwargs):
        self.enc_vocab_filename = enc_vocab_filename
        self.dec_vocab_filename = dec_vocab_filename
        self.enc_tokenizer = get_tokenizer(self.enc_vocab_filename)
        self.dec_tokenizer = get_tokenizer(self.dec_vocab_filename)
        super().__init__(*args, **kwargs)
        self.loss = RecogsLoss()
        if initialize:
            self.initialize()

    def build_graph(self):
        return RecogsModule()

    def build_dataset(self, X, y=None):
        return RecogsDataset(
            self.enc_tokenizer, self.dec_tokenizer, X, y=y)

    def predict(self, X, device=None):
        device = self.device if device is None else torch.device(device)
        dataset = self.build_dataset(X)
        dataloader = self._build_dataloader(dataset, shuffle=False)
        self.model.to(device)
        self.model.eval()
        preds = []
        with torch.no_grad():
            for batch in dataloader:
                X_pad, X_mask = [x.to(device) for x in batch]
                outputs = self.model.encdec.generate(
                    X_pad,
                    attention_mask=X_mask,
                    max_new_tokens=512,
                    eos_token_id=self.model.encdec.config.eos_token_id)
                results = self.dec_tokenizer.batch_decode(
                    outputs, 
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=False)
                preds += results
        return preds

    def score(self, X, y, device=None):
        # An overall accuracy score:
        preds = self.predict(X, device=device)
        vals = [int(recogs_exact_match(gold, pred)) for gold, pred in zip(y, preds)]
        return sum(vals) / len(vals)
    


In [3]:
# further model training

recogs_ff = RecogsModel(
    batch_size=512,
    gradient_accumulation_steps=1,
    max_iter=100, 
    early_stopping=True,
    n_iter_no_change=10,
    optimizer_class=torch.optim.Adam,
    eta=0.00001)

_ = recogs_ff.fit(dataset['train'].input, dataset['train'].output)

  return self.fget.__get__(instance, owner)()
Stopping after epoch 20. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 15.865672990679741

In [4]:
def test(gen_df, model):
    
    cat_df = gen_df.copy()
    cat_df["prediction"] = model.predict(cat_df.input)
    cat_df["correct"] = cat_df.apply(lambda x: recogs_exact_match(x.output, x.prediction), axis=1)
    return cat_df


result_df = test(dataset['dev'], recogs_ff)
result_df['correct'].sum() / result_df.shape[0]

0.9663333333333334

In [5]:
result_df = test(dataset['dev'], recogs_ff)
result_df['correct'].sum() / result_df.shape[0]

0.5645238095238095

In [6]:
bakeoff_df = pd.read_csv(
    os.path.join(SRC_DIRNAME, "cs224u-recogs-test-unlabeled.tsv"), 
    sep="\t", index_col=0)

In [7]:
bakeoff_df["prediction"] = recogs_ff.predict(bakeoff_df.input)

bakeoff_df.to_csv("cs224u-recogs-bakeoff-entry[fine_tuned_recogs_model].tsv", sep="\t")

In [3]:
# T5 model

import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class T5RecogsModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.encdec = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

    def forward(self, X_pad, X_mask, y_pad, y_mask, labels=None):
        outputs = self.encdec(
            input_ids=X_pad, 
            attention_mask=X_mask,
            decoder_attention_mask=y_mask,
            labels=y_pad)
        return outputs

class T5RecogsModel(RecogsModel):
    def __init__(self, *args, initialize=True, **kwargs):
        super().__init__(*args, **kwargs)
        self.enc_tokenizer = AutoTokenizer.from_pretrained("t5-small")
        self.dec_tokenizer = self.enc_tokenizer

    def build_graph(self):
        return T5RecogsModule()

In [9]:
t5mod = T5RecogsModel()
t5_exs = dataset['dev'].input[: 2]

t5_exs

t5mod.predict(t5_exs)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

['Liam hoffte, dass eine Box von einer Frau in der Hand gelegt wird.',
 'Der Donkey lended den Cookie an eine Mutter .']

In [4]:
t5model = T5RecogsModel(batch_size=64,
    gradient_accumulation_steps=2,
    max_iter=100, 
    early_stopping=True,
    n_iter_no_change=10,
    optimizer_class=torch.optim.Adam,
    eta=0.00001)

In [5]:
t5model.fit(dataset['train'].input, dataset['train'].output)

Stopping after epoch 18. Validation score did not improve by tol=1e-05 for more than 10 epochs. Final error is 81.4184251409024

T5RecogsModel(
	batch_size=64,
	max_iter=100,
	eta=1e-05,
	optimizer_class=<class 'torch.optim.adam.Adam'>,
	l2_strength=0,
	gradient_accumulation_steps=2,
	max_grad_norm=None,
	validation_fraction=0.1,
	early_stopping=True,
	n_iter_no_change=10,
	warm_start=False,
	tol=1e-05)

In [6]:
def test(gen_df, model):
    
    cat_df = gen_df.copy()
    cat_df["prediction"] = model.predict(cat_df.input)
    cat_df["correct"] = cat_df.apply(lambda x: recogs_exact_match(x.output, x.prediction), axis=1)
    return cat_df


result_df = test(dataset['dev'], t5model)
print("dev test acc:")
print(result_df['correct'].sum() / result_df.shape[0])
result_df = test(dataset['gen'], t5model)
print("gen test acc:")
print(result_df['correct'].sum() / result_df.shape[0])

dev test acc:
0.154
gen test acc:
0.0850952380952381


In [None]:
bakeoff_df = pd.read_csv(
    os.path.join(SRC_DIRNAME, "cs224u-recogs-test-unlabeled.tsv"), 
    sep="\t", index_col=0)

bakeoff_df["prediction"] = t5model.predict(bakeoff_df.input)

bakeoff_df.to_csv("cs224u-recogs-bakeoff-entry[fine_tuned_T5_model].tsv", sep="\t")

In [None]:
# Base line default model result: 
# fine tune model: dev: 0.9663333333333334 gen: 0.5645238095238095
# T5 model: dev: 0.154 gen: 0.0850952380952381
# T5 with PEFT model

# Fine tune model result: 

In [9]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# huggingface hub model id
model_id = "google-t5/t5-small"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)

from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 589,824 || all params: 61,096,448 || trainable%: 0.9653981848502878




In [10]:
SRC_DIRNAME = os.path.join("data", "recogs")

def load_split(filename):
    return pd.read_csv(
        filename,
        delimiter="\t",
        names=['input', 'output', 'category'])

dataset = {}

for splitname in ("train", "dev", "gen"):
    dataset[splitname] = load_split(f"{SRC_DIRNAME}/{splitname}.tsv")
    

text_column = "input"
label_column = "output"
max_length = 512
from datasets import Dataset, concatenate_datasets
dataset_train = Dataset.from_pandas(dataset["train"], split="train")
dataset_eval = Dataset.from_pandas(dataset["dev"], split="dev")
# dataset = concatenate_datasets([dataset_train, dataset_eval])

def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs


dataset_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset_train.column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

dataset_eval = dataset_eval.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset_eval.column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/135546 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [11]:
print(len(dataset_train))

135546


In [12]:
print(len(dataset_eval))

3000


In [13]:
# train_dataset = RecogsDataset(
#     enc_tokenizer,
#     dec_tokenizer,
#     dataset['train'].input,
#     y=dataset['train'].output)

In [14]:
# dev_dataset = RecogsDataset(
#     enc_tokenizer,
#     dec_tokenizer,
#     dataset['dev'].input,
#     y=dataset['dev'].output)

In [15]:
# ex_dataloader = torch.utils.data.DataLoader(
#     dev_dataset,
#     batch_size=2,
#     shuffle=True,
#     pin_memory=True,
#     collate_fn=dev_dataset.collate_fn)

In [16]:
# ex_batch = iter(ex_dataloader)
# next(ex_batch)

In [17]:
# from transformers import DataCollatorForSeq2Seq

# # we want to ignore tokenizer pad token in the loss
# label_pad_token_id = -100
# # Data collator
# data_collator = DataCollatorForSeq2Seq(
#     enc_tokenizer,
#     model=model,
#     label_pad_token_id=label_pad_token_id,
#     pad_to_multiple_of=8
# )

In [18]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# model_id="t5-small"

# # Load tokenizer of FLAN-t5-XL
# tokenizer = AutoTokenizer.from_pretrained(model_id)

In [19]:
# tokenizer("hi i am")

In [20]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="lora-t5-small-2"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    gradient_accumulation_steps=1,
    report_to="none",
)

# Create Trainer instance
# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     data_collator=train_dataset.collate_fn,
#     train_dataset=train_dataset,
#     eval_dataset=dev_dataset
# )
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_eval
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


In [21]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.3295,0.287091
2,0.3041,0.286027
3,0.2967,0.285313
4,0.2939,0.285232
5,0.2916,0.285085




TrainOutput(global_step=10590, training_loss=0.3312184433761467, metrics={'train_runtime': 13705.5057, 'train_samples_per_second': 49.449, 'train_steps_per_second': 0.773, 'total_flos': 9.2953204752384e+16, 'train_loss': 0.3312184433761467, 'epoch': 5.0})

In [53]:
# model.eval()



# result_df = test(dataset['dev'], recogs_ff)
# result_df['correct'].sum() / result_df.shape[0]

# def evaluate_peft_model(dev_set,max_target_length=500):
#     # generate summary
#     model.eval()  
#     outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
#     print(outputs)
#     prediction = dec_tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
#     # decode eval sample
#     # Replace -100 in the labels as we can't decode them.

#     # Some simple post-processing
#     return prediction


# dev_dataloader = torch.utils.data.DataLoader(
#                         dev_dataset,
#                         batch_size=2,
#                         shuffle=True,
#                         pin_memory=True,
#                         collate_fn=dev_dataset.collate_fn)
    
# for batch in dev_dataloader:
#     preds = evaluate_peft_model(dev_dataset)
import torch
model.eval()
def get_prediction(sample):
    
    # sample = tokenizer(sample, return_tensors='pt', max_length=512, truncation=True)
    outputs = model.generate(input_ids=torch.tensor(sample["input_ids"]).cuda(), 
                             attention_mask = torch.tensor(sample["attention_mask"]).cuda(), 
                             do_sample=True, top_p=0.9, max_new_tokens=512)

    prediction = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
    return prediction


from tqdm import tqdm

predictions = []
for i in tqdm(range(0, len(dataset_eval), 64)):
    sample = dataset_eval[i:i+64]
    pred = get_prediction(sample)
    predictions+=pred
    
    
cat_df = dataset["dev"].copy() 
cat_df["prediction"] = predictions
cat_df["correct"] = cat_df.apply(lambda x: recogs_exact_match(x.output, x.prediction), axis=1)
print(cat_df['correct'].sum() / cat_df.shape[0])

100%|██████████| 47/47 [03:03<00:00,  3.91s/it]


0.944


In [57]:
def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

dataset_gen = Dataset.from_pandas(dataset["gen"], split="gen")
dataset_gen = dataset_gen.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset_gen.column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/21000 [00:00<?, ? examples/s]

In [59]:
predictions = []
for i in tqdm(range(0, len(dataset_gen), 64)):
    sample = dataset_gen[i:i+64]
    pred = get_prediction(sample)
    predictions+=pred
    
    
cat_df = dataset["gen"].copy() 
cat_df["prediction"] = predictions
cat_df["correct"] = cat_df.apply(lambda x: recogs_exact_match(x.output, x.prediction), axis=1)
print(cat_df['correct'].sum() / cat_df.shape[0])

100%|██████████| 329/329 [1:22:24<00:00, 15.03s/it]


0.7460476190476191


In [56]:
bakeoff_df = pd.read_csv(
    os.path.join(SRC_DIRNAME, "cs224u-recogs-test-unlabeled.tsv"), 
    sep="\t", index_col=0)

def preprocess_function(examples):
    inputs = examples[text_column]
    # targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    # labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    # labels = labels["input_ids"]
    # labels[labels == tokenizer.pad_token_id] = -100
    # model_inputs["labels"] = labels
    return model_inputs

dataset_gen = Dataset.from_pandas(bakeoff_df, split="bakeoff")
dataset_gen = dataset_gen.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset_gen.column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

predictions = []
for i in tqdm(range(0, len(dataset_gen), 64)):
    sample = dataset_gen[i:i+64]
    pred = get_prediction(sample)
    predictions+=pred

bakeoff_df["prediction"] = predictions

bakeoff_df.to_csv("cs224u-recogs-bakeoff-entry[PEFT_T5_model].tsv", sep="\t")

Running tokenizer on dataset:   0%|          | 0/420 [00:00<?, ? examples/s]

100%|██████████| 7/7 [00:59<00:00,  8.52s/it]


In [47]:
dev_dataloader = torch.utils.data.DataLoader(
                        dev_dataset,
                        batch_size=1,
                        shuffle=True,
                        pin_memory=True,
                        collate_fn=dev_dataset.collate_fn)

for batch in dev_dataloader:
    print(batch)
    outputs = model.generate(input_ids=batch["input_ids"].cuda())
    print(outputs)
    break

{'input_ids': tensor([[  1,  53, 592, 689, 124, 134, 586, 124, 361,  17,   2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[  1, 103,   5,  52,   6,  67, 182,   5,  66,   6,  67, 381,   5,  46,
           6,  67, 588,   5,  18,   6,  68, 177,   5,  18,   8,  52,   6,  68,
         259,   5,  18,   8,  53,   6,  68, 579,   5,  53,   6,  68, 177,   5,
          53,   8,  66,   6,  68, 664,   5,  53,   8,  46,   6,   2]])}
tensor([[0, 1]], device='cuda:0')




In [44]:
dec_tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=False)

'[PAD] [BOS]'

In [None]:
enc_tokenizer()