<a href="https://colab.research.google.com/github/nikolas-rauscher/ARDI-Scholarly-QALD/blob/master/Flan_T5_fine_tune_ardi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip3 install --upgrade pip
!pip3 install --upgrade transformers
!pip3 install --upgrade accelerate
!pip3 install sentencepiece
!pip install pytesseract transformers datasets rouge-score nltk tensorboard py7zr --upgrade
!pip install ipywidgets
!pip install peft
!pip install bitsandbytes
!pip install evaluate



In [8]:
from datasets import load_dataset
import json
from random import randrange
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel, prepare_model_for_kbit_training, PeftConfig, get_peft_model, LoraConfig, TaskType
from transformers import BitsAndBytesConfig

In [9]:


!unzip /content/splits.zip


Archive:  /content/splits.zip
  inflating: splits/test_0.json      
replace __MACOSX/splits/._test_0.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
  inflating: splits/train_2.json     
  inflating: splits/train_3.json     
  inflating: splits/test_1.json      
  inflating: splits/train_4.json     
  inflating: splits/test_4.json      
  inflating: splits/test_2.json      
  inflating: splits/train_0.json     
  inflating: splits/train_1.json     
  inflating: splits/test_3.json      


## Train a split

In [10]:
#change this for each split
dataset = load_dataset('json', data_files={'train': 'train_0.json','test': 'test_0.json'})


In [11]:
model_id="google/flan-t5-large"
# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)
model=AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto")

In [12]:
from datasets import concatenate_datasets
import torch
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["prompt"], truncation=True), batched=True, remove_columns=["prompt", "answer"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["answer"], truncation=True), batched=True, remove_columns=["prompt", "answer"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")


Map:   0%|          | 0/5972 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/5972 [00:00<?, ? examples/s]

Max target length: 55


In [13]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["prompt"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["answer"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["prompt", "answer"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


Map:   0%|          | 0/4777 [00:00<?, ? examples/s]

Map:   0%|          | 0/1195 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['question', 'id', 'triples_number', 'contexts', 'input_ids', 'attention_mask', 'labels']


In [14]:
import evaluate
import nltk, torch
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")
from sklearn.metrics import precision_recall_fscore_support
def preprocess_logits_for_metrics(logits, labels):
  if isinstance(logits, tuple):
    logits = logits[0]

  return logits.argmax(dim=-1)
# helper function to postprocess text
def postprocess_text(labels, preds):
    preds = [pred.replace('\n','').split('Answer:')[-1].strip() for pred in preds]
    labels = [label.replace('\n','').split('Answer:')[-1].strip() for label in labels]
    #print(preds)
    #print(labels)
    return preds, labels

label  = []


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Replace -100 in the preds as we can't decode them
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
        # Some simple post-processing

    grounds, preds = postprocess_text(decoded_labels,decoded_preds)
    p, r, f, s = precision_recall_fscore_support(grounds, preds, labels=decoded_labels, average='micro')

    decoded_preds = ["\n".join(pred.strip()) for pred in decoded_preds]

    decoded_labels = ["\n".join(label.strip()) for label in decoded_labels]
    # Compute ROUGscores
    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    result["gen_len"] = np.mean(prediction_lens)

    result['f1'] = f
    result['recall'] =r
    result['precision']=p

    return {k: round(v, 4) for k, v in result.items()}



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [15]:
from transformers import DataCollatorForSeq2Seq
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)



In [18]:
from huggingface_hub import HfFolder, login
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

login(token="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")

# Hugging Face repository id
dataset_id="KGQA"
# repository_id = f"{model_id.split('/')[1]}-{dataset_id}"+'_4'
repository_id = "rausch/KGAQ-2"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=1e-3,
    num_train_epochs=10,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=200,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_precision",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
    lr_scheduler_type = "cosine_with_restarts",
    lr_scheduler_kwargs = { "num_cycles": 2 },
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful




In [19]:
# Start training for split 0
trainer.train()
trainer.save_model(repository_id)
# merge adapter and pretrained weights

#save fine-tuned model
model.save_pretrained(repository_id)
tokenizer.save_pretrained(repository_id)

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,F1,Recall,Precision
1,3.5701,3.391384,14.1052,1.2078,13.0257,14.1332,3.0,0.0,0.0,0.0
2,3.0379,2.746814,12.4379,1.0435,11.3645,12.4814,3.0,0.0005,0.0008,0.0003
3,2.2773,2.496198,25.6591,2.6653,16.5422,25.687,6.0,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,F1,Recall,Precision
1,3.5701,3.391384,14.1052,1.2078,13.0257,14.1332,3.0,0.0,0.0,0.0
2,3.0379,2.746814,12.4379,1.0435,11.3645,12.4814,3.0,0.0005,0.0008,0.0003
3,2.2773,2.496198,25.6591,2.6653,16.5422,25.687,6.0,0.0,0.0,0.0
4,1.8845,2.437015,8.8131,0.2887,8.1866,8.8014,3.0,0.0005,0.0008,0.0003
5,1.7721,2.534215,8.2864,0.5105,7.6569,8.2655,3.0,0.0005,0.0008,0.0003
6,2.1007,2.502776,27.8343,3.8693,19.0586,27.8325,6.4795,0.0022,0.0036,0.0015
7,2.0255,2.554404,8.2864,0.5105,7.6569,8.2655,3.0,0.0005,0.0008,0.0003
8,1.9177,2.53563,22.6347,3.1887,14.2667,22.6751,7.0,0.0005,0.0008,0.0003
9,1.7165,2.549155,9.9002,0.817,9.31,9.8757,4.0,0.0005,0.0008,0.0003
10,1.645,2.671208,9.9002,0.817,9.31,9.8757,4.0,0.0005,0.0008,0.0003


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


events.out.tfevents.1722686306.3b678e7990d3.1967.0:   0%|          | 0.00/18.4k [00:00<?, ?B/s]

('rausch/KGAQ-2/tokenizer_config.json',
 'rausch/KGAQ-2/special_tokens_map.json',
 'rausch/KGAQ-2/spiece.model',
 'rausch/KGAQ-2/added_tokens.json',
 'rausch/KGAQ-2/tokenizer.json')

In [20]:
from transformers import AutoTokenizer
def load_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'right'
    return tokenizer

In [21]:
def read_json(path):
    with open(path, 'r', encoding="utf-8") as f:
        data = json.load(f)
    return data

def write_json(data, path):
    with open(path, 'w', encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)


In [22]:
def get_prediction(model,tokenizer, prompt, length=250,stype='greedy'):

    inputs = tokenizer(prompt, add_special_tokens=True, max_length=4096,return_tensors="pt").input_ids.to("cuda")

    outputs = model.generate(inputs, max_new_tokens=length)

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return response


In [23]:
import json

input_path = "test_0.json"
data = read_json(input_path)
responses = []

In [24]:
for i, item in enumerate(data):
  prompt = item['prompt']
  response = get_prediction(model,tokenizer, prompt)
  print('test:', i)
  if len(response) == 0:
    print("No response")
    responses.append("")
  else:
    responses.append({"id":item['id'],"answer":response[0]})

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


test: 0
test: 1
test: 2
test: 3
test: 4
test: 5
test: 6
test: 7
test: 8
test: 9
test: 10
test: 11
test: 12
test: 13
test: 14
test: 15
test: 16
test: 17
test: 18
test: 19
test: 20
test: 21
test: 22
test: 23
test: 24
test: 25
test: 26
test: 27
test: 28
test: 29
test: 30
test: 31
test: 32
test: 33
test: 34
test: 35
test: 36
test: 37
test: 38
test: 39
test: 40
test: 41
test: 42
test: 43
test: 44
test: 45
test: 46
test: 47
test: 48
test: 49
test: 50
test: 51
test: 52
test: 53
test: 54
test: 55
test: 56
test: 57
test: 58
test: 59
test: 60
test: 61
test: 62
test: 63
test: 64
test: 65
test: 66
test: 67
test: 68
test: 69
test: 70
test: 71
test: 72
test: 73
test: 74
test: 75
test: 76
test: 77
test: 78
test: 79
test: 80
test: 81
test: 82
test: 83
test: 84
test: 85
test: 86
test: 87
test: 88
test: 89
test: 90
test: 91
test: 92
test: 93
test: 94
test: 95
test: 96
test: 97
test: 98
test: 99
test: 100
test: 101
test: 102
test: 103
test: 104
test: 105
test: 106
test: 107
test: 108
test: 109
test: 110


In [None]:
output_path = "test_results_large_model_0_zero.json"
write_json(responses,output_path)

In [26]:
!ls


__MACOSX     splits.zip   test_3.json				train_1.json
rausch	     test_0.json  test_4.json				train_2.json
sample_data  test_1.json  test_results_large_model_0_zero.json	train_3.json
splits	     test_2.json  train_0.json				train_4.json
__MACOSX     splits.zip   test_3.json				train_1.json
rausch	     test_0.json  test_4.json				train_2.json
sample_data  test_1.json  test_results_large_model_0_zero.json	train_3.json
splits	     test_2.json  train_0.json				train_4.json
