# LLM

## Imports

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_dataset, concatenate_datasets, DatasetDict
from datetime import datetime, timezone
from evaluate import load
import os
import nltk
import numpy as np
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paulopacitti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

In [4]:
device = torch.device('cuda')

## Model

In [5]:
model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(
    model_id, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

tokenizer.model_max_length

512

In [5]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[8774, 6, 48, 80, 7142, 55, 1], [100, 19, 430, 7142, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [36]:
text = """translate to SQL: Tell me what the notes are for South Australia """
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=64)
tokenizer.decode(outputs[0], skip_special_tokens=True)

'SELECT Notes FROM table WHERE Country = south australia'

In [7]:
print(model.config.task_specific_params) 

{'summarization': {'early_stopping': True, 'length_penalty': 2.0, 'max_length': 200, 'min_length': 30, 'no_repeat_ngram_size': 3, 'num_beams': 4, 'prefix': 'summarize: '}, 'translation_en_to_de': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to German: '}, 'translation_en_to_fr': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to French: '}, 'translation_en_to_ro': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to Romanian: '}}


## Train

In [8]:
train_data = load_dataset('wikisql', split='train+validation', trust_remote_code=True)
test_data = load_dataset('wikisql', split='test', trust_remote_code=True)

In [9]:
train_data[0]

{'phase': 1,
 'question': 'Tell me what the notes are for South Australia ',
 'table': {'header': ['State/territory',
   'Text/background colour',
   'Format',
   'Current slogan',
   'Current series',
   'Notes'],
  'page_title': '',
  'page_id': '',
  'types': ['text', 'text', 'text', 'text', 'text', 'text'],
  'id': '1-1000181-1',
  'section_title': '',
  'caption': '',
  'rows': [['Australian Capital Territory',
    'blue/white',
    'Yaa·nna',
    'ACT · CELEBRATION OF A CENTURY 2013',
    'YIL·00A',
    'Slogan screenprinted on plate'],
   ['New South Wales',
    'black/yellow',
    'aa·nn·aa',
    'NEW SOUTH WALES',
    'BX·99·HI',
    'No slogan on current series'],
   ['New South Wales',
    'black/white',
    'aaa·nna',
    'NSW',
    'CPX·12A',
    'Optional white slimline series'],
   ['Northern Territory',
    'ochre/white',
    'Ca·nn·aa',
    'NT · OUTBACK AUSTRALIA',
    'CB·06·ZZ',
    'New series began in June 2011'],
   ['Queensland',
    'maroon/white',
    'nnn·aaa

In [10]:
def format_dataset(example):
    return {'input': 'translate to SQL: ' + example['question'], 'target': example['sql']['human_readable']}

In [11]:
train_data = train_data.map(format_dataset,remove_columns=train_data.column_names)
test_data = test_data.map(format_dataset,remove_columns=test_data.column_names)

Map:   0%|          | 0/64776 [00:00<?, ? examples/s]

Map:   0%|          | 0/15878 [00:00<?, ? examples/s]

In [12]:
# map article and summary len to dict as well as if sample is longer than 512 tokens
def map_to_length(x):
    x["input_len"] = len(tokenizer(x["input"]).input_ids)
    x["input_longer_256"] = int(x["input_len"] > 256)
    x["input_longer_128"] = int(x["input_len"] > 128)
    x["input_longer_64"] = int(x["input_len"] > 64)
    x["out_len"] = len(tokenizer(x["target"]).input_ids)
    x["out_longer_256"] = int(x["out_len"] > 256)
    x["out_longer_128"] = int(x["out_len"] > 128)
    x["out_longer_64"] = int(x["out_len"] > 64)
    return x


sample_size = 10000
data_stats = train_data.select(range(sample_size)).map(map_to_length, num_proc=4)


def compute_and_print_stats(x):
    if len(x["input_len"]) == sample_size:
        print(
            "Input Mean: {}, %-Input > 256:{},  %-Input > 128:{}, %-Input > 64:{} Output Mean:{}, %-Output > 256:{}, %-Output > 128:{}, %-Output > 64:{}".format(
                sum(x["input_len"]) / sample_size,
                sum(x["input_longer_256"]) / sample_size,
                sum(x["input_longer_128"]) / sample_size,
                sum(x["input_longer_64"]) / sample_size,   
                sum(x["out_len"]) / sample_size,
                sum(x["out_longer_256"]) / sample_size,
                sum(x["out_longer_128"]) / sample_size,
                sum(x["out_longer_64"]) / sample_size,
            )
        )


output = data_stats.map(
  compute_and_print_stats, 
  batched=True,
  batch_size=-1,
)

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Input Mean: 19.8971, %-Input > 256:0.0,  %-Input > 128:0.0, %-Input > 64:0.0002 Output Mean:20.0403, %-Output > 256:0.0, %-Output > 128:0.0002, %-Output > 64:0.0005


In [13]:
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], pad_to_max_length=True, max_length=64)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], pad_to_max_length=True, max_length=64)


    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }


    return encodings

In [14]:
train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)
test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)


columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']


train_data.set_format(type='torch', columns=columns)
test_data.set_format(type='torch', columns=columns)

Map:   0%|          | 0/64776 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Map:   0%|          | 0/15878 [00:00<?, ? examples/s]

In [23]:
test_data["labels"].shape, train_data["labels"].shape

(torch.Size([15878, 64]), torch.Size([64776, 64]))

In [50]:
# def compute_metrics(eval_preds):
#     preds, labels = eval_preds
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     preds = np.where(preds != -100, preds, tokenizer.pad_token_id)

#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     result = rouge.compute(
#         predictions=decoded_preds,
#         references=decoded_labels,
#         use_stemmer=True,
#     )
#     print(f"petros: {result}")

#     return result

In [53]:
rouge = load("rouge")


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions


    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True, )
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)


    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid


    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )
    print(f"petros: {result}")

    return result

In [54]:
batch_size=64
training_args = Seq2SeqTrainingArguments(
    "output",
    num_train_epochs=1,
    gradient_accumulation_steps=2,
    max_steps=50,
    learning_rate=2e-4,
    weight_decay=0.01,
    logging_steps=10,                # when to print log
    push_to_hub=False,
    save_steps=200,  # Save model checkpoints less frequently
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    predict_with_generate=True,
    load_best_model_at_end=True,
    evaluation_strategy         =  "no",
    save_strategy               =  "no",
    generation_max_length = 64
 )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
)
trainer.train()

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
10,0.0818
20,0.1067
30,0.1287
40,0.1446
50,0.1534


TrainOutput(global_step=50, training_loss=0.12305092215538024, metrics={'train_runtime': 31.064, 'train_samples_per_second': 206.026, 'train_steps_per_second': 1.61, 'total_flos': 148712403763200.0, 'train_loss': 0.12305092215538024, 'epoch': 0.09871668311944719})

In [55]:
text = """translate to SQL: Tell me what the notes are for South Australia """
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=64)
tokenizer.decode(outputs[0], skip_special_tokens=True)

'SELECT Notes FROM table WHERE Country = south australia'

In [56]:
metrics = trainer.evaluate(
        eval_dataset = test_data,
)
metrics

RuntimeError: output with shape [64, 6, 1, 1] doesn't match the broadcast shape [64, 6, 1, 64]

In [18]:
local_time = datetime.now(timezone.utc).astimezone()
local_time.isoformat()
trainer.save_model(f"models/{_name}-finetuned-{local_time}")