# LLM

## Imports

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_dataset, concatenate_datasets, DatasetDict
from datetime import datetime, timezone
from evaluate import load
import os
import nltk
import numpy as np
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paulopacitti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

In [3]:
if torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
device

device(type='mps')

## Model

In [4]:
model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(
    model_id, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

model.to(device)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [5]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[8774, 6, 48, 80, 7142, 55, 1], [100, 19, 430, 7142, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [6]:
text = """The square root of x is the cube root of y. What is y to the power of 2, if x = 4?"""
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=512)
tokenizer.decode(outputs[0], skip_special_tokens=True)

'0'

In [7]:
print(model.config.task_specific_params) 

{'summarization': {'early_stopping': True, 'length_penalty': 2.0, 'max_length': 200, 'min_length': 30, 'no_repeat_ngram_size': 3, 'num_beams': 4, 'prefix': 'summarize: '}, 'translation_en_to_de': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to German: '}, 'translation_en_to_fr': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to French: '}, 'translation_en_to_ro': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to Romanian: '}}


## Train

In [8]:
dataset_id = "flytech/python-codes-25k"
raw_dataset = load_dataset(dataset_id)
metric = load("bleu") 
train_testvalid = raw_dataset['train'].train_test_split(seed=42, test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
raw_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})

In [9]:
raw_dataset["train"][0]

{'text': 'Develop an Python program to classify some spoken words Word list: [\'happy\', \'sad\', \'angry\'] Actioning your request now! Full speed ahead! ```python\nimport speech_recognition as sr\n\nr = sr.Recognizer()\n\nwords = [\'happy\', \'sad\', \'angry\']\n\nwith sr.Microphone() as source:\n    print("Say a word.")\n    audio = r.listen(source)\n\ntry:\n    word = r.recognize_google(audio).lower()\n    if word in words:\n        print("The emotion of the word is:", word)\n    else:\n        print("The word is unknown.")\nexcept:\n    print("Sorry could not recognize")\n```',
 'input': '',
 'instruction': "Develop an Python program to classify some spoken words Word list: ['happy', 'sad', 'angry']",
 'output': '```python\nimport speech_recognition as sr\n\nr = sr.Recognizer()\n\nwords = [\'happy\', \'sad\', \'angry\']\n\nwith sr.Microphone() as source:\n    print("Say a word.")\n    audio = r.listen(source)\n\ntry:\n    word = r.recognize_google(audio).lower()\n    if word in wo

In [10]:
metric

EvaluationModule(name: "bleu", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Computes BLEU score of translated segments against one or more references.
Args:
    predictions: list of translations to score.
    references: list of lists of or just a list of references for each translation.
    tokenizer : approach used for tokenizing `predictions` and `references`.
        The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT.
        This can be replaced by any function that takes a string as input and returns a list of tokens as output.
    max_order: Maximum n-gram order to use when computing BLEU score.
    smooth: Whether or not to apply Lin et al. 2004 smoot

In [11]:
# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([raw_dataset["train"], raw_dataset["test"]]).map(lambda x: tokenizer(x["instruction"], truncation=True), batched=True, remove_columns=["input", "output"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([raw_dataset["train"], raw_dataset["test"]]).map(lambda x: tokenizer(x["output"], truncation=True), batched=True, remove_columns=["instruction", "input"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/44663 [00:00<?, ? examples/s]

Map: 100%|██████████| 44663/44663 [00:03<00:00, 13624.98 examples/s]


Max source length: 459


Map: 100%|██████████| 44663/44663 [00:08<00:00, 5490.75 examples/s]


Max target length: 512


In [12]:
prefix = ""
max_input_length = max_source_length
input_label = "instruction"
target_label = "output"
def preprocess_function(sample):
    inputs = [prefix + doc for doc in sample[input_label]]
    model_inputs = tokenizer(inputs, padding='max_length', max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=sample[target_label], padding='max_length', max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = raw_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 4963/4963 [00:01<00:00, 2776.64 examples/s]
Map: 100%|██████████| 4963/4963 [00:01<00:00, 2805.39 examples/s]


In [14]:
batch_size = 16
_name = model_id.split("/")[1]
args = Seq2SeqTrainingArguments(
    f"output/{_name}-finetuned-code-generation",
    evaluation_strategy = "steps",
    learning_rate=2e-4,
    eval_steps=200,  # You can reduce the frequency of evaluation
    save_steps=200,  # Save model checkpoints less frequently
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["bleu"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [16]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"].select(range(100)),
    eval_dataset=tokenized_dataset["validation"].select(range(100)),
    data_collator=data_collator,
)

trainer.train()

100%|██████████| 7/7 [04:26<00:00, 38.02s/it]

{'train_runtime': 266.1054, 'train_samples_per_second': 0.376, 'train_steps_per_second': 0.026, 'train_loss': 20.839915684291295, 'epoch': 1.0}





TrainOutput(global_step=7, training_loss=20.839915684291295, metrics={'train_runtime': 266.1054, 'train_samples_per_second': 0.376, 'train_steps_per_second': 0.026, 'total_flos': 16664793292800.0, 'train_loss': 20.839915684291295, 'epoch': 1.0})

In [None]:
trainer.evaluate()


[A
[A
[A
[A
[A
[A
100%|██████████| 7/7 [00:38<00:00,  5.53s/it]


{'eval_loss': 2.482707977294922,
 'eval_bleu': 0.0,
 'eval_gen_len': 14.91,
 'eval_runtime': 48.9076,
 'eval_samples_per_second': 2.045,
 'eval_steps_per_second': 0.143,
 'epoch': 1.0}

In [18]:
local_time = datetime.now(timezone.utc).astimezone()
local_time.isoformat()
trainer.save_model(f"models/{_name}-finetuned-{local_time}")

In [19]:
model_loaded = AutoModelForSeq2SeqLM.from_pretrained(f"models/{_name}-finetuned-{local_time}").to(device)
tokenizer_loaded = AutoTokenizer.from_pretrained(f"models/{_name}-finetuned-{local_time}")

OSError: Can't load tokenizer for 'models/flan-t5-small-finetuned-2024-06-02 17:13:56.612985-03:00'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'models/flan-t5-small-finetuned-2024-06-02 17:13:56.612985-03:00' is the correct path to a directory containing all relevant files for a T5TokenizerFast tokenizer.

In [17]:
text = """Calculate my carbon footprint based on my daily activities!"""
inputs = tokenizer_loaded(text, return_tensors="pt").to(device)
outputs = model_loaded.generate(**inputs)
tokenizer.decode(outputs[0], skip_special_tokens=True)

NameError: name 'tokenizer_loaded' is not defined

In [None]:
outputs = model.generate(**inputs)
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Using a calculator, you can calculate your carbon footprint by using a calculator.'