In [1]:
import wandb
import json
from datasets import Dataset, DatasetDict
import numpy as np

In [2]:
# Read the dataset
f = open(r"C:\Users\Leandro\Desktop\WandB_Projects\LLM_alpaca\alpaca_data_cleaned.json")
data = json.load(f)
f.close()

In [3]:
# Set the seed for reproducibility
np.random.seed(42)

# Define the number range (0 to 49999)
rango_numeros = np.arange(len(data))

# Generate valid_list with 1000 random numbers without repetition
lista_valid = np.random.choice(rango_numeros, size=1000, replace=False)

# Generate train_list with the remaining 49000 numbers
lista_train = np.setdiff1d(rango_numeros, lista_valid)

data_train = np.array(data)[lista_train]
data_valid = np.array(data)[lista_valid]

# Convert dictionary list to list dictionary
data_dict_train = {key: [item[key] for item in data_train] for key in data_train[0]}
data_dict_valid = {key: [item[key] for item in data_valid] for key in data_valid[0]}

# Create a Dataset object from the list of dictionaries
custom_dataset_train = Dataset.from_dict(data_dict_train)
custom_dataset_valid = Dataset.from_dict(data_dict_valid)

# Convert the Dataset to a DatasetDict
dataset_dict = DatasetDict({
    'train': custom_dataset_train,
    'valid': custom_dataset_valid})

In [4]:
# Set train and valid datasets
train_dataset = dataset_dict['train']
eval_dataset = dataset_dict['valid']

In [5]:
# Create the prompts
def prompt_no_input(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Response:\n{output}").format_map(row)

def prompt_input(row):
    return ("Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n{output}").format_map(row)

def create_prompt(row):
    return prompt_no_input(row) if row["input"] == "" else prompt_input(row)

In [6]:
# Set the models to use

model_id = 'openlm-research/open_llama_3b_v2'
#model_id = 'bigscience/bloom-3b'
#model_id = 'tiiuae/falcon-rw-1b'

In [7]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=64,  # the rank of the LoRA matrices
    lora_alpha=16, # the weight
    lora_dropout=0.1, # dropout to add to the LoRA layers
    bias="none", # add bias to the nn.Linear layers?
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj","v_proj","o_proj"], # the name of the layers to add LoRA (LLAMA 3B)
    #target_modules=['query_key_value'], # the name of the layers to add LoRA (BLOOM 3B)
    #target_modules=['query_key_value'], # the name of the layers to add LoRA (Falcon 1B)
)

In [8]:
import torch
from transformers import BitsAndBytesConfig

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [9]:
# Is GPU available?
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

cuda:0


In [10]:
# Model parameters
model_kwargs = dict(
    device_map=device,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    use_cache=False,
    quantization_config=bnb_config,
    # low_cpu_mem_usage=True,
    #use_flash_attention_2=True,
)

In [11]:
# Training parameters
batch_size = 1
gradient_accumulation_steps = 16
num_train_epochs = 10

In [15]:
from transformers import TrainingArguments

# Training parameters
output_dir = "./output/falcon-1b"
training_args = TrainingArguments(
    num_train_epochs=num_train_epochs,
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    gradient_accumulation_steps=gradient_accumulation_steps,
    #gradient_checkpointing=True,
    gradient_checkpointing_kwargs=dict(use_reentrant=False),
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="epoch",
    report_to="wandb",
)

In [None]:
from trl import SFTTrainer

# Training parameters
trainer = SFTTrainer(
    model=model_id,
    model_init_kwargs=model_kwargs,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    packing=True,
    max_seq_length=1024,
    args=training_args,
    formatting_func=create_prompt,
    peft_config=peft_config,
)

In [None]:
# Remove answers
def create_prompt_no_anwer(row):
    row["output"] = ""
    return {"text": create_prompt(row)}

test_dataset = eval_dataset.map(create_prompt_no_anwer)

In [19]:
from utils import LLMSampleCB

# Set the wandb callback
wandb_callback = LLMSampleCB(trainer, test_dataset, num_samples=30, max_new_tokens=256)
trainer.add_callback(wandb_callback)

In [None]:
# Start training!
with wandb.init(project='LLM_alpaca', name='falcon-1b'):
    trainer.train()

## INFERENCE

In [13]:
def limit_string(text, long_min=70, long_max=90):
    full_text = []
    chunks = text.split("\n")
    chunks = [chunk for chunk in chunks if chunk != ""]

    for chunk in chunks:
        i = long_min
        row = 1

        while i < len(chunk):
            if i < long_max * row and chunk[i] == ' ':
                # Inserta un carácter de nueva línea en esa posición
                chunk = chunk[:i] + '\n' + chunk[i+1:]
                i += long_min  # Salta al próximo bloque de caracteres
                row += 1
            else:
                i += 1

        full_text.append(chunk + "\n\n")

    return "".join(full_text)

In [27]:
#saved_model = "checkpoint-ynqkunya:v1" # llama 3b
#saved_model = "checkpoint-mf8m86ws:v0" # bloom 3b
saved_model = "checkpoint-n8itsr3a:v0" # falcon 1b

In [28]:
with wandb.init(project='LLM_alpaca'):
    artifact = wandb.use_artifact(saved_model)
    artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact checkpoint-n8itsr3a:v0, 147.35MB. 13 files... 
[34m[1mwandb[0m:   13 of 13 files downloaded.  
Done. 0:0:0.7


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [29]:
from transformers import AutoModelForCausalLM, AutoTokenizer

trained_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path = artifact_dir,
    return_dict = True,
    quantization_config = bnb_config,
    #trust_remote_code = True,
    device_map = device)

from peft import PeftModel

trained_model = PeftModel.from_pretrained(
    model = trained_model,
    model_id = artifact_dir)

trained_model_tokenizer = AutoTokenizer.from_pretrained(artifact_dir)
trained_model_tokenizer.pad_token = trained_model_tokenizer.eos_token

In [30]:
generation_config = trained_model.generation_config
generation_config.max_new_token = 1024 * 2
#generation_config.num_beams = 1
#generation_config.early_stopping = True
#generation_config.repetition_penalty = 0.5
generation_config.temperature = 0.7
#generation_config.top_p = 0.7
#generation_config.top_k = 50
generation_config.do_sample = True
generation_config.num_return_sequence = 1
generation_config.pad_token_id = trained_model_tokenizer.pad_token_id
generation_config.eos_token_id = trained_model_tokenizer.eos_token_id

In [31]:
questions = [
    'What is the capital city of Australia?',
    'Can you explain the theory of relativity in simple terms?',
    'How would you code a function in Python to sum two vectors?',
    'Who wrote the novel "To Kill a Mockingbird"?',
    'What is the chemical formula for table salt?',
    'How would you calculate the area of a circle given its radius?',
    'Who was the first person to walk on the moon?',
    'Can you name three common cybersecurity threats?',
    'What is the meaning of the phrase "a bird in the hand is worth two in the bush"?',
    'How would you explain the concept of sustainable development?',
    'Who painted the famous artwork "The Starry Night"?',
    'Can you describe the process of photosynthesis in plants?',
    'How would you solve the following equation: 3x + 5 = 17?',
    'How would you explain the concept of machine learning to someone with no technical background?',
    'What do you think about the future of the artificial intelligence?'
]

In [None]:
falcon_answers = []
for q in questions:
    instruction = q
    print("QUESTION:\n" + instruction + "\n")

    prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

    ### Instruction:
    {instruction}

    ### Response:
    """

    encoding = trained_model_tokenizer(
        prompt,
        padding = True,
        truncation = True,
        max_length = 1024 * 2,
        return_tensors = 'pt').to(device)

    with torch.inference_mode():
        outputs = trained_model.generate(
            input_ids = encoding.input_ids,
            attention_mask = encoding.attention_mask,
            generation_config = generation_config,
            max_new_tokens = 100 * 2
        )

    outputs = trained_model_tokenizer.decode(outputs[0], skip_special_tokens = True)

    key = '### Response:'
    pos = outputs.find(key) + len(key)
    outputs_ = outputs[pos:]
    print_outputs = limit_string(outputs_, long_min=70, long_max=90)
    print("ANSWER:\n" + print_outputs)
    falcon_answers.append(outputs_)

In [34]:
import pandas as pd

In [None]:
with wandb.init(project='LLM_alpaca', name='log_qanda'):
    data = []
    for i, question in enumerate(questions, start=1):
        data.append({"id": i, 
                     "question": question, 
                     "llama-3b": llama_answers_[i-1],
                     "bloom-3b": bloom_answers_[i-1],
                     "falcon-1b": falcon_answers_[i-1]})

    data = pd.DataFrame(data)
    table = wandb.Table(data=data, columns=["id", "question", "llama-3b", "bloom-3b", "falcon-1b"])
    wandb.log({"questions": table})

In [20]:
from tqdm.auto import tqdm

## PERPLEXITY

In [32]:
# Compute the negative log-likelihoods
nlls = []
for i in tqdm(range(len(test_dataset))):

    prompt = test_dataset["text"][i]

    # Prepare the test data
    encoding = trained_model_tokenizer(
        prompt,
        padding = True,
        truncation = True,
        max_length = 1024 * 2,
        return_tensors = 'pt').to(device)

    input_ids = encoding["input_ids"].to(device)

    with torch.inference_mode():
        output = trained_model(input_ids, labels=input_ids)
    nll = output.loss
    nlls.append(nll)

# Calculate the perplexity
perplexity = torch.exp(torch.stack(nlls).mean())
print(f"Perplexity: {perplexity}")

  0%|          | 0/1000 [00:00<?, ?it/s]

Perplexity: 2164.0


In [None]:
# LLAMA
# Perplexity: 2.082414150238037

# BLOOM
# Perplexity: 2.494140625

# FALCON
# Perplexity: 2164.0

In [34]:
from PIL import Image

with wandb.init(project='LLM_alpaca', name='log_img'):

    # Cargar tu imagen (reemplaza 'path_a_tu_imagen.jpg' con la ruta a tu imagen)
    image_path = r"C:\Users\Leandro\Desktop\Some projects\LLM\imgs\parse_analysis.png"
    image = Image.open(image_path)

    # Subir y registrar la imagen
    wandb.log({"image": wandb.Image(image)})

VBox(children=(Label(value='0.001 MB of 0.690 MB uploaded\r'), FloatProgress(value=0.0019756179163244935, max=…