In [1]:
from unsloth import FastLanguageModel
from trl import SFTTrainer
import torch
from transformers import TrainingArguments, TextStreamer
from datasets import DatasetDict, load_dataset
from rich import print
from rich.progress import track
import evaluate

2024-05-26 15:37:59.011418: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-26 15:37:59.036243: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Base

In [4]:
max_seq_length = 4096  # Supports RoPE Scaling interally, so choose any!
# get instruction dataset from ../data/instructions
dataset = DatasetDict.load_from_disk('../data/instructions')
dataset_train = dataset["train"]
dataset_test = dataset["test"]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="../models/vinallama-7b/checkpoint-2961",
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,
)

FastLanguageModel.for_inference(model)



==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 3070 Ti. Max memory: 8.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Unsloth: ../models/vinallama-7b/checkpoint-2961 has no tokenizer.model file.
Just informing you about this - this is not a critical error.


Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
def format_prompt_func(example):
    if isinstance(example['instruction'], str):
        return f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['response']}"
    
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Instruction:\n{example['instruction'][i]}\n\n### Response:\n{example['response'][i]}"
        output_texts.append(text)
    return output_texts

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    formatting_func=format_prompt_func,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args = TrainingArguments(
        output_dir="../models/vinallama-7b/eval",
        fp16_full_eval = True,
        per_device_eval_batch_size = 1,
        eval_accumulation_steps = 4,
        evaluation_strategy = "steps",
        eval_steps = 1,
    ),
)



In [3]:
benchmark_dataset = load_dataset("csv", data_files={"benchmark": "../data/VLSP2020_benchmark_sampled.csv"})
num_examples = len(benchmark_dataset["benchmark"])
predictions = []
references = benchmark_dataset["benchmark"]['english'][:num_examples]

Generating benchmark split: 0 examples [00:00, ? examples/s]

In [5]:
# text_streamer = TextStreamer(tokenizer)

def format_inference_prompt(example):
    return f"""
### Instruction:
Dịch câu sau từ tiếng Việt sang tiếng Anh: 
Tiếng Việt: {example}
Tiếng Anh:

### Response:
"""

def generate_predictions(model, tokenizer, dataset, num_examples, save_path):
    for example in track(dataset["benchmark"]['vietnamese'][:num_examples], total=num_examples):
        inputs = tokenizer(
            [
                format_inference_prompt(example),
            ],
            return_tensors="pt",
        ).to("cuda")

        outputs = model.generate(
            **inputs, max_new_tokens=128
        )
        
        response = tokenizer.batch_decode(outputs)
        response_split = response[0].split("\n")
        
        response_token_index = response_split.index("### Response:")
        translation = response_split[response_token_index + 1]

        predictions.append(translation)

    with open(save_path, "w", encoding='utf8') as f:
        for prediction in predictions:
            f.write(prediction + "\n")    


In [6]:
path = "../data/predictions_vinallama_7b.txt"
generate_predictions(model, tokenizer, benchmark_dataset, num_examples, path)

Output()

# Chat

In [2]:
max_seq_length = 4096
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="../models/vinallama-7b-chat/checkpoint-2961",
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)



==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 3070 Ti. Max memory: 8.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Unsloth: ../models/vinallama-7b-chat/checkpoint-2961 has no tokenizer.model file.
Just informing you about this - this is not a critical error.


Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
text_streamer = TextStreamer(tokenizer)

def format_inference_prompt(example):
    return f"""Dịch câu sau từ tiếng Việt sang tiếng Anh: 
Tiếng Việt: {example}
Tiếng Anh:
"""

predictions = []

def generate_predictions_chat(model, tokenizer, dataset, num_examples, save_path):
    for example in track(dataset["benchmark"]['vietnamese'][:num_examples], total=num_examples):
        messages = [
            {"role": "system", "content": "Bạn là một trợ lí AI hữu ích trong việc dịch thuật tiếng Việt sang Tiếng Anh. Hãy trả lời người dùng một cách chính xác."},
            {"role": "user", "content": format_inference_prompt(example)},
        ]
        
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")
        

        outputs = model.generate(
            inputs, max_new_tokens=128
        )
        
        response = tokenizer.batch_decode(outputs[:, inputs.shape[1]:])[0]
        
        if "<|im_end|>" in response:
            translation = response.split("<|im_end|>")[0].strip()
        else:
            translation = response.strip()
            
        
        # response_token_index = response_split.index("### Response:")
        # translation = response_split[response_token_index + 1]

        predictions.append(translation)

    with open(save_path, "w", encoding='utf8') as f:
        for prediction in predictions:
            f.write(prediction + "\n")    


In [4]:
benchmark_dataset = load_dataset("csv", data_files={"benchmark": "../data/VLSP2020_benchmark_sampled.csv"})
num_examples = len(benchmark_dataset)
predictions = []
references = benchmark_dataset["benchmark"]['english'][:num_examples]

generate_predictions_chat(model, tokenizer, benchmark_dataset, num_examples, "../data/predictions_vinallama_7b_chat.txt")

Output()

# Evaluate

In [17]:
with open("../data/predictions_vinallama_7b.txt", "r", encoding='utf8') as f:
    predictions_vinallama_7b = f.readlines()

with open("../data/predictions_vinallama_7b_chat.txt", "r", encoding='utf8') as f:
    predictions_vinallama_7b_chat = f.readlines()

assert len(predictions_vinallama_7b) == len(predictions_vinallama_7b_chat)
num_examples = len(predictions_vinallama_7b)
benchmark_dataset = load_dataset("csv", data_files={"benchmark": "../data/VLSP2020_benchmark_sampled.csv"})
references = benchmark_dataset["benchmark"]['english']
sources = benchmark_dataset["benchmark"]['vietnamese']

## BLEU

In [5]:
bleu = evaluate.load("bleu")

results_vinallama_7b = bleu.compute(predictions=predictions_vinallama_7b, references=references)
results_vinallama_7b_chat = bleu.compute(predictions=predictions_vinallama_7b_chat, references=references)

for key, value in results_vinallama_7b.items():
    print(f"Vinallama 7B {key}: {value}")
    print(f"Vinallama 7B Chat {key}: {results_vinallama_7b_chat[key]}")

## BERTSCORE

In [15]:
bertscore = evaluate.load("bertscore")

results_vinallama_7b = bertscore.compute(predictions=predictions_vinallama_7b, references=references, lang="en")
results_vinallama_7b_chat = bertscore.compute(predictions=predictions_vinallama_7b_chat, references=references, lang="en")

for key, value in results_vinallama_7b.items():
    
    if isinstance(value, list):
        vinallama_7b_tensor = torch.tensor(value, dtype=torch.float32)
        vinallama_7b_chat_tensor = torch.tensor(results_vinallama_7b_chat[key], dtype=torch.float32)
        
        avg_vinallama_7b = torch.mean(vinallama_7b_tensor).item()
        avg_vinallama_7b_chat = torch.mean(vinallama_7b_chat_tensor).item()
        print(f"Vinallama 7B {key}: {avg_vinallama_7b}")
        print(f"Vinallama 7B Chat {key}: {avg_vinallama_7b_chat}")
    else:
        print(f"Vinallama 7B {key}: {value}")
        print(f"Vinallama 7B Chat {key}: {results_vinallama_7b_chat[key]}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## BLEURT

In [3]:
bleurt = evaluate.load("bleurt", module_type="metric", checkpoint="bleurt-large-512")

results_vinallama_7b = bleurt.compute(predictions=predictions_vinallama_7b, references=references)
results_vinallama_7b_chat = bleurt.compute(predictions=predictions_vinallama_7b_chat, references=references)

for key, value in results_vinallama_7b.items():
    
    vinallama_7b_tensor = torch.tensor(value, dtype=torch.float32)
    vinallama_7b_chat_tensor = torch.tensor(results_vinallama_7b_chat[key], dtype=torch.float32)
    
    avg_vinallama_7b = torch.mean(vinallama_7b_tensor).item()
    avg_vinallama_7b_chat = torch.mean(vinallama_7b_chat_tensor).item()
    
    print(f"Vinallama 7B {key}: {avg_vinallama_7b}")
    print(f"Vinallama 7B Chat {key}: {avg_vinallama_7b_chat}")

Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512').


INFO:tensorflow:Reading checkpoint /home/nguyen/.cache/huggingface/metrics/bleurt/default/downloads/extracted/49ae870ec41970df36d4cb594bb8991ef69b9fe584090a8c05e6b35a23eb644e/bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.


2024-05-26 15:38:13.304878: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-26 15:38:13.305546: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-26 15:38:13.305565: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-26 15:38:13.307250: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-26 15:38:13.307268: I external/local_xla/xla/stream_executor

INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


## MANUAL

In [22]:
num_examples = 20

# generate random indices
# create seed for reproducibility
torch.manual_seed(42)
indices = torch.randperm(len(predictions_vinallama_7b))[:num_examples].tolist()

predictions_vinallama_7b_sampled = [predictions_vinallama_7b[i] for i in indices]
predictions_vinallama_7b_chat_sampled = [predictions_vinallama_7b_chat[i] for i in indices]

references_sampled = [references[i] for i in indices]
sources_sampled = [sources[i] for i in indices]

for prediction_vinallama_7b, prediction_vinallama_7b_chat, reference, source in zip(predictions_vinallama_7b_sampled, predictions_vinallama_7b_chat_sampled, references_sampled, sources_sampled):
    print(f"Vietnamese: {source}")
    print(f"Reference: {reference}")
    print(f"Vinallama 7B: {prediction_vinallama_7b}")
    print(f"Vinallama 7B Chat: {prediction_vinallama_7b_chat}")
    print("\n")

In [5]:
bleurt = evaluate.load("bleurt", module_type="metric", checkpoint="bleurt-large-512")
results = bleurt.compute(predictions=predictions, references=references)
print(results)



INFO:tensorflow:Reading checkpoint /home/nguyen/.cache/huggingface/metrics/bleurt/default/downloads/extracted/49ae870ec41970df36d4cb594bb8991ef69b9fe584090a8c05e6b35a23eb644e/bleurt-base-128.


INFO:tensorflow:Reading checkpoint /home/nguyen/.cache/huggingface/metrics/bleurt/default/downloads/extracted/49ae870ec41970df36d4cb594bb8991ef69b9fe584090a8c05e6b35a23eb644e/bleurt-base-128.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.
