In [None]:
%%capture
!pip install dataset evaluate transformers[sentencepiece]
!pip install transformers trl==0.8.3 peft bitsandbytes
!pip install loguru rouge_score bert_score
!pip install --upgrade nltk
!pip install fire wandb

In [None]:
from datasets import Dataset, load_dataset
import pandas as pd
from tqdm import tqdm
from loguru import logger
from huggingface_hub import login
import os, torch, wandb, fire, evaluate
from trl import SFTTrainer, setup_chat_format
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, DataCollatorForLanguageModeling
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM


In [None]:
from huggingface_hub import login

login("")

In [None]:
ds1 = load_dataset("sailor2/Vietnamese_RAG", "BKAI_RAG", split="train[:1141]")

ds2 = load_dataset("sailor2/Vietnamese_RAG", "LegalRAG", split="train[:3176]")

ds3 = load_dataset("sailor2/Vietnamese_RAG", "expert", split="train[:1772]")

ds4 = load_dataset("luzox/UTEHY_QA", split="train+test")

In [None]:
ds1 = ds1.train_test_split(test_size=0.04)
ds2 = ds2.train_test_split(test_size=0.03)
ds3 = ds3.train_test_split(test_size=0.03)
ds4 = ds4.train_test_split(test_size=0.2)

In [None]:
print(ds1)
print(ds2)
print(ds3)
print(ds4)

In [None]:
print(ds1["train"].features)
print('--'*50)
print(ds2["train"].features)
print('--'*50)
print(ds3["train"].features)
print('--'*50)
print(ds4["train"].features)


In [None]:
ds3 = ds3.remove_columns(["system", "field","spec_field", "question_type","question_type_symbol"])

ds3 = ds3.rename_columns({"revised_answer": "answer", "revised_claims": "context"})

In [None]:
# ds5 = ds5.remove_columns(["groundedness_score", "groundedness_eval","relevance_score", "standalone_score","standalone_eval", "relevance_eval"])

In [None]:
# ds6 = ds6.remove_columns(["groundedness_score", "groundedness_eval","relevance_score", "standalone_score","standalone_eval", "relevance_eval"])

In [None]:
from datasets import concatenate_datasets
from datasets.features import Sequence as DatasetsSequence

def standardize_dataset(dataset_dict):
    # Handle DatasetDict (with train/test/validation splits)
    if hasattr(dataset_dict, 'keys') and isinstance(dataset_dict, dict):
        standardized_dict = {}
        
        for split_name, ds in dataset_dict.items():
            def transform_example(example):
                standardized = {
                    'question': example['question'],
                    'answer': example['answer']
                }
                
                # Check the actual data type of the context
                if isinstance(example['context'], list):
                    standardized['context'] = ' '.join([str(item) for item in example['context']])
                else:
                    standardized['context'] = example['context']
                    
                return standardized
            
            standardized_dict[split_name] = ds.map(transform_example)
        
        return standardized_dict
    else:
        # Handle a single Dataset object
        def transform_example(example):
            standardized = {
                'question': example['question'],
                'answer': example['answer']
            }
            
            # Check the actual data type of the context
            if isinstance(example['context'], list):
                standardized['context'] = ' '.join([str(item) for item in example['context']])
            else:
                standardized['context'] = example['context']
                
            return standardized
        
        return dataset_dict.map(transform_example)

# Apply the standardization to all datasets
standardized_ds1 = standardize_dataset(ds1)
standardized_ds2 = standardize_dataset(ds2)
standardized_ds3 = standardize_dataset(ds3)
standardized_ds4 = standardize_dataset(ds4)
# standardized_ds6 = standardize_dataset(ds6)

# Now you can concatenate them - assuming they all have the same splits
from datasets import concatenate_datasets

# If they have train/test/validation splits
result = {}
for split in standardized_ds1.keys():
    datasets_to_concat = [
        standardized_ds1[split], 
        standardized_ds2[split], 
        standardized_ds3[split], 
        standardized_ds4[split], 
        # standardized_ds6[split]
    ]
    result[split] = concatenate_datasets(datasets_to_concat)

In [None]:
print(result)

In [None]:
df_train = result['train'].to_pandas()
df_test = result['test'].to_pandas()

In [None]:
df_train

In [None]:
wandb.login(key='')
run = wandb.init(
    project='Instruction fine-tune Qwen2.5-7B A100-80GB-ex',
    job_type='training',
    name='v2',
    anonymous='allow')

In [None]:
def process_instruction_prompt(sample):
    context = sample['context']
    question = sample['question']
    answer = sample['answer']
    instruction_prompt = f"<|im_start|>system\nBạn là chuyên gia tư vấn, trả lời các câu hỏi bằng tiếng Việt.<|im_end|>\n<|im_start|>user\nDựa vào nội dung văn bản sau:\n{context}\nBạn hãy đưa ra câu trả lời cho câu hỏi:\n{question}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>"
    return instruction_prompt

df_train['instruction'] = df_train.apply(process_instruction_prompt, axis=1)
df_test['instruction'] = df_test.apply(process_instruction_prompt, axis=1)

In [None]:
print(df_train['instruction'][0])

In [None]:
number_of_val_samples = 500

df_val = df_train.sample(n=number_of_val_samples, random_state=42)
df_train.drop(index=df_val.index, inplace=True)

len(df_train), len(df_val), len(df_test)

In [None]:
dataset_train = Dataset.from_pandas(df_train, preserve_index = False)
dataset_val = Dataset.from_pandas(df_val, preserve_index = False)
dataset_test = Dataset.from_pandas(df_test, preserve_index = False)

In [None]:
model_name = 'Qwen/Qwen2.5-7B-Instruct'

In [None]:
%cd

In [None]:
cd projects/Tunning/

In [None]:
epoch = 2
learning_rate = 2e-4
batch_size = 2

In [None]:
current_device = torch.cuda.current_device()
device_map = {"": current_device}
print(f"Loading model onto device: {current_device}")


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
)

peft_config = LoraConfig(r=16,
                        lora_alpha=32,
                        lora_dropout=0.05,
                        bias='none',
                        task_type="CAUSAL_LM",
                        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'])


model = get_peft_model(model, peft_config=peft_config)

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token 

training_args = TrainingArguments(
    output_dir='./Qwen2.5-7B/checkpoint_16bit',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    optim='adamw_torch',
    num_train_epochs=epoch,
    eval_steps=100,
    save_steps=100,
    save_total_limit=1,
    logging_steps=10,
    warmup_steps=10,
    learning_rate=learning_rate,
    bf16=True,                 
    group_by_length=True,
    report_to='wandb'
)


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


trainer = SFTTrainer(
    model=model,
    data_collator=data_collator,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    args=training_args,
    dataset_text_field='instruction'
)

In [None]:
trainer.train()

In [None]:
output_dir = "./Qwen2.5-7B/model"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
wandb.finish()
model.config.use_cache = True