In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install nltk rouge-score
!pip install rouge

In [None]:
from unsloth import FastLanguageModel
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_dataset
from transformers import TextStreamer, TrainingArguments
from trl import SFTTrainer
import nltk
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import matplotlib.pyplot as plt
import random
from nltk.corpus import wordnet
import jieba
import math
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score
import json


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
max_seq_length = 2048
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}
### Input:
{}
### Response:
{}"""

FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "‰Ω†Âú®Âπ≤‰ªÄ‰πàÂëÄÔºÅ", # instruction
        "", # input
        "", # output
    )
], return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
‰Ω†Âú®Âπ≤‰ªÄ‰πàÂëÄÔºÅ
### Input:

### Response:
I'm reading the book.
<|end_of_text|>


In [None]:
#4 Fine-tune dataset
EOS_TOKEN = tokenizer.eos_token 
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }


In [None]:
# Data Augmentation Method
# Random Insertion
# def random_insertion(sentence, n):
#     words = list(jieba.cut(sentence))
#     new_words = words.copy()
#     for _ in range(n):
#         add_word(new_words)
#     return ''.join(new_words)

# def add_word(new_words):
#     random_word = new_words[random.randint(0, len(new_words) - 1)]
#     random_idx = random.randint(0, len(new_words) - 1)
#     new_words.insert(random_idx, random_word)

# Random Swapping
def random_swap(sentence, n):
    words = list(jieba.cut(sentence))
    if len(words) < 2:
        return sentence 
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return ''.join(new_words)

def swap_word(new_words):
    if len(new_words) < 2:
        return new_words  
    random_idx_1 = random.randint(0, len(new_words) - 1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words) - 1)
        counter += 1
        if counter > 3:  
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words

# # Random Deletion
# def random_deletion(sentence, p):
#     words = list(jieba.cut(sentence))
#     if len(words) == 1:
#         return sentence
#     new_words = []
#     for word in words:
#         r = random.uniform(0, 1)
#         if r > p:
#             new_words.append(word)
#     if len(new_words) == 0:
#         rand_int = random.randint(0, len(words) - 1)
#         return words[rand_int]
#     return ''.join(new_words)

# DA Appending
def augment_texts(texts, n=2, num_augmentations=2):
    augmented_texts = []
    for text in texts:
      augmented_texts.append(text)  
      for _ in range(num_augmentations):
        #augmented_texts.append(random_insertion(text, num_augmentations))
        augmented_texts.append(random_swap(text, n))
        #augmented_texts.append(random_deletion(text, 0.2))
    return augmented_texts

In [None]:
dataset = load_dataset("uITimeCia/Zhenhuan", split="train")


# instructions_train, instructions_eval, inputs_train, inputs_eval, outputs_train, outputs_eval = train_test_split(
#     dataset['instruction'],  dataset['input'], dataset['output'], test_size=0.2, random_state=42)

Downloading readme:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/955k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3729 [00:00<?, ? examples/s]

In [None]:
with open('augmented_huanhuan.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [None]:
print(len(data))

11187


In [None]:
augmented_instructions = [entry['instruction'] for entry in data]
augmented_outputs = [entry['output'] for entry in data]
augmented_inputs = [entry['input'] for entry in data]

In [None]:
# augmented_instructions = augment_texts(dataset['instruction'])

input_field = dataset['input']
augmented_inputs = []
for inp in input_field:
    for _ in range(3):  
        augmented_inputs.append(inp)

output_field = dataset['output']
augmented_outputs = []
for out in output_field:
    for _ in range(3): 
        augmented_outputs.append(out)

# all_instructions = augmented_instructions + synonyms_replace_instruction
# all_inputs = augmented_inputs + synonyms_replace_inputs
# all_outputs = augmented_outputs + synonyms_replace_outputs



In [None]:
print(len(dataset['instruction']))

1710


In [None]:
augmented_dataset = Dataset.from_dict({"instruction": augmented_instructions, "input": augmented_inputs, "output": augmented_outputs})

In [None]:
print("Original dataset size:", len(dataset))
print("Augmented dataset size:", len(augmented_dataset))


Original dataset size: 3729
Augmented dataset size: 11187


In [None]:
df = augmented_dataset.to_pandas()
train_dataset, eval_dataset = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
# train_dataset = augmented_dataset
# eval_dataset = Dataset.from_dict({"instruction": instructions_eval, "input": inputs_eval, "output": outputs_eval})

In [None]:
print("train:", len(train_dataset))
print("eval:", len(eval_dataset))

train: 10068
eval: 1119


In [None]:
augmented_dataset = augmented_dataset.shuffle(seed=42)  # Êâì‰π±Êï∞ÊçÆÈõÜ
augmented_dataset = augmented_dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/11187 [00:00<?, ? examples/s]

In [None]:
train_dataset = Dataset.from_pandas(train_dataset)
eval_dataset = Dataset.from_pandas(eval_dataset)

In [None]:
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/10068 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)


training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=500,  
    learning_rate=2e-5,  
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=50,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    eval_strategy="steps", 
    eval_steps=50,  
    load_best_model_at_end=True,  
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset, 
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=training_args
)

Unsloth: Already have LoRA adapters! We shall skip this step.


Map (num_proc=2):   0%|          | 0/10068 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1119 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_stats = trainer.train()

In [None]:
#8 Save Lora Model
model.save_pretrained("lora_model") # Local saving
# model.push_to_hub("your_name/lora_model", token = "...") # Âú®Á∫ø‰øùÂ≠òÂà∞hugging faceÔºåÈúÄË¶Åtoken

In [None]:
#9 GGUF
#model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
#model.save_pretrained_merged("outputs", tokenizer, save_method = "merged_16bit",) #ÂêàÂπ∂Ê®°ÂûãÔºå‰øùÂ≠ò‰∏∫16‰Ωçhf
model.push_to_hub_gguf("uITimeCia/Paimon_With1000MaxSteps", tokenizer, quantization_method = "q4_k_m", token = "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") #ÂêàÂπ∂4‰ΩçggufÔºå‰∏ä‰º†Âà∞hugging face(ÈúÄË¶ÅË¥¶Âè∑token)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 34.06 out of 50.99 RAM for saving.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 32/32 [00:38<00:00,  1.21s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at uITimeCia/Paimon_With1000MaxSteps into f16 GGUF format.
The output location will be ./uITimeCia/Paimon_With1000MaxSteps/unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: Paimon_With1000MaxSteps
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'mode

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.F16.gguf:   0%|          | 0.00/16.1G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/uITimeCia/Paimon_With1000MaxSteps
Unsloth: Uploading GGUF to Huggingface Hub...


  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/uITimeCia/Paimon_With1000MaxSteps
