In [1]:
import os
import json
import random
from typing import Dict, List, Any

import torch
from datasets import Dataset, DatasetDict
from transformers import (
    TrainingArguments,
    Trainer,
)
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Your Flash Attention 2 installation seems to be broken?
A possible explanation is you have a new CUDA version which isn't
yet compatible with FA2? Please file a ticket to Unsloth or FA2.
We shall now use Xformers instead, which does not have any performance hits!
We found this negligible impact by benchmarking on 1x A100.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.bfloat16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
cache_dir = "./model_cache-2"
model_name = "unsloth/Llama-3.3-70B-Instruct"
train_dataset_path = "step2_model/llama3.3-70b/datasets/tp/tp_data_train.json"
val_dataset_path = "step2_model/llama3.3-70b/datasets/tp/tp_data_val.json"
test_dataset_path = "./datasets/test.json"
lora_save_name = "70b-4bit-lora-r128"
lora_dim = 32

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    cache_dir=cache_dir,
    attn_implementation="flash_attention_2"
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_dim, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)



==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.48.0.
   \\   /|    GPU: NVIDIA H100 PCIe. Max memory: 79.097 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1. CUDA: 9.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 8/8 [00:32<00:00,  4.11s/it]
Unsloth 2025.1.5 patched 80 layers with 80 QKV layers, 80 O layers and 80 MLP layers.


In [4]:

import pprint

def load_json_data(json_path: str) -> List[Dict[str, Any]]:
    """
    Load narrative data from a JSON file.
    This function wraps multiple JSON objects into a list if they aren't already.
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        content = f.read().strip()
        try:
            data = json.loads(content)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            data = []
    
    return data

def formatting_prompts_func(examples: List[Dict[str, Any]]) -> List[str]:
    """
    Format prompts by combining instruction, input, and output from each example.
    """
    output_texts = []
    for example in examples:
        text = (
            f"{example['instruction']}\n\n"
            f"### Input:\n{example['input']}\n" 
            f"### Response:\n{example['output']}"
        ) + tokenizer.eos_token  # Ensure `tokenizer` is defined
        output_texts.append(text)
    return output_texts

# Load and format training data
train_data = load_json_data(train_dataset_path)
formatted_train_texts = formatting_prompts_func(train_data)
train_dataset = Dataset.from_list([{"text": text} for text in formatted_train_texts])

# Load and format validation data
val_data = load_json_data(val_dataset_path)
formatted_val_texts = formatting_prompts_func(val_data)
val_dataset = Dataset.from_list([{"text": text} for text in formatted_val_texts])

# Print a sample from the training dataset
print("Sample from training dataset:")
print("-" * 50)
print(train_dataset[0]["text"])
print("-" * 50)


Sample from training dataset:
--------------------------------------------------
### INSTRUCTIONS
        You are a helpful assistant that identifies and explains turning points in a narrative. You are given:
        - The story, broken down into numbered sentences.
        - The definitions of each of the five turning points (Opportunity, Change of Plans, Point of No Return, Major Setback, Climax).
        - Ground truth turning point indices for this story.
        
        ### TURNING POINT DEFINITIONS
        - **Opportunity** – Introductory event that occurs after presenting the setting and background of the main characters.
        - **Change of Plans** – Event where the main goal of the story is defined, starting the main action.
        - **Point of No Return** – Event that pushes the main character(s) to fully commit to their goal.
        - **Major Setback** – Event where things fall apart temporarily or permanently.
        - **Climax** – Final event/resolution of the main s

In [5]:

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 50,
        learning_rate = 2e-4,
        dataloader_num_workers = 32,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "step2_model/llama3.3-70b/datasets/tp/outputs",
        report_to = "none", # Use this for WandB etc
        evaluation_strategy = "steps",
        eval_steps = 10,
    ),
)

Map (num_proc=2): 100%|██████████| 351/351 [00:03<00:00, 113.55 examples/s]
Map (num_proc=2): 100%|██████████| 88/88 [00:01<00:00, 48.32 examples/s]


In [6]:
from unsloth import unsloth_train
trainer_stats = unsloth_train(trainer)
model.save_pretrained(lora_save_name)  # Local saving
tokenizer.save_pretrained(lora_save_name)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 351 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 50
 "-____-"     Number of trainable parameters = 414,187,520


Step,Training Loss,Validation Loss


In [9]:
dataset[0]["text"].split("### Response:\n")[1]

NameError: name 'dataset' is not defined

In [25]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    dataset[0]["text"].split("### Response:\n")[0]
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 8192)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Identify the turning points and their sentence number in this narrative and explain your reasoning step-by-step. Then provide the final turning point locations clearly in a json format, like {"tp1": ##.#, "tp2": ##.#, "tp3": ##.#, "tp4": ##.#, "tp5": ##.#}.

    ### TURNING POINT DEFINITIONS
    1. **Opportunity** – Introductory event that occurs after presenting the setting and background of the main characters.
    2. **Change of Plans** – Event where the main goal of the story is defined, starting the main action.
    3. **Point of No Return** – Event that pushes the main character(s) to fully commit to their goal.
    4. **Major Setback** – Event where things fall apart temporarily or permanently.
    5. **Climax** – Final event/resolution of the main story (the "biggest spoiler").
    

### 

### Response:
### Review of the Story
The story revolves around Vivo, a kinkajou who plays music with his owner Andrés in Havana, Cuba. After Andrés' death, Vivo decides to fulfill Andrés' dream of confessing his love to Marta Sandoval through a song. Vivo embarks on a journey from Cuba to Miami, facing various challenges and making new friends along the way, including Gabi, who helps him in his quest.

### Summary of Each Segment and Character Development
- **Vivo**: Begins as a reluctant participant in Andrés' plans but becomes the main character driven by guilt and a desire to fulfill Andrés' last wish. He faces numerous challenges, makes new friends, and learns about perseverance and friendship.
- **Gabi**: Initially hesitant but then fully invests in helping Vivo. She learns valuable lessons about friendship, loyalty, and dealing with personal loss.
- **Marta**: Introduced as the object of Andrés' affections, she is later revealed to be grieving Andrés' death. Through Vivo's effor

In [29]:
if True:
    from unsloth import FastLanguageModel
    from transformers import AutoTokenizer
    
    # Load the base model first
    model = None
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Llama-3.3-70B-Instruct",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
        cache_dir=cache_dir,
    )
    
    # Load the LoRA adapter weights
    model = FastLanguageModel.get_peft_model(model)
    model.load_adapter("lora_model")
    
    # Load tokenizer separately
    tokenizer = AutoTokenizer.from_pretrained("lora_model")
    
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    dataset[0]["text"].split("### Response:\n")[0]
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 8192)

==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.48.0.
   \\   /|    GPU: NVIDIA H100 PCIe. Max memory: 79.097 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1. CUDA: 9.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [2]:
import re

import json

from typing import Dict, List, Any, Optional
def extract_tp_from_text(text: str) -> Optional[Dict[str, Any]]:
    """
    Extract the turning points from the model's generated text if it follows
    the $\boxed{...}$ format containing a JSON structure.
    """
    pattern = r'\$\\boxed\{(.*?)\}\$'
    match = re.search(pattern, text)
    if match:
        json_str = match.group(1)
        # Convert the string to a standard JSON format
        json_str = '{' + json_str.replace('"', '"') + '}'
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None
    return None


### Final Turning Point Locations

extract_tp_from_text("""Based on the analysis, the final turning point locations in JSON format are:

```json
{
  ""tp1"": 7,
  ""tp2"": 10,
  ""tp3"": 14,
  ""tp4"": 19,
  ""tp5"": 35
}
```""")