In [1]:
from unsloth import FastLanguageModel
from peft import PeftModel

max_seq_length = 12000
load_in_4bit = True

# Load base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Phi-4",
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit,
)

# Load QLoRa PEFT adapter on top
model = PeftModel.from_pretrained(model, "meftah416/EnergyPhi")

# Enable fast inference
FastLanguageModel.for_inference(model)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.4: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA L40. Num GPUs = 1. Max memory: 44.392 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.63it/s]


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(100352, 5120, padding_idx=100351)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [2]:
!nvidia-smi

Sat Sep 13 00:49:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L40                     On  |   00000000:61:00.0 Off |                    0 |
| N/A   35C    P0            100W /  300W |   10983MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
prompt = """
Generate an EnergyPlus IDF file that is 28.4300 meters long, 18.1100 meters wide, and 2.6600 meters high. 
The window-to-wall ratio is 0.2000. The occupancy rate is  0.09 m2/people, the lighting level is 9.25 W/m2, and the infiltration is 0.0004.
"""
messages = [
    {"role": "user", 
     "content": prompt},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(
    input_ids = inputs, streamer = text_streamer, max_new_tokens = 11000,
    use_cache = True, temperature = 1.5, min_p = 0.1
)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<think>The building has length L = 28.43 m, width W = 18.11 m, height H = 2.66 m, and a window-to-wall ratio WWR = 0.20. The volume of the zone will be L*W*H = 28.43*18.11*2.66 = 1370.97 m³. The window scale factor is sqrt(WWR) = 0.4472, which is used to compute window width and height while preserving wall aspect ratio. 

The floor lies at z=0 with coordinates (L,W,z)=(28.43,18.11,0), (L,0,z)=(28.43,0,0), (0,0,z)=(0,0,0) and (0,W,z)=(0,18.11,0). The roof lies at z=2.66 with coordinates (0,W,z)=(0,18.11,2.66), (0,0,z)=(0,0,2.66), (L,0,z)=(28.43,0,2.66), and (L,W,z)=(28.43,18.11,2.66). 

The north wall is at y=18.11 with corners at (28.43,18.11,2.66), (28.43,18.11,0), (0,18.11,0), (0,18.11,2.66). The east wall is at x=28.43 with corners (28.43,0,2.66), (28.43,0,0), (28.43,18.11,0), (28.43,18.11,2.66). The south wall is at y=0 with corners (0,0,2.66), (0,0,0), (28.43,0,0), (28.43,0,2.66). The west wall is at x=0 with corners (0,18.11,2.66), (0,18.11,0), (0,0,0), (0,0,2.66). 

Each wall h