In [None]:
from unsloth import FastLanguageModel
from peft import PeftModel

max_seq_length = 12000
load_in_4bit = True

# Load base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Phi-4",
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit,
)

# Load QLoRa PEFT adapter on top
model = PeftModel.from_pretrained(model, "meftah416/EnergyPhi")

# Enable fast inference
FastLanguageModel.for_inference(model)

In [None]:
!nvidia-smi

In [None]:
prompt = """
Generate an EnergyPlus IDF file that is 40 meters long, 30 meters wide, and 3 meters high. 
The window-to-wall ratio is 0.4. The occupancy rate is  0.091 m2/people, the lighting level is 9.25 W/m2, and the infiltration is 0.0004.
"""
messages = [
    {"role": "user", 
     "content": prompt},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(
    input_ids = inputs, streamer = text_streamer, max_new_tokens = 11000,
    use_cache = True, temperature = 1.5, min_p = 0.1
)