In [None]:
import torch
from peft import LoraConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, set_seed
from datasets import load_dataset
from trl import SFTTrainer

In [None]:
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model_name = "./phi-3-mini-custom"  # Path to your fine-tuned model directory
model_infer = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, trust_remote_code=True,\
                                                   device_map = 'cuda')
tokenizer_infer = AutoTokenizer.from_pretrained(model_name)


# Add special tokens if needed
special_tokens = {'additional_special_tokens': ['<|instruction|>', '<|input|>', '<|output|>', '<|end|>']}
tokenizer_infer.add_special_tokens(special_tokens)
model_infer.resize_token_embeddings(len(tokenizer_infer))

# Set the model to evaluation mode
model_infer.eval()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_infer.to(device)

def apply_peft_chat_template_infer(instruction, input_text):
    formatted_prompt = (
        f"<|instruction|> {instruction} <|end|>\n"
        f"<|input|> {input_text} <|end|>\n"
        f"<|output|>"
    )
    return formatted_prompt

# Function to perform inference for a single prompt
def generate_response(instruction, input_text, max_length=100, temperature=0.7, top_p=0.9):
    # Format the prompt
    prompt = apply_peft_chat_template_infer(instruction, input_text)

    # Tokenize the prompt
    #input_ids = tokenizer_infer(prompt, return_tensors="pt").input_ids
    input_ids = tokenizer_infer(prompt, return_tensors="pt").input_ids.to(device)

    # Generate the output
    with torch.no_grad():
        output_ids = model_infer.generate(
            input_ids=input_ids,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            num_return_sequences=1
        )

    # Decode the generated tokens to text
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=False)

    # Extract the output part
    output_start = generated_text.find("<|output|>") + len("<|output|>")
    output_end = generated_text.find("<|end|>", output_start)
    response = generated_text[output_start:output_end].strip() if output_end != -1 else generated_text[output_start:].strip()

    return response

In [None]:
# train
trainer.train()

trainer.save_model("./phi-3-mini-custom")
tokenizer.save_pretrained("./phi-3-mini-custom")

In [None]:
# test
# Example usage
instruction = "Generate a python function to print intergers."
input_text = "The function should take a number and print till that."

# Generate the response
response = generate_response(instruction, input_text)
print("Generated Response:")
print(response)