In [1]:
#!pip install accelerate peft bitsandbytes transformers trl

# 

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [3]:
# Installing More Dependencies
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os
import pandas as pd

from transformers import GenerationConfig
from time import perf_counter

In [4]:
model_id = "marcelolimagomes/llama3.18B-Fine-tuned_FIAP"
#model_id = "marcelolimagomes/llama3.18B-Instruct-Fine-tuned_FIAP"

In [5]:
# train_prompt = \
# """<|im_start|>user
# Product name [{}]<|im_end|>
# <|im_start|>assistant 
# Review: {}
#"""

train_prompt = \
"""<|im_start|>user
Write a review of book [{}]<|im_end|>
<|im_start|>assistant
"""

In [6]:
def get_model_and_tokenizer(model_id):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype="float16",
      bnb_4bit_use_double_quant=True
  )
  model = AutoModelForCausalLM.from_pretrained(
      model_id,
      quantization_config=bnb_config,
      device_map="auto"
  )
  model.config.use_cache = False
  model.config.pretraining_tp = 1
  return model, tokenizer

In [7]:
def formatted_prompt(question) -> str:
  return train_prompt.format(question)

In [8]:
def generate_response(user_input, model, tokenizer):
  prompt = formatted_prompt(user_input)
  inputs = tokenizer([prompt], return_tensors="pt")

  generation_config = GenerationConfig(
    penalty_alpha=0.6,
    do_sample=True,
    top_k=5,
    temperature=0.1,
    repetition_penalty=1.2,
    max_new_tokens=100,
    pad_token_id=tokenizer.eos_token_id
  )

  start_time = perf_counter()
  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
  outputs = model.generate(**inputs, generation_config=generation_config)
  theresponse = (tokenizer.decode(outputs[0], skip_special_tokens=True))
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  output_time = perf_counter() - start_time
  print(f"Time taken for inference: {round(output_time,2)} seconds")

In [None]:
model, tokenizer = get_model_and_tokenizer(model_id)

In [None]:
df = pd.read_csv('./data_used_to_train.csv', sep=';')
for _, row in df.sample(frac=1).head(5).iterrows():
  title = row['title']
  content = row['content']
  print(f"Product name [{title}]")
  print(f"Original review:\n {content}")
  print(f"Generated review:")
  print('---------------------------------------------------------------')
  generate_response(title, model, tokenizer)
  print('---------------------------------------------------------------')