In [1]:
from dotenv import load_dotenv
import os

In [2]:
from huggingface_hub import HfApi, HfFolder

token = os.getenv("HF_API_TOKEN")  # Replace with your actual token
HfFolder.save_token(token)


In [3]:
# Installing More Dependencies
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

In [4]:
model_id="meta-llama/Meta-Llama-3.1-8B-Instruct"

In [5]:
def get_model_and_tokenizer(model_id):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
  )
  model = AutoModelForCausalLM.from_pretrained(
      model_id, quantization_config=bnb_config, device_map="auto"
  )
  model.config.use_cache=False
  model.config.pretraining_tp=1
  return model, tokenizer

In [6]:
model, tokenizer = get_model_and_tokenizer(model_id)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
from transformers import GenerationConfig
from time import perf_counter
def generate_response(user_input):
  prompt = formatted_prompt(user_input)
  inputs = tokenizer([prompt], return_tensors="pt")
  generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.5,repetition_penalty=1.2,
      max_new_tokens=60,pad_token_id=tokenizer.eos_token_id
  )
  start_time = perf_counter()
  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
  outputs = model.generate(**inputs, generation_config=generation_config)
  theresponse = (tokenizer.decode(outputs[0], skip_special_tokens=True))
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  output_time = perf_counter() - start_time
  print(f"Time taken for inference: {round(output_time,2)} seconds")

In [8]:
def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"

In [9]:
generate_response(user_input='How do I prevent a phishing email?')

<|im_start|>user
How do I prevent a phishing email?<|im_end|>
<|im_start|>assistant: Preventing phishing emails involves being cautious and vigilant when receiving unsolicited messages, especially those with links or attachments. Here are some tips to help you:

1\. Verify the sender's identity:
   Check if the message is from a known source.
2\. Look for spelling mistakes and grammatical errors:

Time taken for inference: 3.69 seconds


In [None]:
from datasets import Dataset

# Your few-shot examples
dataset = [
    {
        "abstract": "This study investigates the transport of glucose by SGLT1 in human intestinal cells.",
        "question": "Does this abstract provide laboratory or experimental evidence that substrate X is transported by transporter protein Y?",
        "confidence_score": 3,
        "justification": "The abstract mentions investigation of glucose transport by SGLT1 but does not provide explicit experimental evidence or outcomes."
    },
    {
        "abstract": "The effects of various inhibitors on non-specific diffusion were analyzed.",
        "question": "Does this abstract provide laboratory or experimental evidence that substrate X is transported by transporter protein Y?",
        "confidence_score": 0,
        "justification": "The abstract focuses solely on non-specific diffusion and the effects of inhibitors, without mentioning substrate transport by a specific protein."
    },
    {
        "abstract": "ATP-sensitive potassium (K (ATP) ) channels are multimeric protein complexes...",
        "question": "Does this abstract provide laboratory or experimental evidence that substrate X is transported by transporter protein Y?",
        "confidence_score": 4,
        "justification": "The abstract identifies transporter proteins and their roles, but does not provide direct evidence of substrate transport."
    },
    {
        "abstract": "Membrane transporters that use energy stored in sodium gradients to drive nutrients...",
        "question": "Does this abstract provide laboratory or experimental evidence that substrate X is transported by transporter protein Y?",
        "confidence_score": 7,
        "justification": "The abstract discusses structural insights and galactose binding to vSGLT, strongly implying substrate transport, but lacks direct experimental transport evidence."
    },
]

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_list(dataset)


In [None]:
def preprocess_function(examples):
    inputs = [f"Abstract: {abstract}\nQuestion: {question}" for abstract, question in zip(examples["abstract"], examples["question"])]
    outputs = examples["justification"]
    return {"input_ids": tokenizer(inputs, truncation=True, padding="max_length", max_length=512)["input_ids"],
            "labels": tokenizer(outputs, truncation=True, padding="max_length", max_length=512)["input_ids"]}

tokenized_dataset = train_dataset.map(preprocess_function, batched=True)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",  # No validation since you're evaluating manually
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=10,  # More epochs due to limited examples
    save_total_limit=2,
    logging_dir="./logs",
    fp16=True,
)


In [None]:
from transformers import Trainer, AutoModelForCausalLM, AutoTokenizer, DataCollatorForSeq2Seq, AutoProcessor

# Load model and tokenizer
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, device_map={"": "cpu"})
checkpoint = "meta-llama/Meta-Llama-3.1-8B-Instruct"
processor = AutoProcessor.from_pretrained(checkpoint)

data_collator = DataCollatorForSeq2Seq(
   tokenizer=processor,
    model=model,
)



In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    processing_class = processor,
)

In [None]:
import os
os.environ["WANDB_API_KEY"] = os.getenv("WANDB_API_KEY")


In [None]:
trainer.train()
