In [1]:
# Install necessary libraries for model training and evaluation
!pip install transformers torch accelerate bitsandbytes datasets peft trl
!pip install --upgrade transformers torch jinja2

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.46.1-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting torch
  Downloading torch-2.4.1-cp38-cp38-manylinux1_x86_64.whl (797.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.1/797.1 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting accelerate
  Downloading accelerate-1.0.1-py3-none-any.whl (330 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.9/330.9 KB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting datasets
  Dow

# Inference using Llama-3.2-1B-Instruct

In [1]:
# Authentication with Hugging Face
from huggingface_hub import login

# Log into Hugging Face with your personal access token
login("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")

# Import torch and check if a CUDA-enabled GPU is available
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"  # Sets device to GPU if available, otherwise CPU
print(torch.cuda.is_available())  # Prints True if GPU is available, False if not
print(f"Using device: {device}")

True
Using device: cuda


In [2]:
# Import transformers classes to load the model and tokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the model name to load
model_name = "meta-llama/Llama-3.2-1B-Instruct"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto",             # Enables automatic device assignment (uses GPU if available)
    torch_dtype=torch.float16,      # Sets the model's tensor type to float16 for efficient GPU processing
    low_cpu_mem_usage=True,        # Reduces memory usage on CPU when loading the model, helpful with large models
    return_dict=True,               # Returns the model's outputs as a dictionary for easier access
    trust_remote_code=True
)

# Set the padding token for tokenization and model if not already set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [3]:
# Display the model architecture, layers, and parameters
print(model.config)
model

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pad_token_id": [
    128001,
    128008,
    128009
  ],
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "float16",
  "transformers_version": "4.46.1",
  "use_c

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [4]:
# Display the tokenizer
tokenizer

PreTrainedTokenizerFast(name_or_path='meta-llama/Llama-3.2-1B-Instruct', vocab_size=128000, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>', 'pad_token': '<|eot_id|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|finetune_right_pad_id|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128005: AddedToken("<|r

### Prompt Template

*Supported Roles*

There are 4 different roles that are supported by Llama text models:

- system: Sets the context in which to interact with the AI model. It typically includes rules, guidelines, or necessary information that help the model respond effectively.
- user: Represents the human interacting with the model. It includes the inputs, commands, and questions to the model.
- ipython: A new role introduced in Llama 3.1. Semantically, this role means "tool". This role is used to mark messages with the output of a tool call when sent back to the model from the executor.
- assistant: Represents the response generated by the AI model based on the context provided in the system, ipython and user prompts.


Note that the model’s response ends with an <|eot_id|> tag indicating end of turn.

In [5]:
from transformers import pipeline

# Initialize a text generation pipeline with Llama model and tokenizer
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,  # Use float16 precision for efficiency on compatible GPUs
    device_map="auto",          # Automatically assigns the model to available devices (e.g., GPU)
)

# Define the user's message in a chat format
messages = [{"role": "user", "content": "Who is the Prime Minister of India? What are their life achievements?"}]

# Apply the chat template to format the message appropriately for the model
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

# Generate a response using the pipeline
outputs = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)
print(outputs[0]["generated_text"]) # Print the generated text

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 05 Nov 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Who is the Prime Minister of India? What are their life achievements?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The Prime Minister of India is currently Narendra Modi. He has been serving as the Prime Minister of India since 2014. 

Narendra Modi was born on September 17, 1950, in Vadodara, Gujarat. He studied at the Indian Institute of Technology (IIT) Kanpur and later graduated from the Indian Institute of Technology (IIT) Bombay.

Prior to entering politics, Modi worked as a chartered accountant and served in various administrative positions, including serving as the CEO of the Gujarat Industrial Corridor Development Corporation. He was also a member of the Bharatiya Janata Party (BJP) from 1998 to 2014.

Modi's life achievements include:

1. **Economic growth**: Modi is credited with driving Ind

In [6]:
# Define the conversation as a list of message dictionaries
messages = [
    {
        "role": "system",
        "content": "You are a skilled chef specialized in Indian food.",  # Sets the assistant's role and context as a chef
    },
    {
        "role": "user",
        "content": "Can you provide the recipe for Bisi Bele Bath, a South Indian food?",  # User asks for a recipe
    },
]

# Format the messages as a prompt suitable for the model.
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

# Generate a response with a maximum of 512 tokens
outputs = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7) # "do_sample=True" allows varied and natural responses.

# Print the generated response
print(outputs[0]["generated_text"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 04 Nov 2024

You are a skilled chef specialized in Indian food.<|eot_id|><|start_header_id|>user<|end_header_id|>

Can you provide the recipe for Bisi Bele Bath, a South Indian food?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Bisi Bele Bath is a popular South Indian dish, especially in the state of Karnataka. It's a flavorful and nutritious stew made with a variety of ingredients, including lentils, vegetables, and spices. Here's a simple recipe to make Bisi Bele Bath:

** Servings: 4-6 people**

**Ingredients:**

For the lentil mixture:

* 1 cup split red lentils (toor dal) or split yellow lentils (moong dal)
* 1 cup water
* 1 small onion, chopped
* 2 cloves garlic, minced
* 1 small ginger, grated
* 1 teaspoon cumin seeds
* 1 teaspoon fennel seeds
* 1 teaspoon coriander seeds
* 1/2 teaspoon turmeric powder
* 1/2 teaspoon red chili powder (optional)
* Salt, to tas

In [5]:
# # Define a function to generate responses based on a prompt
# def generate_response(prompt, max_length=512, temperature=0.7):
#     # Tokenize input and send it to the same device as the model
#     inputs = tokenizer(
#         prompt, 
#         return_tensors="pt",
#         padding=True                 # Pad sequences to the maximum length in the batch
#     ).to(model.device)

#     # Generate a response using the model
#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_length=max_length,       # Limits the length of the generated response
#             temperature=temperature,     # Controls randomness; higher = more creative responses
#             top_p=0.95,                  # Nucleus sampling for diverse responses
#             top_k=50,                    # Top-k sampling for diversity
#             num_return_sequences=1,      # Number of responses to generate
#             do_sample=True,              # Enables sampling (for non-deterministic output)
#             repetition_penalty=1.1,      # Penalize repetitive tokens (1.0 = no penalty, >1.0 = penalty)
#             no_repeat_ngram_size=3,      # Prevent repetition of n-gram phrases
#             pad_token_id=tokenizer.pad_token_id,    # Padding token ID
#             eos_token_id=tokenizer.eos_token_id     # End-of-sequence token ID
#         )
    
#     # Decode the generated tokens back to text, omitting special tokens
#     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return response.strip()

# # Test the model with a sample prompt
# prompt = "Who is the PM of India and detail about the person?"
# response = generate_response(prompt)
# print("Model response:", response)
# torch.cuda.empty_cache()  # Clean up GPU memory

# Format chatbot dataset as per Llama 3.2

In [8]:
import pandas as pd
from datasets import load_dataset

# Load the dataset
ds = load_dataset("ruslanmv/ai-medical-chatbot")

# Convert the 'train' split to a DataFrame
data = ds['train'].to_pandas()

# Reduce the dataset by half by randomly sampling 50% of the rows
data = data.sample(frac=0.5, random_state=42).reset_index(drop=True)

# Establish the context for the chatbot (system instruction)
system_instruction = "You are a medical assistant trained to provide accurate information based on user queries about health-related issues."

# Define a function to format each row into the required chat structure
# Structured format includes the system message, user queries, and assistant responses.
def format_chat_template(row):
    # Create the structured JSON format for the chat
    row_json = [
        {"role": "system", "content": system_instruction},
        {"role": "user", "content": row["Description"] + " " + row["Patient"]},
        {"role": "assistant", "content": row["Doctor"]}
    ]
    
    # Use the tokenizer to apply the chat template, this adds a 'text' field for processing
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

# Apply the format function to each row of the DataFrame
formatted_data = data.apply(format_chat_template, axis=1)
formatted_data = formatted_data[['text']]    # Keep only the 'text' column

# Display an example from the 'text' column without truncation
with pd.option_context('display.max_colwidth', None):
    print(formatted_data.iloc[1]['text'])

README.md:   0%|          | 0.00/863 [00:00<?, ?B/s]

dialogues.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/256916 [00:00<?, ? examples/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 02 Nov 2024

You are a medical assistant trained to provide accurate information based on user queries about health-related issues.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the treatment for loose motion in a child? Hello sir,baby of age 7 months...suffering with loose motions for past 7 days... chked with doctor..even then no improvement..as per doctors suggestion stopped feeding milk and substituted woth coconut water and putting some poder mixing woth water....can you please help us out..<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hello, Loose stools continue due to viral illness causing loss of lactose enzyme..it will take time to regain enzymatic activity. Please give your child raw cooked banana, soft diet and Isomil. Hope I have answered your query. Let me know if I can assist you further. Take care Regards, Dr Prasanna Lakshmi, Pediatrici

In [9]:
from datasets import Dataset

# Convert the formatted DataFrame back to a Hugging Face Dataset
formatted_dataset = Dataset.from_pandas(formatted_data)

# Perform the train-evaluation split (e.g., 80% train, 20% eval)
split_dataset = formatted_dataset.train_test_split(test_size=0.2, seed=42)

# Access the train and evaluation datasets
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

# Optional: Print an example from the train and eval sets to verify the format
print("Train Example:", train_dataset[0]['text'])
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print("Eval Example:", eval_dataset[0]['text'])

Train Example: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 02 Nov 2024

You are a medical assistant trained to provide accurate information based on user queries about health-related issues.<|eot_id|><|start_header_id|>user<|end_header_id|>

What remedy for cough and cold ? Hi, I had a cough for drinking some chilled mocktail.. Than a day later i started having fever . After taking crocin ( Cough and Cold ) i got temporary relief, but the fever came back after 8 hrs. I have taken Cifron 250Mg. I am feeling feverish and dizzy but the body temparature is normal. Coul you please help me the correct treatment for this infection? Needless to say i do get proper sleep<|eot_id|><|start_header_id|>assistant<|end_header_id|>

hi It appears to be local viral infection. Gargles with saline or betadine are good , avoid cold drinks, herbs like mulethi and basil [ tulsi] act good sy. crux will be soothing to your throat. Mritunjaya w

# Quantization

In [6]:
from transformers import BitsAndBytesConfig
from trl import setup_chat_format

# Define the model name
model_name = "meta-llama/Llama-3.2-1B-Instruct"

# Set up the 8-bit quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,       # Set to True for 8-bit, False for 4-bit
    llm_int8_threshold=6.0,   # Optional: tuning threshold for mixed precision with 8-bit models
)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto",              # Enables automatic device assignment (uses GPU if available)
    quantization_config=quantization_config,
    torch_dtype=torch.float16,      # Use float16 for efficiency on GPUs
    trust_remote_code=True
)

model, tokenizer = setup_chat_format(model, tokenizer)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


# Low Rank Adaptation (LoRA adapters)

In [7]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    r=16,                              # Rank of the LoRA matrices
    lora_alpha=32,                     # Scaling factor for LoRA updates
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"],  # Target modules to apply LoRA
    lora_dropout=0.1,                  # Dropout rate for LoRA layers to prevent overfitting
    bias="none",                      # Don't train bias terms
    task_type="CAUSAL_LM"             # Type of task = causal language modeling
)

# Wrap the model with LoRA configuration
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()
lora_model

trainable params: 6,029,312 || all params: 1,241,847,808 || trainable%: 0.4855


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128258, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pro

#  PEFT Fine-Tuning Llama

In [12]:
from trl import SFTTrainer, SFTConfig

# Define the SFTConfig with customized settings
sft_config = SFTConfig(
    output_dir="./medical_chatbot_model",  # Directory for model outputs
    num_train_epochs=1,                    # Number of training epochs
    eval_strategy="epoch",                 # Evaluate at the end of each epoch
    per_device_train_batch_size=4,         # Batch size for training
    per_device_eval_batch_size=4,          # Batch size for evaluation
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    learning_rate=5e-5,                    # Learning rate
    weight_decay=0.01,                     # Weight decay for regularization
    logging_dir="./logs",                  # Directory for logging
    logging_steps=100,                     # Log every 100 steps
    save_steps=500,                        # Save checkpoints every 500 steps
    save_total_limit=1,                    # Keep last checkpoints only
    max_seq_length=1024,                   # Maximum sequence length
    dataset_text_field="text",              # Specify the text field to use
)

# Initialize the SFTTrainer with SFTConfig
trainer = SFTTrainer(
    model=lora_model,
    args=sft_config,                      # Use the SFTConfig for training arguments
    train_dataset=train_dataset,          # Your training dataset
    eval_dataset=eval_dataset,            # Your evaluation dataset
    peft_config=lora_config,              # PEFT configuration if applicable
    tokenizer=tokenizer                   # Tokenizer for processing input
)

# Start training
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/102766 [00:00<?, ? examples/s]

Map:   0%|          | 0/25692 [00:00<?, ? examples/s]

  super().__init__(
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,2.1577,2.094752




# Medical Chatbot Inference

In [7]:
from peft import PeftModel

# Define the base model name to load
model_name = "meta-llama/Llama-3.2-1B-Instruct"

# Load the tokenizer and the base model used during training to wrap with PEFT
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto",             # Enables automatic device assignment (uses GPU if available)
    load_in_8bit=True,        # Load in 8-bit mode for reduced memory usage
    torch_dtype=torch.float16,      # Sets the model's tensor type to float16 for efficient GPU processing
    low_cpu_mem_usage=True,        # Reduces memory usage on CPU when loading the model, helpful with large models
    return_dict=True,               # Returns the model's outputs as a dictionary for easier access
    trust_remote_code=True
)

# Set the padding token for tokenization and model if not already set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if base_model.config.pad_token_id is None:
    base_model.config.pad_token_id = base_model.config.eos_token_id

base_model, tokenizer = setup_chat_format(base_model, tokenizer)

# Load the LoRA model for inference
lora_model = PeftModel.from_pretrained(base_model, "./medical_chatbot_model/checkpoint-12846")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [19]:
messages = [{"role": "user", "content": "I did a root canal today. What should i eat?"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = lora_model.generate(**inputs, max_new_tokens=100, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])


you should eat something that is easy to digest. You should drink water.


In [8]:
# from transformers import Trainer, TrainingArguments

# # Set up training arguments for fine-tuning
# training_args = TrainingArguments(
#     output_dir="./medical_chatbot_model",   # Directory to save the fine-tuned model
#     per_device_train_batch_size=4,          # Adjust batch size based on GPU memory; change if necessary
#     per_device_eval_batch_size=4,           # Evaluation batch size
#     num_train_epochs=3,                       # Number of epochs to train the model
#     logging_dir='./logs',                     # Directory for storing logs
#     logging_steps=100,                         # Log every 100 steps
#     eval_strategy="epoch",                    # Evaluate at the end of each epoch
#     save_strategy="epoch",                    # Save model at the end of each epoch
#     save_total_limit=1,                       # Only save the latest model to avoid clutter
#     learning_rate=2e-4,                       # Learning rate for optimization
# )

# # Initialize the Trainer with the LoRA model, training arguments, and dataset
# trainer = Trainer(
#     model=lora_model,  # Use the LoRA model
#     args=training_args,
#     train_dataset=train_dataset,  # Use the training dataset
#     eval_dataset=eval_dataset      # Use the evaluation dataset
# )

# # Start fine-tuning the model on the dataset
# trainer.train()

In [13]:
!pip install langchain langchain_community langchain-huggingface

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl (17 kB)
Collecting sentence-transformers>=2.6.0
  Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 KB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: sentence-transformers, langchain-huggingface
Successfully installed langchain-huggingface-0.0.3 sentence-transformers-3.2.1
You should consider upgrading via the '/opt/ohpc/pub/apps/python/3.8.12/bin/python3.8 -m pip install --upgrade pip' command.[0m[33m
[0m

In [8]:
from langchain_huggingface import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableSequence

# Initialize tokenizer and model
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    return_dict=True,
    trust_remote_code=True
)

# Set padding tokens
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

# Create the pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
    max_new_tokens=100,
    temperature=0.7,
    do_sample=True
)

# Create LangChain wrapper around the pipeline
llm = HuggingFacePipeline(pipeline=pipe)

# Create a prompt template
prompt = PromptTemplate(
    input_variables=["question"],
    template="""You are a AI assistant.
Question: {question}
Answer:"""
)

# Create the chain using RunnableSequence
chain = prompt | llm

# Usage
response = chain.invoke({"question": "What is your name?"})
print(response)

You are a AI assistant.
Question: What is your name?
Answer: I don't have a personal name, but I am a machine learning model designed to assist and communicate with users like you. You can refer to me as AI Assistant or simply "Assistant" if you prefer.

How can I ask you a question?
You can ask me any question, and I will do my best to provide a helpful and accurate response. You can ask me to:

* Provide information on a specific topic
* Explain a concept or process
* Answer a trivia question
* Generate


In [None]:
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.messages import SystemMessage, HumanMessage

# # Create LangChain wrapper around the pipeline
# llm = HuggingFacePipeline(pipeline=pipe)

# # Create a chat prompt template with system and user roles
# chat_prompt = ChatPromptTemplate.from_messages([
#     # System message for context and guidelines
#     SystemMessage(content="""You are a skilled chef specialized in Indian cuisine. 
# You provide detailed recipes, cooking tips, and cultural context for Indian dishes."""),
    
#     # Human message for the current question
#     HumanMessage(content="{question}")
# ])

# # Create the chain using RunnableSequence
# chain = chat_prompt | llm

# # Example usage
# response = chain.invoke({
#     "question": "Can you provide the recipe for Bisi Bele Bath?"
# })

# print(response)