In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from IPython.display import  clear_output
import time

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

DEFAULT_MODEL = "meta-llama/Llama-3.2-3B-Instruct"

# Configure 4-bit quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4",  # Normalized float 4-bit (recommended)
    bnb_4bit_compute_dtype=torch.float16,  
    bnb_4bit_use_double_quant=True  # Improves performance by applying second quantization
)

model = AutoModelForCausalLM.from_pretrained(
    DEFAULT_MODEL,
    quantization_config=bnb_config,
    # torch_dtype=torch.bfloat16,
    device_map=device,
)

tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
model.generation_config.pad_token_id = 128001

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:

conversation = [
    {"role": "system", "content": f'''You are a helpful assistant'''},
    {"role": "user", "content": f'''who is Modi'''},
]
# 
prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to(device)


with torch.no_grad():
    output = model.generate(
        **inputs,
        do_sample=False,
        max_new_tokens=256
    )


processed_text = tokenizer.decode(output[0][len(inputs.input_ids[0])+3:], skip_special_tokens=True)

print(processed_text)





"Modi" can refer to several individuals, but I'll provide information on the most well-known ones:

1. **Narendra Modi**: He is the current Prime Minister of India, serving since 2014. Born on September 17, 1950, in Vadnagar, Madhya Pradesh, India, Modi is a member of the Bharatiya Janata Party (BJP). He has been a prominent figure in Indian politics for over three decades, serving as the Chief Minister of Gujarat from 2001 to 2014.
2. **Suresh Modi**: An Indian politician from the Bharatiya Janata Party (BJP), Suresh Modi is the current Deputy Chief Minister of Uttar Pradesh, serving since 2017. He is the brother of Narendra Modi.
3. **Dilip Modi**: An Indian politician from the Bharatiya Janata Party (BJP), Dilip Modi is a Member of the Legislative Assembly (MLA) from the Gujarat Legislative Assembly, representing the Surat North constituency.

If you could provide more context or specify which Modi you are referring to, I'd be happy to provide more information.


In [21]:
def update_chat_history(ai_response, user_query, chat_history):
    chat_history.extend([{"role": "assistant", "content": ai_response},
                         {"role": "user", "content": user_query}])
    return chat_history

In [22]:
conversation2 = update_chat_history(ai_response=processed_text,user_query='who is Narendra Modi', chat_history=conversation)

In [27]:
prompt = tokenizer.apply_chat_template(conversation2, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to(device)

print(f'token length: {len(inputs.input_ids[0])}')


with torch.no_grad():
    output = model.generate(
        **inputs,
        do_sample=True,
        max_new_tokens=100
    )


processed_text = tokenizer.decode(output[0][len(inputs.input_ids[0])+3:], skip_special_tokens=True)

print(processed_text)

token length: 289


**Narendra Damodardas Modi** (born September 17, 1950) is the current Prime Minister of India, serving since 2014. He is a member of the Bharatiya Janata Party (BJP) and is widely known for his conservative and nationalist policies.

**Early Life and Education:**

Modi was born in Vadnagar, Madhya Pradesh, India, to a Hindu family. His father, Damodardas Modi,


In [None]:
proce

In [6]:
model.generate?

[0;31mSignature:[0m
[0mmodel[0m[0;34m.[0m[0mgenerate[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0minputs[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgeneration_config[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mgeneration[0m[0;34m.[0m[0mconfiguration_utils[0m[0;34m.[0m[0mGenerationConfig[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlogits_processor[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mgeneration[0m[0;34m.[0m[0mlogits_process[0m[0;34m.[0m[0mLogitsProcessorList[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstopping_criteria[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mgeneration[0m[0;34m.[0m[0mstopping_criteria[0m[0;34m.[0m[0mStoppingCriteria