In [1]:
!pip install transformers datasets peft faiss-cpu sentence-transformers faiss-gpu
!pip install bitsandbytes peft accelerate gradio trl

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m:00:01

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from transformers import LlamaTokenizer,PreTrainedTokenizerFast
from datasets import load_dataset

# Load the PubMed QA dataset
pubmed = load_dataset('pubmed_qa', 'pqa_labeled', split='train')

# Load the Llama tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained('meta-llama/Llama-3.2-1B-Instruct')
tokenizer.pad_token = tokenizer.eos_token

# Tokenize function with special tokens for question and context
def tokenize_function(examples):
    # Concatenate the question and context with special tokens
    inputs = [f"<question> {q} <context> {c}" for q, c in zip(examples['question'], examples['context'])]
    
    # Tokenize the inputs and long answers
    tokenized_inputs = tokenizer(
        inputs,
        truncation=True,
        padding='max_length',
        max_length=512,
    )
    
    # Tokenize the long answers
    tokenized_answers = tokenizer(
        examples['long_answer'],
        truncation=True,
        padding='max_length',
        max_length=512,
    )["input_ids"]

    # Set the tokenized long answers as labels for training
    tokenized_inputs["labels"] = tokenized_answers
    return tokenized_inputs

# Tokenize the dataset
tokenized_pubmed = pubmed.map(tokenize_function, batched=True)

# Display the tokenized dataset
print(tokenized_pubmed)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})


In [None]:
print(tokenized_pubmed[0])

In [4]:
from transformers import LlamaForCausalLM, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb, platform, gradio, warnings
from datasets import load_dataset
from trl import SFTTrainer

bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant= False,
)

# Load the model using bitsandbytes (with 4-bit quantization)
model = LlamaForCausalLM.from_pretrained(
    'meta-llama/Llama-3.2-1B-Instruct', quantization_config=bnb_config)

# Apply LoRA or QLoRA config to the model
lora_config = LoraConfig(
    r=8,                 # Rank of the LoRA matrix
    lora_alpha=16,       # Scaling factor for LoRA
    lora_dropout=0.1,    # Dropout rate
    bias="none"          # No bias for LoRA layers
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Ensure the model is in evaluation mode to verify it works
model.eval()

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

PeftModel(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear4bit(in_fea

In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./kaggle/working/fine_tuned_model",  # Directory to save the model
    # evaluation_strategy="epoch",  # Evaluate the model after every epoch
    learning_rate=5e-5,  # Learning rate for fine-tuning
    per_device_train_batch_size=1,  # Batch size per GPU
    per_device_eval_batch_size=1,  # Batch size per GPU during evaluation
    num_train_epochs=2,  # Number of epochs
    weight_decay=0.01,  # Regularization to prevent overfitting
    save_total_limit=1,  # Limit the number of saved models
    logging_dir='./logs',  # Directory to store logs
    logging_steps=20,  # Log every 20 steps
    report_to="tensorboard",  # Report training to TensorBoard
    fp16=True
)


# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,                         # The model to be trained
    args=training_args,                  # Training arguments
    train_dataset=tokenized_pubmed,      # Tokenized dataset
)

# Start training
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
20,2.3554
40,2.3056
60,2.2806
80,2.142
100,1.904
120,1.8311
140,1.8216
160,1.8374
180,1.7239
200,1.8789


TrainOutput(global_step=2000, training_loss=1.762493740081787, metrics={'train_runtime': 1009.3499, 'train_samples_per_second': 1.981, 'train_steps_per_second': 1.981, 'total_flos': 5984244203520000.0, 'train_loss': 1.762493740081787, 'epoch': 2.0})

In [None]:
# # Save the model and tokenizer
# model.save_pretrained('/kaggle/working/fine_tuned_model_for_rag_1b_instruct')
# tokenizer.save_pretrained('/kaggle/working/fine_tuned_model_for_rag_1b_instruct')

In [6]:
# Push the model and tokenizer to Hugging Face Hub
model.push_to_hub("manishsahu/fine_tuned_model_for_rag_1b_instruct1")
tokenizer.push_to_hub("manishsahu/fine_tuned_model_for_rag_1b_instruct1")

adapter_model.safetensors:   0%|          | 0.00/3.42M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/manishsahu/fine_tuned_model_for_rag_1b_instruct1/commit/87263236679b0405476f0aec1c1ef12630593c4e', commit_message='Upload tokenizer', commit_description='', oid='87263236679b0405476f0aec1c1ef12630593c4e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/manishsahu/fine_tuned_model_for_rag_1b_instruct1', endpoint='https://huggingface.co', repo_type='model', repo_id='manishsahu/fine_tuned_model_for_rag_1b_instruct1'), pr_revision=None, pr_num=None)

In [7]:
# Load the fine-tuned model and tokenizer
model = LlamaForCausalLM.from_pretrained('manishsahu/fine_tuned_model_for_rag_1b_instruct1')
tokenizer = PreTrainedTokenizerFast.from_pretrained('manishsahu/fine_tuned_model_for_rag_1b_instruct1')

adapter_config.json:   0%|          | 0.00/830 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

In [None]:
from transformers import LlamaForCausalLM, PreTrainedTokenizerFast

# Load the fine-tuned model and tokenizer
model = LlamaForCausalLM.from_pretrained('/kaggle/working/fine_tuned_model_for_rag_1b_instruct')
tokenizer = PreTrainedTokenizerFast.from_pretrained('/kaggle/working/fine_tuned_model_for_rag_1b_instruct')

In [35]:
# Sample question and context
question = "What is the treatment for hypertension?"
context = "Hypertension is commonly treated with lifestyle changes such as diet and exercise. Medications such as ACE inhibitors, calcium channel blockers, and diuretics are often prescribed as well."

# Prepare the input in the same format used during training
input_text = f"Question: {question}\nContext: {context}\nAnswer:"
inputs = tokenizer(input_text, return_tensors='pt')

# Generate the answer
output_ids = model.generate(inputs.input_ids, max_length=512)
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Extract the answer part from the generated text
generated_answer = answer.split("Answer:")[1].strip()

# Print the generated answer
print("Generated Answer:", generated_answer)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Answer: The treatment for hypertension typically involves lifestyle modifications, including diet, exercise, and weight loss. Medications such as ACE inhibitors, calcium channel blockers, and diuretics are commonly used to treat hypertension. The choice of medication depends on the severity of hypertension, the presence of kidney disease, and the patient's medical history. Lifestyle modifications are usually the first line of treatment.

Reasoning Skill: This question requires the ability to identify the typical treatment approach for hypertension, which is a key aspect of hypertension management. The correct answer is supported by evidence-based guidelines, such as those from the American Heart Association and the American College of Cardiology. The question also requires the ability to distinguish between the typical treatment approach and the specific treatment options for certain patients, such as those with kidney disease or those with a history of heart disease. This re

In [36]:
# Sample question and context
question = "What are the common side effects of Metformin?"
context = "Metformin is a medication primarily used for the treatment of type 2 diabetes. It helps control blood sugar levels. Common side effects of Metformin include gastrointestinal symptoms such as nausea, vomiting, diarrhea, abdominal pain, and loss of appetite. Long-term use may lead to vitamin B12 deficiency."

# Prepare the input in the same format used during training
input_text = f"Question: {question}\nContext: {context}\nAnswer:"
inputs = tokenizer(input_text, return_tensors='pt')

# Generate the answer
output_ids = model.generate(inputs.input_ids, max_length=512)
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Extract the answer part from the generated text
generated_answer = answer.split("Answer:")[1].strip()

# Print the generated answer
print("Generated Answer:", generated_answer)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated Answer: The common side effects of Metformin include gastrointestinal symptoms such as nausea, vomiting, diarrhea, abdominal pain, and loss of appetite. These side effects are usually mild and temporary. In some cases, long-term use of Metformin may lead to vitamin B12 deficiency.

Reasoning Skill: The question requires the test-taker to identify the common side effects of Metformin. This involves analyzing the information provided in the question and using the knowledge of the subject matter to make an educated inference. The test-taker must also be able to distinguish between the common side effects and the long-term side effects of Metformin, which requires critical thinking and analysis of the information. This type of question is appropriate for assessing the reasoning skill of Identifying Pros And Cons, as it requires the test-taker to evaluate the potential side effects of a medication and make an informed decision.


In [None]:
0+8

## RAG Implementation

In [22]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Initialize the SentenceTransformer model for embedding
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Function to generate context embeddings and create FAISS index
def create_faiss_index(contexts):
    # Step 1: Generate sentence embeddings for contexts
    context_embeddings = embedder.encode(contexts, show_progress_bar=True, device='cuda')

    # Step 2: Convert embeddings to numpy array (required by FAISS)
    context_embeddings = np.array(context_embeddings)

    # Step 3: Create FAISS index and add embeddings
    dimension = context_embeddings.shape[1]  # Dimension of the embeddings
    faiss_index = faiss.IndexFlatL2(dimension)  # L2 distance metric for similarity search
    faiss_index.add(context_embeddings)

    # Step 4: Save the index and embeddings
    faiss.write_index(faiss_index, '/kaggle/working/path_to_faiss_index')
    np.save('/kaggle/working/path_to_context_embeddings.npy', context_embeddings)

    return faiss_index, context_embeddings

# Function to retrieve the most relevant context for a query
def search_query(query, faiss_index, context_embeddings, contexts, top_k=5):
    # Step 1: Generate query embedding
    query_embedding = embedder.encode([query], convert_to_tensor=True, device='cuda')
    query_embedding = query_embedding.cpu().numpy()

    # Step 2: Search the FAISS index for the top_k most similar contexts
    distances, indices = faiss_index.search(query_embedding, top_k)

    # Step 3: Retrieve the contexts and distances
    results = []
    for i in range(top_k):
        context_idx = indices[0][i]  # Get the index of the context
        distance = distances[0][i]   # Get the distance (similarity measure)
        context = contexts[context_idx]  # Retrieve the actual context text
        results.append((context, distance))
    
    return results

# # Example usage: Loading PubMed Dataset
# from datasets import load_dataset

# # Load the PubMed QA dataset
# pubmed = load_dataset('pubmed_qa', 'pqa_labeled', split='train')

# Get the context data from PubMed (assuming 'context' field is available)
contexts = pubmed['context']  # List of context texts

# Save contexts (if you need to store and load them later)
import pickle
with open('/kaggle/working/path_to_contexts.pkl', 'wb') as f:
    pickle.dump(contexts, f)

# Step 1: Create FAISS index for the contexts
faiss_index, context_embeddings = create_faiss_index(contexts)

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [None]:
# Now, you can give a query, and the system will retrieve relevant contexts
query = "What is the role of inflammation in heart disease?"
results = search_query(query, faiss_index, context_embeddings, contexts, top_k=3)

# Display the top matching contexts for the query
# print(f"Top matching contexts for the query: {query}\n")
for context, distance in results:
    print(f"Context: {context} | Distance: {distance:.3f}\n")
    
# for c, d in results:
#     print(c['contexts'],'\n')

## Get answer using RAG

In [31]:
def get_answer(question, faiss_index, context_embeddings, contexts, embedder, model, tokenizer):
    # Step 1: Encode the question and retrieve the most relevant context
    question_embedding = embedder.encode([question])
    distances, indices = faiss_index.search(np.array(question_embedding), 1)
    top_context = contexts[indices[0][0]]
    
    # Step 2: Generate the answer using the fine-tuned model
    # input_text = f"<question> {question} <context> {top_context}"
    input_text = f"Question: {question}\nContext: {top_context['contexts']}\nAnswer:"
    answer_ids = model.generate(tokenizer(input_text, return_tensors='pt').input_ids, max_length=512)
    answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)

    # Extract the answer part from the generated text
    generated_answer = answer.split("Answer:")[1].strip()
    
    return generated_answer, top_context

In [37]:
# Example question
question = "What is the role of inflammation in heart disease?"

# Generate the answer
answer, c = get_answer(question, faiss_index, context_embeddings, contexts, embedder, model, tokenizer)

print(f"Question: {question}\n")
print(f"Answer: {answer}\n")
# print(c)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is the role of inflammation in heart disease?

Answer: ['The role of inflammation in heart disease is multifaceted. It is well established that white adipose tissue (WAT) produces numerous proinflammatory and proatherogenic cytokines and chemokines. These signals are produced in response to the accumulation of lipids and the presence of metabolic stressors. Adipose tissue-derived chemotactic signals play a crucial role in the migration of leukocytes to sites of inflammation and the recruitment of inflammatory cells to the arterial wall. The migration of these cells contributes to the development of atherosclerosis and the progression of cardiovascular disease. The role of inflammation in the development of atherosclerosis is complex and multifaceted. It involves the recruitment of macrophages and T cells to the arterial wall, the production of proinflammatory cytokines and chemokines, and the promotion of atherogenesis.']
Note: The question is presented in a way that it 

In [32]:
# Example question
question = "What are the common side effects of Metformin?"

# Generate the answer
answer, c = get_answer(question, faiss_index, context_embeddings, contexts, embedder, model, tokenizer)

print(f"Question: {question}\n")
print(f"Answer: {answer}\n")
# print(c)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What are the common side effects of Metformin?

Answer: ['The common side effects of Metformin include nausea, gastrointestinal disturbances, diarrhea, and lactic acidosis. The most common side effects were nausea and gastrointestinal disturbances. Metformin was associated with a higher incidence of nausea and gastrointestinal disturbances compared with the sulfonylurea group.']



In [34]:
# Example question
question = "What is the treatment for hypertension?"

# Generate the answer
answer, c = get_answer(question, faiss_index, context_embeddings, contexts, embedder, model, tokenizer)

print(f"Question: {question}\n")
print(f"Answer: {answer}\n")
# print(c)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is the treatment for hypertension?

Answer: ['The treatment for hypertension is not specified in the text. However, the treatment for hypertension is typically lifestyle modifications, such as dietary changes, exercise, and weight loss, and may include the use of medications to control blood pressure. The DASH study was a controlled feeding study that compared the effects of different diets on blood pressure in individuals with prehypertension or stage 1 hypertension. The study found no difference in caloric intake between AA and non-AA women.']

Reasoning Skill: This question requires the test-taker to analyze the purpose and design of the DASH study and identify the specific treatment for hypertension. The test-taker must also recognize that the DASH study was a controlled feeding study and that the results may have been influenced by the



## Flask implementation

In [14]:
!pip install Flask pyngrok transformers flask-ngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [9]:
!ngrok authtoken 2rXNsTZZgcrQc5rdhcbdkwbw487Y8JsNCyYe  # I've modified this it, use your won

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [26]:
from pyngrok import ngrok

# Set the Flask app port
port = 5000

# Start Ngrok tunnel to the Flask app
public_url = ngrok.connect(port)

print(f"Flask app is running at: {public_url}")

Flask app is running at: NgrokTunnel: "https://58a2-35-190-138-166.ngrok-free.app" -> "http://localhost:5000"


In [29]:
from flask import Flask, request, jsonify, render_template_string
from transformers import LlamaForCausalLM, PreTrainedTokenizerFast
from flask_ngrok import run_with_ngrok
import nest_asyncio

nest_asyncio.apply()

app = Flask(__name__)
run_with_ngrok(app)  # Start ngrok when the app is run

# Load your fine-tuned model and tokenizer
model = LlamaForCausalLM.from_pretrained('manishsahu/fine_tuned_model_for_rag_1b_instruct1')
tokenizer = PreTrainedTokenizerFast.from_pretrained('manishsahu/fine_tuned_model_for_rag_1b_instruct1')

# Assume you have your FAISS index, context embeddings, and the list of contexts
faiss_index = faiss.read_index('/kaggle/working/path_to_faiss_index')
context_embeddings = np.load('/kaggle/working/path_to_context_embeddings.npy')
# Load the contexts from the pickle file
with open('/kaggle/working/path_to_contexts.pkl', 'rb') as f:
    contexts = pickle.load(f)
# contexts = [{'contexts': 'context text here'}]  # Load your contexts accordingly

@app.route("/")
def home():
    return render_template_string("""
    <h1>Ask a Question</h1>
    <form action="/ask" method="post">
        <input type="text" name="question" placeholder="Enter your question" required>
        <input type="submit" value="Ask">
    </form>
    """)

@app.route("/ask", methods=["POST"])
def ask():
    question = request.form["question"]
    
    # Step 1: Encode the question and retrieve the most relevant context
    question_embedding = embedder.encode([question])
    distances, indices = faiss_index.search(np.array(question_embedding), 1)
    top_context = contexts[indices[0][0]]
    
    # Step 2: Generate the answer using the fine-tuned model
    input_text = f"Question: {question}\nContext: {top_context['contexts']}\nAnswer:"
    answer_ids = model.generate(tokenizer(input_text, return_tensors='pt').input_ids, max_length=512)
    answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)
    
    # Extract the answer part from the generated text
    generated_answer = answer.split("Answer:")[1].strip()
    
    return render_template_string("""
    <h1>Your Question:</h1>
    <p>{{ question }}</p>
    <h1>Answer:</h1>
    <p>{{ answer }}</p>
    <a href="/">Ask another question</a>
    """, question=question, answer=generated_answer)

if __name__ == "__main__":
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off
 * Running on http://58a2-35-190-138-166.ngrok-free.app
 * Traffic stats available on http://127.0.0.1:4040


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
