In [None]:
!pip install transformers accelerate sentencepiece datasets

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Use a small model first to understand quickly
model_name = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (f

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Zero Shot Prompting

In [None]:
question = "Where is Paris?"
prompt = f"Classify the question type: \"{question}\""

In [None]:
prompt

'Classify the question type: "Where is Paris?"'

In [None]:
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=10)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Paris


In [None]:
# Few Shot Prompting

In [None]:
prompt = """Classify the question type:
Q: Where is New York?
A: LOC

Q: Who is the Prime Minister of India?
A: HUM

Q: How many grams is in kilograms?
A:
"""

In [None]:
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=10)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

gram


In [None]:
# The output should be entity i.e. NUM, HUM, LOC. Clearly the zero-shot and few-shot prompting does not work.

In [None]:
from datasets import load_dataset

dataset = load_dataset("trec")
train_data = dataset["train"]
test_data = dataset["test"]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

trec.py:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

The repository for trec contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/trec.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5452 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
def convert_to_instruction_format(example):
    return {
        "input": f"Classify the question type: \"{example['text']}\"",
        "output": example["coarse_label"]
    }

train_data = train_data.map(convert_to_instruction_format)
test_data = test_data.map(convert_to_instruction_format)

Map:   0%|          | 0/5452 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
# Get label names
label_names = dataset['train'].features['coarse_label'].names
print(label_names)

['ABBR', 'ENTY', 'DESC', 'HUM', 'LOC', 'NUM']


In [None]:
coarse_label_map = {i: label for i, label in enumerate(label_names)}
print(coarse_label_map)

{0: 'ABBR', 1: 'ENTY', 2: 'DESC', 3: 'HUM', 4: 'LOC', 5: 'NUM'}


In [None]:
train_data["input"][0]

'Classify the question type: "How did serfdom develop in and then leave Russia ?"'

In [None]:
train_data["output"][0]

2

In [None]:
def preprocess(example):
    input_text = str(example["input"])
    output_text = str(example["output"])

    model_inputs = tokenizer(
        input_text, max_length=128, truncation=True, padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            output_text, max_length=2, truncation=True, padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_train = train_data.map(preprocess, batched=False)
tokenized_test = test_data.map(preprocess, batched=False)

Map:   0%|          | 0/5452 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
tokenized_train

Dataset({
    features: ['text', 'coarse_label', 'fine_label', 'input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5452
})

In [None]:
tokenized_train["labels"][0]

[204, 1]

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_checkpoint = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    eval_steps=400,  # You can tweak this number
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=4,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,

)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33momkarwazulkar[0m ([33momkarwazulkar1998[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,0.7502
200,0.1761
300,0.1533
400,0.1342
500,0.1016
600,0.0979
700,0.079
800,0.0578
900,0.0609
1000,0.0574


TrainOutput(global_step=1364, training_loss=0.13491821953278482, metrics={'train_runtime': 293.7562, 'train_samples_per_second': 74.238, 'train_steps_per_second': 4.643, 'total_flos': 1013475031646208.0, 'train_loss': 0.13491821953278482, 'epoch': 4.0})

In [None]:
model.save_pretrained("saved_model_")
tokenizer.save_pretrained("saved_model_")

('saved_model_/tokenizer_config.json',
 'saved_model_/special_tokens_map.json',
 'saved_model_/spiece.model',
 'saved_model_/added_tokens.json',
 'saved_model_/tokenizer.json')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/saved_model_")
model = AutoModelForSeq2SeqLM.from_pretrained("/content/saved_model_")

In [None]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.07216385006904602, 'eval_runtime': 4.545, 'eval_samples_per_second': 110.012, 'eval_steps_per_second': 7.041, 'epoch': 4.0}


In [None]:
train_data[3]

{'text': 'What fowl grabs the spotlight after the Chinese Year of the Monkey ?',
 'coarse_label': 1,
 'fine_label': 2,
 'input': 'Classify the question type: "What fowl grabs the spotlight after the Chinese Year of the Monkey ?"',
 'output': 1}

In [None]:
inputs = tokenizer('Classify the question type: "What fowl grabs the spotlight after the Chinese Year of the Monkey ?"', return_tensors="pt").to(model.device)
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

1


In [None]:
inputs = tokenizer('Classify the question type: "Who is the Prime Minister of India ?"', return_tensors="pt").to(model.device)
outputs = model.generate(**inputs)
predicted_label_index = int(tokenizer.decode(outputs[0], skip_special_tokens=True))
print(coarse_label_map.get(predicted_label_index, "Unknown Label"))

HUM


In [None]:
inputs = tokenizer('Classify the question type: "Where is Paris ?"', return_tensors="pt").to(model.device)
outputs = model.generate(**inputs)
predicted_label_index = int(tokenizer.decode(outputs[0], skip_special_tokens=True))
print(coarse_label_map.get(predicted_label_index, "Unknown Label"))

LOC


In [None]:
inputs = tokenizer('Classify the question type: "How many grams is in kilograms?"', return_tensors="pt").to(model.device)
outputs = model.generate(**inputs)
predicted_label_index = int(tokenizer.decode(outputs[0], skip_special_tokens=True))
print(coarse_label_map.get(predicted_label_index, "Unknown Label"))

NUM


In [None]:
prompt

'Classify the question type: \nQ: Where is New York?\nA: LOC\n\nQ: Who is the Prime Minister of India?\nA: HUM\n\nQ: How many grams is in kilograms?\nA:\n'

In [None]:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs)
predicted_label_index = int(tokenizer.decode(outputs[0], skip_special_tokens=True))
print(coarse_label_map.get(predicted_label_index, "Unknown Label"))

NUM


In [None]:
from tqdm import tqdm
import torch

correct = 0
total = 0

model.eval()

for example in tqdm(test_data):
    input_text = example["input"]
    true_label = str(example["output"]).strip()  # convert label to string

    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=5)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # Clean prediction (sometimes model outputs extra text)
    pred = pred.strip().replace("Label:", "").strip()

    if pred == true_label:
        correct += 1
    total += 1

accuracy = correct / total
print(f"\n📊 Final Accuracy: {accuracy * 100:.2f}% on {total} test examples")

100%|██████████| 500/500 [00:23<00:00, 21.65it/s]


📊 Final Accuracy: 96.20% on 500 test examples





In [None]:
from fastapi import FastAPI, HTTPException # type: ignore
from pydantic import BaseModel # type: ignore
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch # type: ignore

# Load model and tokenizer
model_path = "./saved_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Use CPU or GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create FastAPI app
app = FastAPI(title="LLM Inference API")

# Request schema
class InputText(BaseModel):
    prompt: str

@app.post("/predict/")
def predict(input: InputText):
    try:
        # Tokenize and move to correct device
        inputs = tokenizer(
            input.prompt,
            return_tensors="pt",
            truncation=True,
            padding=True
        ).to(device)

        # Generate prediction
        outputs = model.generate(**inputs, max_new_tokens=64)
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)

        return {"prompt": input.prompt, "prediction": result}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [None]:
from sagemaker.huggingface import HuggingFaceModel # type: ignore
import sagemaker # type: ignore
import boto3 # type: ignore

# Update this to your AWS region
region = "eu-north-1"
sagemaker_session = sagemaker.Session()
role = "arn:aws:iam::..........:role/Sagemaker_S3_Role"

# Model location on S3
model_location = "s3://............./model.tar.gz"

# Environment variables
env = {
    'HF_TASK': 'text2text-generation'
}

# Define the model
huggingface_model = HuggingFaceModel(
    model_data=model_location,
    role=role,
    transformers_version="4.26",
    pytorch_version="1.13",
    py_version="py39",
    env=env,
    sagemaker_session=sagemaker_session
)

# Deploy model
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge",
    endpoint_name="huggingface-pytorch-inference-2025-04-21-16-09-01-008"
)

print("Deployed to endpoint:", predictor.endpoint_name)

In [None]:
from fastapi import FastAPI, HTTPException # type: ignore
from pydantic import BaseModel # type: ignore
import boto3 # type: ignore
import json
import os

# AWS setup
REGION = "eu-north-1"
ENDPOINT_NAME = "huggingface-pytorch-inference-..........."

runtime = boto3.client("sagemaker-runtime", region_name=REGION)

# FastAPI setup
app = FastAPI(title="SageMaker LLM Inference API")

class InputText(BaseModel):
    prompt: str

@app.post("/predict/")
def predict(input: InputText):
    try:
        payload = {
            "inputs": input.prompt
        }

        response = runtime.invoke_endpoint(
            EndpointName=ENDPOINT_NAME,
            ContentType="application/json",
            Body=json.dumps(payload)
        )

        result = json.loads(response["Body"].read().decode("utf-8"))

        return {
            "prompt": input.prompt,
            "prediction": result[0]['generated_text'] if isinstance(result, list) else result
        }

    except Exception as e:
        print("🔥 Error occurred while calling SageMaker:", e)  # <--- add this
        raise HTTPException(status_code=500, detail=str(e))

In [None]:
# curl -X POST http://127.0.0.1:8000/predict/ \
#     -H "Content-Type: application/json" \
#     -d '{"prompt": "Classify the question type: How can I find a list of celebrities’ real names?"}'