## Installing Unsloth and Flash Attention

In [1]:
# installing unsloth

%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

# Install Flash Attention 2 for softcapping support
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

## Import Necessary Library

In [3]:
from unsloth import FastLanguageModel
import torch

# config
max_seq_length = 2048
dtype = None
load_in_4bit = True

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


## Load Fine Tuned Large Language Gemma Model from HuggingFace Hub

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "manojbaniya/roman-nepali-gemma-1500step",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "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", # for accessing gated gemma model from huggingface hub
)

==((====))==  Unsloth 2025.2.15: Fast Gemma2 patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/216M [00:00<?, ?B/s]

Unsloth 2025.2.15 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


## Inference Mode

In [5]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584, padding_idx=0)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

## Prompt Template

In [6]:
prompt_template_rag = """<start_of_turn>system
You are an AI assistant who responds to user instructions. Use the context information to answer if it is given, and respond in Roman Nepali.<end_of_turn>
<start_of_turn>user
{question}
Context: {context}<end_of_turn>
<start_of_turn>model
"""
prompt_template_instruction = """<start_of_turn>system
You are helpful AI Assistant follow the user instruction and respond in Roman Nepali language.<end_of_turn>
<start_of_turn>user
{question}<end_of_turn>
<start_of_turn>model
"""

## Response Generation

In [8]:
def generate_response(question, type="instruction", context=None):
  if type == "instruction":
    prompt_template = prompt_template_instruction
  elif type == "RAG":
    prompt_template = prompt_template_rag
  inputs = prompt_template.format(question=question, context=context)

  inputs = tokenizer([inputs], return_tensors="pt").to("cuda")
  outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True, top_k=10, do_sample=True, temperature=0.8)
  response = tokenizer.batch_decode(outputs)
  return response[0]

## Test

In [9]:
response = generate_response(
    question="Hi",
    type="instruction",
)
print(response)

<bos><start_of_turn>system
You are helpful AI Assistant follow the user instruction and respond in Roman Nepali language.<end_of_turn>
<start_of_turn>user
Hi<end_of_turn>
<start_of_turn>model
Namaste! Ma tapai ko sahayog ko lagi yaha chu. Aba ke madat garna sakchhu?<eos>


## Stream Token Generation (Streaming)

In [11]:
prompt_template_rag = """<start_of_turn>system
You are an AI assistant who responds to user instructions. Use the context information to answer if it is given, and respond in Roman Nepali.<end_of_turn>
<start_of_turn>user
{question}
Context: {context}<end_of_turn>
<start_of_turn>model
"""
prompt_template_instruction = """<start_of_turn>system
You are helpful AI Assistant follow the user instruction and respond in Roman Nepali language.<end_of_turn>
<start_of_turn>user
{question}<end_of_turn>
<start_of_turn>model
"""

In [16]:
def get_inputs(question, instruct_type="RAG", context=None):
  if instruct_type == "instruction":
    prompt = prompt_template_instruction
  elif instruct_type == "RAG":
    prompt = prompt_template_rag
  inputs = prompt.format(question=question, context=context, response=response)
  inputs = tokenizer(inputs, return_tensors="pt").to("cuda")
  return inputs

In [17]:
get_inputs("Hi", instruct_type="instruction")["input_ids"]

tensor([[     2,    106,   9020,    108,   2045,    708,  10055,  16481,  18145,
           1611,    573,   2425,  14239,    578,   8702,    575,   9783, 205184,
           5255, 235265,    107,    108,    106,   1645,    108,   2151,    107,
            108,    106,   2516,    108]], device='cuda:0')

In [33]:
def generate_tokens(question: str, context: str, instruct_type="RAG", max_new_tokens=100):
    inputs = get_inputs(question, instruct_type, context)
    is_done = False
    tokens = []

    while not is_done:
        with torch.no_grad():
            outputs = model(inputs["input_ids"])
            logits = outputs.logits[:, -1, :]
            preds = logits.softmax(dim=1)
            label = preds.argmax(dim=-1)

            # Decode and yield the token
            decoded_token = tokenizer.decode(label)
            print(decoded_token)
            # yield decoded_token  # Yield the decoded token one by one
            # Append token to the list
            tokens.append(label.item())

            # Update the input_ids for the next iteration
            inputs["input_ids"] = torch.cat([inputs["input_ids"], label.unsqueeze(-1)], dim=-1)

            # Check if generation is done
            if label == tokenizer.eos_token_id or len(tokens) >= max_new_tokens:
                is_done = True

In [34]:
generate_tokens("Hi", "instruction")

Namaste
!
 Aba
 ta
pai
 ko
 lagi
 ke
 mad
at
 gar
na
 sak
ch
hu
?
<eos>


In [63]:
def generate_tokens_sample(question: str, context: str, instruct_type="RAG", max_new_tokens=100, top_k=10):
    inputs = get_inputs(question, instruct_type, context)
    is_done = False
    tokens = []

    while not is_done:
        with torch.no_grad():
            outputs = model(inputs["input_ids"])
            logits = outputs.logits[:, -1, :]
            preds = logits.softmax(dim=-1)

            # Apply Top-k sampling
            top_k_probs, top_k_indices = torch.topk(preds, top_k, dim=-1)  # Get top 10 probabilities & indices
            sampled_index = torch.multinomial(top_k_probs, 1)  # Sample from top 10
            label = top_k_indices.gather(-1, sampled_index)  # Get actual token index

            # Decode and yield the token
            decoded_token = tokenizer.decode(label.item()) # Get the integer value before decoding
            # print(decoded_token)
            yield decoded_token  # Yield the decoded token one by one

            # Append token to the list
            tokens.append(label.item())

            # Update the input_ids for the next iteration
            # Ensure label has the correct dimensions before concatenating
            inputs["input_ids"] = torch.cat([inputs["input_ids"], label], dim=-1)

            # Check if generation is done
            if label.item() == tokenizer.eos_token_id or len(tokens) >= max_new_tokens:
                is_done = True

In [38]:
generate_tokens_sample("Hi", "instruction")

Namaste
!
 Ma
 ta
pai
 ko
 personal
 assistant
 hu
.
 Aba
 ta
pai
 ko
 sa
hay
og
 ma
 kas
ari
 mad
at
 gar
na
 sak
ch
hu
?
<eos>


In [39]:
generate_tokens_sample("Hi", "instruction")

Namaste
!
 Ma
 ta
pai
 ko
 e
-
commerce
 assistant
 ho
.
 Aba
 ta
pai
 ko
 lagi
 ke
 mad
ad
 gar
na
 sak
ch
hu
?
<eos>


## Serving

In [42]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [None]:
from pyngrok import ngrok

ngrok.set_auth_token("2riLb0UtZa6Vf4YtDV7fdJExpKu_5njCcgcsSrLJnKfw8gXP6")

# Start the Flask server
public_url = ngrok.connect(6000)
print("Public URL:", public_url)

## Implement RAG

In [None]:
!pip install flask_cors

In [55]:
ecommerce_context = "Product Details: name: Redmi note 9 pro price: 15000 RAM: 8GB Stock: True Storage: 64GB name: Samsung Galaxy price: 120000 RAM: 16GB storage: 64GB"

In [73]:
from flask import Flask, request, jsonify, Response
from flask_cors import CORS

app = Flask(__name__)
CORS(app)

@app.route('/')
def home():
    return "Hello, this is your server running on Google Colab!"

@app.route('/chat', methods=['POST'])
def chat():
    # Get the user's query from the request
    user_query = request.json.get('question', '')

    if not user_query:
        # Return a single JSON response if query is missing
        return jsonify({"response": "Please provide a valid query."})

    def generate_response():
        # Call the generator function and yield tokens as they are generated
        try:
          for token in generate_tokens_sample(user_query, context=ecommerce_context, instruct_type="RAG"):
            # print(f"Sent: {token}")
            yield f"data: {token}\n\n"  # Format for server-sent events (SSE)
        except Exception as e:
          print(f"Error: {e}")
          return jsonify({"response": f"Error: {e}"})

    # Return a streaming response with the correct content type for SSE
    return Response(generate_response(), content_type='text/event-stream')

In [None]:
app.run(host='localhost', port=6000)