## Installing Unsloth

In [1]:
# installing unsloth

%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

# Install Flash Attention 2 for softcapping support
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

## Downloading Model

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "manojbaniya/best_v5_2",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.1.8: Fast Gemma2 patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

Unsloth 2025.1.8 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


In [4]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

## Prompt Template

In [5]:
prompt_template = """
Below is an instruction that describes a task paired with an input that provides further context. Write a response that appriately complete the request.

### Instruction
{question}

### Input
{context}

### Response
"""

## Function to generate Response

In [11]:
def generate_response(question: str, context="", max_new_tokens=100, stream=True):
  # The prompt template is used to create the input for the model
  inputs = prompt_template.format(question=question, context=context)
  # Now, we tokenize the formatted input using the tokenizer from your model
  inputs = tokenizer(inputs, return_tensors="pt").to("cuda")
  is_done = False
  tokens = []
  if stream:
    while not is_done:
      with torch.no_grad():
        outputs = model(inputs["input_ids"])
        logits = outputs.logits[:, -1, :]
        preds = logits.softmax(dim=-1)
        label = preds.argmax(dim=-1)

        decoded_token = tokenizer.decode(label)
        yield decoded_token
        tokens.append(label.item())

        inputs["input_ids"] = torch.cat([inputs["input_ids"], label.unsqueeze(-1)], dim=-1)

        if label == tokenizer.eos_token_id or len(tokens) >= max_new_tokens:
          is_done = True

In [10]:
response_stream = generate_response("Nepal ko capital city kaha ho?")

for token in response_stream:
  print(response_stream)

<generator object generate_response at 0x79b35d3be980>
Nepal
<generator object generate_response at 0x79b35d3be980>
 ko
<generator object generate_response at 0x79b35d3be980>
 capital
<generator object generate_response at 0x79b35d3be980>
 city
<generator object generate_response at 0x79b35d3be980>
 Kathmandu
<generator object generate_response at 0x79b35d3be980>
 ho
<generator object generate_response at 0x79b35d3be980>
.
<generator object generate_response at 0x79b35d3be980>



<generator object generate_response at 0x79b35d3be980>
<eos>


## Making server

In [12]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [13]:
from pyngrok import ngrok

In [14]:
ngrok.set_auth_token("2riLb0UtZa6Vf4YtDV7fdJExpKu_5njCcgcsSrLJnKfw8gXP6")



In [15]:
from pyngrok import ngrok

# Start the Flask server
public_url = ngrok.connect(5000)
print("Public URL:", public_url)

Public URL: NgrokTunnel: "https://ea99-34-82-114-22.ngrok-free.app" -> "http://localhost:5000"


In [16]:
from flask import Flask, request, jsonify, Response

app = Flask(__name__)

@app.route('/')
def home():
    return "Hello, this is your server running on Google Colab!"

@app.route('/chat', methods=['POST'])
def chat():
    # Get the user's query from the request
    question = request.json.get('question', '')
    context = request.json.get("context", "")
    stream = request.json.get("stream", True)
    max_new_tokens = 100


    if not question:
        # Return a single JSON response if query is missing
        return jsonify({"response": "Please provide a valid query."})

    def generate_response_streaming():
        # Call the generator function and yield tokens as they are generated
        for token in generate_response(question, context, max_new_tokens, stream):
            yield f"data: {token}\n\n"  # Format for server-sent events (SSE)

    # Return a streaming response with the correct content type for SSE
    return Response(generate_response_streaming(), content_type='text/event-stream')

if __name__ == '__main__':
    app.run(host='localhost', port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://localhost:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [01/Feb/2025 08:04:28] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [01/Feb/2025 08:04:29] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [01/Feb/2025 08:04:49] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [01/Feb/2025 08:05:39] "POST /chat HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [01/Feb/2025 08:06:24] "POST /chat HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [01/Feb/2025 08:06:55] "POST /chat HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [01/Feb/2025 08:07:13] "POST /chat HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [01/Feb/2025 08:07:27] "POST /chat HTTP/1.1" 200 -
