In [None]:
# Unsloth Model API Deployment

# Install required packages
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install fastapi uvicorn pyngrok

In [None]:
# Import libraries
import uvicorn
import threading
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional, List, Dict, Any
from pyngrok import ngrok
import torch
from unsloth import FastLanguageModel
from transformers import TextStreamer

In [None]:
# Set up ngrok auth token (you'll need to sign up for ngrok and get a token)
# You can skip this if you already set it up before
import os
NGROK_AUTH_TOKEN = ""  # Replace with your actual token
!ngrok authtoken $NGROK_AUTH_TOKEN

# Define your model parameters
max_seq_length = 2048
dtype = None  # None for auto detection (Float16 for Tesla T4, Bfloat16 for Ampere+)
load_in_4bit = True

# Replace with your actual model path on Hugging Face
HF_MODEL_PATH = "Hamza-Mubashir/marketing_rafam97_finetuned"  # Replace this!


In [None]:
# Initialize FastAPI app
app = FastAPI(title="VentureForce Multi-Agent API",
              description="API for generating text using a finetuned Llama model")

# Configure CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Define request and response models
class GenerationRequest(BaseModel):
    instruction: str
    input_text: str = ""
    max_new_tokens: int = 2048
    temperature: float = 0.7
    top_p: float = 0.9

class GenerationResponse(BaseModel):
    generated_text: str

# Global variables for model and tokenizer
model = None
tokenizer = None

# Load model function
def load_model():
    global model, tokenizer
    try:
        print("Loading model from Hugging Face...")
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=HF_MODEL_PATH,
            max_seq_length=max_seq_length,
            dtype=dtype,
            load_in_4bit=load_in_4bit,
            # token="hf_...",  # Uncomment and add your token if using a private model
        )
        FastLanguageModel.for_inference(model)  # Enable faster inference
        print("Model loaded successfully!")
    except Exception as e:
        print(f"Error loading model: {e}")
        raise e

# Define the prompt template - use the same as during training
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Define API endpoints
@app.get("/")
async def root():
    return {"message": "VentureForce Multi-Agent API is running. Send POST requests to /generate endpoint."}

@app.post("/generate", response_model=GenerationResponse)
async def generate_text(request: GenerationRequest):
    global model, tokenizer

    if model is None or tokenizer is None:
        raise HTTPException(status_code=503, detail="Model not loaded yet. Please try again later.")

    try:
        # Format the prompt
        formatted_prompt = alpaca_prompt.format(
            request.instruction,
            request.input_text,
            ""  # Leave output blank for generation
        )

        # Tokenize input
        inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")

        # Generate text
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=request.max_new_tokens,
                temperature=request.temperature,
                top_p=request.top_p,
                use_cache=True
            )

        # Decode and return the generated text
        full_output = tokenizer.batch_decode(outputs)[0]

        # Extract only the response part
        response_prefix = "### Response:"
        if response_prefix in full_output:
            generated_text = full_output.split(response_prefix)[1].strip()
        else:
            generated_text = full_output

        return {"generated_text": generated_text}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")

# Function to prevent Colab from disconnecting (keeps sending requests to maintain connection)
def keep_alive():
    import time
    import requests
    import IPython.display
    from google.colab import output

    while True:
        time.sleep(60)
        output.eval_js("new Audio('https://dummy.mp3').play();")
        IPython.display.clear_output(wait=True)
        print("Server is still running.")

In [None]:
!pip install nest_asyncio

In [None]:
import nest_asyncio
nest_asyncio.apply()
# Start the server with ngrok
print("Loading model and starting server...")
load_model()

# Test model with a sample generation before starting the server
print("\nTesting model with a sample prompt:")
test_instruction = "Venture Force - A Multi Agent Framework for Early Age Startups"
test_input = """Large Language Model (LLM) dialogue agents have unveiled unforeseen limitations in specific domains
due to their generalized training data with typical problems i.e poor contextual parsing, lack of domain
knowledge, factual inaccuracies, ethical dilemmas, bias propagation, and hallucinations."""

formatted_prompt = alpaca_prompt.format(test_instruction, test_input, "")
inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")
text_streamer = TextStreamer(tokenizer)
print("Generated sample:")
_ = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=100,
    temperature=0.7,
    top_p=0.9,
    use_cache=True
)

# Start ngrok
ngrok_tunnel = ngrok.connect(8000)
print(f"\nPublic URL: {ngrok_tunnel.public_url}")
print(f"API Endpoint: {ngrok_tunnel.public_url}/generate")
print("\nExample curl command:")
print(f'''curl -X 'POST' \\
  '{ngrok_tunnel.public_url}/generate' \\
  -H 'Content-Type: application/json' \\
  -d '{{
    "instruction": "Venture Force - A Multi Agent Framework for Early Age Startups",
    "input_text": "I want to create a startup that helps small businesses with AI-powered customer service",
    "max_new_tokens": 1024
  }}'
''')

# Keep the server alive in a separate thread
import threading
threading.Thread(target=keep_alive, daemon=True).start()

# Start uvicorn server (this will block until the server is stopped)
uvicorn.run(app, host="0.0.0.0", port=8000)