# Serve Ollama Model via API with Localtunnel

This notebook guides you through serving a chosen Ollama model and exposing its API publicly using localtunnel.

**Steps:**
1. Install necessary packages.
2. Check and manage your Ollama installation and models.
3. Select or pull the Ollama model you want to serve.
4. Create a FastAPI server to provide an API for the model.
5. Test the local API.
6. Expose the local API to the internet using localtunnel.
7. Learn how to use and manage the exposed API.

## 1. Install Dependencies

We need `ollama` (the Python client), `fastapi` for the web server, `uvicorn` to run FastAPI, `pydantic` for data validation, `nest_asyncio` to allow `uvicorn` to run in a Jupyter environment, and `localtunnel` (via npm) to expose the server.

In [None]:
# Install Python packages
print("Installing Python packages: ollama, fastapi, uvicorn, pydantic, nest_asyncio")
!pip install ollama fastapi uvicorn pydantic nest_asyncio

# Install localtunnel globally using npm
print("Installing localtunnel globally using npm...")
!npm install -g localtunnel

print("Dependencies installed.")

# nest_asyncio is used to allow uvicorn to run in a Jupyter notebook environment
import nest_asyncio
nest_asyncio.apply()

## 2. Check Ollama Installation and Running Status

This step ensures Ollama is installed and the Ollama server is running on your system. If not, it will attempt to start the server.

In [None]:
import subprocess
import time
import os
import sys

def check_ollama():
    print("Checking Ollama installation...")
    try:
        version_output = subprocess.check_output(["ollama", "--version"], text=True, stderr=subprocess.STDOUT)
        print(f"Ollama is installed: {version_output.strip()}")
    except (subprocess.SubprocessError, FileNotFoundError) as e:
        print(f"Ollama not found or error during version check: {e}")
        print("Please install Ollama from https://ollama.com/download and ensure it's in your PATH.")
        sys.exit("Ollama installation check failed.")

    print("\nChecking Ollama server status...")
    try:
        subprocess.check_output(["ollama", "list"], text=True, timeout=10) # Increased timeout for initial check
        print("Ollama server is running and responsive.")
    except (subprocess.SubprocessError, FileNotFoundError, subprocess.TimeoutExpired) as e:
        print(f"Ollama server not responding or not found: {e}")
        print("Attempting to start Ollama server...")
        try:
            if sys.platform == "win32":
                subprocess.Popen(["ollama", "serve"], creationflags=subprocess.CREATE_NEW_CONSOLE)
            else:
                # Start Ollama server in the background, detaching it
                subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True)
            
            print("Waiting for Ollama server to start (15 seconds)...")
            time.sleep(15) # Give server more time to start
            
            # Verify again
            subprocess.check_output(["ollama", "list"], text=True, timeout=10)
            print("Ollama server started and is responsive.")
        except (subprocess.SubprocessError, FileNotFoundError, subprocess.TimeoutExpired) as start_err:
            print(f"Failed to start or verify Ollama server: {start_err}")
            print("Please ensure Ollama is installed correctly and can be run from the command line.")
            print("You might need to start 'ollama serve' manually in a separate terminal if issues persist.")
            sys.exit("Ollama server startup failed.")

check_ollama()

## 3. List Available Models and Select/Pull Model

Let's see which models you have locally. If the model you want isn't listed, you can pull it.

In [None]:
import ollama

def list_local_models():
    print("\nListing available local models...")
    try:
        models_info = ollama.list()
        if not models_info['models']:
            print("  No models found locally.")
            return []
        print("Available models:")
        for i, model_data in enumerate(models_info['models']):
            size_gb = model_data['size'] / (1024**3)
            print(f"  {i+1}. {model_data['name']} (Size: {size_gb:.2f} GB)")
        return [m['name'] for m in models_info['models']]
    except Exception as e:
        print(f"Error listing models: {e}")
        print("Make sure the Ollama server is running (see previous step).")
        return []

local_model_names = list_local_models()

In [None]:
# Model Selection
print("\n--- Model Selection ---")
if local_model_names:
    print("You can choose from your local models or pull a new one.")
else:
    print("No local models found. You'll need to pull a model.")

model_to_serve = input("Enter the name of the Ollama model you want to serve (e.g., 'llama3:8b', 'mistral', 'codellama'): ").strip()

if not model_to_serve:
    print("No model name entered. Exiting.")
    sys.exit("Model name required.")

if model_to_serve not in local_model_names:
    print(f"Model '{model_to_serve}' not found locally.")
    pull_choice = input(f"Do you want to pull '{model_to_serve}'? This can take some time and disk space. (yes/no): ").strip().lower()
    if pull_choice == 'yes':
        print(f"Pulling '{model_to_serve}'... Please be patient.")
        try:
            # Stream the pull progress
            current_digest = ""
            for progress in ollama.pull(model_to_serve, stream=True):
                digest = progress.get("digest", "")
                if digest != current_digest and digest != "":
                    current_digest = digest
                    print(f"Pulling {digest} - {progress.get('status')}")
                if "total" in progress and "completed" in progress:
                    percentage = (progress['completed'] / progress['total']) * 100
                    print(f"Status: {progress.get('status')} - {percentage:.2f}% completed", end='\r')
            print("\nPull completed.")
            print(f"Successfully pulled '{model_to_serve}'.")
            local_model_names.append(model_to_serve) # Add to list for consistency
        except Exception as e:
            print(f"\nError pulling model '{model_to_serve}': {e}")
            print("Please check the model name (e.g., 'llama2', 'mistral:latest') and your internet connection.")
            sys.exit("Model pull failed.")
    else:
        print("Model not pulled. Exiting.")
        sys.exit("No model selected to serve.")
else:
    print(f"Using local model: '{model_to_serve}'")

SELECTED_MODEL = model_to_serve
print(f"Will attempt to serve the model: {SELECTED_MODEL}")

## 4. Create FastAPI Server

We'll create a simple API server using FastAPI to interact with the selected Ollama model.

In [None]:
from fastapi import FastAPI, HTTPException, Body
from pydantic import BaseModel, Field
import uvicorn
import threading
import ollama # Ensure ollama is imported here as well for the API functions
from typing import Optional, List, Dict, Any, Union
import asyncio # For uvicorn shutdown

# Define request/response models for FastAPI
class GenerateRequest(BaseModel):
    prompt: str
    model: Optional[str] = None # If None, uses SELECTED_MODEL from notebook scope
    images: Optional[List[str]] = Field(default=None, description="A list of base64-encoded images for multimodal models")
    format: Optional[str] = Field(default=None, description="The format to return a response in. Currently the only accepted value is 'json'")
    options: Optional[Dict[str, Any]] = Field(default=None, description="Additional model parameters listed in the documentation for the Modelfile such as temperature")
    system: Optional[str] = Field(default=None, description="System message to (overrides what is defined in the Modelfile)")
    template: Optional[str] = Field(default=None, description="The full prompt or prompt template (overrides what is defined in the Modelfile)")
    context: Optional[List[int]] = Field(default=None, description="The context parameter returned from a previous request to /generate, used to keep a short conversational memory")
    stream: Optional[bool] = Field(default=False, description="If false the response will be returned as a single response object, rather than a stream of objects")
    keep_alive: Optional[Union[str, float, int]] = Field(default=None, description="Controls how long the model will stay loaded into memory following the request (default: 5m)")

class GenerateResponse(BaseModel):
    model: str
    created_at: str
    response: str
    done: bool
    context: Optional[List[int]] = None
    total_duration: Optional[int] = None
    load_duration: Optional[int] = None
    prompt_eval_count: Optional[int] = None
    prompt_eval_duration: Optional[int] = None
    eval_count: Optional[int] = None
    eval_duration: Optional[int] = None

app = FastAPI(title="Ollama Model Server", description=f"Serving Ollama model: {SELECTED_MODEL}")

@app.get("/")
def read_root():
    return {"message": f"Ollama API server for model '{SELECTED_MODEL}' is running. Use the /api/generate endpoint."}

@app.post("/api/generate", response_model=GenerateResponse)
async def generate_text(request: GenerateRequest):
    try:
        model_name_to_use = request.model if request.model else SELECTED_MODEL
        
        payload = request.dict(exclude_none=True) # Create payload from request, excluding unset fields
        payload['model'] = model_name_to_use # Ensure model is in payload
        
        if payload.get('stream', False):
            # This basic endpoint is not designed for true streaming to the HTTP client.
            # Ollama client itself handles the stream and aggregates if stream=True is passed to it
            # but the FastAPI response model expects a single object.
            # For simplicity, we will let ollama.generate handle it; it returns the full response if stream=True but not handled by client.
            print("Warning: 'stream: True' requested. The ollama client will aggregate the stream into a single response for this endpoint.")

        print(f"Received request for model '{model_name_to_use}' with prompt: '{request.prompt[:50]}...'")
        response_data = ollama.generate(**payload)
        print(f"Generated response. Done: {response_data.get('done')}")
        return response_data
    except Exception as e:
        print(f"Error during generation: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/active-model")
def get_active_model():
    return {"active_model": SELECTED_MODEL, "status": "ready"}

# --- Server Start/Stop --- (Global state for server management)
PORT = 8008
uvicorn_server_instance = None
server_thread = None

def start_fastapi_server():
    global uvicorn_server_instance, server_thread
    if server_thread is not None and server_thread.is_alive():
        print(f"FastAPI server is already running or starting on port {PORT}.")
        return

    config = uvicorn.Config(app, host="127.0.0.1", port=PORT, log_level="info")
    uvicorn_server_instance = uvicorn.Server(config)
    
    server_thread = threading.Thread(target=uvicorn_server_instance.run, daemon=True)
    server_thread.start()
    print(f"FastAPI server starting on http://127.0.0.1:{PORT}")
    time.sleep(3) # Wait a bit for the server to initialize
    if server_thread.is_alive():
        print(f"FastAPI server started successfully for model '{SELECTED_MODEL}'. Access it locally.")
    else:
        print("FastAPI server failed to start. Check console for errors.")
        uvicorn_server_instance = None # Clear instance if failed

# Automatically start the server when this cell is run
start_fastapi_server()

## 5. Test Local API Server

Before exposing it, let's test if the local API server is working.

In [None]:
import requests
import json

def test_local_api():
    print("\n--- Testing Local API Server ---")
    if server_thread and server_thread.is_alive():
        base_url = f"http://127.0.0.1:{PORT}"
        # Test root endpoint
        try:
            response_root = requests.get(base_url, timeout=10)
            response_root.raise_for_status()
            print(f"GET /: {response_root.json()}")
        except requests.RequestException as e:
            print(f"Error testing GET /: {e}")
            return

        # Test /api/active-model endpoint
        try:
            response_active = requests.get(f"{base_url}/api/active-model", timeout=10)
            response_active.raise_for_status()
            print(f"GET /api/active-model: {response_active.json()}")
        except requests.RequestException as e:
            print(f"Error testing GET /api/active-model: {e}")
            return

        # Test /api/generate endpoint
        api_generate_url = f"{base_url}/api/generate"
        test_payload = {
            "prompt": "Why is the sky blue? Explain briefly.",
            # "model": SELECTED_MODEL, # Not needed as API defaults to SELECTED_MODEL
            "options": {"temperature": 0.7}
        }
        try:
            print(f"POST /api/generate with prompt: '{test_payload['prompt']}'")
            response_generate = requests.post(api_generate_url, json=test_payload, timeout=120) # Longer timeout for generation
            response_generate.raise_for_status() 
            
            response_data = response_generate.json()
            print("Local API /generate Test Response:")
            print(f"  Model: {response_data.get('model')}")
            print(f"  Response: {response_data.get('response')[:300]}...")
            if not response_data.get('response'):
                 print("  Warning: Empty response from model. The model might be very slow or there could be an issue.")
        except requests.exceptions.RequestException as e:
            print(f"Error testing POST /api/generate: {e}")
            if hasattr(e, 'response') and e.response is not None:
                print(f"Response content: {e.response.text}")
        except json.JSONDecodeError:
            print(f"Error decoding JSON from API response. Status: {response_generate.status_code}, Response text: {response_generate.text}")
    else:
        print("FastAPI server is not running. Please run the server cell (Cell 10) first.")

# Run the test
test_local_api()

## 6. Expose API with Localtunnel

Now, let's make your local API server accessible from the internet using localtunnel. This will output a public URL.

In [None]:
import subprocess
import threading
import time
import re

public_url_store = {"url": None} # Use a dictionary to store mutable URL
lt_process_store = {"process": None} # Store the process object

def monitor_localtunnel_output(proc, url_store):
    try:
        for line in iter(proc.stdout.readline, ''):
            line_stripped = line.strip()
            print(f"Localtunnel: {line_stripped}")
            url_match = re.search(r'your url is: (https?://[^\s]+)', line_stripped)
            if url_match:
                url_store["url"] = url_match.group(1)
                print(f"\n🎉 Public API URL: {url_store['url']}")
                print(f"You can now access your Ollama model '{SELECTED_MODEL}' API at this URL.")
                print(f"Example: POST to {url_store['url']}/api/generate")
                # Don't close stdout, let it run to show connection status or errors
        proc.stdout.close()
    except Exception as e:
        print(f"Error reading localtunnel stdout: {e}")

def start_localtunnel(port_to_expose, url_store, process_store):
    if process_store["process"] and process_store["process"].poll() is None:
        print(f"Localtunnel is already running or starting.")
        if url_store["url"]:
            print(f"Current Public URL: {url_store['url']}")
        else:
            print("Localtunnel is running, but URL not yet captured. Check output.")
        return

    print(f"\n--- Starting Localtunnel for port {port_to_expose} ---")
    try:
        command = ["lt", "--port", str(port_to_expose)]
        # For subdomains, you can try: command.extend(["--subdomain", "my-ollama-server"])
        # However, custom subdomains might be rate-limited or require payment on public localtunnel instances.
        
        process_store["process"] = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)
        
        # Thread to monitor stdout for the URL without blocking
        stdout_thread = threading.Thread(target=monitor_localtunnel_output, args=(process_store["process"], url_store), daemon=True)
        stdout_thread.start()

        # Thread to monitor stderr for errors
        def monitor_stderr(proc):
            for line in iter(proc.stderr.readline, ''):
                print(f"Localtunnel ERR: {line.strip()}", file=sys.stderr)
            proc.stderr.close()
        stderr_thread = threading.Thread(target=monitor_stderr, args=(process_store["process"],), daemon=True)
        stderr_thread.start()

        print("Localtunnel process started. Waiting for URL...")
        # Give it a few seconds to establish and print the URL
        time.sleep(10) 
        if not url_store["url"]:
            print("Localtunnel URL not detected after 10s. Check its output above.")
            print("It might be slow, or there could be an issue with localtunnel service (e.g., rate limits, server down).")

    except FileNotFoundError:
        print("Error: 'lt' command not found. Make sure localtunnel is installed globally (npm install -g localtunnel) and in your PATH.")
        url_store["url"] = None
        process_store["process"] = None
    except Exception as e:
        print(f"An unexpected error occurred while starting localtunnel: {e}")
        url_store["url"] = None
        process_store["process"] = None

# Ensure the FastAPI server is running before starting localtunnel
if server_thread and server_thread.is_alive():
    start_localtunnel(PORT, public_url_store, lt_process_store)
else:
    print("FastAPI server is not running. Please run the server cell (Cell 10) first to expose via localtunnel.")

# Display the URL again if already set
if public_url_store["url"]:
    print(f"\nYour API should be accessible at: {public_url_store['url']}")
    print(f"Test with: curl -X POST {public_url_store['url']}/api/generate -H \"Content-Type: application/json\" -d '{{\"prompt\":\"Hi there!\", \"model\":\"{SELECTED_MODEL}\"}}'")
elif lt_process_store["process"]:
    print("\nLocaltunnel is attempting to start/running. Check its output above for the URL or any errors.")
else:
    print("\nLocaltunnel did not start. Check for errors in the cell output.")


## 7. Using Your Exposed API

If localtunnel started successfully, your API is live at the URL it provided.

**Endpoint:** `POST {public_url}/api/generate`

**Request Body (JSON):**
Review the `GenerateRequest` model in Cell 10 for all possible parameters. A simple request:
```json
{
  "prompt": "Your prompt here",
  "model": "optional_model_name_override" 
}
```

**Example using cURL:**
(Replace `YOUR_PUBLIC_URL` with the actual URL from localtunnel output)
```bash
curl -X POST YOUR_PUBLIC_URL/api/generate \
     -H "Content-Type: application/json" \
     -d '{"prompt": "Tell me a fun fact about Large Language Models.", "model": "''' + SELECTED_MODEL + '''"}'
```

## 8. Stopping the Server and Tunnel

When you're done, you should stop the localtunnel and the FastAPI server.
- **Localtunnel**: Can be stopped by running the `stop_services()` function below or by interrupting/restarting the kernel.
- **FastAPI Server**: Running in a daemon thread, it will stop when the Jupyter kernel is shut down or restarted.

In [None]:
import signal

def stop_services():
    global uvicorn_server_instance, server_thread, lt_process_store, public_url_store
    
    print("\n--- Stopping Services ---")
    
    # Stop localtunnel
    lt_proc = lt_process_store.get("process")
    if lt_proc and lt_proc.poll() is None: # Check if process exists and is running
        print("Terminating localtunnel process...")
        lt_proc.terminate() # Try to terminate gracefully
        try:
            lt_proc.wait(timeout=5) # Wait for it to terminate
            print("Localtunnel process terminated.")
        except subprocess.TimeoutExpired:
            print("Localtunnel process did not terminate gracefully, killing.")
            lt_proc.kill()
            print("Localtunnel process killed.")
        except Exception as e:
            print(f"Error during localtunnel termination: {e}")
        lt_process_store["process"] = None
        public_url_store["url"] = None
    else:
        print("Localtunnel process not found or already stopped.")

    # Stop FastAPI/Uvicorn server
    # Uvicorn running in a thread needs a more direct shutdown if possible.
    # The `should_exit` event is the recommended way for uvicorn.Server
    if uvicorn_server_instance:
        print("Requesting FastAPI server (Uvicorn) to shut down...")
        uvicorn_server_instance.should_exit = True
        # Wait for the thread to join
        if server_thread and server_thread.is_alive():
            server_thread.join(timeout=5)
            if server_thread.is_alive():
                print("FastAPI server thread did not exit gracefully.")
            else:
                print("FastAPI server thread exited.")
        uvicorn_server_instance = None
        server_thread = None
    else:
        print("FastAPI server instance not found (might have failed to start or already stopped).")
    
    print("Services stop requested. If issues persist, restart the Jupyter kernel.")

# You can run this cell to stop the services manually.
# stop_services()

print("Run the line 'stop_services()' in this cell (uncomment it) or a new cell to stop localtunnel and attempt to stop the FastAPI server.")
print("The most reliable way to stop all processes started by this notebook is to 'Restart Kernel' or 'Shut Down Kernel' from the Jupyter menu.")