In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Query FastAPI Prometheus Metrics\n",
    "\n",
    "This notebook demonstrates how to fetch and visualize metrics from your FastAPI app's Prometheus `/metrics` endpoint."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "import requests\n",
    "import re\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# FASTAPI_METRICS_URL can be localhost if running in the same container, else use the public endpoint\n",
    "FASTAPI_METRICS_URL = \"http://localhost:8080/metrics\""
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Fetch metrics\n",
    "resp = requests.get(FASTAPI_METRICS_URL)\n",
    "metrics_text = resp.text\n",
    "print(metrics_text[:500])  # Print first 500 chars for inspection"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parse Inference Latency Histogram Buckets"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Example Prometheus histogram line:\n",
    "# inference_latency_seconds_bucket{le=\"0.005\"} 0.0\n",
    "\n",
    "latency_buckets = {}\n",
    "for line in metrics_text.splitlines():\n",
    "    if line.startswith('inference_latency_seconds_bucket'):\n",
    "        m = re.match(r'.*le=\"([0-9e\\+\\.-]+)\"} ([0-9\\.]+)', line)\n",
    "        if m:\n",
    "            le = float(m.group(1))\n",
    "            count = float(m.group(2))\n",
    "            latency_buckets[le] = count\n",
    "\n",
    "if not latency_buckets:\n",
    "    print(\"No latency buckets found. Have you made any /generate requests?\")\n",
    "else:\n",
    "    print(latency_buckets)"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plot Latency Histogram"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "if latency_buckets:\n",
    "    plt.figure(figsize=(8,4))\n",
    "    buckets = sorted(latency_buckets.items())\n",
    "    x = [le for le, _ in buckets]\n",
    "    y = [count for _, count in buckets]\n",
    "    plt.step(x, y, where='post')\n",
    "    plt.xlabel('Latency (seconds)')\n",
    "    plt.ylabel('Cumulative Requests')\n",
    "    plt.title('Inference Latency Histogram')\n",
    "    plt.grid(True)\n",
    "    plt.show()"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plot Number of Model Switches"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Extract MODEL_SWITCHES counter\n",
    "model_switches = None\n",
    "for line in metrics_text.splitlines():\n",
    "    if line.startswith('model_switches_total'):\n",
    "        try:\n",
    "            model_switches = float(line.split()[-1])\n",
    "        except Exception:\n",
    "            pass\n",
    "\n",
    "print(f\"Model switches: {model_switches}\")"
   ],
   "execution_count": null,
   "outputs": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": ""
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}