In [14]:
import time
import requests
import json

OLLAMA_URL = "http://host.docker.internal:11434/api/generate"

def measure_latency(prompt, model="llama3"):
    # --- Send request ---
    start = time.time()

    # streaming = True â†’ lets us measure first token time
    response = requests.post(
        OLLAMA_URL,
        json={"model": model, "prompt": prompt, "stream": True},
        stream=True
    )
    
    # --- First token latency ---
    first_token_time = None
    tokens = 0
    
    for line in response.iter_lines():
        if not line:
            continue
        data = json.loads(line.decode("utf-8"))
        
        if first_token_time is None:
            first_token_time = time.time() - start
        
        tokens += 1
        
        # End of stream?
        if data.get("done"):
            break

    total_time = time.time() - start
    tokens_per_sec = tokens / total_time if total_time > 0 else 0

    return {
        "prompt": prompt,
        "first_token_latency": first_token_time,
        "total_latency": total_time,
        "tokens": tokens,
        "tokens_per_sec": tokens_per_sec
    }

# ---- Test the function ----
result = measure_latency("Explain quantum entanglement in one sentence.")
result


{'prompt': 'Explain quantum entanglement in one sentence.',
 'first_token_latency': 0.345139741897583,
 'total_latency': 1.3408548831939697,
 'tokens': 55,
 'tokens_per_sec': 41.018607374563764}

In [None]:
import requests, json

response = requests.post(
    "http://host.docker.internal:11436/api/embeddings",
    json={"model": "nomic-embed-text", "prompt": "hello world"}
).json()

response




{'embedding': [-0.15567687153816223,
  -0.03041648119688034,
  -3.9096615314483643,
  0.19335557520389557,
  0.13164487481117249,
  1.5941569805145264,
  -0.004164695739746094,
  -0.9834780693054199,
  -0.33262476325035095,
  -1.2340929508209229,
  0.011161962524056435,
  0.8931363821029663,
  0.6323418021202087,
  1.8454142808914185,
  1.0351557731628418,
  -1.4351294040679932,
  0.23527556657791138,
  -0.6747883558273315,
  -0.9761874675750732,
  0.6759461164474487,
  -0.08489550650119781,
  -2.1519100666046143,
  -0.17436952888965607,
  0.8691720366477966,
  2.1018905639648438,
  -0.3264440596103668,
  -0.3433741331100464,
  1.4053444862365723,
  0.14877256751060486,
  -0.5022650361061096,
  -0.027482235804200172,
  -0.24973300099372864,
  -0.004718352109193802,
  0.3573193848133087,
  0.9005894660949707,
  0.06268630176782608,
  0.7435093522071838,
  0.3936997056007385,
  0.371711790561676,
  0.13436990976333618,
  -0.10684989392757416,
  -0.338253378868103,
  0.2759498059749603,
 

In [16]:
import requests, json

response = requests.post(
    "http://host.docker.internal:11434/api/generate",
    json={"model": "llama3.1:8b", "prompt": "Say hello.", "stream": False}
).json()

response


{'model': 'llama3.1:8b',
 'created_at': '2025-11-23T16:35:51.590735299Z',
 'response': 'Hello! How are you today? Is there something I can help you with or would you like to chat?',
 'done': True,
 'done_reason': 'stop',
 'context': [128006,
  882,
  128007,
  271,
  46864,
  24748,
  13,
  128009,
  128006,
  78191,
  128007,
  271,
  9906,
  0,
  2650,
  527,
  499,
  3432,
  30,
  2209,
  1070,
  2555,
  358,
  649,
  1520,
  499,
  449,
  477,
  1053,
  499,
  1093,
  311,
  6369,
  30],
 'total_duration': 7449671981,
 'load_duration': 6945194363,
 'prompt_eval_count': 13,
 'prompt_eval_duration': 54855879,
 'eval_count': 23,
 'eval_duration': 426709321}

In [11]:
prompts = ["test"] * 10
results = [measure_latency(p) for p in prompts]

import pandas as pd
pd.DataFrame(results)


Unnamed: 0,prompt,first_token_latency,total_latency,tokens,tokens_per_sec
0,test,0.730539,0.972218,15,15.428645
1,test,0.131935,0.670566,30,44.738304
2,test,0.133703,1.065426,51,47.868182
3,test,0.128864,0.279615,9,32.187096
4,test,0.126937,0.694224,31,44.654152
5,test,0.127798,0.768946,35,45.516854
6,test,0.125307,0.678498,31,45.689153
7,test,0.13151,0.760973,34,44.679642
8,test,0.134536,0.960644,45,46.843587
9,test,0.126108,0.839205,38,45.280949


In [None]:
from ollama import Client
client = Client(host='http://host.docker.internal:11434')

prompt = """
Extract all structured metadata from the following academic text. 
Return ONLY valid JSON with the fields:
title, authors (name, email, affiliation), publication (venue, date, review_url), 
abstract_summary, keywords.

Text:
<<<
Published in Transactions on Machine Learning Research (02/2023)
Diffusion-based Time Series Imputation and Forecasting
with Structured State Space Models
Juan Miguel Lopez Alcaraz
juan.lopez.alcaraz@uol.de
Division AI4Health
Oldenburg University
Nils Strodthoff
nils.strodthoff@uol.de
Division AI4Health
Oldenburg University
Reviewed on OpenReview: https: // openreview. net/ forum? id= hHiIbk7ApW
Abstract
The imputation of missing values represents a significant obstacle for many real-world data
analysis pipelines. Here, we focus on time series data and put forward SSSD, an imputation
model that relies on two emerging technologies, (conditional) diffusion models as state-of-
the-art generative models and structured state space models as internal model architecture,
which are particularly suited to capture long-term dependencies in time series data. We
demonstrate that SSSD matches or even exceeds state-of-the-art probabilistic imputation
and forecasting performance on a broad range of data sets and different missingness scenarios,
including the challenging blackout-missing scenarios, where prior approaches failed to provide
meaningful results.
1
Introduction
Missing input data is a common phenomenon in real-world machine learning applications, which can have many
different reasons, ranging from inadequate data entry over equipment failures to file losses. Handling missing
input data represents a major challenge for machine learning applications as most algorithms require data
without missing values to train. Unfortunately, the imputation quality has a critical impact on downstream
tasks, as demonstrated in prior work (Shadbahr et al., 2022), and poor imputations can even introduce bias
into the downstream analysis (Zhang et al., 2022), which can potentially call into question the validity of the
results achieved in them.
In this work, we focus on time series as a data modality, where missing data is particularly prevalent,
for example, due to faulty sensor equipment. We consider a range of different missingness scenarios, see
Figure 1 for a visual overview, where the example of faulty sensor equipment former example already suggests
that not-at-random missingness scenarios are significant for real-world scenarios. Time series forecasting
is naturally contained in this approach as special case of blackout missingness, where the location of the
imputation window is at the end of the sequence. We also stress that the most realistic scenario to address
imputation as an underspecified problem class is the use of probabilistic imputation methods, which do not
provide only a single imputation but instead allow samples of different plausible imputations.
There is a large body of literature on time series imputation, see (Osman et al., 2018) for a review, ranging
from statistical methods (Lin & Tsai, 2020) to autoregressive models (Atyabi et al., 2016; Bashir & Wei,
2018). Recently, deep generative models started to emerge as a promising paradigm to model time series
imputation of long sequences or time series forecasting problems at long horizons. However, many existing
models remain limited to the random missing scenario or show unstable behavior during training. In addition,
we demonstrate that state-of-the-art approaches even fail to deliver qualitatively meaningful imputations in
blackout missing scenarios on certain data sets.
1
>>>
"""




times = []
for i in range(10):
    start = time.time()
    response = client.generate(
        model="llama3",
        prompt=prompt,
        format="json",
        options={"temperature": 0}
    )
    elapsed = time.time() - start
    times.append(elapsed)
    print(f"Request {i+1}: {elapsed:.2f} seconds")

print(f"\nAverage time: {sum(times)/len(times):.2f} seconds")
print(f"Total time for 10 requests: {sum(times):.2f} seconds")


{"title": "Diffusion-based Time Series Imputation and Forecasting with Structured State Space Models",
"authors": [
{"name": "Juan Miguel Lopez Alcaraz", 
"email": "juan.lopez.alcaraz@uol.de", 
"affiliation": "Oldenburg University, Division AI4Health"},
{"name": "Nils Strodthoff", 
"email": "nils.strodthoff@uol.de", 
"affiliation": "Oldenburg University, Division AI4Health"}
],
"publication": {
"venue": "Transactions on Machine Learning Research",
"date": "02/2023",
"review_url": "https://openreview.net/forum?id=hHiIbk7ApW"
},
"abstract_summary": "The imputation of missing values represents a significant obstacle for many real-world data analysis pipelines. Here, we focus on time series data and put forward SSSD, an imputation model that relies on two emerging technologies, (conditional) diffusion models as state-of-the-art generative models and structured state space models as internal model architecture, which are particularly suited to capture long-term dependencies in time series d