# Demo: Локальный запуск скриптов для предсказания пространственно-временных рядов

Ноутбук показывает, как подготовить данные, проверить интерфейсы локальных скриптов и запустить предсказание без поднятия FastAPI. Полезно для отладки и быстрого прототипирования.

Цель: запускать модули и CLI-скрипты локально, генерировать тестовый датасет и проверять выходные файлы/метрики.

## 1) Установка зависимостей и проверка окружения

Выполните в терминале следующую проверку и установку зависимостей, если это ещё не сделано:

```bash
python --version
pip install -r requirements.txt
```

Проверьте, что виртуальное окружение активно и переменные окружения (`HF_MODEL`, `EMBED_MODEL`, `FAISS_PATH`) установлены при необходимости.

In [7]:
# 2) Импорт библиотек и проверка локальных скриптов
import sys
from pathlib import Path
import importlib

print('python:', sys.version.splitlines()[0])

# Проверяем, что основная директория проекта доступна
repo_root = Path('..').resolve()
print('repo_root (relative):', repo_root)

# Files we expect (adjust names if your project uses different filenames)
expected = ['scripts/build_vector_db.py', '../app/main.py', '../app/model_selector.py', '../app/model_runner.py']
for p in expected:
    exists = Path(p).exists()
    print(f"{p}: {exists}")

# Quick import checks (non-fatal)
for mod in ['app.model_selector', 'app.model_runner', 'app.vector_db']:
    try:
        importlib.import_module(mod)
        print(f"Imported {mod}")
    except Exception as e:
        print(f"Failed to import {mod}: {e}")


python: 3.12.3 (main, Aug 14 2025, 17:47:21) [GCC 13.3.0]
repo_root (relative): /home/sasha/llm-ida
scripts/build_vector_db.py: False
../app/main.py: True
../app/model_selector.py: True
../app/model_runner.py: True
Imported app.model_selector
Imported app.model_runner
Imported app.vector_db


## 3) Генерация синтетического пространственно‑временного датасета

Создадим небольшой синтетический датасет: X с формой (T, N, F). T=50 временных шагов, N=8 локаций, F=1 признак (например, интенсивность). Добавим синусоиды с пространственным градиентом и шум.

In [None]:

#№ Demo: run select_model -> run_model_from_choice and save demo_out.npz
import importlib, os
from pathlib import Path
import numpy as np
# ensure repo on path
repo_root = Path('.').resolve()
import sys
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
# import modules
m = importlib.import_module('app.llm')
mr = importlib.import_module('app.model_runner')
# prepare a small synthetic (T,N,F) dataset and save
T, N, F = 30, 6, 1
x = np.linspace(0, 2*np.pi, T)
X = np.zeros((T,N,F), dtype=np.float32)
for n in range(N):
    X[:,n,0] = np.sin(x + n*0.2) + 0.05*n
np.savez('demo_in.npz', X=X, meta={'nodes': list(range(N))})
# ask LLM (or fallback) to select model
choice = m.select_model('short timeseries from sensor network', 'forecast', ['pysteps','sktime','tslearn','torch_geometric'])
print('select_model ->', choice)
# run the model using the runner API
res = mr.run_model_from_choice(choice, 'demo_in.npz', horizon=3)
print('runner library:', res.get('library'))
print('runner output keys:', list(res.keys()))

In [6]:
# Demo: use fallback LLM for deterministic behaviour in the demo
import os
import sys
from pathlib import Path
# Ensure repo root is on sys.path (not required if package installed editable)
repo_root = Path('.').resolve()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
# Use 'fallback' so the demo doesn't attempt to load a large HF model
os.environ['HF_MODEL'] = 'fallback'
# Import and run the select_model function
import importlib
m = importlib.import_module('app.llm')
importlib.reload(m)
print('HF_MODEL=', m.HF_MODEL)
res = m.select_model('short timeseries from sensor network', 'forecast', ['pysteps','sktime','tslearn','torch_geometric'])
print('select_model ->', res)
# Print concrete model_name/model_args if provided
print('model_name:', res.get('model_name'))
print('model_args:', res.get('model_args'))

HF_MODEL= fallback
select_model -> {'model_choice': 'Graph-based GNN (GCN/GAT)', 'library': 'torch_geometric', 'model_name': 'GAT', 'model_args': {'layers': 2}, 'rationale': 'Dataset appears to have graph structure or many sensors, GNNs model spatial relations well.', 'confidence': 0.7}
model_name: GAT
model_args: {'layers': 2}


In [2]:
# TinyLlama smoke-test (direct Python)\n# This cell imports `app.llm` directly from the repo, sets HF_MODEL, and calls select_model.
import os
import sys
from pathlib import Path
# Ensure repo root is on sys.path (not required if package installed editable)
repo_root = Path('.').resolve()
if str(repo_root) not in sys.path:    
    sys.path.insert(0, str(repo_root))
# Set the HF_MODEL for this test
os.environ['HF_MODEL'] = 'fallback'
# Import and run the select_model function
import importlib
m = importlib.import_module('app.llm')
importlib.reload(m)
print('HF_MODEL=', m.HF_MODEL)
res = m.select_model('short timeseries from sensor network', 'forecast', ['pysteps','sktime','tslearn','torch_geometric'])
print('select_model ->', res)

HF_MODEL= fallback
select_model -> {'model_choice': 'Graph-based GNN (GCN/GAT)', 'library': 'torch_geometric', 'rationale': 'Dataset appears to have graph structure or many sensors, GNNs model spatial relations well.', 'confidence': 0.7}


In [1]:
# TinyLlama smoke-test (direct Python)\n# This cell imports `app.llm` directly from the repo, sets HF_MODEL, and calls select_model.
import os
import sys
from pathlib import Path
# Ensure repo root is on sys.path (not required if package installed editable)
repo_root = Path('.').resolve()
if str(repo_root) not in sys.path:    
    sys.path.insert(0, str(repo_root))
# Set the HF_MODEL for this test
os.environ['HF_MODEL'] = 'fallback'
# Import and run the select_model function
import importlib
m = importlib.import_module('app.llm')
importlib.reload(m)
print('HF_MODEL=', m.HF_MODEL)
res = m.select_model('short timeseries from sensor network', 'forecast', ['pysteps','sktime','tslearn','torch_geometric'])
print('select_model ->', res)

HF_MODEL= fallback
select_model -> {'model_choice': 'Graph-based GNN (GCN/GAT)', 'library': 'torch_geometric', 'rationale': 'Dataset appears to have graph structure or many sensors, GNNs model spatial relations well.', 'confidence': 0.7}


In [2]:
# TinyLlama smoke-test (direct Python)\n# This cell imports `app.llm` directly from the repo, sets HF_MODEL, and calls select_model.
import os
import sys
from pathlib import Path
# Ensure repo root is on sys.path (not required if package installed editable)
repo_root = Path('.').resolve()
if str(repo_root) not in sys.path:    
    sys.path.insert(0, str(repo_root))
# Set the HF_MODEL for this test
os.environ['HF_MODEL'] = 'TinyLlama/TinyLlama_v1.1'
# Import and run the select_model function
import importlib
m = importlib.import_module('app.llm')
importlib.reload(m)
print('HF_MODEL=', m.HF_MODEL)
res = m.select_model('short timeseries from sensor network', 'forecast', ['pysteps','sktime','tslearn','torch_geometric'])
print('select_model ->', res)

HF_MODEL= TinyLlama/TinyLlama_v1.1


Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


select_model -> {'model_choice': 'pysteps', 'library': 'pysteps', 'rationale': 'pysteps is the most accurate model for this dataset', 'confidence': 1}


In [None]:
# 4) Демонстрация: явная передача model_name и model_args в run_model_from_choice
import importlib
import numpy as np
from pathlib import Path
repo_root = Path('.').resolve()
import sys
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
mr = importlib.import_module('app.model_runner')
m = importlib.import_module('app.llm')
# load demo input saved earlier
try:
    d = np.load('demo_in.npz', allow_pickle=True)
    X = d['X'] if 'X' in d else d[list(d.keys())[0]]
except Exception as e:
    print('Could not load demo_in.npz:', e)
    X = np.zeros((30,6,1), dtype=np.float32)
# Create explicit choice: request RandomForest reduction with small ensemble
choice = {'library':'sktime', 'model_name':'forest', 'model_args':{'n_estimators':10}}
print('choice ->', choice)
res = mr.run_model_from_choice(choice, X, horizon=3)
print('library:', res.get('library'))
print('y_pred.shape:', getattr(res.get('y_pred'), 'shape', None))
print('csv preview:\n', '\n'.join(res.get('csv', '').splitlines()[:10]))
np.savez('demo_out.npz', y_pred=res.get('y_pred'), meta=res.get('meta'))
print('Saved demo_out.npz')

### Эксперимент: влияние `n_estimators` на предсказания и время выполнения
В этом эксперименте мы запустим `run_model_from_choice` с разными значениями `n_estimators` для `forest` режима и измерим время выполнения и простую статистику прогнозов (среднее по горизонту и узлам).

In [None]:
import time
import importlib
import numpy as np
from pathlib import Path
repo_root = Path('.').resolve()
import sys
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
mr = importlib.import_module('app.model_runner')
# load demo input or create if missing
try:
    d = np.load('demo_in.npz', allow_pickle=True)
    X = d['X'] if 'X' in d else d[list(d.keys())[0]]
except Exception:
    X = np.zeros((30,6,1), dtype=np.float32)
results = []
for n_est in [5, 10, 50]:
    choice = {'library':'sktime', 'model_name':'forest', 'model_args':{'n_estimators': n_est}}
    t0 = time.time()
    res = mr.run_model_from_choice(choice, X, horizon=3)
    dt = time.time() - t0
    y = res.get('y_pred')
    mean_pred = None
    try:
        mean_pred = float(np.mean(y))
    except Exception:
        mean_pred = None
    results.append({'n_estimators': n_est, 'time': dt, 'mean_pred': mean_pred})
    print(f'n_estimators={n_est}  time={dt:.3f}s  mean_pred={mean_pred}')
# save results
np.savez('demo_n_estimators_results.npz', results=results)
print('Saved demo_n_estimators_results.npz')

### Эксперимент: параметры для `AutoARIMA` (пример `model_args`)
В этом эксперименте мы протестируем несколько конфигураций `model_args` для `AutoARIMA` — включая сезонность (`seasonal`), порядок сезонности `m`, и ограничения `max_p`/`max_q` — чтобы увидеть влияние параметров на время выполнения и простую статистику прогнозов.
Если `sktime` не доступен в среде, ячейка корректно обработает исключение и запишет пустые результаты.

In [None]:
import time
import importlib
import numpy as np
from pathlib import Path
repo_root = Path('.').resolve()
import sys
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
mr = importlib.import_module('app.model_runner')
# load demo input or create if missing
try:
    d = np.load('demo_in.npz', allow_pickle=True)
    X = d['X'] if 'X' in d else d[list(d.keys())[0]]
except Exception:
    X = np.zeros((30,6,1), dtype=np.float32)
# define AutoARIMA configurations to try
configs = [
    {'model_name':'auto_arima', 'model_args': {'seasonal': False, 'max_p': 2, 'max_q':2}},
    {'model_name':'auto_arima', 'model_args': {'seasonal': True, 'm': 12, 'max_p': 3, 'max_q':3}},
    {'model_name':'auto_arima', 'model_args': {'seasonal': True, 'm': 4, 'max_p': 2, 'max_q':2, 'start_p':0}},
]
results = []
for cfg in configs:
    choice = {'library':'sktime', 'model_name': cfg['model_name'], 'model_args': cfg['model_args']}
    print('Running cfg:', cfg)
    t0 = time.time()
    try:
        res = mr.run_model_from_choice(choice, X, horizon=3)
        dt = time.time() - t0
        y = res.get('y_pred')
        mean_pred = None
        try:
            mean_pred = float(np.mean(y))
        except Exception:
            mean_pred = None
        results.append({'config': cfg, 'time': dt, 'mean_pred': mean_pred, 'success': True})
        print(f
)
    except Exception as e:
        dt = time.time() - t0
        print('config failed:', e)
        results.append({'config': cfg, 'time': dt, 'mean_pred': None, 'success': False, 'error': str(e)})
# save results
np.savez('demo_autoarima_results.npz', results=results)
print('Saved demo_autoarima_results.npz')

### Эксперимент: tslearn — конфигурации с holdout RMSE, графики и экспорт
В этой ячейке мы запускаем набор конфигураций `n_neighbors` × `window_size` для адаптера `tslearn`,
делаем holdout (последние `horizon` шагов) для оценки RMSE, строим графики с легендами и сохраняем результаты
в нескольких форматах: PNG, PDF и простая HTML-страница с изображениями и таблицей результатов.

In [None]:
# tslearn experiment with holdout RMSE + plots + export
import time
import importlib
import numpy as np
from pathlib import Path
repo_root = Path('.').resolve()
import sys
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
mr = importlib.import_module('app.model_runner')
# load demo input or create if missing
try:
    d = np.load('demo_in.npz', allow_pickle=True)
    X = d['X'] if 'X' in d else d[list(d.keys())[0]]
except Exception:
    T, N, F = 60, 6, 1
    x = np.linspace(0, 4 * np.pi, T)
    X = np.zeros((T, N, F), dtype=np.float32)
    for n in range(N):
        X[:, n, 0] = np.sin(x + n * 0.25) + 0.05 * n
# ensure we have enough length
horizon = 3
if X.shape[0] <= horizon:
    raise RuntimeError('input series too short for holdout')
configs = []
for n_neighbors in [1, 3, 5, 9]:
    for window_size in [5, 10, 15]:
        configs.append({'n_neighbors': n_neighbors, 'window_size': window_size})
results = []
for cfg in configs:
    choice = {'library': 'tslearn', 'model_name': 'knn', 'model_args': {'n_neighbors': cfg['n_neighbors'], 'window_size': cfg['window_size']}}
    # build train by removing last `horizon` steps for holdout
    X_train = X[:-horizon]
    X_truth = X[-horizon:, :, 0]  # shape (H, N)
    t0 = time.time()
    try:
        res = mr.run_model_from_choice(choice, X_train, horizon=horizon)
        dt = time.time() - t0
        y = res.get('y_pred')
        y_arr = np.asarray(y)
        # normalize prediction shape to (H, N)
        if y_arr.ndim == 1:
            pred = y_arr.reshape((horizon, 1))
        elif y_arr.ndim == 2:
            # (H, N) expected
            pred = y_arr
        elif y_arr.ndim == 3:
            pred = y_arr[:, :, 0]
        else:
            pred = y_arr.reshape((horizon, -1))[:, :X_truth.shape[1]]
        # ensure shapes line up (H, N)
        if pred.shape != X_truth.shape:
            # try to broadcast or trim/pad
            minN = min(pred.shape[1], X_truth.shape[1])
            pred = pred[:, :minN]
            truth = X_truth[:, :minN]
        else:
            truth = X_truth
        # compute RMSE across horizons and nodes
        rmse = float(np.sqrt(np.mean((pred - truth) ** 2)))
        mean_pred = float(np.mean(pred))
        results.append({'cfg': cfg, 'time': dt, 'mean_pred': mean_pred, 'rmse': rmse, 'success': True})
        print(f'cfg={cfg} dt={dt:.3f}s rmse={rmse:.6f} mean={mean_pred:.6f}')
    except Exception as e:
        dt = time.time() - t0
        print('cfg failed', cfg, e)
        results.append({'cfg': cfg, 'time': dt, 'mean_pred': None, 'rmse': None, 'success': False, 'error': str(e)})
# save full results
np.savez('demo_tslearn_results.npz', results=results)
print('Saved demo_tslearn_results.npz')
# plotting and export
try:
    import matplotlib.pyplot as plt
    from matplotlib.backends.backend_pdf import PdfPages
    import base64
    labels = [f"k={r['cfg']['n_neighbors']},w={r['cfg']['window_size']}" for r in results]
    times = [r['time'] for r in results]
    rmses = [r['rmse'] if r['rmse'] is not None else float('nan') for r in results]
    means = [r['mean_pred'] if r['mean_pred'] is not None else float('nan') for r in results]
    fig, ax = plt.subplots(figsize=(10,4))
    ax.bar(range(len(times)), times, color='C0')
    ax.set_xticks(range(len(times)))
    ax.set_xticklabels(labels, rotation=45, ha='right')
    ax.set_ylabel('time (s)')
    ax.set_title('tslearn experiment runtime')
    fig.tight_layout()
    p_time = repo_root / 'demo_tslearn_time.png'
    fig.savefig(p_time)
    plt.close(fig)
    fig, ax = plt.subplots(figsize=(10,4))
    ax.plot(range(len(rmses)), rmses, marker='o', label='RMSE')
    ax.plot(range(len(means)), means, marker='x', label='mean_pred')
    ax.set_xticks(range(len(means)))
    ax.set_xticklabels(labels, rotation=45, ha='right')
    ax.set_ylabel('value')
    ax.set_title('tslearn experiment RMSE and mean prediction')
    ax.legend()
    fig.tight_layout()
    p_mean = repo_root / 'demo_tslearn_mean.png'
    fig.savefig(p_mean)
    plt.close(fig)
    # save both figures into a single PDF report
    pdf_path = repo_root / 'demo_tslearn_report.pdf'
    with PdfPages(pdf_path) as pdf:
        import matplotlib.image as mpimg
        img = mpimg.imread(str(p_time))
        fig = plt.figure(figsize=(10,4))
        plt.axis('off')
        plt.imshow(img)
        pdf.savefig(fig)
        plt.close(fig)
        img = mpimg.imread(str(p_mean))
        fig = plt.figure(figsize=(10,4))
        plt.axis('off')
        plt.imshow(img)
        pdf.savefig(fig)
        plt.close(fig)
    print('Saved', p_time, p_mean, pdf_path)
    # generate a small HTML report embedding images as base64
    try:
        def _img_to_base64(path: Path) -> str:
            try:
                with open(path, 'rb') as f:
                    return base64.b64encode(f.read()).decode('ascii')
            except Exception:
                return ''

        p_time_b64 = _img_to_base64(p_time)
        p_mean_b64 = _img_to_base64(p_mean)
        pdf_str = str(pdf_path)
    except Exception:
        p_time_b64 = p_mean_b64 = ''
        pdf_str = ''

    html = []
    html.append('<!doctype html>')
    html.append('<html>')
    html.append('<head><meta charset="utf-8"><title>tslearn experiment</title></head>')
    html.append('<body>')
    html.append('<h1>tslearn experiment results</h1>')
    if p_time_b64:
        html.append(f"<p>Time plot:</p><p><img src=\"data:image/png;base64,{p_time_b64}\" style=\"max-width:100%;height:auto;\"></p>")
    if p_mean_b64:
        html.append(f"<p>RMSE/mean plot:</p><p><img src=\"data:image/png;base64,{p_mean_b64}\" style=\"max-width:100%;height:auto;\"></p>")
    if pdf_str:
        html.append(f"<p>PDF report: <a href=\"{pdf_str}\">{pdf_str}</a></p>")

    html.append('<h2>Summary table</h2>')
    html.append('<table border="1" cellpadding="4"><tr><th>n_neighbors</th><th>window_size</th><th>time_s</th><th>mean_pred</th><th>rmse</th></tr>')
    for r in results:
        cfg = r.get('cfg', {})
        nn = cfg.get('n_neighbors', '')
        ws = cfg.get('window_size', '')
        time_s = r.get('time', float('nan'))
        mean_pred_v = r.get('mean_pred', '')
        rmse_v = r.get('rmse', '')
        html.append(f"<tr><td>{nn}</td><td>{ws}</td><td>{time_s:.3f}</td><td>{mean_pred_v}</td><td>{rmse_v}</td></tr>")
    html.append('</table>')
    html.append('</body></html>')
    html_path = repo_root / 'demo_tslearn_report.html'
    with open(html_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(html))
    print('Saved HTML report:', html_path)
except Exception as e:
    print('Plot/export skipped or partially failed:', e)

cfg={'n_neighbors': 1, 'window_size': 5} dt=0.001s rmse=0.362799 mean=0.108119
cfg={'n_neighbors': 1, 'window_size': 10} dt=0.000s rmse=0.362799 mean=0.108119
cfg={'n_neighbors': 1, 'window_size': 15} dt=0.000s rmse=0.362799 mean=0.108119
cfg={'n_neighbors': 3, 'window_size': 5} dt=0.000s rmse=0.362799 mean=0.108119
cfg={'n_neighbors': 3, 'window_size': 10} dt=0.000s rmse=0.362799 mean=0.108119
cfg={'n_neighbors': 3, 'window_size': 15} dt=0.000s rmse=0.362799 mean=0.108119
cfg={'n_neighbors': 5, 'window_size': 5} dt=0.000s rmse=0.362799 mean=0.108119
cfg={'n_neighbors': 5, 'window_size': 10} dt=0.000s rmse=0.362799 mean=0.108119
cfg={'n_neighbors': 5, 'window_size': 15} dt=0.000s rmse=0.362799 mean=0.108119
cfg={'n_neighbors': 9, 'window_size': 5} dt=0.000s rmse=0.362799 mean=0.108119
cfg={'n_neighbors': 9, 'window_size': 10} dt=0.000s rmse=0.362799 mean=0.108119
cfg={'n_neighbors': 9, 'window_size': 15} dt=0.000s rmse=0.362799 mean=0.108119
Saved demo_tslearn_results.npz
Saved /home/s