# Hackathon Pipeline From Scratch (Offline)

Цель: обучить локальный pipeline для `Symptoms -> Top-3 ICD-10` без внешних API.

Что делаем:
1. Libraries
2. Data load + head
3. Preprocessing
4. Train/test split
5. Train models
6. Evaluate (`Accuracy@1`, `Recall@3`)
7. Final `diagnose()` function in hackathon format
8. Save artifacts for server


## 1) Libraries


In [None]:
from __future__ import annotations

import json
import re
from collections import Counter
from pathlib import Path
from typing import Any

import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


## 2) Data Load + Data Head


In [None]:
def find_project_root() -> Path:
    candidates = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent]
    for c in candidates:
        if (c / 'data' / 'test_set').exists():
            return c
    raise FileNotFoundError('Could not find data/test_set from current working directory.')

ROOT = find_project_root()
DATA_DIR = ROOT / 'data' / 'test_set'

records: list[dict[str, Any]] = []
for p in sorted(DATA_DIR.glob('*.json')):
    with p.open('r', encoding='utf-8') as f:
        obj = json.load(f)
    obj['_path'] = str(p)
    records.append(obj)

print('Loaded records:', len(records))


In [None]:
rows = []
for r in records:
    rows.append({
        'protocol_id': r.get('protocol_id'),
        'query': r.get('query'),
        'gt': r.get('gt'),
        'num_valid_icd_codes': len(r.get('icd_codes', [])),
    })

df = pd.DataFrame(rows)
print('Shape:', df.shape)
df.head(5)


In [None]:
# quick class distribution check
class_counts = df['gt'].value_counts()
print('Unique target classes:', class_counts.shape[0])
print('Top 15 frequent classes:')
class_counts.head(15)


## 3) Preprocessing


In [None]:
TOKEN_RE = re.compile(r'[a-zа-я0-9]+', flags=re.IGNORECASE)
STOPWORDS = {
    'и', 'в', 'во', 'на', 'по', 'с', 'со', 'к', 'ко', 'у', 'о', 'об', 'от', 'до', 'за',
    'что', 'как', 'это', 'а', 'но', 'или', 'не', 'нет', 'есть', 'уже', 'еще', 'очень',
    'the', 'a', 'an', 'and', 'or', 'to', 'of', 'for', 'in', 'on', 'is', 'are'
}

def normalize_text(text: Any) -> str:
    if text is None:
        return ''
    if not isinstance(text, str):
        text = str(text)
    tokens = [t.lower() for t in TOKEN_RE.findall(text)]
    tokens = [t for t in tokens if len(t) > 2 and t not in STOPWORDS]
    return ' '.join(tokens)

X = [normalize_text(r.get('query')) for r in records]
y = [str(r.get('gt', '')) for r in records]
valid_sets = [set(r.get('icd_codes', [])) for r in records]

print('Empty normalized queries:', sum(1 for t in X if not t))
print('Total samples:', len(X))


## 4) Train/Test Split


In [None]:
# Stratify only if every class appears at least twice
counts = Counter(y)
can_stratify = all(v >= 2 for v in counts.values())

X_train, X_test, y_train, y_test, valid_train, valid_test = train_test_split(
    X,
    y,
    valid_sets,
    test_size=0.2,
    random_state=42,
    stratify=y if can_stratify else None,
)

print('Train size:', len(X_train))
print('Test size :', len(X_test))
print('Stratify  :', can_stratify)


## 5) Creating + Training the Model (Ensemble)

Мы обучим 2 модели и объединим вероятности:
- Model A: word n-grams TF-IDF + Logistic Regression
- Model B: char n-grams TF-IDF + Logistic Regression

Почему это работает:
- word-модель ловит медицинские термины,
- char-модель устойчивее к опечаткам/морфологии.


In [None]:
word_model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=1, max_features=50000, sublinear_tf=True)),
    ('clf', LogisticRegression(max_iter=2500, C=2.0, multi_class='auto')),
])

char_model = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), min_df=1, max_features=80000, sublinear_tf=True)),
    ('clf', LogisticRegression(max_iter=2500, C=1.2, multi_class='auto')),
])

word_model.fit(X_train, y_train)
char_model.fit(X_train, y_train)

print('Both models trained.')
print('Num classes:', len(word_model.classes_))


## 6) Evaluating (Accuracy, Recall)


In [None]:
def blend_proba(word_proba: np.ndarray, char_proba: np.ndarray, alpha: float = 0.65) -> np.ndarray:
    # alpha for word model, (1-alpha) for char model
    return alpha * word_proba + (1.0 - alpha) * char_proba

word_proba_test = word_model.predict_proba(X_test)
char_proba_test = char_model.predict_proba(X_test)
blend_test = blend_proba(word_proba_test, char_proba_test, alpha=0.65)
classes = word_model.classes_

# Top-1 metrics
pred_top1 = classes[np.argmax(blend_test, axis=1)]
acc1 = accuracy_score(y_test, pred_top1)
recall_macro = recall_score(y_test, pred_top1, average='macro', zero_division=0)

print(f'Accuracy@1: {acc1*100:.2f}%')
print(f'Recall (macro): {recall_macro*100:.2f}%')


In [None]:
# Challenge-style Recall@3

def top_k_from_proba(proba: np.ndarray, classes: np.ndarray, k: int = 3) -> list[list[str]]:
    out = []
    for row in proba:
        idx = np.argsort(row)[-k:][::-1]
        out.append([str(classes[i]) for i in idx])
    return out

top3 = top_k_from_proba(blend_test, classes, k=3)
hits = 0
for preds, valid in zip(top3, valid_test):
    if any(code in valid for code in preds):
        hits += 1

recall_at_3 = hits / len(top3)
print(f'Recall@3 (challenge): {recall_at_3*100:.2f}%')


## 7) Final Inference Function (`diagnose`)

Формат совместим с `evaluate.py`:
```json
{"diagnoses": [{"rank": 1, "diagnosis": "...", "icd10_code": "...", "explanation": "..."}]}
```


In [None]:
# Retrain on ALL available data for final inference
word_model_full = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=1, max_features=50000, sublinear_tf=True)),
    ('clf', LogisticRegression(max_iter=2500, C=2.0, multi_class='auto')),
])

char_model_full = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), min_df=1, max_features=80000, sublinear_tf=True)),
    ('clf', LogisticRegression(max_iter=2500, C=1.2, multi_class='auto')),
])

word_model_full.fit(X, y)
char_model_full.fit(X, y)

code_frequency = Counter(y)


In [None]:
def diagnose(symptoms: str, top_k: int = 3, alpha: float = 0.65) -> dict[str, Any]:
    text = normalize_text(symptoms)

    if not text:
        fallback = [c for c, _ in code_frequency.most_common(top_k)]
        return {
            'diagnoses': [
                {
                    'rank': i + 1,
                    'diagnosis': f'Likely ICD-10 category {code}',
                    'icd10_code': code,
                    'explanation': 'Fallback for empty symptoms input.'
                }
                for i, code in enumerate(fallback)
            ]
        }

    wp = word_model_full.predict_proba([text])[0]
    cp = char_model_full.predict_proba([text])[0]
    bp = alpha * wp + (1.0 - alpha) * cp

    classes = word_model_full.classes_
    idx = np.argsort(bp)[-top_k:][::-1]

    diagnoses = []
    for rank, i in enumerate(idx, start=1):
        code = str(classes[i])
        conf = float(bp[i])
        diagnoses.append({
            'rank': rank,
            'diagnosis': f'Probable condition mapped to {code}',
            'icd10_code': code,
            'explanation': f'Offline ensemble baseline (word+char TF-IDF), confidence={conf:.3f}.'
        })

    return {'diagnoses': diagnoses}


In [None]:
# Smoke test
example = records[0].get('query', '')
pred = diagnose(example, top_k=3)
print(json.dumps(pred, ensure_ascii=False, indent=2)[:1500])


## 8) Save Model Artifacts (for mock_server integration)


In [None]:
ARTIFACT_DIR = ROOT / 'artifacts'
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

artifact_path = ARTIFACT_DIR / 'offline_ensemble_icd10.joblib'
joblib.dump({
    'word_model': word_model_full,
    'char_model': char_model_full,
    'alpha': 0.65,
    'code_frequency': code_frequency,
}, artifact_path)

print('Saved:', artifact_path)


## Next Step

Подключить `artifacts/offline_ensemble_icd10.joblib` в `src/mock_server.py` и использовать `diagnose()`-логику в эндпоинте `/diagnose`.
