In [4]:
import warnings
from collections import defaultdict

import numpy as np
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from utils import get_scores
from xgboost import XGBClassifier

from origami.preprocessing import load_df_from_mongodb
from origami.utils.guild import load_secrets, print_guild_scalars


In [5]:
# experiment flags
model_name = "LogisticRegression"  # "XGBoost" # "RandomForest"
limit = 1000
n_random_seeds = 5

print(f"Running {model_name=}, {limit=}, {n_random_seeds=}")

Running model_name='LogisticRegression', limit=1000, n_random_seeds=5


In [6]:
# defaul model hyper parameters

# logistic regression
lr_C = 1.0
lr_penalty = "none"
lr_max_iter = 50
lr_fit_intercept = True

# xgboost
xgb_learning_rate = 0.1
xgb_max_depth = 5
xgb_subsample = 1.0
xgb_colsample_bytree = 1.0
xgb_colsample_bylevel = 1.0
xgb_min_child_weight = 1.0
xgb_reg_alpha = 0.0
xgb_reg_lambda = 1.0
xgb_gamma = 0
xgb_n_estimators = 100

# random forest
rf_n_estimators = 100
rf_max_features = "none"
rf_max_depth = "none"
rf_min_samples_split = 5

# lightgbm
lgb_num_leaves = 10
lgb_max_depth = 5
lgb_learning_rate = 0.1
lgb_n_estimators = 100
lgb_min_child_weight = 1.0
lgb_subsample = 0.8
lgb_colsample_bytree = 0.8
lgb_reg_alpha = 0.0
lgb_reg_lambda = 1.0

In [7]:
secrets = load_secrets()


loading local secrets.



# Data


In [8]:
PROJECTION = {"_id": 0, "DIFFERENTIAL_DIAGNOSIS": 0}
TARGET_FIELD = "DIFFERENTIAL_DIAGNOSIS_NOPROB"


def load_docs(collection_name):
    return load_df_from_mongodb(
        uri=secrets["MONGO_URI"],
        db=secrets["DATABASE"],
        coll=collection_name,
        projection=PROJECTION,
        sort=[("_id", 1)],
        limit=limit,
    )


def preprocess_dataset(df):
    # pull up relevant fields at the top of the df
    df["EVIDENCES"] = df["docs"].apply(lambda x: x["EVIDENCES"])
    df["DIFFERENTIAL_DIAGNOSIS_NOPROB"] = df["docs"].apply(lambda x: x["DIFFERENTIAL_DIAGNOSIS_NOPROB"])
    df["PATHOLOGY"] = df["docs"].apply(lambda x: x["PATHOLOGY"])
    return df

In [9]:
# load data

train_docs_df = load_docs(collection_name="train-noprob").pipe(preprocess_dataset)
test_docs_df = load_docs(collection_name="test-noprob").pipe(preprocess_dataset)
val_docs_df = load_docs(collection_name="validate-noprob").pipe(preprocess_dataset)

# ML


In [11]:
def get_classifier(model_name, seed):
    match model_name:
        case "LogisticRegression":
            clf = LogisticRegression(
                random_state=seed,
                C=lr_C if lr_penalty != "none" else 1.0,
                penalty=lr_penalty if lr_penalty != "none" else None,
                max_iter=lr_max_iter,
                fit_intercept=True if lr_fit_intercept == 1 else False,
                solver="saga",
            )
        case "XGBoost":
            clf = XGBClassifier(
                random_state=seed,
                max_depth=xgb_max_depth,
                learning_rate=xgb_learning_rate,
                n_estimators=xgb_n_estimators,
                subsample=xgb_subsample,
                colsample_bytree=xgb_colsample_bytree,
                colsample_bylevel=xgb_colsample_bylevel,
                min_child_weight=xgb_min_child_weight,
                reg_alpha=xgb_reg_alpha,
                reg_lambda=xgb_reg_lambda,
                gamma=xgb_gamma,
            )
        case "RandomForest":
            clf = RandomForestClassifier(
                random_state=seed,
                n_estimators=rf_n_estimators,
                max_features=rf_max_features if rf_max_features != "none" else None,
                max_depth=rf_max_depth if rf_max_depth != "none" else None,
                min_samples_split=rf_min_samples_split,
            )
        case "LightGBM":
            clf = LGBMClassifier(
                random_state=seed,
                verbose=-1,
                num_leaves=lgb_num_leaves,
                max_depth=lgb_max_depth,
                learning_rate=lgb_learning_rate,
                n_estimators=lgb_n_estimators,
                min_child_weight=lgb_min_child_weight,
                subsample=lgb_subsample,
                colsample_bytree=lgb_colsample_bytree,
                reg_alpha=lgb_reg_alpha,
                reg_lambda=lgb_reg_lambda,
            )

        case _:
            raise ValueError(f"Unknown model {model_name}")
    return clf

In [12]:
# encode data
mlb_ddx = MultiLabelBinarizer()
mlb_evd = MultiLabelBinarizer()

# train
X_train = mlb_evd.fit_transform(train_docs_df["EVIDENCES"])
y_train = mlb_ddx.fit_transform(train_docs_df["DIFFERENTIAL_DIAGNOSIS_NOPROB"])

# val
X_val = mlb_evd.transform(val_docs_df["EVIDENCES"])
y_val = mlb_ddx.transform(val_docs_df["DIFFERENTIAL_DIAGNOSIS_NOPROB"])
y_pathology_val = mlb_ddx.transform(
    val_docs_df["PATHOLOGY"].apply(
        lambda x: [
            x,
        ]
    )
)
y_pathology_val = np.where(y_pathology_val > 0.5)[1]

# test
X_test = mlb_evd.transform(test_docs_df["EVIDENCES"])
y_test = mlb_ddx.transform(test_docs_df["DIFFERENTIAL_DIAGNOSIS_NOPROB"])
y_pathology_test = mlb_ddx.transform(
    test_docs_df["PATHOLOGY"].apply(
        lambda x: [
            x,
        ]
    )
)
y_pathology_test = np.where(y_pathology_test > 0.5)[1]



In [134]:
# X_test[:3,:], y_test[:3,:], y_pathology_test[:3]

In [13]:
results = defaultdict(list)

In [14]:
for clf_seed in range(n_random_seeds):
    clf = get_classifier(model_name=model_name, seed=clf_seed)
    multi_output_clf = MultiOutputClassifier(clf, n_jobs=4)
    print(f"Training {clf}")

    # train
    with warnings.catch_warnings():
        warnings.simplefilter(action="ignore", category=ConvergenceWarning)
        multi_output_clf.fit(X_train, y_train)

    # evaluate dev
    y_pred_val = multi_output_clf.predict_proba(X_val)
    y_pred_val = np.hstack([y_pred_val_i[:, 1].reshape(-1, 1) for y_pred_val_i in y_pred_val])

    scores_val = get_scores(y_target=y_val, y_pred=y_pred_val, y_pathology=y_pathology_val, postfix="_val")
    for score_name, score in scores_val.items():
        results[score_name].append(score)

    # evaluate test
    y_pred_test = multi_output_clf.predict_proba(X_test)
    y_pred_test = np.hstack([y_pred_test_i[:, 1].reshape(-1, 1) for y_pred_test_i in y_pred_test])

    scores_test = get_scores(y_target=y_test, y_pred=y_pred_test, y_pathology=y_pathology_test, postfix="_test")
    for score_name, score in scores_test.items():
        results[score_name].append(score)

    guild_output = {"step": clf_seed} | scores_val | scores_test
    print_guild_scalars(**guild_output)

Training LogisticRegression(max_iter=50, penalty=None, random_state=0, solver='saga')




|  step: 0  |  recall_val: 0.8314068786891555  |  precision_val: 0.8327029902140332  |  f1_val: 0.832054429705749  |  gtpa_val: 0.978  |  gtpa_at_1_val: 0.638  |  recall_test: 0.8282854920990792  |  precision_test: 0.8312029383603198  |  f1_test: 0.8297416507367719  |  gtpa_test: 0.982  |  gtpa_at_1_test: 0.645  |
Training LogisticRegression(max_iter=50, penalty=None, random_state=1, solver='saga')




|  step: 1  |  recall_val: 0.8318345179341555  |  precision_val: 0.8339688463232376  |  f1_val: 0.8329003148131509  |  gtpa_val: 0.979  |  gtpa_at_1_val: 0.645  |  recall_test: 0.8292770518667296  |  precision_test: 0.8300828439218094  |  f1_test: 0.8296797522462857  |  gtpa_test: 0.98  |  gtpa_at_1_test: 0.645  |
Training LogisticRegression(max_iter=50, penalty=None, random_state=2, solver='saga')




|  step: 2  |  recall_val: 0.8316663097649025  |  precision_val: 0.8342577736418247  |  f1_val: 0.8329600260998039  |  gtpa_val: 0.976  |  gtpa_at_1_val: 0.64  |  recall_test: 0.8296960450080911  |  precision_test: 0.8312407077770055  |  f1_test: 0.830467658128272  |  gtpa_test: 0.981  |  gtpa_at_1_test: 0.646  |
Training LogisticRegression(max_iter=50, penalty=None, random_state=3, solver='saga')




|  step: 3  |  recall_val: 0.8312018429906443  |  precision_val: 0.833947785482118  |  f1_val: 0.8325725501154765  |  gtpa_val: 0.978  |  gtpa_at_1_val: 0.641  |  recall_test: 0.8289024348875614  |  precision_test: 0.8315551347531643  |  f1_test: 0.8302266658813712  |  gtpa_test: 0.979  |  gtpa_at_1_test: 0.641  |
Training LogisticRegression(max_iter=50, penalty=None, random_state=4, solver='saga')




|  step: 4  |  recall_val: 0.83062080252698  |  precision_val: 0.8328349155410779  |  f1_val: 0.8317263855062441  |  gtpa_val: 0.976  |  gtpa_at_1_val: 0.644  |  recall_test: 0.830492785049955  |  precision_test: 0.8315300786054587  |  f1_test: 0.8310111081324244  |  gtpa_test: 0.979  |  gtpa_at_1_test: 0.646  |


In [15]:
print("Aggregated metrics:")
keys = list(results.keys())
scalars = {}
for key in keys:
    scalars[f"{key}_mean"] = np.mean(results[key])
    scalars[f"{key}_std"] = np.std(results[key])
    scalars[f"{key}_min"] = np.min(results[key])
    scalars[f"{key}_max"] = np.max(results[key])

# print rounded scalars
print_guild_scalars(**{k: f"{v:.4f}" for k, v in scalars.items()})

Aggregated metrics:
|  recall_val_mean: 0.8313  |  recall_val_std: 0.0004  |  recall_val_min: 0.8306  |  recall_val_max: 0.8318  |  precision_val_mean: 0.8335  |  precision_val_std: 0.0006  |  precision_val_min: 0.8327  |  precision_val_max: 0.8343  |  f1_val_mean: 0.8324  |  f1_val_std: 0.0005  |  f1_val_min: 0.8317  |  f1_val_max: 0.8330  |  gtpa_val_mean: 0.9774  |  gtpa_val_std: 0.0012  |  gtpa_val_min: 0.9760  |  gtpa_val_max: 0.9790  |  gtpa_at_1_val_mean: 0.6416  |  gtpa_at_1_val_std: 0.0026  |  gtpa_at_1_val_min: 0.6380  |  gtpa_at_1_val_max: 0.6450  |  recall_test_mean: 0.8293  |  recall_test_std: 0.0007  |  recall_test_min: 0.8283  |  recall_test_max: 0.8305  |  precision_test_mean: 0.8311  |  precision_test_std: 0.0005  |  precision_test_min: 0.8301  |  precision_test_max: 0.8316  |  f1_test_mean: 0.8302  |  f1_test_std: 0.0005  |  f1_test_min: 0.8297  |  f1_test_max: 0.8310  |  gtpa_test_mean: 0.9802  |  gtpa_test_std: 0.0012  |  gtpa_test_min: 0.9790  |  gtpa_test_max: 0.9