In [None]:
import sqlalchemy as sa
import numpy as np
import pandas as pd
from scipy.special import softmax
from scipy.linalg import toeplitz
import hashlib
from pathlib import Path
from dotenv import load_dotenv
import os
import zlib
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, cross_validate, StratifiedKFold

# Load .env from the project root (adjust if needed)
env_path = ".env"
load_dotenv(dotenv_path=env_path, override=False)  # don't clobber real env vars

True

In [58]:
def simulate_mnlogit(
    n_samples: int,
    n_classes: int,
    n_features: int,
    *,
    base_rates=None,          # e.g. [0.6,0.3,0.1]; if None => uniform
    coef_scale: float = 1.0,  # std for coefficients
    class_sep: float = 1.0,   # scales logits (↑ easier, ↓ harder)
    intercept: bool = True,
    n_informative: int | None = None,  # if None => all features informative
    feature_scale: float = 1.0,
    flip_y: float = 0.0,               # label noise fraction in [0,1]
    seed: int | None = None,           # if None => hash of params (deterministic)
    return_probas: bool = False,
):
    """Deterministic multinomial-logit simulator with minimal dependencies."""
    rng = np.random.default_rng(seed)

    # ----- sizes / basic checks -----
    K, p, n = int(n_classes), int(n_features), int(n_samples)
    if K < 2: raise ValueError("n_classes must be >= 2")
    if p < 1: raise ValueError("n_features must be >= 1")
    n_inf = p if n_informative is None else int(n_informative)
    if not (1 <= n_inf <= p): raise ValueError("n_informative must be in [1, n_features]")

    # ----- class priors -----
    if base_rates is None:
        pri = np.full(K, 1.0 / K)
    else:
        pri = np.asarray(base_rates, float)
        if pri.shape != (K,) or not np.isclose(pri.sum(), 1.0):
            raise ValueError("base_rates must be length K and sum to 1")

    # ----- features: Gaussian -----
    X = rng.normal(0.0, feature_scale, size=(n, p))

    # ----- parameters -----
    W = np.zeros((p, K))
    W[:n_inf, :] = rng.normal(0.0, coef_scale, size=(n_inf, K))
    W *= class_sep
    b = np.log(np.clip(pri, 1e-12, None)) - (0 if not intercept else np.mean(np.log(np.clip(pri,1e-12,None))))
    if not intercept: b = np.zeros(K)

    # ----- probabilities & labels -----
    P = softmax(X @ W + b, axis=1)
    cumP = np.cumsum(P, axis=1)
    y = (rng.random(n)[:, None] > cumP).sum(axis=1).astype(int)

    # optional label noise (flip to a different class uniformly)
    if flip_y > 0:
        flips = rng.random(n) < float(flip_y)
        if flips.any():
            r = rng.integers(0, K-1, size=flips.sum())
            yf = y[flips]
            y[flips] = r + (r >= yf)

    df = pd.DataFrame(X, columns=[f"X{j}" for j in range(p)])
    df["Y"] = y + 1
    if return_probas:
        for k in range(K): df[f"p{k}"] = P[:, k]

    params = {"W": W, "b": b, "seed_used": seed}
    return df, params

# --- set values ---
seed=42
n_samples=500
n_classes=4
n_features=100
base_rates=[0.5, 0.2, 0.2, 0.1]
coef_scale=0.7
class_sep=1.2
n_informative=8
flip_y=0.05
# --- set values ---


# return_probas=False

# df, params = simulate_mnlogit(
#     seed=seed,
#     n_samples=n_samples, n_classes=n_classes, n_features=n_features,
#     base_rates=base_rates,
#     coef_scale=coef_scale, class_sep=class_sep,
#     n_informative=n_informative, flip_y=flip_y,
#     return_probas=False
# )
# print(df.head(), "\nW:", params["W"].shape, "b:", params["b"].shape, "seed:", params["seed_used"])

# df



In [None]:
# Columns: 
    # seed=42,
    # n_samples=500, n_classes=4, n_features=100,
    # base_rates=[0.5, 0.2, 0.2, 0.1],
    # coef_scale=0.7, class_sep=1.2,
    # n_informative=8, flip_y=0.05,
    # return_probas=False

In [9]:
from sklearn.metrics import (accuracy_score, roc_curve, precision_recall_curve, classification_report, confusion_matrix, precision_score, recall_score,
                             f1_score, roc_auc_score, log_loss, balanced_accuracy_score,
                             cohen_kappa_score, matthews_corrcoef, jaccard_score, hamming_loss)

def df_upsert_postgres(data_frame, table_name, engine, schema=None, match_columns=None, insert_only=False):
    """
    Perform an "upsert" on a PostgreSQL table from a DataFrame.
    Constructs an INSERT … ON CONFLICT statement, uploads the DataFrame to a
    temporary table, and then executes the INSERT.
    Parameters
    ----------
    data_frame : pandas.DataFrame
        The DataFrame to be upserted.
    table_name : str
        The name of the target table.
    engine : sqlalchemy.engine.Engine
        The SQLAlchemy Engine to use.
    schema : str, optional
        The name of the schema containing the target table.
    match_columns : list of str, optional
        A list of the column name(s) on which to match. If omitted, the
        primary key columns of the target table will be used.
    insert_only : bool, optional
        On conflict do not update. (Default: False)
    """
    table_spec = ""
    if schema:
        table_spec += '"' + schema.replace('"', '""') + '".'
    table_spec += '"' + table_name.replace('"', '""') + '"'


    df_columns = list(data_frame.columns)
    if not match_columns:
        insp = sa.inspect(engine)
        match_columns = insp.get_pk_constraint(table_name, schema=schema)[
            "constrained_columns"
        ]
    columns_to_update = [col for col in df_columns if col not in match_columns]
    insert_col_list = ", ".join([f'"{col_name}"' for col_name in df_columns])
    stmt = f"INSERT INTO {table_spec} ({insert_col_list})\n"
    stmt += f"SELECT {insert_col_list} FROM temp_table\n"
    match_col_list = ", ".join([f'"{col}"' for col in match_columns])
    stmt += f"ON CONFLICT ({match_col_list}) DO "
    if insert_only:
        stmt += "NOTHING"
    else:
        stmt += "UPDATE SET\n"
        stmt += ", ".join(
            [f'"{col}" = EXCLUDED."{col}"' for col in columns_to_update]
        )

    with engine.begin() as conn:
        conn.exec_driver_sql("DROP TABLE IF EXISTS temp_table")
        conn.exec_driver_sql(
            f"CREATE TEMPORARY TABLE temp_table AS SELECT * FROM {table_spec} WHERE false"
        )
        data_frame.to_sql("temp_table", conn, if_exists="append", index=False)
        conn.exec_driver_sql(stmt)

def calculate_metrics(config, y_true, y_pred):
    """
    """
    confusion_matrix_ls = str(confusion_matrix(y_true, y_pred))
    accuracy = accuracy_score(y_true, y_pred)
    precision_macro = precision_score(y_true, y_pred, average='macro')
    precision_micro = precision_score(y_true, y_pred, average='micro')
    recall_macro = recall_score(y_true, y_pred, average='macro')
    recall_micro = recall_score(y_true, y_pred, average='micro')
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_micro = f1_score(y_true, y_pred, average='micro')
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
    cohen_kappa = cohen_kappa_score(y_true, y_pred)
    matthews_corr_coef = matthews_corrcoef(y_true, y_pred)
    jaccard_macro = jaccard_score(y_true, y_pred, average='macro')
    hamming_loss_db = hamming_loss(y_true, y_pred)
    
    metrics_dict = {
        "accuracy":[accuracy],        
        "precision_macro":[precision_macro], 
        "precision_micro":[precision_micro], 
        "recall_macro":[recall_macro], 
        "recall_micro":[recall_micro], 
        "f1_macro":[f1_macro], 
        "f1_micro":[f1_micro], 
        "balanced_accuracy":[balanced_accuracy], 
        "cohen_kappa":[cohen_kappa], 
        "matthews_corrcoef":[matthews_corr_coef], 
        "jaccard_macro":[jaccard_macro], 
        "hamming_loss":[hamming_loss_db], 
        "confusion_matrix":[confusion_matrix_ls],
    }
    return pd.DataFrame(metrics_dict)
 

In [114]:
import os

postgres_df = pd.DataFrame()
save_db = True
traversal_type = "nd_stepwise"
underscored_model_types = "a"


seed=42
n_samples=1000
n_classes=6
n_features=50
base_rates=[0.2, 0.2, 0.15, 0.15, 0.15, 0.15]
coef_scale=0.9
class_sep=1.2
n_informative=25
flip_y=0.001

save_db = True

df, params = simulate_mnlogit(
    seed=seed,
    n_samples=n_samples, n_classes=n_classes, n_features=n_features,
    base_rates=base_rates,
    coef_scale=coef_scale, class_sep=class_sep,
    n_informative=n_informative, flip_y=flip_y,
    return_probas=False
)

if save_db == True:
    name = "mlr_" + str(n_classes) + "_class_" + str(n_features) + "_" + str(n_informative) + str(seed) + "_" + str(n_samples) + "_" + "_" + str(coef_scale) + "_" + str(flip_y) + "_" + str(class_sep) + "_" + str(base_rates).replace('[', '').replace(']', '').replace(' ','')
    postgres_df.loc[0, "simulation_id"] = int(zlib.crc32(name.encode()))
    postgres_df.loc[0, "name"] = name
    postgres_df.loc[0, "seed"] = seed
    postgres_df.loc[0, "n_classes"] = n_classes
    postgres_df.loc[0, "n_features"] = n_features
    postgres_df.loc[0, "n_samples"] = n_samples
    postgres_df.loc[0, "n_informative"] = n_informative
    postgres_df.loc[0, "flip_y"] = flip_y
    postgres_df.loc[0, "base_rates"] = str(base_rates)
    postgres_df.loc[0, "coef_scale"] = coef_scale
    postgres_df.loc[0, "class_sep"] = class_sep
    postgres_df.loc[0, "notes"] = ""
    password = os.environ['ML_POSTGRESS_URL'].split(':')[2].split("@")[0]
    host = os.environ['ML_POSTGRESS_HOST']
    database = "simulation"
    engine = sa.create_engine( f"postgresql://max:{password}@{host}" + f"/{database}")
    postgres_df.to_sql('mlr_simulations_dictionary', engine, schema="public", if_exists='append', index=False)
    # engine = None

  P = softmax(X @ W + b, axis=1)
  P = softmax(X @ W + b, axis=1)
  P = softmax(X @ W + b, axis=1)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_full_train, X_full_train['Y'], stratify=X_full_train['Y'], test_size=0.3, random_state=42)
