In [None]:
import os
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import src.utils.io as io_utils

In [None]:
load_dotenv()
warnings.filterwarnings("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2


ROOT = Path(os.getenv("ROOT"))
CONFIG_DIR = ROOT / Path("src/config")
DATA_CFG = io_utils.load_yaml(CONFIG_DIR / Path("data.yaml"))
MODELS_CFG = io_utils.load_yaml(CONFIG_DIR / Path("models.yml"))
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [None]:
train = io_utils.load_df_parquet(ROOT / DATA_CFG["preds_on_syn"])
test = io_utils.load_df_parquet(ROOT / DATA_CFG["preds_on_test"])

target = io_utils.load_df_parquet(ROOT / DATA_CFG["target_on_syn"])
ids = io_utils.load_df_parquet(ROOT / DATA_CFG["test_ids"])

In [None]:
PARAM_PATH = ROOT / MODELS_CFG["log_reg_meta"]
param = io_utils.load_yaml(PARAM_PATH)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
meta_model = LogisticRegressionCV(
    Cs=np.logspace(-3, 3, 13),
    cv=cv,
    **param,
)

meta_model.fit(train, target)

In [None]:
pred_test_final = meta_model.predict_proba(test)[:, 1]

df_sub = pd.DataFrame(ids, columns=["id"])
df_sub["y"] = pred_test_final

print(f"Size of submission: {df_sub.shape}")

df_sub.head()

In [None]:
df_sub.to_csv(ROOT / DATA_CFG["final"]["sub_path"])