# 03 â€” LAMA baselines (2 configs)

Requirements satisfied:
- **minimum 2 different LAMA configurations**
- choose the best validation score
- avoid leakage: we use time-aware holdout split

In [2]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve()
if (PROJECT_ROOT / "src").exists() is False and (PROJECT_ROOT.parent / "src").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT:", PROJECT_ROOT)
print("src exists:", (PROJECT_ROOT / "src").exists())

PROJECT_ROOT: /Users/sergey/code/renthop-lightautoml-vs-custom
src exists: True


In [7]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss

from src.config import Paths, TARGET_COL, ID_COL, SEED
from src.models.lama import fit_lama_tabular, fit_lama_tabular_nlp
from src.utils.seed import set_global_seed

set_global_seed(SEED)
paths = Paths()

df = pd.read_pickle(paths.data_processed/"model_table.pkl")
spec = json.loads((paths.data_processed/"feature_spec.json").read_text(encoding="utf-8"))

# Split back to train/test by presence of target
train_df = df[df[TARGET_COL].notna()].copy()
test_df  = df[df[TARGET_COL].isna()].copy()

# Time-aware holdout: last 20% by created_dt
train_df = train_df.sort_values("created_dt")
cut = int(len(train_df) * 0.8)
tr, va = train_df.iloc[:cut], train_df.iloc[cut:]
print("train:", tr.shape, "valid:", va.shape)

train: (39481, 34) valid: (9871, 34)


In [9]:
labels = ["high","medium","low"]  # submission column order
y_va = va[TARGET_COL].values

drop_cols = [ID_COL, "created_dt"]  # keep only engineered numeric/cat/text cols

# --- Config A: TabularAutoML (fast-ish) ---
res_a = fit_lama_tabular(
    train_df=tr,
    valid_df=va,
    target_col=TARGET_COL,
    drop_cols=drop_cols + spec["text_cols"],  # tabular-only: drop raw text
    timeout=600,
    cpu_limit=4,
    params={"use_algos": [["lgb"]]},  # simple, strong baseline
    verbose=2,
)
score_a = log_loss(y_va, res_a.valid_pred, labels=labels)
print("LAMA Config A logloss:", score_a)

[14:08:58] Stdout logging level is INFO2.
[14:08:58] Task: multiclass

[14:08:58] Start automl preset with listed constraints:
[14:08:58] - time: 600.00 seconds
[14:08:58] - CPU: 4 cores
[14:08:58] - memory: 16 GB

[14:08:59] [1mTrain data shape: (39481, 34)[0m

[14:09:02] Layer [1m1[0m train process start. Time left 596.45 secs
[14:09:13] [1mSelector_LightGBM[0m fitting and predicting completed
[14:09:14] Start fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ...
[14:09:14] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m =====
[14:09:27] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m =====
[14:09:43] ===== Start working with [1mfold 2[0m for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m =====
[14:09:58] ===== Start working with [1mfold 3[0m for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m =====
[14:10:16] ===== Start working with [1mfold 4[0m for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m =====
[14:10:28] Fitting [1mLvl_0_Pipe_0_Mod_0_Lig

In [None]:
# --- Config B: TabularAutoML (stronger: more time + more algos) ---
res_b = fit_lama_tabular(
    train_df=tr,
    valid_df=va,
    target_col=TARGET_COL,
    drop_cols=drop_cols + spec["text_cols"],
    timeout=1800,
    cpu_limit=4,
    params={"use_algos": [["lgb", "linear_l2"]]},
    verbose=2,
)
score_b = log_loss(y_va, res_b.valid_pred, labels=labels)
print("LAMA Config B logloss:", score_b)

[14:11:16] Stdout logging level is INFO2.
[14:11:16] Task: multiclass

[14:11:16] Start automl preset with listed constraints:
[14:11:16] - time: 1800.00 seconds
[14:11:16] - CPU: 4 cores
[14:11:16] - memory: 16 GB

[14:11:16] [1mTrain data shape: (39481, 34)[0m

[14:11:17] Layer [1m1[0m train process start. Time left 1799.07 secs
[14:11:19] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[14:11:19] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====


## LAMA with text (TabularNLPAutoML)

This may require external model downloads depending on `text_params/autonlp_params`.
If you run in a restricted environment, you can skip this section.

(If it runs, it's a great second baseline to mention in the report.)

In [None]:
RUN_NLP = False  # set True if you run on Kaggle / environment with required NLP deps & downloads

if RUN_NLP:
    res_nlp = fit_lama_tabular_nlp(
        train_df=tr,
        valid_df=va,
        target_col=TARGET_COL,
        drop_cols=drop_cols,
        text_cols=spec["text_cols"],
        timeout=1800,
        cpu_limit=4,
        params={"use_algos": [["linear_l2", "lgb"]]},
        text_params={"lang": "en"},
        tfidf_params={"ngram_range": (1, 2), "max_features": 80000},
        autonlp_params=None,  # keep TF-IDF only
        verbose=2,
    )
    score_nlp = log_loss(y_va, res_nlp.valid_pred, labels=labels)
    print("LAMA NLP logloss:", score_nlp)

In [None]:
# Compare
results = pd.DataFrame(
    [
        {"config": "TabularAutoML A (lgb, 600s)", "logloss": score_a},
        {"config": "TabularAutoML B (lgb+linear, 1800s)", "logloss": score_b},
    ]
).sort_values("logloss")
display(results)

ax = results.set_index("config")["logloss"].plot(kind="barh", figsize=(8,3))
ax.set_title("LAMA baseline comparison (lower is better)")
ax.set_xlabel("logloss")
plt.show()

## Save best LAMA model + create submission

In [None]:
best_model = res_a.model if score_a <= score_b else res_b.model
best_name = "lama_a" if score_a <= score_b else "lama_b"

# Fit on full train and predict test
from lightautoml.tasks import Task
from lightautoml.automl.presets.tabular_presets import TabularAutoML

roles = {"target": TARGET_COL, "drop": drop_cols + spec["text_cols"]}
task = Task("multiclass")

automl_full = TabularAutoML(
    task=task,
    timeout=1800,
    cpu_limit=4,
    general_params={"use_algos": [["lgb", "linear_l2"]]},
)
_ = automl_full.fit_predict(train_df, roles=roles, verbose=2)
test_pred = automl_full.predict(test_df).data

sub = pd.DataFrame({ID_COL: test_df[ID_COL].values})
sub[labels] = test_pred

out_path = paths.submissions / f"submission_{best_name}.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)
sub.to_csv(out_path, index=False)
print("Saved submission:", out_path)
sub.head()