# Single Table Models Template

In [None]:
# Removes lint errors from VS Code
from typing import Dict, TYPE_CHECKING, Tuple, List

if TYPE_CHECKING:
    import kedro

    catalog: kedro.io.data_catalog.DataCatalog
    session: kedro.framework.session.session.KedroSession
    pipelines: Dict[str, kedro.pipeline.pipeline.Pipeline]


In [None]:
import os

VIEW = os.getenv("DATASET_VIEW") or "tab_adult"
TABLE = os.getenv("DATASET_TABLE") or "table"
ALG = os.getenv("SYNTH_ALG") or "ref"

import numpy as np
import pandas as pd

wrk: pd.DataFrame = catalog.load(f"{VIEW}.wrk.{TABLE}")
alg: pd.DataFrame = catalog.load(f"{VIEW}.{ALG}.{TABLE}")
dev: pd.DataFrame = catalog.load(f"{VIEW}.dev.{TABLE}")

from pasteur.metadata import Metadata

meta = catalog.load(f"params:{VIEW}.metadata")
meta = Metadata(meta, wrk).get_table(TABLE)

random_state = catalog.load("params:random_state")


2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `tab_adult.wrk.table` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `tab_adult.ref.table` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `tab_adult.dev.table` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `params:tab_adult.metadata` (MemoryDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `params:random_state` (MemoryDataSet)...


In [None]:
wrk.head()


Unnamed: 0,id,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
1,6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica
2,7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States
3,9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States
4,10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States


In [None]:
meta.targets


['education']

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

models = [
    ("lr", LogisticRegression, LinearRegression),
    ("svm", SVC, SVC),
    ("gb", GradientBoostingClassifier, GradientBoostingRegressor),
]


def fit_data(target_col: str, train: pd.DataFrame, *tests: List[pd.DataFrame]):
    columns = []

    for name, col in meta.cols.items():
        if name == target_col:
            continue

        if col.is_id():
            continue
        elif col.is_cat():
            columns.append(
                (name, OneHotEncoder(handle_unknown="infrequent_if_exist"), [name])
            )
        else:
            columns.append((name, StandardScaler(), [name]))

    trans = ColumnTransformer(
        columns, remainder="drop", verbose_feature_names_out=False
    )

    train_t = trans.fit_transform(train)
    test_t = [trans.transform(test) for test in tests]
    return train_t, *test_t


ratio = 0.2
targets = meta.targets + meta.sensitive


def test_targets():
    target_res = []

    for model, clf_l, clf_r in models:
        for type, data in [("wrk", wrk), ("alg", alg)]:
            train, test = train_test_split(
                data, test_size=ratio, random_state=random_state
            )

            for target in targets:
                x_train, x_test, x_wrk, x_dev = fit_data(target, train, test, wrk, dev)
                y_train, y_test, y_wrk, y_dev = (
                    train[target],
                    test[target],
                    wrk[target],
                    dev[target],
                )

                clf = clf_l() if meta[target].is_cat() else clf_r()

                clf.fit(x_train, y_train)

                res_train = clf.score(x_train, y_train)
                res_dev = clf.score(x_dev, y_dev)

                if type == "alg":
                    res_test = clf.score(x_test, y_test)
                    res_wrk = clf.score(x_wrk, y_wrk)
                else:
                    res_test = np.NAN
                    res_wrk = np.NAN

                target_res.append(
                    (model, type, target, res_train, res_test, res_wrk, res_dev)
                )

    target_res = pd.DataFrame(
        target_res,
        columns=[
            "model",
            "data",
            "target",
            "train_results",
            "test_results",
            "wrk_results",
            "dev_results",
        ],
    )
    return target_res


target_res = test_targets()
target_res.head()


Unnamed: 0,model,data,target,train_results,test_results,wrk_results,dev_results
0,lr,wrk,education,0.873884,,,0.855344
1,lr,wrk,race,0.878203,,,0.876843
2,lr,wrk,relationship,0.790767,,,0.782862
3,lr,alg,education,0.867262,0.849136,0.844287,0.84398
4,lr,alg,race,0.877243,0.874856,0.877764,0.877457


In [None]:
from IPython.display import display

for target in targets:
    caption = "Target: " if target in meta.targets else "Sensitive: "
    caption += target.capitalize()

    # cols = pd.MultiIndex.from_tuples([
    #     ("wrk", "train_results"),
    #     ("alg", "train_results"),
    #     ("alg", "test_results"),
    #     ("wrk", "dev_results"),
    #     ("alg", "dev_results"),
    #     ("alg", "wrk_results")
    # ], names=["data", None])

    pt = target_res[target_res["target"] == target].pivot_table(
        index=["model"],
        columns=["data"],
        values=["train_results", "test_results", "dev_results", "wrk_results"],
        sort=False,
    )#.reindex(cols, axis=1)
    
    pt = (
        pt.style.set_caption(caption)
        .background_gradient(axis=1)
        .applymap(
            lambda x: "color: transparent; background-color: transparent"
            if pd.isnull(x)
            else ""
        )
    )

    display(pt)

pt.columns

Unnamed: 0_level_0,dev_results,dev_results,test_results,train_results,train_results,wrk_results
data,alg,wrk,alg,alg,wrk,alg
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
lr,0.84398,0.855344,0.849136,0.867262,0.873884,0.844287
svm,0.960381,0.969595,0.957006,0.980996,0.985411,0.960227
gb,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0_level_0,dev_results,dev_results,test_results,train_results,train_results,wrk_results
data,alg,wrk,alg,alg,wrk,alg
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
lr,0.877457,0.876843,0.874856,0.877243,0.878203,0.877764
svm,0.869472,0.869165,0.86833,0.871005,0.874364,0.871545
gb,0.875307,0.872236,0.871401,0.891736,0.895095,0.877918


Unnamed: 0_level_0,dev_results,dev_results,test_results,train_results,train_results,wrk_results
data,alg,wrk,alg,alg,wrk,alg
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
lr,0.785319,0.782862,0.775432,0.787408,0.790767,0.785703
svm,0.790848,0.790541,0.78618,0.807851,0.809387,0.791155
gb,0.79269,0.790233,0.783877,0.822248,0.821096,0.793458


MultiIndex([(  'dev_results', 'alg'),
            (  'dev_results', 'wrk'),
            ( 'test_results', 'alg'),
            ('train_results', 'alg'),
            ('train_results', 'wrk'),
            (  'wrk_results', 'alg')],
           names=[None, 'data'])