# Tabular Model: flat data, without context

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mostly-ai/mostlyai-engine/blob/main/examples/flat.ipynb)

In [None]:
from pathlib import Path
import pandas as pd
from mostlyai import engine

# init workspace and logging
ws = Path("ws-tabular-flat")
engine.init_logging()

# load original data
url = "https://github.com/mostly-ai/public-demo-data/raw/refs/heads/dev/census"
trn_df = pd.read_csv(f"{url}/census.csv.gz")

# execute the engine steps
engine.split(                         # split data as PQT files for `trn` + `val` to `{ws}/OriginalData/tgt-data`
  workspace_dir=ws,
  tgt_data=trn_df,
  model_type="TABULAR",
)
engine.analyze(workspace_dir=ws)      # generate column-level statistics to `{ws}/ModelData/tgt-stats/stats.json`
engine.encode(workspace_dir=ws)       # encode training data to `{ws}/OriginalData/encoded-data`
engine.train(workspace_dir=ws)        # train model and store to `{ws}/ModelData/model-data`
engine.generate(workspace_dir=ws)     # use model to generate synthetic samples to `{ws}/SyntheticData`

In [None]:
# load synthetic data
syn_df = pd.read_parquet(ws / "SyntheticData")
syn_df.head(5)

### QUALITY ASSURANCE

#### univariate `age`

In [None]:
print("Original Age:  " + ", ".join([f'q{q*100:.0f}: {trn_df["age"].quantile(q):.0f}' for q in [.1, .25, .5, .75, .9]]))
print("Synthetic Age: " + ", ".join([f'q{q*100:.0f}: {syn_df["age"].quantile(q):.0f}' for q in [.1, .25, .5, .75, .9]]))
#syn_df["age"].quantile(np.linspace(0, 1, 11))

#### bivariate `sex` ~ `income`: income gap

In [None]:
trn_gap = (trn_df[trn_df["sex"] == "Male"]["income"] == ">50K").mean() - (trn_df[trn_df["sex"] == "Female"]["income"] == ">50K").mean()
syn_gap = (syn_df[syn_df["sex"] == "Male"]["income"] == ">50K").mean() - (syn_df[syn_df["sex"] == "Female"]["income"] == ">50K").mean()
print(f"Income Gap {trn_gap:.1%} vs. {syn_gap:.1%}")

#### check consistency between `education` and `education.num`

In [None]:
pd.crosstab(syn_df["education"], syn_df["education_num"])