# Single Table Analysis Template

In [None]:
# Removes lint errors from VS Code
from typing import Dict, TYPE_CHECKING, Tuple, List

if TYPE_CHECKING:
    import kedro

    catalog: kedro.io.data_catalog.DataCatalog
    session: kedro.framework.session.session.KedroSession
    pipelines: Dict[str, kedro.pipeline.pipeline.Pipeline]

In [None]:
import os

VIEW = os.getenv("DATASET_VIEW") or "tab_adult"
TABLE = os.getenv("DATASET_TABLE") or "table"
ALG = os.getenv("SYNTH_ALG") or "ref"

import numpy as np
import pandas as pd

wrk: pd.DataFrame = catalog.load(f"{VIEW}.wrk.{TABLE}")
alg: pd.DataFrame = catalog.load(f"{VIEW}.{ALG}.{TABLE}")

from pasteur.metadata import Metadata
meta = catalog.load(f"params:{VIEW}.metadata")
meta = Metadata(meta, wrk).get_table(TABLE)

2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `tab_adult.wrk.table` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `tab_adult.ref.table` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `params:tab_adult.metadata` (MemoryDataSet)...


In [None]:
wrk.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
1,6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica
2,7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States
3,9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States
4,10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States


In [None]:
wrk.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13024 entries, 0 to 13023
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   id              13024 non-null  int64   
 1   age             13024 non-null  int32   
 2   workclass       13024 non-null  category
 3   fnlwgt          13024 non-null  int32   
 4   education       13024 non-null  category
 5   education-num   13024 non-null  int32   
 6   marital-status  13024 non-null  category
 7   occupation      13024 non-null  category
 8   relationship    13024 non-null  category
 9   race            13024 non-null  category
 10  sex             13024 non-null  category
 11  capital-gain    13024 non-null  int32   
 12  capital-loss    13024 non-null  int32   
 13  hours-per-week  13024 non-null  int32   
 14  native-country  13024 non-null  category
dtypes: category(8), int32(6), int64(1)
memory usage: 614.5 KB


In [None]:
# Plotting
%matplotlib inline

from matplotlib import pyplot as plt
import matplotlib.ticker as ticker

plt.style.use("./utils/light.mplstyle")
plt.rcParams['figure.figsize'] = [6, 4]
# plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower

In [None]:
from pandas.api.types import (
    is_datetime64_any_dtype as is_datetime,
    is_timedelta64_dtype as is_timedelta,
)

def gen_freq(a, b, cols, is_cat=None, fillna=1e-6, bins=32):
    if isinstance(cols, str):
        cols = [cols]
    if isinstance(is_cat, bool):
        is_cat = [is_cat]

    # Shallow copy a, b and only keep cols
    # FIXME: stop dropping NAs
    a, b = a[cols].dropna(), b[cols].dropna()

    # Convert any continuous variables to discrete
    if is_cat is not None and not all(is_cat):
        for col, cat in zip(cols, is_cat):
            if cat:
                continue

            if is_datetime(a[col]):
                c, d = a[col].dt.day_of_year, b[col].dt.day_of_year
            else:
                c, d = pd.to_numeric(a[col]), pd.to_numeric(b[col])

            col_bins = np.histogram_bin_edges(c, bins=bins).astype(np.float32)
            a[col] = np.digitize(c, col_bins)
            b[col] = np.digitize(d, col_bins)

    ## Calculate frequencies of discrete variables
    a, b = a.value_counts(), b.value_counts()
    c = pd.concat([a, b], axis=1)
    c = c / c.sum()
    c = c.fillna(value=fillna)
    c[c == 0] = fillna
    c = c / c.sum()
    return c.iloc[:, 0], c.iloc[:, 1]


In [None]:
from scipy.stats import chisquare

res = []
for col in meta.columns:
    k, j = gen_freq(wrk, alg, col, meta[col].is_cat())
    chi, p = chisquare(k, j)
    # print(f"{name:12}.{col:20}: X^2={chi:3.3f} p={100*p:7.3f}%")
    res.append([meta[col].type, col, chi, p])

res = pd.DataFrame(res, columns=["type", "col", "X^2", "p"])
res.set_index(keys=["type", "col"]).sort_index().style.background_gradient(axis=0)


Unnamed: 0_level_0,Unnamed: 1_level_0,X^2,p
type,col,Unnamed: 2_level_1,Unnamed: 3_level_1
categorical,education,0.003617,1.0
categorical,marital-status,0.001804,1.0
categorical,native-country,0.011814,1.0
categorical,occupation,0.00268,1.0
categorical,race,0.000322,1.0
categorical,relationship,0.001108,1.0
categorical,sex,1e-06,0.999347
categorical,workclass,0.001247,1.0
numerical,age,0.005262,1.0
numerical,capital-gain,0.007041,1.0


In [None]:
from scipy.stats import ks_2samp

res = []
for col in meta.cols:
    if meta[col].is_cat():
        continue
    c, d = wrk[col].dropna(), alg[col].dropna()
    ks, p = ks_2samp(c, d)
    res.append([meta[col].type, col, ks, p])

res = pd.DataFrame(res, columns=["type", "col", "K-S", "p"])
res.set_index(keys=["type", "col"]).style.background_gradient(axis=0)


Unnamed: 0_level_0,Unnamed: 1_level_0,K-S,p
type,col,Unnamed: 2_level_1,Unnamed: 3_level_1
numerical,age,0.012208,0.283941
numerical,fnlwgt,0.007217,0.884148
numerical,education-num,0.010673,0.445335
numerical,capital-gain,0.004837,0.997884
numerical,capital-loss,0.001229,1.0
numerical,hours-per-week,0.008062,0.787924
