In [None]:
# Use latest code in directory. Alternatively, run poetry install so the package is used instead.
import os
os.chdir("..")

In [None]:
from pathlib import Path
import logging

import pandas as pd

from ndj_pipeline import db, model, transform, utils

logging.basicConfig(
    level=logging.INFO, 
    format="%(asctime)s [%(levelname)s] %(message)s", 
    handlers=[logging.StreamHandler()]
)

In [None]:
transform.run()

config = Path("data", "example_experiment.yaml")
model_config = utils.load_model_config(config)
model.run_model_training(model_config)

In [None]:
input_path = Path("data", "processed", "titanic.parquet")
logging.info(f"Loading data from {input_path}")
data = pd.read_parquet(input_path)

In [None]:
db.create_db()

sql_path = Path("schemas", "queries", "window_rows.sql")
logging.info(f"Loading sql query from {sql_path}")
db.query_db(sql_path.open().read())

In [None]:
from scipy import stats

In [None]:
data = data.dropna(subset=['age', 'sex'])
data['name_t'] = data['name'].str.contains('t').astype(int)
data['name_u'] = data['name'].str.contains('u').astype(int)

In [None]:
# T Tests

In [None]:
stats.ttest_ind(data.loc[data['name_u']==1, 'age'], data.loc[data['name_u']==0, 'age'])

In [None]:
stats.ttest_ind(data.loc[data['sex']==1, 'age'], data.loc[data['sex']==0, 'age'])

In [None]:
# Chi squared tests

In [None]:
xtab = pd.crosstab(data['survived'], data['sex'])
stats.chi2_contingency(xtab)

In [None]:
xtab = pd.crosstab(data['name_u'], data['sex'])
stats.chi2_contingency(xtab)