In [None]:
# Use latest code in directory. Alternatively, run poetry install so the package is used instead.
import os
os.chdir("..")

In [None]:
from pathlib import Path
import logging

import pandas as pd
from scipy import stats

from ndj_pipeline import db, model, transform, utils

logging.basicConfig(
    level=logging.INFO, 
    format="%(asctime)s [%(levelname)s] %(message)s", 
    handlers=[logging.StreamHandler()]
)

In [None]:
# Run data cleaning and ML pipeline
transform.run()

config = Path("data", "example_experiment.yaml")
model_config = utils.load_model_config(config)
model.run_model_training(model_config)

In [None]:
# Load data
input_path = Path("data", "processed", "titanic.parquet")
logging.info(f"Loading data from {input_path}")
data = pd.read_parquet(input_path)

In [None]:
# Run an SQL query from file and display
db.create_db()

sql_path = Path("schemas", "queries", "window_rows.sql")
logging.info(f"Loading sql query from {sql_path}")
data = db.query_db(sql_path.open().read())
data.head()

In [None]:
# Stat tests
input_path = Path("data", "processed", "titanic.parquet")
logging.info(f"Loading data from {input_path}")
data = pd.read_parquet(input_path)

data = data.dropna(subset=['age', 'sex'])
data['name_u'] = data['name'].str.contains('u').astype(int)

# T Tests
results = stats.ttest_ind(data.loc[data['survived']==1, 'fare'], data.loc[data['survived']==0, 'fare'])
logging.info("Relationship between survival and fare paid")
logging.info(f"T-test t-stat: {results.statistic:.3f}, p-value: {results.pvalue:.3f}. (expected difference)")
logging.info("")

results = stats.ttest_ind(data.loc[data['name_u']==1, 'age'], data.loc[data['name_u']==0, 'age'])
logging.info("Relationship between letter 'u' in name and age")
logging.info(f"T-test t-stat: {results.statistic:.3f}, p-value: {results.pvalue:.3f}. (expected no difference)")
logging.info("")

# Chi squared tests
results = stats.chi2_contingency(pd.crosstab(data['survived'], data['sex']))
logging.info("Relationship between survival and sex")
logging.info(f"Chi-squared Chi-value: {results[0]:.3f}, p-value: {results[1]:.3f}. (expected difference)")
logging.info("")

results = stats.chi2_contingency(pd.crosstab(data['name_u'], data['sex']))
logging.info("Relationship between letter 'u' in name and sex")
logging.info(f"Chi-squared Chi-value: {results[0]:.3f}, p-value: {results[1]:.3f}. (expected no difference)")
logging.info("")

In [None]:
import statsmodels.api as sm

import numpy as np

# Statsmodel linear regression
input_path = Path("data", "processed", "titanic.parquet")
logging.info(f"Loading data from {input_path}")
data = pd.read_parquet(input_path)

target = "survived"
features = ["pclass", "age"]

data = data.dropna(subset=['age', "pclass", "survived"])

data["_constant"] = 1
features.append("_constant")

# Need to add constant either through data
# Beware Int64 types
# Predictions are through 'results' variable
model = sm.OLS(data[target], data[features])
results = model.fit()

logging.info(results.pvalues)
logging.info(results.params)
logging.info(results.rsquared)

results.predict(data[features])