Skip to content

Commit

Permalink
Major changes: EDA (multivariate SVD and UMAP), Model workflow (XGB, …
Browse files Browse the repository at this point in the history
…shape) (#103)

* + updated env
+ included umap
+ inclucded scikit-learn
+ added svd_degeneracy calc
+ added umap calc
+ added umap plot
+ improved makefile to bash install miniforge not zsh install

* + added calc_umap
+ moved pymc to optional deps to allow for lightweight non-pymc install

* + tweak calc_umap

* + minor improvement to datatype corrector

* + breaking changes:
+ renamed model to model_pymc, renamed ModelIO to PYMCIO
+ created new model_xgb dir with xgb_io.py and XGBIO class
+ included in deps: XGBoost, SHAP, category_encoders
+ added all in deps
+ reinstalled dev
+ update version to 0.6 because now starting to include xgb, shap and related features

* + renamed model_io.py to pymc_io.py
+ updated pymc_io.read_idata
+ updated xgb_io.read

* + time for a merge
  • Loading branch information
jonsedar committed Nov 14, 2023
1 parent 25dcb1d commit 66e9c24
Show file tree
Hide file tree
Showing 21 changed files with 280 additions and 85 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[flake8]
ignore = E203, E266, W291, W293, F401, F403, E501, W503, W605
ignore = E203, E266, W291, W293, F401, F403, E501, W503, W605, C901
max-line-length = 88
max-doc-length = 144
max-complexity = 18
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ repos:
- id: no-print-statements
files: ^oreum_core/
- repo: https://github.com/psf/black # black formatter
rev: 23.10.0
rev: 23.11.0
hooks:
- id: black
files: ^oreum_core/
Expand Down
99 changes: 56 additions & 43 deletions LICENSES_THIRD_PARTY.md

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ dev: ## create env for local dev on any machine MacOS x64 (Intel)
export CONDA_ENV_PATH=$(MAMBADIR)/envs/oreum_core/bin; \
export CONDA_DEFAULT_ENV=oreum_core; \
export CONDA_SUBDIR=osx-64; \
$(PYTHON_ENV) -m pip index versions oreum_core; \
$(PYTHON_ENV) -m pip install -e ".[dev]"; \
$(PYTHON_ENV) -m pip install -e ".[all]"; \
$(PYTHON_ENV) -c "import numpy as np; np.__config__.show()" > dev/install_log/blas_info.txt; \
pipdeptree -a > dev/install_log/pipdeptree.txt; \
pipdeptree -a -r > dev/install_log/pipdeptree_rev.txt; \
Expand Down Expand Up @@ -69,7 +68,7 @@ mamba: ## get mamba via miniforge for MacOS x86_64 (Intel via Rosetta2) use zsh
test -f $(MAMBARC) || { echo $(MAMBARCMSG); exit 1; }
wget $(MAMBADL)/$(MAMBAV) -O $(HOME)/miniforge.sh
chmod 755 $(HOME)/miniforge.sh
zsh $(HOME)/miniforge.sh -b -p $(MAMBADIR)
bash $(HOME)/miniforge.sh -b -p $(MAMBADIR)
export PATH=$(MAMBADIR)/bin:$$PATH; \
conda init zsh;
rm $(HOME)/miniforge.sh
Expand Down
8 changes: 4 additions & 4 deletions assets/img/interrogate_badge.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion oreum_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"""Core tools for use on projects by Oreum Industries"""
import logging

__version__ = "0.5.8"
__version__ = "0.6.0"

# logger goes to null handler by default
# packages that import oreum_core can override this and direct elsewhere
Expand Down
16 changes: 8 additions & 8 deletions oreum_core/curate/data_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,12 @@ class DatatypeConverter:
"""Force correct datatypes according to what model expects"""

def __init__(self, ftsd: dict, ftslvlcat: dict = {}, date_format: str = '%Y-%m-%d'):
"""Initialise with fts and fts_dtype_pandas_categorical
The pandas categorical dtype logically sits on top of a str object
giving it order which is critical for patsy dmatrix transform
and thus model structure.
"""Initialise with fts and optionally specify factors with specific levels
Use with a fts dict of form:
ftsd = dict(
fid = [],
fcat = [],
fstr = [],
fbool = [],
fdate = [],
fyear = [],
Expand All @@ -55,8 +52,8 @@ def __init__(self, ftsd: dict, ftslvlcat: dict = {}, date_format: str = '%Y-%m-%
fverbatim = [], # maintain in current dtype)
"""
self.ftsd = dict(
fid=ftsd.get('fid', []),
fcat=ftsd.get('fcat', []),
fstr=ftsd.get('fstr', []),
fbool=ftsd.get('fbool', []),
fdate=ftsd.get('fdate', []),
fyear=ftsd.get('fyear', []),
Expand All @@ -83,14 +80,17 @@ def _force_dtypes(self, dfraw):
fts_all = [w for _, v in self.ftsd.items() for w in v]
df = dfraw[fts_all].copy()

for ft in self.ftsd['fid'] + self.ftsd['fcat']:
for ft in self.ftsd['fcat'] + self.ftsd['fstr']:
# tame string, clean, handle nulls
idx = df[ft].notnull()
vals = df.loc[idx, ft].astype(str, errors='raise').apply(snl.clean)
df.drop(ft, axis=1, inplace=True)
df.loc[~idx, ft] = 'nan'
df.loc[idx, ft] = vals
df[ft] = df[ft].astype('string')
if ft in self.ftsd['fcat']:
df[ft] = pd.Categorical(df[ft].values, ordered=True)
else:
df[ft] = df[ft].astype('string')

for ft in self.ftsd['fbool']:
# tame string, strip, lower, use self.bool_dict, use pd.NA
Expand Down
5 changes: 4 additions & 1 deletion oreum_core/eda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,16 @@
bootstrap_lr,
calc_geometric_cv,
calc_location_in_ecdf,
calc_svd,
calc_umap,
fit_and_plot_fn,
get_gini,
month_diff,
tril_nan,
)
from .describe import describe, display_fw, display_ht, get_fts_by_dtype
from .eda_io import FigureIO, display_image_file, output_data_dict
from .plot import (
from .plot import ( # plot_umap,
plot_accuracy,
plot_binary_performance,
plot_bool_ct,
Expand All @@ -39,6 +41,7 @@
plot_coverage,
plot_date_ct,
plot_estimate,
plot_explained_variance,
plot_f_measure,
plot_float_dist,
plot_grp_ct,
Expand Down
55 changes: 55 additions & 0 deletions oreum_core/eda/calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

# eda.calc.py
"""Calculations to help EDA"""
import logging
import warnings

import matplotlib.pyplot as plt
Expand All @@ -22,10 +23,15 @@
import seaborn as sns
from matplotlib import figure
from scipy import stats
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from umap.umap_ import UMAP

RSD = 42
rng = np.random.default_rng(seed=RSD)

_log = logging.getLogger(__name__)

__all__ = [
'fit_and_plot_fn',
'get_gini',
Expand All @@ -35,6 +41,8 @@
'calc_location_in_ecdf',
'month_diff',
'tril_nan',
'calc_svd',
'calc_umap',
]


Expand Down Expand Up @@ -270,3 +278,50 @@ def tril_nan(m: np.ndarray, k: int = 0) -> np.ndarray:

# return np.where(mask, m, np.ones(1, m.dtype) * np.nan)
return np.where(mask, m, np.nan)


def calc_svd(df: pd.DataFrame, k: int = 10) -> tuple[pd.DataFrame, TruncatedSVD]:
"""Calc SVD for k components (and preprocess to remove nulls and zscore),
report degeneracy, return transformed df and fitted TruncatedSVD object"""

# protect SVD from nulls
idx_nulls = df.isnull().sum(axis=1) > 0
if sum(idx_nulls) > 0:
df = df.loc[~idx_nulls].copy()
_log.info(f'Excluding {sum(idx_nulls)} rows containing a null, prior to SVD')

# standardize
scaler = StandardScaler().fit(df)
dfs = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)

# use scikit learn's TruncatedSVD with randomized
k = min(k, dfs.shape[1] - 1)
svd = TruncatedSVD(n_components=k, random_state=RSD)
svd_fit = svd.fit(dfs)

# Are any eigenvalues NaN or really small?
n_null = sum(np.isnan(svd_fit.singular_values_))
assert n_null == 0, f'{n_null} Singular Values are NaN'
n_tiny = sum(svd_fit.singular_values_ < 1e-12)
assert n_tiny == 0, f'{n_tiny} Singular Values are < 1e-12'

dfx = svd_fit.transform(dfs)

return dfx, svd_fit


def calc_umap(df: pd.DataFrame) -> tuple[pd.DataFrame, UMAP]:
"""Calc 2D UMAP (and preprocess to remove nulls and zscore), return
transformed df and fitted UMAP object"""

# protect UMAP from nulls
idx_nulls = df.isnull().sum(axis=1) > 0
if sum(idx_nulls) > 0:
df = df.loc[~idx_nulls].copy()
_log.info(f'Excluding {sum(idx_nulls)} rows containing a null, prior to UMAP')

umapper = UMAP(n_neighbors=5)
umap_fit = umapper.fit(df)
dfx = pd.DataFrame(umap_fit.transform(df), columns=['c0', 'c1'], index=df.index)

return dfx, umap_fit
50 changes: 47 additions & 3 deletions oreum_core/eda/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

# eda.plot.py
"""EDA Plotting"""
import logging
from textwrap import wrap
from typing import Literal

Expand All @@ -24,6 +25,7 @@
from matplotlib import figure, gridspec, lines, ticker
from scipy import integrate, stats

from .calc import calc_svd
from .describe import get_fts_by_dtype

__all__ = [
Expand Down Expand Up @@ -55,9 +57,11 @@
'plot_kj_summaries_for_linear_model',
'plot_grp_ct',
'plot_cdf_ppc_vs_obs',
'plot_explained_variance',
]


_log = logging.getLogger(__name__)
RSD = 42
rng = np.random.default_rng(seed=RSD)

Expand Down Expand Up @@ -384,7 +388,7 @@ def plot_joint_numeric(
ftsd = get_fts_by_dtype(dfp)
linreg = False
if hue in ftsd['int'] + ftsd['float']: # bin into 5 equal quantiles
dfp[hue] = pd.qcut(dfp[hue].values, q=5)
dfp[hue] = pd.qcut(dfp[hue].values, q=7)
kws['palette'] = 'viridis'
else:
ngrps = len(dfp[hue].unique())
Expand All @@ -396,8 +400,9 @@ def plot_joint_numeric(

kde_kws = kws | dict(zorder=0, levels=7, cut=0, fill=kdefill, legend=True)
scatter_kws = kws | dict(
alpha=0.6, marker='o', linewidths=0.05, edgecolor='#dddddd', s=40
alpha=0.6, marker='o', linewidths=0.05, edgecolor='#dddddd', s=50
)
reg_kws = kws | dict(scatter_kws=scatter_kws, robust=True)
rug_kws = kws | dict(height=0.1, legend=False)

if kind == 'kde':
Expand All @@ -410,7 +415,7 @@ def plot_joint_numeric(
_ = gd.plot_joint(sns.scatterplot, **scatter_kws)
_ = gd.plot_marginals(sns.rugplot, **rug_kws)
elif kind == 'reg':
_ = gd.plot_joint(sns.regplot, scatter_kws=scatter_kws, **kws)
_ = gd.plot_joint(sns.regplot, **reg_kws)
_ = gd.plot_marginals(sns.rugplot, **rug_kws)
else:
raise ValueError('kwarg `kind` must be in {kde, scatter, kde+scatter, reg}')
Expand Down Expand Up @@ -1720,3 +1725,42 @@ def plot_cdf_ppc_vs_obs(
_ = axs.set(xlim=(0, np.ceil(y.max())), ylim=(0, 1))

_ = f.suptitle('Cumulative density plot of the posterior predictive vs actual')

return f


def plot_explained_variance(
df: pd.DataFrame, k: int = 10, topn: int = 3
) -> figure.Figure:
"""Calculate Truncated SVD and plot explained variance curve, with optional
vline for the topn components Related to eda.calc.get_svd"""

_, svd_fit = calc_svd(df, k)
evr = pd.Series(
svd_fit.explained_variance_ratio_.cumsum(), name='explained_variance_csum'
)
evr.index = np.arange(1, len(evr) + 1)
evr.index.name = 'component'

f, axs = plt.subplots(1, 1, figsize=(12, 5))
_ = sns.pointplot(
x='component', y='explained_variance_csum', data=evr.reset_index(), ax=axs
)
_ = axs.vlines(topn - 1, 0, 1, 'orange', '-.')
_ = axs.annotate(
'{:.1%}'.format(evr[topn]),
xy=(topn - 1, evr[topn]),
xycoords='data',
xytext=(-10, 10),
textcoords='offset points',
color='orange',
ha='right',
fontsize=12,
)

_ = axs.set_ylim(0, 1.001)
_ = f.suptitle(
f'Explained variance @ top {topn} components ~ {evr[topn]:.1%}', fontsize=14
)
_ = f.tight_layout()
return f
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# model/
"""Various classes & functions for modelling, primarily using pymc"""
# model_pymc/
"""Various classes & functions for modelling, using PyMC"""
from .base import BasePYMCModel
from .calc import (
calc_2_sample_delta_prop,
Expand All @@ -37,7 +37,6 @@
print_rvs,
)
from .distributions import lognormal_icdf, mv_dist, normal_icdf, sanity_check_lognorm
from .model_io import ModelIO
from .plot import (
facetplot_krushke,
forestplot_multiple,
Expand All @@ -50,3 +49,4 @@
plot_ppc,
plot_trace,
)
from .pymc_io import PYMCIO
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import pandas as pd
import patsy as pat

from ..model import BasePYMCModel
from ..model_pymc import BasePYMCModel

# import pytensor.tensor as pt
# from IPython.display import Markdown, display
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion oreum_core/model/plot.py → oreum_core/model_pymc/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import xarray as xr
from matplotlib import figure, gridspec

from ..model import BasePYMCModel
from ..model_pymc import BasePYMCModel

__all__ = [
'plot_trace',
Expand Down
Loading

0 comments on commit 66e9c24

Please sign in to comment.