Major changes: EDA (multivariate SVD and UMAP), Model workflow (XGB, …

…shape) (#103) * + updated env + included umap + inclucded scikit-learn + added svd_degeneracy calc + added umap calc + added umap plot + improved makefile to bash install miniforge not zsh install * + added calc_umap + moved pymc to optional deps to allow for lightweight non-pymc install * + tweak calc_umap * + minor improvement to datatype corrector * + breaking changes: + renamed model to model_pymc, renamed ModelIO to PYMCIO + created new model_xgb dir with xgb_io.py and XGBIO class + included in deps: XGBoost, SHAP, category_encoders + added all in deps + reinstalled dev + update version to 0.6 because now starting to include xgb, shap and related features * + renamed model_io.py to pymc_io.py + updated pymc_io.read_idata + updated xgb_io.read * + time for a merge
oreum-industries · Nov 14, 2023 · 66e9c24 · 66e9c24
1 parent 25dcb1d
commit 66e9c24
Show file tree

Hide file tree

Showing 21 changed files with 280 additions and 85 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E203, E266, W291, W293, F401, F403, E501, W503, W605
+ignore = E203, E266, W291, W293, F401, F403, E501, W503, W605, C901
 max-line-length = 88
 max-doc-length = 144
 max-complexity = 18

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -31,7 +31,7 @@ repos:
       - id: no-print-statements
         files: ^oreum_core/
   - repo: https://github.com/psf/black  # black formatter
-    rev: 23.10.0
+    rev: 23.11.0
     hooks:
       - id: black
         files: ^oreum_core/

diff --git a/LICENSES_THIRD_PARTY.md b/LICENSES_THIRD_PARTY.md
diff --git a/Makefile b/Makefile
@@ -37,8 +37,7 @@ dev:  ## create env for local dev on any machine MacOS x64 (Intel)
 		export CONDA_ENV_PATH=$(MAMBADIR)/envs/oreum_core/bin; \
 		export CONDA_DEFAULT_ENV=oreum_core; \
 		export CONDA_SUBDIR=osx-64; \
-		$(PYTHON_ENV) -m pip index versions oreum_core; \
-		$(PYTHON_ENV) -m pip install -e ".[dev]"; \
+		$(PYTHON_ENV) -m pip install -e ".[all]"; \
 		$(PYTHON_ENV) -c "import numpy as np; np.__config__.show()" > dev/install_log/blas_info.txt; \
 		pipdeptree -a > dev/install_log/pipdeptree.txt; \
 		pipdeptree -a -r > dev/install_log/pipdeptree_rev.txt; \
@@ -69,7 +68,7 @@ mamba:  ## get mamba via miniforge for MacOS x86_64 (Intel via Rosetta2) use zsh
 	test -f $(MAMBARC) || { echo $(MAMBARCMSG); exit 1; }
 	wget $(MAMBADL)/$(MAMBAV) -O $(HOME)/miniforge.sh
 	chmod 755 $(HOME)/miniforge.sh
-	zsh $(HOME)/miniforge.sh -b -p $(MAMBADIR)
+	bash $(HOME)/miniforge.sh -b -p $(MAMBADIR)
 	export PATH=$(MAMBADIR)/bin:$$PATH; \
 		conda init zsh;
 	rm $(HOME)/miniforge.sh

diff --git a/assets/img/interrogate_badge.svg b/assets/img/interrogate_badge.svg
diff --git a/oreum_core/__init__.py b/oreum_core/__init__.py
@@ -15,7 +15,7 @@
 """Core tools for use on projects by Oreum Industries"""
 import logging
 
-__version__ = "0.5.8"
+__version__ = "0.6.0"
 
 # logger goes to null handler by default
 # packages that import oreum_core can override this and direct elsewhere

diff --git a/oreum_core/curate/data_transform.py b/oreum_core/curate/data_transform.py
@@ -38,15 +38,12 @@ class DatatypeConverter:
     """Force correct datatypes according to what model expects"""
 
     def __init__(self, ftsd: dict, ftslvlcat: dict = {}, date_format: str = '%Y-%m-%d'):
-        """Initialise with fts and fts_dtype_pandas_categorical
-        The pandas categorical dtype logically sits on top of a str object
-        giving it order which is critical for patsy dmatrix transform
-        and thus model structure.
+        """Initialise with fts and optionally specify factors with specific levels
 
         Use with a fts dict of form:
             ftsd = dict(
-                fid = [],
                 fcat = [],
+                fstr = [],
                 fbool = [],
                 fdate = [],
                 fyear = [],
@@ -55,8 +52,8 @@ def __init__(self, ftsd: dict, ftslvlcat: dict = {}, date_format: str = '%Y-%m-%
                 fverbatim = [],        # maintain in current dtype)
         """
         self.ftsd = dict(
-            fid=ftsd.get('fid', []),
             fcat=ftsd.get('fcat', []),
+            fstr=ftsd.get('fstr', []),
             fbool=ftsd.get('fbool', []),
             fdate=ftsd.get('fdate', []),
             fyear=ftsd.get('fyear', []),
@@ -83,14 +80,17 @@ def _force_dtypes(self, dfraw):
         fts_all = [w for _, v in self.ftsd.items() for w in v]
         df = dfraw[fts_all].copy()
 
-        for ft in self.ftsd['fid'] + self.ftsd['fcat']:
+        for ft in self.ftsd['fcat'] + self.ftsd['fstr']:
             # tame string, clean, handle nulls
             idx = df[ft].notnull()
             vals = df.loc[idx, ft].astype(str, errors='raise').apply(snl.clean)
             df.drop(ft, axis=1, inplace=True)
             df.loc[~idx, ft] = 'nan'
             df.loc[idx, ft] = vals
-            df[ft] = df[ft].astype('string')
+            if ft in self.ftsd['fcat']:
+                df[ft] = pd.Categorical(df[ft].values, ordered=True)
+            else:
+                df[ft] = df[ft].astype('string')
 
         for ft in self.ftsd['fbool']:
             # tame string, strip, lower, use self.bool_dict, use pd.NA

diff --git a/oreum_core/eda/__init__.py b/oreum_core/eda/__init__.py
@@ -19,14 +19,16 @@
     bootstrap_lr,
     calc_geometric_cv,
     calc_location_in_ecdf,
+    calc_svd,
+    calc_umap,
     fit_and_plot_fn,
     get_gini,
     month_diff,
     tril_nan,
 )
 from .describe import describe, display_fw, display_ht, get_fts_by_dtype
 from .eda_io import FigureIO, display_image_file, output_data_dict
-from .plot import (
+from .plot import (  # plot_umap,
     plot_accuracy,
     plot_binary_performance,
     plot_bool_ct,
@@ -39,6 +41,7 @@
     plot_coverage,
     plot_date_ct,
     plot_estimate,
+    plot_explained_variance,
     plot_f_measure,
     plot_float_dist,
     plot_grp_ct,

diff --git a/oreum_core/eda/calc.py b/oreum_core/eda/calc.py
@@ -14,6 +14,7 @@
 
 # eda.calc.py
 """Calculations to help EDA"""
+import logging
 import warnings
 
 import matplotlib.pyplot as plt
@@ -22,10 +23,15 @@
 import seaborn as sns
 from matplotlib import figure
 from scipy import stats
+from sklearn.decomposition import TruncatedSVD
+from sklearn.preprocessing import StandardScaler
+from umap.umap_ import UMAP
 
 RSD = 42
 rng = np.random.default_rng(seed=RSD)
 
+_log = logging.getLogger(__name__)
+
 __all__ = [
     'fit_and_plot_fn',
     'get_gini',
@@ -35,6 +41,8 @@
     'calc_location_in_ecdf',
     'month_diff',
     'tril_nan',
+    'calc_svd',
+    'calc_umap',
 ]
 
 
@@ -270,3 +278,50 @@ def tril_nan(m: np.ndarray, k: int = 0) -> np.ndarray:
 
     # return np.where(mask, m, np.ones(1, m.dtype) * np.nan)
     return np.where(mask, m, np.nan)
+
+
+def calc_svd(df: pd.DataFrame, k: int = 10) -> tuple[pd.DataFrame, TruncatedSVD]:
+    """Calc SVD for k components (and preprocess to remove nulls and zscore),
+    report degeneracy, return transformed df and fitted TruncatedSVD object"""
+
+    # protect SVD from nulls
+    idx_nulls = df.isnull().sum(axis=1) > 0
+    if sum(idx_nulls) > 0:
+        df = df.loc[~idx_nulls].copy()
+        _log.info(f'Excluding {sum(idx_nulls)} rows containing a null, prior to SVD')
+
+    # standardize
+    scaler = StandardScaler().fit(df)
+    dfs = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)
+
+    # use scikit learn's TruncatedSVD with randomized
+    k = min(k, dfs.shape[1] - 1)
+    svd = TruncatedSVD(n_components=k, random_state=RSD)
+    svd_fit = svd.fit(dfs)
+
+    # Are any eigenvalues NaN or really small?
+    n_null = sum(np.isnan(svd_fit.singular_values_))
+    assert n_null == 0, f'{n_null} Singular Values are NaN'
+    n_tiny = sum(svd_fit.singular_values_ < 1e-12)
+    assert n_tiny == 0, f'{n_tiny} Singular Values are < 1e-12'
+
+    dfx = svd_fit.transform(dfs)
+
+    return dfx, svd_fit
+
+
+def calc_umap(df: pd.DataFrame) -> tuple[pd.DataFrame, UMAP]:
+    """Calc 2D UMAP (and preprocess to remove nulls and zscore), return
+    transformed df and fitted UMAP object"""
+
+    # protect UMAP from nulls
+    idx_nulls = df.isnull().sum(axis=1) > 0
+    if sum(idx_nulls) > 0:
+        df = df.loc[~idx_nulls].copy()
+        _log.info(f'Excluding {sum(idx_nulls)} rows containing a null, prior to UMAP')
+
+    umapper = UMAP(n_neighbors=5)
+    umap_fit = umapper.fit(df)
+    dfx = pd.DataFrame(umap_fit.transform(df), columns=['c0', 'c1'], index=df.index)
+
+    return dfx, umap_fit
diff --git a/oreum_core/eda/plot.py b/oreum_core/eda/plot.py
@@ -14,6 +14,7 @@
 
 # eda.plot.py
 """EDA Plotting"""
+import logging
 from textwrap import wrap
 from typing import Literal
 
@@ -24,6 +25,7 @@
 from matplotlib import figure, gridspec, lines, ticker
 from scipy import integrate, stats
 
+from .calc import calc_svd
 from .describe import get_fts_by_dtype
 
 __all__ = [
@@ -55,9 +57,11 @@
     'plot_kj_summaries_for_linear_model',
     'plot_grp_ct',
     'plot_cdf_ppc_vs_obs',
+    'plot_explained_variance',
 ]
 
 
+_log = logging.getLogger(__name__)
 RSD = 42
 rng = np.random.default_rng(seed=RSD)
 
@@ -384,7 +388,7 @@ def plot_joint_numeric(
         ftsd = get_fts_by_dtype(dfp)
         linreg = False
         if hue in ftsd['int'] + ftsd['float']:  # bin into 5 equal quantiles
-            dfp[hue] = pd.qcut(dfp[hue].values, q=5)
+            dfp[hue] = pd.qcut(dfp[hue].values, q=7)
             kws['palette'] = 'viridis'
         else:
             ngrps = len(dfp[hue].unique())
@@ -396,8 +400,9 @@ def plot_joint_numeric(
 
     kde_kws = kws | dict(zorder=0, levels=7, cut=0, fill=kdefill, legend=True)
     scatter_kws = kws | dict(
-        alpha=0.6, marker='o', linewidths=0.05, edgecolor='#dddddd', s=40
+        alpha=0.6, marker='o', linewidths=0.05, edgecolor='#dddddd', s=50
     )
+    reg_kws = kws | dict(scatter_kws=scatter_kws, robust=True)
     rug_kws = kws | dict(height=0.1, legend=False)
 
     if kind == 'kde':
@@ -410,7 +415,7 @@ def plot_joint_numeric(
         _ = gd.plot_joint(sns.scatterplot, **scatter_kws)
         _ = gd.plot_marginals(sns.rugplot, **rug_kws)
     elif kind == 'reg':
-        _ = gd.plot_joint(sns.regplot, scatter_kws=scatter_kws, **kws)
+        _ = gd.plot_joint(sns.regplot, **reg_kws)
         _ = gd.plot_marginals(sns.rugplot, **rug_kws)
     else:
         raise ValueError('kwarg `kind` must be in {kde, scatter, kde+scatter, reg}')
@@ -1720,3 +1725,42 @@ def plot_cdf_ppc_vs_obs(
         _ = axs.set(xlim=(0, np.ceil(y.max())), ylim=(0, 1))
 
     _ = f.suptitle('Cumulative density plot of the posterior predictive vs actual')
+
+    return f
+
+
+def plot_explained_variance(
+    df: pd.DataFrame, k: int = 10, topn: int = 3
+) -> figure.Figure:
+    """Calculate Truncated SVD and plot explained variance curve, with optional
+    vline for the topn components Related to eda.calc.get_svd"""
+
+    _, svd_fit = calc_svd(df, k)
+    evr = pd.Series(
+        svd_fit.explained_variance_ratio_.cumsum(), name='explained_variance_csum'
+    )
+    evr.index = np.arange(1, len(evr) + 1)
+    evr.index.name = 'component'
+
+    f, axs = plt.subplots(1, 1, figsize=(12, 5))
+    _ = sns.pointplot(
+        x='component', y='explained_variance_csum', data=evr.reset_index(), ax=axs
+    )
+    _ = axs.vlines(topn - 1, 0, 1, 'orange', '-.')
+    _ = axs.annotate(
+        '{:.1%}'.format(evr[topn]),
+        xy=(topn - 1, evr[topn]),
+        xycoords='data',
+        xytext=(-10, 10),
+        textcoords='offset points',
+        color='orange',
+        ha='right',
+        fontsize=12,
+    )
+
+    _ = axs.set_ylim(0, 1.001)
+    _ = f.suptitle(
+        f'Explained variance @ top {topn} components ~ {evr[topn]:.1%}', fontsize=14
+    )
+    _ = f.tight_layout()
+    return f
diff --git a/oreum_core/model/__init__.py → oreum_core/model_pymc/__init__.py b/oreum_core/model/__init__.py → oreum_core/model_pymc/__init__.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# model/
-"""Various classes & functions for modelling, primarily using pymc"""
+# model_pymc/
+"""Various classes & functions for modelling, using PyMC"""
 from .base import BasePYMCModel
 from .calc import (
     calc_2_sample_delta_prop,
@@ -37,7 +37,6 @@
     print_rvs,
 )
 from .distributions import lognormal_icdf, mv_dist, normal_icdf, sanity_check_lognorm
-from .model_io import ModelIO
 from .plot import (
     facetplot_krushke,
     forestplot_multiple,
@@ -50,3 +49,4 @@
     plot_ppc,
     plot_trace,
 )
+from .pymc_io import PYMCIO
diff --git a/oreum_core/model/base.py → oreum_core/model_pymc/base.py b/oreum_core/model/base.py → oreum_core/model_pymc/base.py
diff --git a/oreum_core/model/calc.py → oreum_core/model_pymc/calc.py b/oreum_core/model/calc.py → oreum_core/model_pymc/calc.py
diff --git a/oreum_core/model/describe.py → oreum_core/model_pymc/describe.py b/oreum_core/model/describe.py → oreum_core/model_pymc/describe.py
@@ -21,7 +21,7 @@
 import pandas as pd
 import patsy as pat
 
-from ..model import BasePYMCModel
+from ..model_pymc import BasePYMCModel
 
 # import pytensor.tensor as pt
 # from IPython.display import Markdown, display

diff --git a/oreum_core/model/distributions.py → oreum_core/model_pymc/distributions.py b/oreum_core/model/distributions.py → oreum_core/model_pymc/distributions.py
diff --git a/oreum_core/model/plot.py → oreum_core/model_pymc/plot.py b/oreum_core/model/plot.py → oreum_core/model_pymc/plot.py
@@ -24,7 +24,7 @@
 import xarray as xr
 from matplotlib import figure, gridspec
 
-from ..model import BasePYMCModel
+from ..model_pymc import BasePYMCModel
 
 __all__ = [
     'plot_trace',