# Length of stay summary dev - hillmaker (v0.5.0)

<div class="alert alert-block alert-warning">
    <b>WARNING</b> Numerous API and core code changes have happened to hillmaker recently and this notebook is specific to v0.5.x. 
</div>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from IPython.display import Image

import hillmaker as hm

In [None]:
ssu_stopdata = '../data/ShortStay.csv'
stops_df = pd.read_csv(ssu_stopdata, parse_dates=['InRoomTS','OutRoomTS'])
stops_df.info() # Check out the structure of the resulting DataFrame

In [None]:
stops_df.head()

## Length of stay summary

We can precompute LOS during the pydantic model pre-processing step. Instead of storing it as a Timedelta, we could allow user to specify units for LOS summary (default = 'h') where the unit string codes are same as used for Timedelta conversions.

https://en.wikipedia.org/wiki/ISO_8601#Durations

In [None]:
units = 'hours'

In [None]:
los_field_name = f'los_{units}'
stops_df[los_field_name] = (stops_df['OutRoomTS'] - stops_df['InRoomTS']) / pd.Timedelta(1, units)
stops_df.head()

In [None]:
stops_df.info()

In addition to statistical summaries as shown below, it would be nice to have histograms.

In [None]:
stops_df.groupby(['PatType'])[los_field_name].describe()

In [None]:
from pandas.core.groupby import DataFrameGroupBy
from typing import Dict, List, Tuple

In [None]:
def summary_stats(group: DataFrameGroupBy,
                  percentiles: Tuple[float] | List[float] = (0.25, 0.5, 0.75, 0.95, 0.99),
                  stub: str = ''):
    """
    Compute summary statistics on a pandas `DataFrameGroupBy` object.

    Parameters
    ----------
    group : pd.DataFrameGroupBy
        The grouping is by category
    percentiles : list or tuple of floats (e.g. [0.5, 0.75, 0.95]), optional
        Which percentiles to compute. Default is (0.25, 0.5, 0.75, 0.95, 0.99)
    stub : str
        Used to create field names (e.g. '{stub}_mean')

    Returns
    -------
    Dict whose keys are '{stub}_{statistic}'. Dict values are `DataFrame` objects.

    """
    stats = {stub + 'count': group.count(), stub + 'mean': group.mean(),
             stub + 'min': group.min(),
             stub + 'max': group.max(), 'stdev': group.std(), 'sem': group.sem(),
             stub + 'var': group.var(), 'cv': group.std() / group.mean() if group.mean() > 0 else 0,
             stub + 'skew': group.skew(), 'kurt': group.kurt()}

    if percentiles is not None:
        pctile_vals = group.quantile(percentiles)

        for p in percentiles:
            pctile_name = f'{stub}p{int(100 * p):d}'
            stats[pctile_name] = pctile_vals[p]

    return stats

In [None]:
cat_field_grp = stops_df.groupby(['PatType'])
occ_stats = cat_field_grp[los_field_name].apply(summary_stats)
occ_stats.unstack()

How to make nice looking tabular outputs? Similar to gtable in R.

In [None]:
# Import seaborn
import seaborn as sns

# Apply the default theme
sns.set_theme()

In [None]:
sns.displot(data=stops_df, x="los_hours", hue="PatType", col="PatType")

In [None]:
sns.displot(data=stops_df, x="los_hours", hue="PatType", col="PatType", kind='kde')

In [None]:
sns.displot(data=stops_df, x="los_hours", hue="PatType", col="PatType", stat='density', kde=True)

I think to have a non-shared y-axis, you have to use the `FacetGrid` and `map` approach.

In [None]:
sns.displot(data=stops_df, x="los_hours", hue="PatType", kind='ecdf')

In [None]:
sns.displot(data=stops_df, x="los_hours", hue="PatType", kind='kde')

In [None]:
sns.catplot(data=stops_df, x="PatType", y="los_hours", kind="box")

In [None]:
sns.catplot(data=stops_df, x="PatType", y="los_hours", kind="violin")

In [None]:
sns.catplot(data=stops_df, x="PatType", y="los_hours", kind="boxen")

In [None]:
sns.catplot(data=stops_df, x="PatType", y="los_hours", kind="bar")

Need to change y-axis to be relative frequency instead of raw counts.

In [None]:
sns.displot(data=stops_df, x="los_hours", hue="PatType", element="step")

In [None]:
g = sns.FacetGrid(data=stops_df, col="PatType", sharex=False, sharey=False)
g.map(sns.histplot, "los_hours")

In [None]:
g = sns.FacetGrid(data=stops_df, col="PatType", sharex=False);
g.map(sns.histplot, "los_hours", stat='density');

In [None]:
sns.set_style('ticks')
g = sns.FacetGrid(data=stops_df, col="PatType", sharex=False);
g.map(sns.histplot, "los_hours", stat='density', common_norm=False);

In [None]:
g = sns.FacetGrid(data=stops_df, col="PatType", sharex=False, sharey=False);
g.map(sns.kdeplot, "los_hours");