In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import shapiro, norm
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, html, dcc, Input, Output
import dash_bootstrap_components as dbc

### Input

In [2]:
# Define input path
path_input: Path = Path("../../../data/original/calcium/").resolve()

In [3]:
# Instantiate file descriptor dictionary
fd: pd.DataFrame = pd.DataFrame(columns=["filepath"])

# Iterate over files in input path using pathlib
for p in path_input.rglob("*"):
    if p.is_file():
        # Append file path as string to file descriptor
        fd.loc[len(fd)] = str(p)

In [4]:
# Parse month and year columns
fd["month"], fd["year"] = (
    fd["filepath"].str.split(pat=" ").str[-2],
    fd["filepath"].str.split(pat=" ").str[-1].str[:-5].astype(int),
)
fd["year"] = fd["year"].map(
    {
        fd["year"].min(): f"{fd['year'].min()} (Vista)",
        fd["year"].max(): f"{fd['year'].max()} (Roche)",
    }
)
fd["site"] = fd["filepath"].str.split(pat="\\").str[-2]

In [5]:
# Columns to read from each file and their new names
usecols_: list[str] = ["AGE", "PAT_ENC_CSN_ID", "COMPONENT_RESULT", "DEPARTMENT_NAME"]
colnames: list[str] = ["age", "RID", "calcium", "dept"]

In [6]:
# Read all files and store them in a list
data_list: list[pd.DataFrame] = []
for _, entry in fd.iterrows():
    single_file: pd.DataFrame = (
        pd.read_excel(entry["filepath"], usecols=usecols_)
        .rename(columns=dict(zip(usecols_, colnames)))
        .dropna()
    )
    single_file["month"], single_file["year"], single_file["site"] = (
        entry["month"],
        entry["year"],
        entry["site"],
    )
    data_list.append(single_file)
# Concatenate all files in the list to a single DataFrame
data: pd.DataFrame = pd.concat(data_list, axis=0).reset_index(drop=True)

### Processing

In [7]:
# Segment age into categorical groups
age_group_count: pd.Series = (
    pd.cut(x=data["age"], bins=[0, 18, 60, np.inf], right=False)
    .value_counts(normalize=True)
    .sort_index()
)
age_group_labels: list[str] = [
    f"{k} ({(v * 100):.1f}%)" for k, v in age_group_count.to_dict().items()
]
data["age_group"] = pd.cut(
    x=data["age"], bins=[0, 18, 60, np.inf], right=False, labels=age_group_labels
)

In [8]:
# Handle calcium values with '<' or '>' by censoring at the boundaries
data["calcium"] = (
    data["calcium"]
    .astype(str)
    .str.replace(pat="[><]", repl="", regex=True)
    .astype(float)
)

In [9]:
# List the most common departments and their percentages
depts_count: pd.Series = data["site"].value_counts(normalize=True, dropna=False)
depts: pd.DataFrame = pd.DataFrame(
    {
        "dept_name": depts_count.index,
        "dept_percentage": [format(dept, ".1%") for dept in depts_count.values],
    }
)
depts["dept_label"] = depts["dept_name"] + " (" + depts["dept_percentage"] + ")"

### Dashboard

In [10]:
app: Dash = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

In [11]:
# fmt: off
app.layout = html.Div([
    dbc.Row([
        dbc.Col([
            html.Hr(),
            dbc.Row([
                dbc.Col(dbc.Label(children="plot height"), width=3),
                dbc.Col(dcc.Slider(id="height", min=200, max=1000, step=25, value=450, marks={x: str(x) for x in range(200, 1000 + 200, 200)}), width=8),
            ], align="center"),
            dbc.Row([
                dbc.Col(dbc.Label(children="plot width"), width=3),
                dbc.Col(dcc.Slider(id="width", min=800, max=1800, step=25, value=1000, marks={x: str(x) for x in range(800, 1800 + 200, 200)}), width=8),
            ], align="center"),
            html.Hr(),
            dbc.Switch(id="normalize", value=True, label="normalize by probability density"),
            html.Hr(),
            dbc.Label(children="select month"),
            dbc.Checklist(id="month", options=fd["month"].unique(), value=fd["month"].unique(), inline=True),
            html.Hr(),
            dbc.Label(children="select site"),
            dbc.Checklist(id="site", options=depts["dept_label"], value=depts["dept_label"], inline=False),
            html.Hr(),
            dbc.Label(children="select age group"),
            dbc.Checklist(id="age_group", options=age_group_labels, value=age_group_labels, inline=False),
        ], width={"size": 3, "offset": 1}),
        dbc.Col(dcc.Graph(id="calcium_histogram"), width=7, align="end"),
    ], align="end"),
], style={"backgroundColor": "white"})

In [12]:
# fmt: off
@app.callback(
    Output("calcium_histogram", "figure"),
    Input("height", "value"),
    Input("width", "value"),
    Input("normalize", "value"),
    Input("month", "value"),
    Input("site", "value"),
    Input("age_group", "value"),
    prevent_initial_call=False)
def update_histogram(height: int, width: int, normalize: bool, month: list[str], site: list[str], age_group: list[str]) -> go.Figure:
    # Filter data based on selected month, site, and age group
    if len(month) == 0 or len(site) == 0 or len(age_group) == 0:
        return go.Figure()
    dept: pd.Series = depts.loc[depts["dept_label"].isin(site), "dept_name"]
    data_plot: pd.DataFrame = data.loc[(data["month"].isin(month)) & (data["site"].isin(dept)) & (data["age_group"].isin(age_group))]
    
    # Create histogram figure object
    fig = px.histogram(data_plot, x="calcium", histnorm="probability density" if normalize else "density",
                       color="year", marginal="box", barmode="overlay", opacity=0.7)
    
    # Update histogram bin size
    fig.data[0].xbins.update(size=0.1)
    
    # Add normal distribution curve and normality test result to the histogram
    for k, trace in enumerate(fig.data):
        if isinstance(trace, go.Histogram):
            # Fit a normal distribution to the histogram data
            hist_data: np.ndarray = np.array(trace.x)
            mu, std = norm.fit(hist_data)
            norm_pdf: np.ndarray = np.vstack((hist_data, norm.pdf(hist_data, mu, std)), dtype=None)
            norm_pdf_plot: np.ndarray = norm_pdf[:, np.argsort(norm_pdf[0])]
            multiplier: int = 1 if normalize else len(hist_data)
            
            # Add normal distribution curve to the histogram
            fig.add_trace(go.Scatter(x=norm_pdf_plot[0], y=multiplier * norm_pdf_plot[1], mode="lines",
                                     line=dict(color=trace.marker.color, width=2), name=None, showlegend=False))
            
            # Perform Shapiro-Wilk test for normality
            _, shapiro_p = shapiro(hist_data)
            shapiro_result: str = "Parametric" if shapiro_p > 0.05 else "Non-parametric"
            
            # Add annotation with normality test result
            annot: str = f"{shapiro_result} (p={shapiro_p:.4f})<br>{mu:.1f} \u00B1 {std:.1f}<br>{np.array2string(np.quantile(hist_data, [0.1, 0.25, 0.5, 0.75, 0.9]), precision=1, separator=', ')}"
            fig.add_annotation(x=mu, y=multiplier * norm.pdf(mu, mu, std), xref="x", yref="y", text=annot, showarrow=False,
                               xshift=150 * (k - 1), yshift=10, font=dict(size=18, color=trace.marker.color, family="Arial"))
    
    fig.update_layout(
        template="simple_white",
        paper_bgcolor="rgba(255, 255, 255, 1)",
        plot_bgcolor="rgba(255, 255, 255, 1)",
        xaxis=dict(title="calcium", showticklabels=True, mirror=False, ticks="outside", showline=True, range=[6, 14]),
        yaxis=dict(showticklabels=True, mirror=True, ticks="outside", showline=True),
        font=dict(color="black", size=18, family="Arial"),
        title=dict(text=None, font=dict(color="black", size=22, family="Arial"), x=0.5),
        height=height,
        width=width,
        margin=dict(t=0, b=100, l=0, r=0),
    )
    
    return fig

In [None]:
app.run(debug=True, jupyter_height=1000, port=7604, use_reloader=False)


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 68426.


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 118955.


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 68426.


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 118955.


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 63065.


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 112664.


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 57482.


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 105739.


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 62843.


scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 112030.


scipy.stats.shapiro: For N > 5000, computed 