# Hierarchical Modeling
What models fit the data?

# Setup

### Imports

In [1]:
# Stdlib imports
import os
import shutil

In [2]:
# Calculation imports
import cmdstanpy
import numpy as np
import pandas as pd
import scipy
from scipy.stats import norm, binned_statistic
from sklearn.metrics import root_mean_squared_error
import sklearn.metrics as sk_metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [3]:
# Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

### Parameters

In [4]:
config = {
    "bins": 16,
    "min_bin_percentile": 10,
    "nonden_fit_cols": ["references", "year"],
    "fit_dir": "../../analysis_data/stan_fits/stan_fit",
    "cat_col": "stan_field_code",
}

In [5]:
metrics = {
    r"$R^2$": sk_metrics.r2_score,
    "1-MAE": lambda y_true, y_pred: 1 - sk_metrics.mean_absolute_error(y_true, y_pred),
    "1-RMSE": lambda y_true, y_pred: 1
    - sk_metrics.root_mean_squared_error(y_true, y_pred),
    # "1-MAPE": lambda y_true, y_pred: 1
    # - sk_metrics.mean_absolute_percentage_error(y_true, y_pred),
    "1-MedAE": lambda y_true, y_pred: 1
    - sk_metrics.median_absolute_error(y_true, y_pred),
    "1-MPinL": lambda y_true, y_pred: 1 - sk_metrics.mean_pinball_loss(y_true, y_pred),
    r"$D^2_{pin}$": sk_metrics.d2_pinball_score,
    r"$D^2_{abs}$": sk_metrics.d2_absolute_error_score,
}

In [6]:
palette = sns.color_palette()

In [7]:
def log_sum_exp(mu):
    """Calculate the log sum of exponentials
    (pulling out the max to maintain) precision.
    """
    mu_max = np.max(mu)

    summation = np.sum(np.exp(mu - mu_max))
    return mu_max + np.log(summation)

In [8]:
def log_ppd(log_p):
    M = len(log_p)
    return -np.log(M) + log_sum_exp(log_p)

### Load data

In [None]:
df_all = pd.read_csv("../../analysis_data/all_data.csv")
df_all

In [None]:
# Basic nan handling across all obs
df_all["log_cpy"] = np.log10(df_all["citations_per_year"])
df_all["log_cpy"] = df_all["log_cpy"].replace(-np.inf, np.nan)

In [None]:
# Make the field category more useful
df_all["fields_of_study_0"] = df_all["fields_of_study_0"].astype("category")
df_all["stan_field_code"] = df_all["fields_of_study_0"].cat.codes + 1
field_names = df_all["fields_of_study_0"].cat.categories
field_names

In [None]:
# Get the vectorizer names
vectorizer_names = sorted(df_all["vectorizer"].unique())
vectorizer_names

In [None]:
# Load raw data
models = [_ for _ in os.listdir(config["fit_dir"]) if os.path.isdir(f"{config["fit_dir"]}/{_}")]
draws = []
# Loop through models
for i, model_i in enumerate(models):
    model_dir_i = f"{config["fit_dir"]}/{model_i}"
    files_j = [_ for _ in os.listdir(model_dir_i) if _[-4:] == ".csv"]
    draws_i = []
    # Loop through chains per model
    for j, file_j in enumerate(files_j):
        chain_j = pd.read_csv(f"{model_dir_i}/{file_j}", comment="#")
        chain_j["chain"] = file_j.split(".")[0].split("_")[-1]
        draws_i.append(chain_j)
    draws_i = pd.concat(draws_i, ignore_index=True)
    draws_i["model"] = model_i
    draws.append(draws_i)
draws = pd.concat(draws, ignore_index=True)
draws

In [None]:
print(f"Draws df using {draws.memory_usage().sum() / (1024 ** 3):.3f} GB of memory")

In [15]:
# Load results, train, and test data
results = pd.read_csv(f"{config["fit_dir"]}/results.csv", index_col=0)
df_train = pd.read_csv(f"{config["fit_dir"]}/train_data.csv", index_col=0)
df_test = pd.read_csv(f"{config["fit_dir"]}/test_data.csv", index_col=0)

# EDA

## Model Performance

In [16]:
performance = results.copy()

In [17]:
# List out the columns related to metrics
metric_cols = list(metrics.keys())
descaled_metric_cols = [f"{_}_descaled" for _ in metric_cols]
all_metric_cols = metric_cols + descaled_metric_cols

In [None]:
# Add scaled versions of metrics (go from min value(0) to max value(1))
# This is different form the "descaled" below, which refers to whether or not y_pred
# was returned to its unscaled version before the metric was calculated
metric_scaler = StandardScaler()
relative_performance = performance.copy()
relative_performance[all_metric_cols] = metric_scaler.fit_transform(X=performance[all_metric_cols])
relative_performance["metric_is_scaled"] = True
performance["metric_is_scaled"] = False
performance = pd.concat([performance, relative_performance], ignore_index=True)
performance

In [None]:
# Convert metrics into long format
nonmetric_cols = performance.columns.difference(pd.Series(all_metric_cols))
performance = performance.melt(id_vars=nonmetric_cols, var_name="metric_type", value_name="metric", ignore_index=False)
performance = performance.reset_index().rename(columns={"index": "model"})
performance

In [None]:
# Split out the "descaled" into a separate column
performance["y_pred_is_scaled"] = ~performance["metric_type"].str.contains("_descaled")
performance["metric_type"] = performance["metric_type"].str.replace("_descaled", "")
performance

In [None]:
# Typical ranges for metrics
sns.stripplot(
    data=performance.query("metric_is_scaled == False"),
    x="metric",
    y="metric_type",
    hue="y_pred_is_scaled"
)

In [None]:
# Performance for metrics scaled by their mean and std
sns.stripplot(
    data=performance.query("metric_is_scaled == True"),
    x="metric",
    y="metric_type",
    hue="y_pred_is_scaled",
)

## Slopes

In [None]:
# Melt betas into new DF
beta_cols = [_ for _ in results.columns if _[:5] == "beta["]
nonbeta_cols = [_ for _ in results.columns if "beta" not in _]
betas = results.melt(
    id_vars=nonbeta_cols,
    value_vars=beta_cols,
    var_name="beta_type",
    value_name="beta",
    ignore_index=False,
)
# Turn the index into a columns
betas = betas.reset_index().rename(columns={"index": "model"})
betas

In [None]:
# Get the field and variable out of the beta name
beta_vars = betas["beta_type"].str.findall(r"beta\[(.*)\]").str[0]
betas["field"] = beta_vars.str.findall(r"\,(.*)").str[0]
betas["field"] = betas["field"].fillna("All")
betas["variable"] = beta_vars.str.split(",").str[0]
betas

In [None]:
# Add in the mu_betas, since those are mean betas
mubeta_cols = [_ for _ in results.columns if "mu_beta" in _ ]
mubetas = results.melt(
    id_vars=nonbeta_cols,
    value_vars=mubeta_cols,
    var_name="beta_type",
    value_name="beta",
    ignore_index=False,
)
mubetas = mubetas.reset_index().rename(columns={"index": "model"})
mubetas["variable"] = mubetas["beta_type"].str.findall(r"mu_beta\[(.*)\]").str[0]
mubetas["field"] = "All"
mubetas

In [26]:
# Combine and clean up
betas = pd.concat([betas, mubetas], ignore_index=True)
betas["variable"] = betas["variable"].str.replace("_j", "").astype("category")
betas["field"] = betas["field"].astype("category")
betas["beta"] = betas["beta"].fillna(value=0.)
# betas = betas.dropna(subset="beta")

In [None]:
# Get an overview of the range of possible correlations
sns.histplot(betas["beta"])

In [None]:
# Look at betas per variable
fig = plt.figure()
ax = plt.gca()

sns.stripplot(
    ax=ax,
    data=betas,
    x="beta",
    y="variable",
    hue="field",
)

ax.axvline(
    0,
    color="k",
    zorder=-1,
)

In [None]:
# Same plot as above, but with a y-axis that shows model performance
fg = sns.relplot(
    data=betas,
    x="beta",
    y=r"$R^2$_descaled",
    row="variable",
    hue="field",
)

for row in fg.axes:
    ax = row[0]

    ax.axvline(
        0,
        color="k",
        zorder=-1,
    )
    ax.tick_params(labelbottom=True)

plt.tight_layout()