# Hierarchical Modeling
What models fit the data?

# Setup

### Imports

In [1]:
# Stdlib imports
import os
import shutil

In [2]:
# Calculation imports
import cmdstanpy
import numpy as np
import pandas as pd
import scipy
from scipy.stats import norm, binned_statistic
from sklearn.metrics import root_mean_squared_error
import sklearn.metrics as sk_metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [3]:
# Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

### Parameters

In [4]:
config = {
    "bins": 16,
    "min_bin_percentile": 10,
    "nonden_fit_cols": ["references", "year"],
    "fit_dir": "../../analysis_data/stan_fits/stan_fit_1000",
    "cat_col": "stan_field_code",
}


In [5]:
palette = sns.color_palette()

In [6]:
def log_sum_exp(mu):
    """Calculate the log sum of exponentials
    (pulling out the max to maintain) precision.
    """
    mu_max = np.max(mu)

    summation = np.sum(np.exp(mu - mu_max))
    return mu_max + np.log(summation)

In [7]:
def log_ppd(log_p):
    M = len(log_p)
    return -np.log(M) + log_sum_exp(log_p)

### Load data

In [None]:
df_all = pd.read_csv("../../analysis_data/all_data.csv")
df_all

In [None]:
# Basic nan handling across all obs
df_all["log_cpy"] = np.log10(df_all["citations_per_year"])
df_all["log_cpy"] = df_all["log_cpy"].replace(-np.inf, np.nan)

In [None]:
# Make the field category more useful
df_all["fields_of_study_0"] = df_all["fields_of_study_0"].astype("category")
df_all["stan_field_code"] = df_all["fields_of_study_0"].cat.codes + 1
field_names = df_all["fields_of_study_0"].cat.categories
field_names

In [None]:
# Get the vectorizer names
vectorizer_names = sorted(df_all["vectorizer"].unique())
vectorizer_names

In [12]:
# Load results, train, and test data
results = pd.read_csv(f"{config["fit_dir"]}/results.csv", index_col=0)
df_train = pd.read_csv(f"{config["fit_dir"]}/train_data.csv", index_col=0)
df_test = pd.read_csv(f"{config["fit_dir"]}/test_data.csv", index_col=0)

In [None]:
# Load raw data
models = [_ for _ in os.listdir(config["fit_dir"]) if os.path.isdir(f"{config["fit_dir"]}/{_}")]
draws = []
# Loop through models
for i, model_i in enumerate(models):
    model_dir_i = f"{config["fit_dir"]}/{model_i}"
    files_j = [_ for _ in os.listdir(model_dir_i) if _[-4:] == ".csv"]
    draws_i = []
    # Loop through chains per model
    for j, file_j in enumerate(files_j):
        chain_j = pd.read_csv(f"{model_dir_i}/{file_j}", comment="#")
        chain_j["chain"] = file_j.split(".")[0].split("_")[-1]
        draws_i.append(chain_j)
    draws_i = pd.concat(draws_i, ignore_index=True)
    draws_i["model"] = model_i
    draws.append(draws_i)
draws = pd.concat(draws, ignore_index=True)
draws

In [None]:
print(f"Draws df using {draws.memory_usage().sum() / (1024 ** 3):.3f} GB")