# Hierarchical Modeling
What models fit the data?

# Setup

In [None]:
import pandas as pd
from tqdm import tqdm
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

#### Parameters

In [None]:
config = {
    "bins": 16,
    "min_bin_percentile": 10,
    "eval_sample_size": 1000,
}

#### Load data

In [None]:
df_all = pd.read_csv("../../analysis_data/all_data.csv")
df_all

In [None]:
# Basic nan handling across all obs
df_all["log_cpy"] = np.log10(df_all["citations_per_year"])
df_all['log_cpy'] = df_all['log_cpy'].replace(-np.inf, np.nan)

In [None]:
field_names = sorted(df_all["fields_of_study_0"].unique())
field_names

In [None]:
vectorizer_names = sorted(df_all["vectorizer"].unique())
vectorizer_names

# Plot

### Brief aside on logscale plotting

In [None]:
fig = plt.figure()
ax = plt.gca()

x = np.logspace(-3, 3, 100)
log10 = np.log10(x)
log10p = np.log10(1 + x)
log10p2 = np.log10(1 + x) - 1

ax.plot(
    x,
    log10,
    label="log10",
)
ax.plot(
    x,
    log10p,
    label="log10p",
)
ax.plot(
    x,
    log10p2,
    label="log10p - 1",
)

ax.legend()

ax.set_xscale("log")

### Overview Normalized 2D Histogram

In [None]:
def get_bins(x, n_bins=config["bins"]):
    # Get density bins
    bins = np.linspace(
        np.nanpercentile(x, config["min_bin_percentile"]),
        np.nanpercentile(x, 100 - config["min_bin_percentile"]),
        n_bins + 1,
    )
    return bins


density_bins = df_all.groupby("vectorizer")["density"].apply(get_bins).to_dict()
cpy_bins = get_bins(
    df_all["log_cpy"], n_bins=17
)  # The n_bins=17 is to ensure we don't accidentally flip axes

In [None]:
def custom_histplot(data, x, y, normed=True, *args, **kwargs):

    # Get the appropriate facets
    vectorizer = data["vectorizer"].unique()[0]
    field = data["fields_of_study_0"].unique()[0]

    density_bins = get_bins(data[x])

    if normed:
        hist2d, _, _ = np.histogram2d(data[x], data[y], bins=(density_bins, cpy_bins))
        density_hist, _ = np.histogram(data[x], bins=density_bins)
        hist2d_normed = hist2d / density_hist.reshape(1, -1).T

        # Plot the data
        plt.pcolormesh(
            density_bins,
            cpy_bins,
            hist2d_normed.T,
            shading='auto',
            cmap="viridis",
        )
    else:
        sns.histplot(data, x=x, y=y, bins=(density_bins, cpy_bins), *args, **kwargs)

In [None]:
fg = sns.FacetGrid(data=df_all, row="vectorizer", col="fields_of_study_0", sharex=False)
fg.map_dataframe(custom_histplot, x="density", y="log_cpy", normed=True)
fg.set_titles("{row_name} | {col_name}")

### Focused Normalized Histogram

In [None]:
df = df_all.query("vectorizer in ['SciBERT', 'Word2Vec']")

In [None]:
fg = sns.FacetGrid(data=df, row="vectorizer", col="fields_of_study_0", sharex=False)
fg.map_dataframe(custom_histplot, x="density", y="log_cpy", normed=True)
fg.set_titles("{row_name} | {col_name}")

# Statistical Modeling

Note to future self: cmdstanpy and cmdstan advertise conda as *the* way to install.
Don't listen to them.
I couldn't get it to compile when I used conda for anything cmdstanpy related.
I downloaded the repo and compiled manually, and used pip for cmdstanpy.

In [None]:
import cmdstanpy

## Train-Test Split

In [None]:
import scipy
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [None]:
from sklearn.preprocessing import scale

In [None]:
from scipy.stats import binned_statistic

In [None]:
# Pivot to wide format so we can use the same train_test division
df_eval = df_all.pivot_table(index="identifier", columns="vectorizer", values="density")
vectorizers = df_eval.columns
log_cpy = df_all.pivot_table(index="identifier", columns="vectorizer", values="log_cpy").iloc[:, 0]
df_eval["log_cpy"] = log_cpy
df_eval.columns.name = None
df_eval

In [None]:
# Apply scaling (normalization by mean and sigma) to help with modelling
df_eval = df_eval.apply(scale)
df_eval

In [None]:
# Downsample
if config["eval_sample_size"] is not None:
    df_eval = df_eval.sample(config["eval_sample_size"])

In [None]:
# Train-test split
df_train, df_test = train_test_split(df_eval)

## Global fits
$(c \sim \rho_t)$ vs $(c \sim \rho_v)$

In [None]:
stan_model = "globalreg"

In [None]:
# Dictionary to hold modeling output
output = {
    "vectorizer": [],
    "alpha": [],
    "beta": [],
    "sigma": [],
    "rmse": [],
    "log_ppd": [],
}

# Loop through vectorizers
for i, vectorizer_i in enumerate(vectorizers):

    # Drop na
    df_train_i = df_train.dropna(subset=[vectorizer_i, "log_cpy"])
    df_test_i = df_train.dropna(subset=[vectorizer_i, "log_cpy"])

    # Format data
    data = {
        "N": len(df_train_i),
        "x": df_train_i[vectorizer_i].values,
        "y": df_train_i["log_cpy"].values,
        "N_test": len(df_test_i),
        "x_test": df_test_i[vectorizer_i].values,
        "y_test": df_test_i["log_cpy"].values,
    }

    # Compile model
    model = cmdstanpy.CmdStanModel(stan_file=f"../stan_models/{stan_model}.stan")

    # Fit model
    fit = model.sample(
        data=data,
    )

    # Parse output
    draws = fit.draws_pd()
    y_tilde_cols = [col for col in draws.columns if 'y_tilde' in col]
    columns = [col for col in draws.columns if 'y_tilde' not in col]
    y_pred_i = draws[y_tilde_cols].mean(axis="rows").values

    # Store label
    output["vectorizer"].append(vectorizer_i)

    # Store parameters
    output["alpha"].append(draws["alpha"].median())
    output["beta"].append(draws["beta"].median())
    output["sigma"].append(draws["sigma"].median())

    # Calculate rmse
    rmse_i = root_mean_squared_error(df_test_i["log_cpy"], y_pred_i)
    output["rmse"].append(rmse_i)

    # Calculate log posterior predictive density
    log_ppd = scipy.special.logsumexp(draws["log_p"]) - np.log(len(draws))
    output["log_ppd"].append(log_ppd)
output = pd.DataFrame(output)

In [None]:
output

In [None]:
def medianplot(data, x, y, bins, ax):

    x = data[x]
    y = data[y]

    centers = (bins[1:] + bins[:-1]) / 2

    # Calculate running median
    median, bin_edges, bin_number = binned_statistic(
        x, y, statistic=np.nanmedian, bins=bins
    )
    ax.plot(
        centers,
        median,
        color="k",
    )

    # Calculate running percentiles
    low, bin_edges, bin_number = binned_statistic(
        x,
        y,
        statistic=lambda v: np.nanpercentile(v, 16),
        bins=bins,
    )
    high, bin_edges, bin_number = binned_statistic(
        x,
        y,
        statistic=lambda v: np.nanpercentile(v, 84),
        bins=bins,
    )
    ax.fill_between(
        centers,
        low,
        high,
        color="k",
        alpha=0.1,
    )

    ax.set_xlim(bins[0], bins[-1])

In [None]:
palette = sns.color_palette()

In [None]:
left_edge = np.nanpercentile(df_eval[vectorizers].values.flatten(), config["min_bin_percentile"])
right_edge = -left_edge
bins = np.linspace(left_edge, right_edge, config["bins"])

In [None]:
for i, row in output.iterrows():
    fig = plt.figure()
    ax = plt.gca()

    medianplot(
        data=df_eval,
        x=row["vectorizer"],
        y="log_cpy",
        bins=bins,
        ax=ax,
    )

    # Plot the regression
    xs = bins
    ys = row["alpha"] + row["beta"] * xs
    ax.plot(
        xs,
        ys,
        color=palette[0],
    )
    ax.fill_between(
        xs,
        ys - row["sigma"],
        ys + row["sigma"],
        color=palette[0],
        alpha=0.4,
    )

    ax.set_xlabel(f"density [{row["vectorizer"]}]")
    ax.set_ylabel("log_cpy")

    