# Hierarchical Modeling
What models fit the data?

# Setup

In [None]:
import pandas as pd
from tqdm import tqdm
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

#### Parameters

In [None]:
config = {
    "bins": 16,
    "min_bin_percentile": 10,
    "eval_sample_size": 1000,
}

#### Load data

In [None]:
df_all = pd.read_csv("../../analysis_data/all_data.csv")
df_all

In [None]:
# Basic nan handling across all obs
df_all["log_cpy"] = np.log10(df_all["citations_per_year"])
df_all['log_cpy'] = df_all['log_cpy'].replace(-np.inf, np.nan)

In [None]:
field_names = sorted(df_all["fields_of_study_0"].unique())
field_names

In [None]:
vectorizer_names = sorted(df_all["vectorizer"].unique())
vectorizer_names

# Plot

### Brief aside on logscale plotting

In [None]:
fig = plt.figure()
ax = plt.gca()

x = np.logspace(-3, 3, 100)
log10 = np.log10(x)
log10p = np.log10(1 + x)
log10p2 = np.log10(1 + x) - 1

ax.plot(
    x,
    log10,
    label="log10",
)
ax.plot(
    x,
    log10p,
    label="log10p",
)
ax.plot(
    x,
    log10p2,
    label="log10p - 1",
)

ax.legend()

ax.set_xscale("log")

### Overview Normalized 2D Histogram

In [None]:
def get_bins(x, n_bins=config["bins"]):
    # Get density bins
    bins = np.linspace(
        np.nanpercentile(x, config["min_bin_percentile"]),
        np.nanpercentile(x, 100 - config["min_bin_percentile"]),
        n_bins + 1,
    )
    return bins


density_bins = df_all.groupby("vectorizer")["density"].apply(get_bins).to_dict()
cpy_bins = get_bins(
    df_all["log_cpy"], n_bins=17
)  # The n_bins=17 is to ensure we don't accidentally flip axes

In [None]:
def custom_histplot(data, x, y, normed=True, *args, **kwargs):

    # Get the appropriate facets
    vectorizer = data["vectorizer"].unique()[0]
    field = data["fields_of_study_0"].unique()[0]

    density_bins = get_bins(data[x])

    if normed:
        hist2d, _, _ = np.histogram2d(data[x], data[y], bins=(density_bins, cpy_bins))
        density_hist, _ = np.histogram(data[x], bins=density_bins)
        hist2d_normed = hist2d / density_hist.reshape(1, -1).T

        # Plot the data
        plt.pcolormesh(
            density_bins,
            cpy_bins,
            hist2d_normed.T,
            shading='auto',
            cmap="viridis",
        )
    else:
        sns.histplot(data, x=x, y=y, bins=(density_bins, cpy_bins), *args, **kwargs)

In [None]:
fg = sns.FacetGrid(data=df_all, row="vectorizer", col="fields_of_study_0", sharex=False)
fg.map_dataframe(custom_histplot, x="density", y="log_cpy", normed=True)
fg.set_titles("{row_name} | {col_name}")

### Focused Normalized Histogram

In [None]:
df = df_all.query("vectorizer in ['SciBERT', 'Word2Vec']")

In [None]:
fg = sns.FacetGrid(data=df, row="vectorizer", col="fields_of_study_0", sharex=False)
fg.map_dataframe(custom_histplot, x="density", y="log_cpy", normed=True)
fg.set_titles("{row_name} | {col_name}")

# Statistical Modeling

Note to future self: cmdstanpy and cmdstan advertise conda as *the* way to install.
Don't listen to them.
I couldn't get it to compile when I used conda for anything cmdstanpy related.
I downloaded the repo and compiled manually, and used pip for cmdstanpy.

In [None]:
import cmdstanpy

## Train-Test Split

In [None]:
import scipy
from sklearn.model_selection import train_test_split

In [None]:
# Pivot to wide format so we can use the same train_test division
df_eval = df_all.pivot_table(index="identifier", columns="vectorizer", values="density")
log_cpy = df_all.pivot_table(index="identifier", columns="vectorizer", values="log_cpy").iloc[:, 0]
df_eval["log_cpy"] = log_cpy
df_eval.columns.name = None
df_eval

In [None]:
# Downsample
if config["eval_sample_size"] is not None:
    df_eval = df_eval.sample(config["eval_sample_size"])

In [None]:
# Train-test split
df_train, df_test = train_test_split(df_eval)

## Global fits
$(c \sim \rho_t)$ vs $(c \sim \rho_v)$

In [None]:
vectorizer = "Word2Vec"

# Drop na
df_train_i = df_train.dropna(subset=[vectorizer, "log_cpy"])
df_test_i = df_train.dropna(subset=[vectorizer, "log_cpy"])

In [None]:
data = {
    "N": len(df_train_i),
    "x": df_train_i[vectorizer].values,
    "y": df_train_i["log_cpy"].values,
}

In [None]:
model = cmdstanpy.CmdStanModel(stan_file="../stan_models/linear_regression.stan")

In [None]:
fit = model.sample(
    data=data,
)

In [None]:
draws = fit.draws_pd()

In [None]:
scipy.special.logsumexp(draws["log_p"]) - np.log(len(draws))

In [None]:
data = {
    "N": len(df_test_i),
    "x": df_test_i[vectorizer].values,
    "y": df_test_i["log_cpy"].values,
}

In [None]:
new_quantities

In [None]:
# Show parameters
pg = sns.PairGrid(data=fit.draws_pd(), vars=["alpha", "beta", "sigma"], diag_sharey=False)
pg.map_lower(sns.histplot)
pg.map_diag(sns.kdeplot)
pg.map_upper(sns.scatterplot)

### Posterior Predictive Check

In [None]:
model_ppc = cmdstanpy.CmdStanModel(stan_file='../stan_models/linear_regression_ppc.stan')

In [None]:
new_quantities = model_ppc.generate_quantities(data=data, previous_fit=fit)

In [None]:
samples = new_quantities.draws_pd(inc_sample=True)

In [None]:
y_rep_cols = [col for col in samples.columns if 'y_rep' in col]

In [None]:
means = samples[y_rep_cols].mean(axis='rows')

In [None]:
diffs = means - data["y"]

In [None]:
ax = sns.histplot(
    samples[y_rep_cols].values.flatten(),
)
sns.histplot(
    data["y"],
    ax=ax
)

In [None]:
sns.histplot(
    data=diffs
)