# Hierarchical Modeling
What models fit the data?

# Setup

### Imports

In [None]:
# Calculation imports
import cmdstanpy
import numpy as np
import pandas as pd
import scipy
from scipy.stats import norm, binned_statistic
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from tqdm import tqdm

In [None]:
# Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns

#### Parameters

In [None]:
config = {
    "bins": 16,
    "min_bin_percentile": 10,
    "eval_sample_size": 1000,
    "nonden_fit_cols": ["references", "year"],
}

In [None]:
palette = sns.color_palette()

In [None]:
def log_ppd(log_p):
    log_p_max = np.max(log_p)
    log_sum_exp = log_p_max + np.log(np.sum(np.exp(log_p - log_p_max)))
    return log_sum_exp - len(log_p)

#### Load data

In [None]:
df_all = pd.read_csv("../../analysis_data/all_data.csv")
df_all

In [None]:
# Basic nan handling across all obs
df_all["log_cpy"] = np.log10(df_all["citations_per_year"])
df_all["log_cpy"] = df_all["log_cpy"].replace(-np.inf, np.nan)

In [None]:
field_names = sorted(df_all["fields_of_study_0"].unique())
field_names

In [None]:
vectorizer_names = sorted(df_all["vectorizer"].unique())
vectorizer_names

# Visual Overview

### Brief aside on logscale plotting

In [None]:
fig = plt.figure()
ax = plt.gca()

x = np.logspace(-3, 3, 100)
log10 = np.log10(x)
log10p = np.log10(1 + x)
log10p2 = np.log10(1 + x) - 1

ax.plot(
    x,
    log10,
    label="log10",
)
ax.plot(
    x,
    log10p,
    label="log10p",
)
ax.plot(
    x,
    log10p2,
    label="log10p - 1",
)

ax.legend()

ax.set_xscale("log")

### Overview Normalized 2D Histogram

In [None]:
def get_bins(x, n_bins=config["bins"]):
    # Get density bins
    bins = np.linspace(
        np.nanpercentile(x, config["min_bin_percentile"]),
        np.nanpercentile(x, 100 - config["min_bin_percentile"]),
        n_bins + 1,
    )
    return bins


density_bins = df_all.groupby("vectorizer")["density"].apply(get_bins).to_dict()
cpy_bins = get_bins(
    df_all["log_cpy"], n_bins=17
)  # The n_bins=17 is to ensure we don't accidentally flip axes

In [None]:
def custom_histplot(data, x, y, normed=True, *args, **kwargs):

    # Get the appropriate facets
    vectorizer = data["vectorizer"].unique()[0]
    field = data["fields_of_study_0"].unique()[0]

    density_bins = get_bins(data[x])

    if normed:
        hist2d, _, _ = np.histogram2d(data[x], data[y], bins=(density_bins, cpy_bins))
        density_hist, _ = np.histogram(data[x], bins=density_bins)
        hist2d_normed = hist2d / density_hist.reshape(1, -1).T

        # Plot the data
        plt.pcolormesh(
            density_bins,
            cpy_bins,
            hist2d_normed.T,
            shading="auto",
            cmap="viridis",
        )
    else:
        sns.histplot(data, x=x, y=y, bins=(density_bins, cpy_bins), *args, **kwargs)

In [None]:
fg = sns.FacetGrid(data=df_all, row="vectorizer", col="fields_of_study_0", sharex=False)
fg.map_dataframe(custom_histplot, x="density", y="log_cpy", normed=True)
fg.set_titles("{row_name} | {col_name}")

### Focused Normalized Histogram

In [None]:
df = df_all.query("vectorizer in ['SciBERT', 'Word2Vec']")

In [None]:
fg = sns.FacetGrid(data=df, row="vectorizer", col="fields_of_study_0", sharex=False)
fg.map_dataframe(custom_histplot, x="density", y="log_cpy", normed=True)
fg.set_titles("{row_name} | {col_name}")

# Statistical Modeling

Note to future self: cmdstanpy and cmdstan advertise conda as *the* way to install.
Don't listen to them.
I couldn't get it to compile when I used conda for anything cmdstanpy related.
I downloaded the repo and compiled manually, and used pip for cmdstanpy.

## Train-Test Split

In [None]:
# Pivot to wide format so we can use the same train_test division
# and so we can use multiple densities at once
df_vectorizers = df_all.pivot_table(
    index="identifier", columns="vectorizer", values="density"
)
# For the nonden cols we can take the first because the other values are duplicates
nonden_cols = ["log_cpy", ] + config["nonden_fit_cols"]
df_others = df_all.groupby("identifier")[nonden_cols].first()
df_eval = pd.concat([df_others, df_vectorizers], axis="columns")
df_eval

In [None]:
# Drop all columns where log_cpy is na
df_eval = df_eval.dropna(subset="log_cpy")

In [None]:
# Apply scaling (normalization by mean and sigma) to help with modelling
df_eval = df_eval.apply(scale)
df_eval

In [None]:
# Downsample
if config["eval_sample_size"] is not None:
    df_eval = df_eval.sample(config["eval_sample_size"])

In [None]:
# Train-test split
df_train, df_test = train_test_split(df_eval)

In [None]:
# dictionary to store results in
results_dict = {}

## Base model
The base model is just a normal distribution.

In [None]:
stan_model = "base"

In [None]:
# Dict for storing output
output = {}

In [None]:
# Drop na
df_train_i = df_train.dropna(subset=["log_cpy"])
df_test_i = df_train.dropna(subset=["log_cpy"])

In [None]:
# Format data
data = {
    "N": len(df_train_i),
    "y": df_train_i["log_cpy"].values,
    "N_test": len(df_test_i),
    "y_test": df_test_i["log_cpy"].values,
}

In [None]:
# Compile model
model = cmdstanpy.CmdStanModel(stan_file=f"../stan_models/{stan_model}.stan")

In [None]:
# Fit model
fit = model.sample(
    data=data,
)

In [None]:
# Parse output
draws = fit.draws_pd()

In [None]:
# Store parameters
output["alpha"] = draws["alpha"].median()
output["sigma"] = draws["sigma"].median()

In [None]:
# Calculate log posterior predictive density
output["log_ppd"] = log_ppd(draws["log_p"])

In [None]:
results_dict[stan_model] = pd.Series(output)

## Linear regression on density
$(c \sim \rho_t)$ vs $(c \sim \rho_v)$

In [None]:
stan_model = "reg"

### Fit

In [None]:
# Dictionary to hold modeling output

# Loop through vectorizers
outputs_for_this_model = {}
for i, vectorizer_i in enumerate(vectorizer_names):
    output = {}

    # Drop na
    df_train_i = df_train.dropna(subset=[vectorizer_i, "log_cpy"])
    df_test_i = df_train.dropna(subset=[vectorizer_i, "log_cpy"])

    # Format data
    data = {
        "N": len(df_train_i),
        "x": df_train_i[vectorizer_i].values,
        "y": df_train_i["log_cpy"].values,
        "N_test": len(df_test_i),
        "x_test": df_test_i[vectorizer_i].values,
        "y_test": df_test_i["log_cpy"].values,
    }

    # Compile model
    model = cmdstanpy.CmdStanModel(stan_file=f"../stan_models/{stan_model}.stan")

    # Fit model
    fit = model.sample(
        data=data,
    )

    # Parse output
    draws = fit.draws_pd()

    # Store parameters
    output["alpha"] = draws["alpha"].median()
    output[f"beta[{vectorizer_i}]"] = draws["beta"].median()
    output["sigma"] = draws["sigma"].median()

    # Calculate log posterior predictive density
    output["log_ppd"] = log_ppd(draws["log_p"])

    key = f"{stan_model}_{vectorizer_i}"
    outputs_for_this_model[key] = output
    results_dict[key] = output

### Evaluate

In [None]:
# Format for showing here
output = pd.DataFrame(outputs_for_this_model).T
output["vectorizer"] = vectorizer_names

In [None]:
def medianplot(data, x, y, bins, ax):

    x = data[x]
    y = data[y]

    centers = (bins[1:] + bins[:-1]) / 2

    # Calculate running median
    median, bin_edges, bin_number = binned_statistic(
        x, y, statistic=np.nanmedian, bins=bins
    )
    ax.plot(
        centers,
        median,
        color="k",
    )

    # Calculate running percentiles
    low, bin_edges, bin_number = binned_statistic(
        x,
        y,
        statistic=lambda v: np.nanpercentile(v, 16),
        bins=bins,
    )
    high, bin_edges, bin_number = binned_statistic(
        x,
        y,
        statistic=lambda v: np.nanpercentile(v, 84),
        bins=bins,
    )
    ax.fill_between(
        centers,
        low,
        high,
        color="k",
        alpha=0.1,
    )

    ax.set_xlim(bins[0], bins[-1])

In [None]:
left_edge = np.nanpercentile(
    df_eval[vectorizer_names].values.flatten(), config["min_bin_percentile"]
)
right_edge = -left_edge
bins = np.linspace(left_edge, right_edge, config["bins"])

In [None]:
for i, row in output.iterrows():
    fig = plt.figure()
    ax = plt.gca()

    medianplot(
        data=df_eval,
        x=row["vectorizer"],
        y="log_cpy",
        bins=bins,
        ax=ax,
    )

    # Plot the regression
    xs = bins
    ys = row["alpha"] + row[f"beta[{row["vectorizer"]}]"] * xs
    ax.plot(
        xs,
        ys,
        color=palette[0],
    )
    ax.fill_between(
        xs,
        ys - row["sigma"],
        ys + row["sigma"],
        color=palette[0],
        alpha=0.4,
    )

    ax.set_xlabel(f"density [{row["vectorizer"]}]")
    ax.set_ylabel("log_cpy")

    

## Multivariate linear regression w/o density
The next model is a multivate linear regression with no density

In [None]:
stan_model = "multireg"
fit_cols = config["nonden_fit_cols"]
cols = ["log_cpy", ] + fit_cols

In [None]:
# Dict for storing output
output = {}

In [None]:
# Drop na
df_train_i = df_train[cols].dropna()
df_test_i = df_train[cols].dropna()

In [None]:
# Format data
data = {
    "N": len(df_train_i),
    "K": len(fit_cols),
    "x": df_train_i[fit_cols].values,
    "y": df_train_i["log_cpy"].values,
    "N_test": len(df_test_i),
    "x_test": df_test_i[fit_cols].values,
    "y_test": df_test_i["log_cpy"].values,
}

In [None]:
# Compile model
model = cmdstanpy.CmdStanModel(stan_file=f"../stan_models/{stan_model}.stan")

In [None]:
# Fit model
fit = model.sample(
    data=data,
)

In [None]:
# Parse output
draws = fit.draws_pd()

In [None]:
# Store parameters
output["alpha"] = draws["alpha"].median()
output["sigma"] = draws["sigma"].median()
beta_cols = [col for col in draws.columns if col[:4] == "beta"]
betas = draws[beta_cols].median(axis="rows")
for i, fit_col in enumerate(fit_cols):
    output[f"beta[{fit_col}]"] = betas.iloc[i]

In [None]:
# Calculate log posterior predictive density
output["log_ppd"] = log_ppd(draws["log_p"])

In [None]:
results_dict[stan_model] = pd.Series(output)

## Multivariate linear regression
Now with density.

In [None]:
stan_model = "multireg"

In [None]:
# Dictionary to hold modeling output

# Loop through vectorizers
outputs_for_this_model = {}
for i, vectorizer_i in enumerate(vectorizer_names):
    output = {}

    # Drop na
    fit_cols = [vectorizer_i, ] + config["nonden_fit_cols"]
    cols = ["log_cpy", ] + fit_cols
    df_train_i = df_train.dropna(subset=cols)
    df_test_i = df_train.dropna(subset=cols)

    # Format data
    data = {
        "N": len(df_train_i),
        "K": len(fit_cols),
        "x": df_train_i[fit_cols].values,
        "y": df_train_i["log_cpy"].values,
        "N_test": len(df_test_i),
        "x_test": df_test_i[fit_cols].values,
        "y_test": df_test_i["log_cpy"].values,
    }

    # Compile model
    model = cmdstanpy.CmdStanModel(stan_file=f"../stan_models/{stan_model}.stan")

    # Fit model
    fit = model.sample(
        data=data,
    )

    # Parse output
    draws = fit.draws_pd()

    # Store parameters
    output["alpha"] = draws["alpha"].median()
    output["sigma"] = draws["sigma"].median()
    beta_cols = [col for col in draws.columns if col[:4] == "beta"]
    betas = draws[beta_cols].median(axis="rows")
    for i, fit_col in enumerate(fit_cols):
        output[f"beta[{fit_col}]"] = betas.iloc[i]

    # Calculate log posterior predictive density
    output["log_ppd"] = log_ppd(draws["log_p"])

    key = f"{stan_model}_{vectorizer_i}"
    outputs_for_this_model[key] = output
    results_dict[key] = output

## Multivariate linear regression w/ all fields
The next model is a multivate linear regression using every variable we have

In [None]:
stan_model = "multireg"
fit_cols = config["nonden_fit_cols"] + vectorizer_names
cols = ["log_cpy", ] + fit_cols

In [None]:
# Dict for storing output
output = {}

In [None]:
# Drop na
df_train_i = df_train[cols].dropna()
df_test_i = df_train[cols].dropna()

In [None]:
# Format data
data = {
    "N": len(df_train_i),
    "K": len(fit_cols),
    "x": df_train_i[fit_cols].values,
    "y": df_train_i["log_cpy"].values,
    "N_test": len(df_test_i),
    "x_test": df_test_i[fit_cols].values,
    "y_test": df_test_i["log_cpy"].values,
}

In [None]:
# Compile model
model = cmdstanpy.CmdStanModel(stan_file=f"../stan_models/{stan_model}.stan")

In [None]:
# Fit model
fit = model.sample(
    data=data,
)

In [None]:
# Parse output
draws = fit.draws_pd()

In [None]:
# Store parameters
output["alpha"] = draws["alpha"].median()
output["sigma"] = draws["sigma"].median()
beta_cols = [col for col in draws.columns if col[:4] == "beta"]
betas = draws[beta_cols].median(axis="rows")
for i, fit_col in enumerate(fit_cols):
    output[f"beta[{fit_col}]"] = betas.iloc[i]

In [None]:
# Calculate log posterior predictive density
output["log_ppd"] = log_ppd(draws["log_p"])

In [None]:
results_dict[f"full_{stan_model}"] = pd.Series(output)

# Summarize

In [None]:
results = pd.DataFrame(results_dict).T
results

## Visualize metrics

### Log Posterior Predictive Density

In [None]:
fig = plt.figure()
ax = plt.gca()

x = np.arange(len(results))
ax.scatter(
    x,
    results["log_ppd"],
)

ax.set_xticks(x)
ax.set_xticklabels(results.index, rotation=45)

### RMSE

In [None]:
beta_cols = [col for col in results.columns if col[:4] == "beta"]
betas = results[beta_cols].fillna(0.)
x_cols = [col[5:-1] for col in beta_cols]
x_test = df_test[x_cols].fillna(0.)

In [None]:
# Calculate y_pred
y_pred = np.matmul(x_test.values, betas.T)
y_pred += results["alpha"].values.reshape(1, -1)
y_pred.index = df_test.index
y_pred

In [None]:
# Calcualte rmse
results["rmse"] = y_pred.apply(lambda x: root_mean_squared_error(df_test["log_cpy"], x))

In [None]:
fig = plt.figure()
ax = plt.gca()

x = np.arange(len(results))
ax.scatter(
    x,
    results["rmse"],
)

ax.set_xticks(x)
ax.set_xticklabels(results.index, rotation=45, ha="right")
ax.set_ylabel("rmse")

ax.set_ylim(0, ax.get_ylim()[1])

### Maximum slopes

In [None]:
max_betas = results[beta_cols].max(axis="columns")

In [None]:
fig = plt.figure()
ax = plt.gca()

x = np.arange(len(results))
ax.scatter(
    x,
    max_betas,
)

ax.set_xticks(x)
ax.set_xticklabels(results.index, rotation=45, ha="right")
ax.set_ylabel(r"max($\beta_i$)")

ax.set_ylim(0, ax.get_ylim()[1])

In [None]:
# max slopes correlation with rmse
ax = sns.scatterplot(
    results,
    x=max_betas,
    y=1 - results["rmse"],
)
ax.set_xlim(0, ax.get_xlim()[1])
ax.set_ylim(0, ax.get_ylim()[1])

ax.set_xlabel(r'$\max(\beta_i)$')
ax.set_ylabel(r'1 - rmse')

In [None]:
results

In [None]:
# Inspect rmse as a function of distance from the center
for model_i in results.index:

    try:
        vectorizer_i = model_i.split("_")[-1]
        assert vectorizer_i in vectorizer_names
    except AssertionError:
        continue

    bins = np.linspace(-3, 3, config["bins"])
    den_key = f"rho_{vectorizer_i}"
    cut_key = f"{den_key}_bin"
    model_key = f"y_pred_{model_i}"
    df_test_to_group = pd.DataFrame()
    df_test_to_group[den_key] = df_test[vectorizer_i]
    df_test_to_group[model_key] = y_pred[model_i]
    df_test_to_group[cut_key] = pd.cut(df_test_to_group[den_key], bins)
    df_test_to_group["y_test"] = df_test["log_cpy"]
    def get_rmse_of_df(df):
        if len(df) == 0:
            return np.nan
        return root_mean_squared_error(df["y_test"], df[model_key])
    rmse_per_bin = df_test_to_group.groupby(cut_key).apply(get_rmse_of_df)
    centers = 0.5 * (bins[:-1] + bins[1:])

    fig = plt.figure()
    ax = plt.gca()
    ax = sns.scatterplot(
        x=centers,
        y=rmse_per_bin,
        ax=ax,
    )
    ax.axvline(
        0,
        c='0.2',
        linestyle="--",
    )
    ax.axhline(
        results.loc[model_i, "rmse"],
        c='0.2',
        linestyle="--",
    )

    ax.set_ylabel("rmse")
    ax.set_xlabel(den_key)

    ax.set_ylim(0, ax.get_ylim()[1])
    fig