# Setup


## Imports


In [None]:
import os

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib
from matplotlib import patheffects
import matplotlib.pyplot as plt
import seaborn as sns

## Parameters


In [None]:
config = {
    "dir": "../../outputs/librarian=S2",
    "bins": 16,
    "min_bin_percentile": 1,
    "min_diff": 2,
    "left_density": "SciBERT",
    "right_density": "Word2Vec",
}

# Data Preprocessing


In [None]:
# Ensure we don't have any old dataframes
try:
    del df_combined
except NameError:
    pass
# Main loop
for i, vectorizer_dir in enumerate(os.listdir(config["dir"])):

    # Combine same vectorizer, different centers
    dfs = []
    for j, center_dir in enumerate(
        os.listdir(os.path.join(config["dir"], vectorizer_dir))
    ):
        # Open
        fp = os.path.join(config["dir"], vectorizer_dir, center_dir, "all_data.csv")
        df = pd.read_csv(fp)

        # Skip dfs with no identifier
        try:
            df = df.set_index("identifier")
        except KeyError:
            continue

        # Add df to list
        dfs.append(df)

    # Combine across centers
    if len(dfs) == 0:
        continue
    df = pd.concat(dfs)

    # Rename columns
    vectorizer = vectorizer_dir.split("=")[-1]
    for col in ["density", "edginess"]:
        df[f"{col}_{vectorizer}"] = df[col]
        del df[col]

    # Combine across vectorizers
    try:
        df_combined = df_combined.join(
            df.drop(columns=["citations_per_year", "year", "references", "is_center"]),
            how="left",
        )
    except NameError:
        df_combined = df

In [None]:
# Drop rows with missing data or duplicates
df_combined = df_combined.dropna(
    axis="index",
    how="any",
    subset=[f"density_{config['left_density']}", f"density_{config['right_density']}"],
)
df_combined = df_combined.drop_duplicates()

In [None]:
# Calculate percentiles for density vars
quantiles = np.linspace(0, 1, config["bins"] + 1)
density_vars = [var for var in df_combined.columns if "density" in var]
for var in density_vars:

    # Get density
    density = df_combined[var].values

    # Get density bins
    bins = np.linspace(
        np.percentile(density, config["min_bin_percentile"]),
        np.percentile(density, 100 - config["min_bin_percentile"]),
        config["bins"],
    )

    # Calculate quantile bins
    vectorizer = var.split("_")[-1]
    bins = np.quantile(density, quantiles)
    df_combined.loc[:, f"bin_{vectorizer}"] = np.digitize(density, bins=bins)

In [None]:
# Isolate entries where the density differs by >= min_diff bin
df_combined["different_densities"] = (
        np.abs(df_combined[f"bin_{config["left_density"]}"] -
               df_combined[f"bin_{config["right_density"]}"])
    ) >= config["min_diff"]
df_combined["different_densities"].value_counts()

# Visualization


In [None]:
# Basic visualization
g = sns.PairGrid(df_combined, vars=density_vars)
g.map_diag(sns.histplot, bins=config["bins"])
g.map_offdiag(sns.histplot, bins=config["bins"])