# Setup

## Imports

In [None]:
import os

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib
from matplotlib import patheffects
import matplotlib.pyplot as plt
import seaborn as sns

## Parameters

In [None]:
keys = [
    'Bacon2019',
    'hafenLowredshiftLymanLimit2017',
    'Imeletal2022',
    'Ololube2012',
    'Torres2013',
    'West2003',
]
key = keys[1]
librarian = 'S2'
vectorizer = 'SciBERT'

## Process Parameters

In [None]:
directory = os.path.join(
    '../../outputs',
    f'librarian={librarian}',
    f'vectorizer={vectorizer}',
)

# Data Preprocessing

In [None]:
dfs = []
for key in keys:
    fp = os.path.join(directory, f'center={key}/all_data.csv')
    df = pd.read_csv(fp)

    # Density categorization
    med_density = df['density'].median()
    df['density_cat'] = (df['density'] > med_density).astype('category')
    df['density_cat'] = df['density_cat'].cat.rename_categories({
        True: '> median',
        False: '< median',
    })

    # Center category
    df['center'] = key

    dfs.append(df)

In [None]:
# Combine
df = pd.concat(dfs)

In [None]:
# Make a combined category
df['label'] = (
    df['center'].astype(str) + '_' + df['density_cat'].astype(str)
).astype('category')

In [None]:
# Make a log scale version
df['log_cpy'] = np.log10(df['citations_per_year'])
df['log_cpy'] = df['log_cpy'].replace(-np.inf, np.nan)

# Visualization

In [None]:
import matplotlib.transforms


fig = plt.figure(figsize=(len(keys) * 2.5, 6))
ax = plt.gca()

sns.violinplot(
    ax=ax,
    data=df,
    x='center',
    y='log_cpy',
    hue='density_cat',
    split=True,
    inner='quart',
    dodge=True,
    gap=0,
)

df_by_center = df.groupby('center')
for i, key in enumerate(keys):

    # Get the group
    df_center = df_by_center.get_group(key)
    n = df_center.shape[0]

    # Median change
    df_center_by_density = df_center.groupby('density_cat')
    med_cpy = 10.**df_center_by_density['log_cpy'].median()
    fraction_change = med_cpy['> median'] / med_cpy['< median']
    median_change_str = (
        r'$c_{>50} = '
        f'{fraction_change:.2f}'
        r'c_{<50}$'
    )

    # Median change in width
    std_cpy = 10.**df_center_by_density['log_cpy'].std()
    fraction_change = std_cpy['> median'] / std_cpy['< median']
    std_change_str = (
        r'$\sigma_{>50} = '
        f'{fraction_change:.2f}'
        r'\sigma_{<50}$'
    )

    text = ax.annotate(
        text=f'n={n}\n' + median_change_str + '\n' + std_change_str,
        xy=(i, 1),
        xycoords=matplotlib.transforms.blended_transform_factory(
            ax.transData,
            ax.transAxes,
        ),
        xytext=(0, -5),
        textcoords='offset points',
        ha='center',
        va='top',
    )
    text.set_path_effects([
        patheffects.Stroke(linewidth=3, foreground='w'),
        patheffects.Normal()
    ])

legend = ax.get_legend()
legend.set_loc('upper center')
legend.set_bbox_to_anchor((0.5, 0.75))
legend.set_alignment('left')