# Diversity country

Modelling diversity at the country level

## Preamble

In [None]:
%run ../notebook_preamble.ipy
%config Completer.use_jedi = False


pd.options.mode.chained_assignment = None 

from narrowing_ai_research.utils.altair_utils import *

In [None]:
#Uncomment and run if you want to save figures
driv = altair_visualisation_setup()

In [None]:
import altair as alt
import random
import logging
import yaml
from statsmodels.api import OLS, Logit
from statsmodels.tools.tools import add_constant
from scipy.stats import zscore

from narrowing_ai_research.paper.make_category_diversity import make_category_diversity, read_process_data, diversity_estimation
pd.options.mode.chained_assignment = None  # default='warn'
alt.data_transformers.disable_max_rows()

In [None]:
def make_regression_dataset(papers_orgs, df,reg_year,category='institute_country'):


    df_ = df.copy()
    # Relevant papers
    papers_rel = papers_orgs.loc[
        (papers_orgs["is_ai"] == True) & (papers_orgs["year"] >= reg_year)].query("org_type!='Company'")

    # Paper counts in the period
    paper_counts_all = papers_rel.drop_duplicates([category,'article_id'])[category].value_counts()

    # Lookup between org name and paper year
    paper_counts = (
        papers_rel.groupby(["year", category])["article_id"].count().to_dict()
    )

    df_["number_of_papers"] = [
        paper_counts[(r["year"], r["category"])] for rid, r in df_.iterrows()]

    return df_,paper_counts_all


def fit_regression(diversity_df, cov_type="HC1"):
    """
    Fits regression model
    """
    # Metrics

    # Storage for results
    reg_results = {"balance": {}, "weitzman": {}, "rao_stirling": {}}

    # For each variable we subset by metric and parametre set
    for v in ["balance", "weitzman", "rao_stirling"]:

        met = diversity_df.loc[diversity_df["metric"] == v]

        for m in [0, 1, 2]:
            reg_results[v][m] = {}

            # Subset
            div = met.loc[met["parametre_set"] == f"Parametre Set {m}"]

            # Normalise variables
            Y = zscore(div["score"]).astype(float)

            # Papers logged
            div["papers_log"] = np.log(div["number_of_papers"])

            # Create org fixed effects
            fe = pd.get_dummies(div["category"])

            # Endogenous without fixed effects
            X_no_fe = add_constant(div[["papers_log", "year"]]).astype(
                float
            )

            # Endogenous with fixed effects
            X_fe = add_constant(
                pd.concat(
                    [div[["papers_log", "year"]], fe], axis=1
                ).astype(float)
            )

            # For both endogenous fit the models
            for X, n in zip([X_no_fe, X_fe], ["no_fe", "fe"]):
                ols = OLS(Y, X).fit(cov_type=cov_type)
                reg_results[v][m][n] = ols

    return reg_results


def make_regression_table(
    reg_results,
    org_names_list,
    save=True,
    metrics=["balance", "weitzman", "rao_stirling"],
):
    """Creates a regression table for the paper
    Also returns organisational coefficients used in a visualisation later
    """
    tables = []

    org_coeffs = {}

    for k, v in reg_results.items():
        for k2, v2 in v.items():
            for k3, v3 in v2.items():

                var_names = ["papers_log", "year"]

                params = []
                names = []
                for v in var_names:

                    par = np.float(v3.params[v])
                    t = np.float(v3.tvalues[v])
                    pv = np.float(v3.pvalues[v])

                    if pv < 0.01:
                        par = str(np.round(par, 2)) + "***"
                    elif pv < 0.05:
                        par = str(np.round(par, 2)) + "**"
                    elif pv < 0.1:
                        par = str(np.round(par, 2)) + "*"
                    else:
                        par = str(np.round(par, 2)) + "*"

                    params.append(par)
                    params.append(f"({str(round(t,2))})")

                    names.append(v)
                    names.append(v + "_t_val")

                main_results = pd.Series(params, index=names)

                fe = "Yes" if k3 == "fe" else "No"
                other_details = pd.Series(
                    [np.round(v3.rsquared, 2), int(v3.nobs), fe],
                    index=["$R^2$", "obs", "FE"],
                )

                results_series = pd.concat([main_results, other_details])

                results_series.name = "_".join([k, str(k2)])

                if k3 == "fe":

                    org_vals = pd.concat(
                        [
                            v3.params.loc[org_names_list],
                            v3.conf_int()[0].loc[org_names_list],
                            v3.conf_int()[1].loc[org_names_list],
                        ],
                        axis=1,
                    )

                    org_vals.columns = ["beta", "lower", "upper"]
                    org_coeffs["_".join([k, str(k2)])] = org_vals

                tables.append(results_series)

    reg_results_table = pd.concat(tables, axis=1)

    if save is True:
        clean_name_lookup = {
            "papers_log": "Papers (log)",
            "year": "Year",
            "$R^2$": "$R^2$",
            "obs": "N",
            "FE": "Fixed Effects",
        }

        for m in metrics:
            rel_table = reg_results_table.loc[
                :, [m in x for x in reg_results_table.columns]
            ]

            rel_table.columns = [x.split("_")[-1] for x in rel_table.columns]
            rel_table = rel_table.reset_index(drop=False)

            rel_table["index"] = [
                clean_name_lookup[x] if x in clean_name_lookup.keys() else ""
                for x in rel_table["index"]
            ]

            rel_table.rename(columns={"index": m}, inplace=True)

            rel_table.to_latex(f"{project_dir}/reports/tables_country/{m}.tex", index=False)

    return reg_results_table, org_coeffs


def make_chart_organisational_diversity(
    org_coeffs,
    num_orgs,
    metric_params,
    org_type_lookup,
    paper_counts,
    save=True,
    fig_num=14,
):
    """Plot comparing the organisational diversity coefficients"""

    # Regression coefficients sorted
    selected = (
        org_coeffs[metric_params]
        .sort_values("beta")
        .head(n=num_orgs)
        .reset_index(drop=False)
    )

    selected["org_type"] = selected["index"].map(org_type_lookup)
    selected["order"] = range(0, len(selected))

    # Paper counts by organisation
    recent_papers_orgs = (
        paper_counts.loc[selected["index"]]
        .reset_index(name="papers")
        .rename(columns={"index": "org"})
    )
    recent_papers_orgs["order"] = range(0, len(recent_papers_orgs))
    recent_papers_orgs["org_type"] = recent_papers_orgs["org"].map(org_type_lookup)

    b_ch = (
        alt.Chart(selected)
        .mark_bar()
        .encode(
            y=alt.Y("index", sort=alt.EncodingSortField("order"), title=""),
            x=alt.X("beta", title="Coefficient on diversity"),
            color=alt.X("org_type", title="Organisation type"),
        )
    ).properties(width=150, height=600)

    b_err = (
        alt.Chart(selected)
        .mark_errorbar()
        .encode(
            y=alt.Y(
                "index",
                sort=alt.EncodingSortField("order"),
                title="",
                axis=alt.Axis(ticks=False, labels=False),
            ),
            x=alt.X("lower", title=""),
            x2="upper",
        )
    ).properties(width=150, height=600)

    b_act = (
        alt.Chart(recent_papers_orgs)
        .mark_bar()
        .encode(
            y=alt.Y(
                "org",
                title=None,
                sort=alt.EncodingSortField("order"),
                axis=alt.Axis(labels=False, ticks=False),
            ),
            x=alt.X("papers"),
            color="org_type",
        )
    ).properties(width=100, height=600)

    out = (b_ch + b_err).resolve_scale(y="independent")
    out_2 = alt.hconcat(out, b_act, spacing=0).resolve_scale(y="shared")

    if save is True:
        save_altair(out_2, f"fig_{fig_num}_comp", driv)

    return out_2

## Read Data

In [None]:
with open(f"{project_dir}/paper_config.yaml", "r") as infile:
        pars = yaml.safe_load(infile)

div_params = pars["section_4"]["div_params"]
section_pars = pars["section_8"]

In [None]:
papers, paper_orgs, topic_mix = read_process_data()

In [None]:
df = diversity_estimation(
    paper_orgs, div_params, section_pars, papers, topic_mix, cat='institute_country'
)

In [None]:
reg_df,paper_counts = make_regression_dataset(paper_orgs,df,reg_year=2014)

In [None]:
reg_output = fit_regression(reg_df)

In [None]:
reg_table,country_coeffs = make_regression_table(reg_output,
                                  list(set(reg_df['category'])),save=False)

## AI index charts

In [None]:
# Diversity scores by country
zs = df.groupby(['metric','parametre_set','year']
          ).apply(lambda x: x.set_index('category').assign(z_score=lambda y: zscore(y['score']))
                 ).reset_index(level=3).reset_index(drop=True)

c_order = zs.groupby('category')['z_score'].mean().sort_values(ascending=False).index.tolist()

In [None]:
box = (alt.Chart(zs)
       .mark_boxplot()
       .encode(y=alt.Y('category',
                       sort=c_order,title='Country'),
               x=alt.X('z_score',title=['Thematic diversity','(standardised)']))).properties(
    width=200,height=400)

# Diversity coefficients (controlling for size)

coeff_df = pd.concat(
    [v.assign(z_score = lambda x: zscore(x.beta)).assign(metric=k) for k,v in country_coeffs.items(
    )]).reset_index(drop=False)

coeff_order = coeff_df.groupby('index')['z_score'].mean().sort_values(ascending=False).index.to_list()

box_coeff = (alt.Chart(coeff_df)
       .mark_boxplot()
       .encode(y=alt.Y('index',
                       sort=c_order,axis=alt.Axis(labels=False,ticks=False),title=''),
               x=alt.X('z_score',title=['Thematic diversity', 'regression coefficient','(standardised)']
                      ))).properties(width=200,height=400)

country_div = alt.hconcat(box,box_coeff)

save_altair(country_div,"country_diversity_boxplot",driv,fig_path)

In [None]:
country_div