# Diversity: topic view

What topics contribute to diversity?



## Preamble

In [None]:
%run ../notebook_preamble.ipy
%config Completer.use_jedi = False


pd.options.mode.chained_assignment = None 

In [None]:
# Uncomment and run if you want to save figures
# driv = altair_visualisation_setup()

In [None]:
import altair as alt
import random
import logging
import yaml
from scipy.stats import zscore

from narrowing_ai_research.utils.list_utils import *
from narrowing_ai_research.utils.altair_utils import *
from narrowing_ai_research.utils.read_utils import *
from narrowing_ai_research.transformers.diversity import Diversity, remove_zero_axis
from narrowing_ai_research.paper.s5_network_view import read_process_data
from narrowing_ai_research.paper.s9_topic_comparison import topic_rep
from narrowing_ai_research.paper.make_topic_diversity_contribution import *


alt.data_transformers.disable_max_rows()

## Read data

In [None]:
with open(f"{project_dir}/paper_config.yaml",'r') as infile:
    cats = yaml.safe_load(infile)['section_9']['categories']

### Metadata

In [None]:
# # We use the same diversity parametres as in the analysis of diversity
# with open(f"{project_dir}/paper_config.yaml",'r') as infile:
#     div_params = yaml.safe_load(infile)['section_4']['div_params']

### Data

In [None]:
papers = read_papers()
papers_orgs = paper_orgs_processing(read_papers_orgs(),papers)
arxiv_categories = read_arxiv_cat_lookup()
topic_category_map = read_topic_category_map()

In [None]:
topic_mix = read_topic_mix().set_index('article_id')

In [None]:
div_contr = pd.read_csv(f"{project_dir}/data/processed/diversity_contribution.csv")

In [None]:
papers_orgs['year'] = [x.year for x in papers_orgs['date']]

## Analysis

In [None]:
#div_contr['category'] = div_contr['topic'].map(topic_category_map)

In [None]:
pset = 'param_set_1'
var = 'balance'

pres= (div_contr
       .query("diversity_contribution_method == 'presence'")
       .query(f"metric == '{var}'")).pivot_table(
    index=['topic'],columns=['parametre_set'],values='div_contr').apply(
    zscore).reset_index(drop=False)[['topic',pset]].sort_values(pset,
                                                                         ascending=False).reset_index(drop=True)

In [None]:
# Recent papers
paper_orgs_rec = papers_orgs.loc[papers_orgs['year']>=2019].query("is_ai==True")

topics_rec = topic_mix.loc[topic_mix.index.isin(set(paper_orgs_rec['article_id']))]

# Topic distribution (all)
paper_counts_all = topics_rec.idxmax(axis=1).value_counts().to_dict()

# Topic_distribution (corporates and academic)
paper_counts_ac,paper_counts_corp = [
    topic_mix.loc[
        topic_mix.index.isin(
            set(paper_orgs_rec.query(f"org_type=='{org}'")[
                'article_id']))].idxmax(axis=1).value_counts().to_dict() for org in
    ['Education','Company']]

In [None]:
# Topic distribution (top / bottom distr)
papers_orgs_quant = paper_orgs_rec.query("is_ai==True")['org_name'].value_counts(
).reset_index(name='count').query("count>10").assign(rank = lambda x: pd.qcut(x['count'],
                                                                             q=np.arange(0,1.1,0.2),
                                                                             labels=False,
                                                           duplicates='drop'))

papers_orgs_quant_top,papers_orgs_quant_bot = [
    topic_mix.loc[
        topic_mix.index.isin(
            set(paper_orgs_rec.loc[
                paper_orgs_rec['org_name'].isin(
                    set(
                        papers_orgs_quant.query(f"rank=={q}")['index']))]['article_id']))
    ].idxmax(axis=1).value_counts().to_dict() for q in [4,0]]

pres['category'] = pres['topic'].map(topic_category_map).dropna()

names = ['all','academic','corporate','top_quartile','bottom_quartile']

for n,d in zip(names,[paper_counts_all,paper_counts_ac,paper_counts_corp,papers_orgs_quant_top,papers_orgs_quant_bot]
              ):
    pres[n] = pres['topic'].map(d).fillna(0)
    pres[n] = (pres[n]/pres[n].sum()).cumsum()
    
pres['order'] = np.arange(0,len(pres))

pres_cumul = pres[['order']+names].melt(id_vars='order',var_name='variable')

pres['category'] = pres['topic'].map(topic_category_map).dropna()

names = ['all','academic','corporate','top_quartile','bottom_quartile']

for n,d in zip(names,[paper_counts_all,paper_counts_ac,paper_counts_corp,papers_orgs_quant_top,papers_orgs_quant_bot]
              ):
    pres[n] = pres['topic'].map(d).fillna(0)
    pres[n] = (pres[n]/pres[n].sum()).cumsum()
    
pres['order'] = np.arange(0,len(pres))

pres_cumul = pres[['order']+names].melt(id_vars='order',var_name='variable')



In [None]:
div = (alt
 .Chart(pres)
 .mark_bar()
 .encode(x=alt.X('topic',axis=alt.Axis(labels=False,ticks=False),
                 sort=alt.EncodingSortField(pset,order='descending')),
        y=pset,color=alt.Color('category',
                                       sort=alt.EncodingSortField(
                                           pset,
                                           op='mean',
                                           order='descending'),
                                       legend=alt.Legend(columns=2)),
                                        tooltip=['topic'])).properties(width=400,height=200)

contr = (alt.Chart(pres_cumul)
 .mark_line()
 .encode(x=alt.X('order',scale=alt.Scale(domain=[0,550]),axis=alt.Axis(grid=False)),
        y='value',
        color='variable')).properties(width=400,height=200)

alt.vconcat(div,contr).resolve_scale(color='independent')

### Org profiles

In [None]:
papers_orgs_ai = papers_orgs.query("is_ai==True")

In [None]:
ordered_cats = topic_mix.applymap(lambda x: x>0).sum()

In [None]:
my_ordered_cats = ordered_cats.reset_index(name='count').assign(
    cat = lambda x: x['index'].map(topic_category_map)).groupby('cat')['count'].mean().sort_values(
    ascending=False).index.tolist()

In [None]:
def strip_plot(df,ordered_cats,name):
    stripplot = (
        alt.Chart(df)
        .mark_circle(size=14,stroke='grey',strokeWidth=0.5).encode(
            x=alt.X(
                'jitter:Q',
                title=None,
                axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
                scale=alt.Scale()),
            y=alt.Y('ratio:Q',title='Specialisation'),
            tooltip=['index'],
            size=alt.Size('levels',title=['Number', 'of papers'],
                        #scale=alt.Scale(type='log')
                         ),
            color=alt.Color('cat_sel:N', legend=None,scale=alt.Scale(scheme='tableau10')),
            column=alt.Column(
                'cat_sel:N',
                title='arXiv category',
                sort=ordered_cats,
                header=alt.Header(
                    labelFontSize=12,
                    labelAngle=270,
                    titleOrient='top',
                    labelOrient='bottom',
                    labelAlign='center',
                    labelPadding=25)))
        .transform_calculate(
        # Generate Gaussian jitter with a Box-Muller transform
            jitter='sqrt(-2*log(random()))*cos(2*PI*random())')
#         .transform_filter(
#             alt.datum.levels > 0)  
        .configure_facet(
            spacing=0)
        .configure_view(
            stroke=None)
        .configure_axis(
            labelFontSize=12,
            titleFontSize=12)
        .properties(title=name,width=10,height=200))
    
    return stripplot
    

In [None]:
def make_chart_topic_spec(data,variable,value,ordered_cats=[]):
    '''
    '''
    logging.info(f"Extracting IDs {value}")
    _ids = set(papers_orgs_ai.loc[papers_orgs_ai[variable]==value]['article_id'])

    rep = topic_rep(_ids,topic_mix,cats,
                    topic_mix.columns,topic_category_map)[0].dropna().reset_index(drop=True)
    
    if len(ordered_cats)==0:
        ordered_cats = rep.groupby(
            'cat_sel')['levels'].sum().sort_values(ascending=False).index.tolist()    
    
    
    logging.info("Plotting")
    plot = strip_plot(rep,ordered_cats,value)
    
    return plot
    


In [None]:
c, a, g, m = [make_chart_topic_spec(paper_orgs_ai,v,n) for v,n in zip(['org_type','org_type',
                                                                   'org_name','org_name'],
                                                                   ['Company','Education',
                                                                   'Google',
                                                                   'Massachusetts Institute of Technology'])]

In [None]:
c

In [None]:
a

In [None]:
g

In [None]:
m