# Diversity: topic view

What topics contribute to diversity?



## Preamble

In [None]:
%run ../notebook_preamble.ipy
%config Completer.use_jedi = False


pd.options.mode.chained_assignment = None 

In [None]:
# Uncomment and run if you want to save figures
# driv = altair_visualisation_setup()

In [None]:
import altair as alt
import random
import logging
import yaml

from narrowing_ai_research.utils.list_utils import *
from narrowing_ai_research.utils.altair_utils import *
from narrowing_ai_research.utils.read_utils import *
from narrowing_ai_research.transformers.diversity import Diversity, remove_zero_axis
from narrowing_ai_research.paper.s5_network_view import read_process_data

alt.data_transformers.disable_max_rows()

## Read data

### Metadata

In [None]:
# We use the same diversity parametres as in the analysis of diversity
with open(f"{project_dir}/paper_config.yaml",'r') as infile:
    div_params = yaml.safe_load(infile)['section_4']['div_params']

### Data

In [None]:
papers, topic_mix, topic_category_map, arxiv_cat_lookup = read_process_data()

## Analysis

In [None]:
# Focus on recent AI papers
papers_rec = papers.loc[(papers['year']>=2020)&(papers['is_ai']==True)]

topics_rec = remove_zero_axis(
    topic_mix.loc[topic_mix['article_id'].isin(set(papers_rec['article_id']))].set_index('article_id'))

In [None]:
def topic_diversity_contribution(topic_mix,metric,params,name,method='max',threshold=None):
    '''Compares the diversity of the corpus without / without papers with a topic
    '''
    
    print(params)
    
    d = Diversity(topic_mix,name)
    
    getattr(d,metric)(params)
    
    bench = d.metric
    
    if method == 'max':
        papers_max_topic = topic_mix.idxmax(axis=1).reset_index(name='topic').groupby(
            'topic')['article_id'].apply(lambda x: set(x))
        
    else:
        paper_bin = topic_mix.applymap(lambda x: x>threshold)
        
    results = []
    
    for n,t in enumerate(topic_mix.columns):
        if n%20==0:
            logging.info(f"processed {n} topics")
        
        if method == 'max':
            if t not in papers_max_topic.keys():
                presence = 0
                difference = 0
            else:            
                presence = len(papers_max_topic[t])
                topic_mix_reduced = remove_zero_axis(
                    topic_mix.loc[~topic_mix.index.isin(papers_max_topic[t])])

                d_ = Diversity(topic_mix_reduced,name)

                getattr(d_,metric)(params)
                reduced = d_.metric            
                difference = bench-d_.metric
            
        else:
            presence = paper_bin[t].sum()
            papers_without_topic = paper_bin.loc[paper_bin[t]==0].index
            
            topic_mix_reduced = remove_zero_axis(topic_mix.loc[papers_without_topic])

            d_ = Diversity(topic_mix_reduced,name)

            getattr(d_,metric)(params)
            reduced = d_.metric
            difference = bench-d_.metric
            
        out = pd.Series([t,presence,difference,metric,name],index=['topic','presence',
                                                                       'div_contr','metric','parametre_set'])
        results.append(out)
    
    return results

In [None]:
def topic_diversity_calculation_all(topic_mix,param_dict,method='max',threshold=None):
    
    all_results=[]
        
    for k,v in param_dict.items():
        logging.info(k)
        
        for n,par in enumerate(v):
            results = topic_diversity_contribution(topic_mix,k,par,f'param_set_{n}',
                                                   method=method,
                                                   threshold=threshold)
            all_results.append(results)           
    return all_results

In [None]:
diversity_contribution_max = topic_diversity_calculation_all(topics_rec,div_params)

In [None]:
diversity_contribution_df = pd.concat([pd.DataFrame(x) for x in diversity_contribution_max])
diversity_contribution_df['method']='max'
diversity_contribution_df.sort_values('presence').to_csv(
    f"{project_dir}/data/processed/diversity_contribution_max.csv",index=False)

In [None]:
diversity_contribution_pres = topic_diversity_calculation_all(
    topics_rec,div_params,method='pres',threshold=0.1)

diversity_contribution_pres_df['method']='pres'

# diversity_contribution_pres_df.to_csv(
#     f"{project_dir}/data/processed/diversity_contribution_pres.csv",index=False)