# Section 5: Quality and track record

* What are the levels of quality (impact) of Covid AI research papers?
* What are the levels of experience of AI researchers focusing on Covid?
* How does the above differ between COVID research clusters?

## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import random
import altair as alt
from toolz.curried import *
from ast import literal_eval
from scipy.stats import ttest_ind, mannwhitneyu
from ai_covid_19.utils.utils import *


In [None]:
def citation_distr(_vector,bins=[0,1,2,3,5,10,20,100,1000]):
    '''Bins citations according to intervals
    
    Args:
        _vector: distribution of citations
        bins: (list) intervals for binning
    
    '''
    
    bins_ = bins.copy()
    
    cut = pd.cut(_vector,bins=bins_,right=False,include_lowest=True)
    
    out = cut.value_counts(normalize=True)
    out.index= ['_'.join([x.strip() for x in re.sub('\[|\)','',str(inter)).split(',')]) for inter in out.index]
    
    return(out)

def get_author_profile(x):
    '''Extract an author track record
    
    Args:
        x (df) is a df with the publications that the author has been involved in
    Returns a series with the number of papers she has authored, her citation mean and median and her
    experience (number of years she has been present in the data)
    
    '''
    
    stats = [len(x),
             x['citation_count'].median(),
             x['citation_count'].mean(),
             2020-x['year'].min()]
    return(pd.Series(stats,
                     index=['paper_n','citation_median','citation_mean','experience']))


def make_mean_comp(table,var_name,table_name):
    '''Creates a table to compare means
    
    
    '''
    
    t_l = table.reset_index(drop=False).melt(id_vars=var_name)
    t_l.rename(columns={var_name:'category','variable':'statistic'},inplace=True)
    t_l['variable'] = [f"{var_name}: {b}" for b in t_l['category']]
    t_l['table'] = table_name
    return(t_l)

def get_tests_table(table,variable_name,test=ttest_ind):
    '''P
    
    
    '''
    
    
    res = {}

    for x in stat_names:

        t_1 = test(table.loc[table[variable_name]==True][x],
                        table.loc[table[variable_name]==False][x])

        res[x] = t_1[1]
    return(res)
    



## 1. Read data

In [None]:
#All arXiv data
rxiv = pd.read_csv(f"{data_path}/processed/rxiv_metadata.csv",dtype={'id':str,
                                                                    'is_ai':bool,'is_covid':bool}).pipe(preview)

In [None]:
#Create the covid df
cov = rxiv.query("is_covid == True").reset_index(drop=True).pipe(preview)

In [None]:
#Create a paper-cluster membership lookup
cluster_lookup = pd.read_csv(
    f"{data_path}/processed/covid_semantic.csv",dtype={'article_id':str}).drop_duplicates(
    'article_id').set_index('article_id')['cluster'].to_dict()

## 2. Data analysis

### 1. Covid vs non covid citations

How do the levels of citations for Covid and non-Covid research compare?

#### Cited / non-cited comparison

In [None]:
rxiv_2020 = rxiv.query('year == 2020')

rxiv_2020['cluster'] = rxiv['id'].map(cluster_lookup)

In [None]:
rxiv_2020.groupby(['is_covid','is_ai'])['citation_count'].mean().reset_index(drop=False).pivot_table(
index='is_covid',columns='is_ai')

In [None]:
cit_groups = rxiv_2020.groupby(
    ['is_covid','is_ai','article_source'])['citation_count'].mean().reset_index()

alt.Chart(cit_groups).mark_bar().encode(x='is_covid:N',y='citation_count',
                                        column='is_ai:N',
                                        row='article_source').properties(height=100,width=50)

In [None]:
#Comparison of paper with at least one citation: AI vs non AI by article source
rxiv_2020.assign(
    has_cit = lambda x: x['citation_count']>0).groupby(
    ['article_source','is_covid'])['has_cit'].mean().reset_index(name='share').pivot_table(
    index='article_source',columns='is_covid',values='share').assign(rep = lambda x: x[1]/x[0])

#### Full distribution of citations by source

In [None]:
int_lookup = {'0_1':'0: 0-1', '100_1000':'7: >100', '10_20':'5: 10-20', '1_2':'1: 1-2', 
              '20_100':'6: 20-100', '2_3':'2: 2-3', 
              '3_5':'3: 3-5', '5_10':'4: 5-10'}

In [None]:
rxiv_cit_distrs = rxiv_2020.groupby(
    ['is_covid','is_ai','article_source'])['citation_count'].apply(lambda x:
                                                                  citation_distr(x)).reset_index(
    drop=False).pipe(preview)

rxiv_cit_distrs['int_sorted'] = rxiv_cit_distrs['level_3'].map(int_lookup)

In [None]:
#Clean up the variable names and categories
rxiv_cit_distrs['is_covid'],rxiv_cit_distrs['is_ai'],rxiv_cit_distrs['article_source'] = [
    func(rxiv_cit_distrs[var]) for func,var in zip(
        [convert_covid,convert_ai,convert_source],['is_covid','is_ai','article_source'])]

rxiv_cit_distrs['citation_count'] = 100*rxiv_cit_distrs['citation_count']
rxiv_cit_distrs['% of papers'] = make_pc(rxiv_cit_distrs['citation_count'])

In [None]:
#Chart
b = (alt
 .Chart(rxiv_cit_distrs)
 .mark_bar(stroke='black',strokeWidth=0.5)
 .encode(x=alt.X('is_covid:N',title=''),
         y=alt.Y('citation_count',title='% of papers'),
         color=alt.Color(
             'int_sorted:O',scale=alt.Scale(scheme='orangered'),title=['Number of','citations']),
         column=alt.Column('article_source',title='Source'),
         row=alt.Row('is_ai:N',title=''),
            tooltip=['% of papers']))

b = b.properties(height=77,width=100).resolve_scale(color='shared')

b.save(f"{fig_path}/fig_10.html")

b

#### Citations by cluster

In [None]:
#Focus on covid papers for which we have cluster information
rxiv['cluster'] = rxiv['id'].map(cluster_lookup)
cov = rxiv.query('is_covid==True').reset_index(drop=True).dropna(axis=0,subset=['cluster'])

#List of top 12 clusters in terms of AI publications
top_ai_clusters = list(cov.query('is_ai==1')['cluster'].value_counts().sort_values(ascending=False)[:12].index)

In [None]:
#Get citations for papers in different clusters
cit_sorted = clean_cluster([x for x in cov.groupby('cluster')['citation_count'].mean().sort_values(ascending=False).index if
              x in top_ai_clusters])
#Clean variable names
cov['cluster'] = clean_cluster(cov['cluster'])


top_clust_cov = cov.loc[[x in cit_sorted for x in cov['cluster']]]
top_clust_cov['rank'] = top_clust_cov['cluster'].map({c:n for n,c in enumerate(cit_sorted)})

In [None]:
top_clust_cov['cluster'] = clean_cluster(top_clust_cov['cluster'])
top_clust_cov['is_ai'] = convert_ai(top_clust_cov['is_ai'])

In [None]:
#Calculate citation means
citation_means = top_clust_cov.groupby(['is_ai','cluster'])['citation_count'].mean().apply(
    lambda x: np.round(x,2)).reset_index(name='Citation mean')

#Merge with the cluster info

top_clust_cov_2 = pd.merge(top_clust_cov,citation_means,
                          left_on=['is_ai','cluster'],right_on=['is_ai','cluster'])

In [None]:
out = []

for n,c in enumerate(cit_sorted):
    
    l1 = (alt.Chart(top_clust_cov_2)
         .transform_filter(alt.datum.cluster==c)
         .mark_point(opacity=0.5,stroke='black',strokeWidth=2,filled=True)
         .encode(x=alt.X('is_ai:N',title=''),
                 y=alt.Y('citation_count:Q',title=['Number','of citations']),
                 size=alt.Size('count()',
                               scale=alt.Scale(range=[0,100],type='log'),
                              title=['Number', 'of publications']),
                               color=alt.Color('is_ai:N',title='Category'),
                tooltip=['Citation mean:Q']))

    l2 = (alt.Chart(top_clust_cov_2)
          .transform_filter(alt.datum.cluster==c)
          .mark_line(strokeWidth=1,strokeDash=[1,1])
          .encode(x='is_ai:N',y='citation_count:Q',detail='is_ai:N',color='is_ai:N'))
    
    l2 = (alt.Chart(top_clust_cov_2)
          .transform_filter(alt.datum.cluster==c)
          .mark_tick(strokeWidth=1,opacity=0.7)
          .encode(x='is_ai:N',y='citation_count:Q',
                  color='is_ai:N',tooltip=['Citation mean:Q']))

    ch = (l1+l2).properties(height=100,width=150,title=c)
    
    out.append(ch)
    
out_threes = [out[:3],out[3:6],out[6:9],out[9:12]]

#Arrange things so they fit in a page
r = []

for p in out_threes:
    
    r.append(alt.hconcat(*p))
    
fin_fig = alt.vconcat(*r)

fin_fig.save(f"{fig_path}/fig_11.html")

fin_fig

In [None]:
#Calculate citation means by group: How often are AI means above or below non-A)

citation_means = rxiv_2020.query('is_covid==True').groupby(['cluster','is_ai'])[
    'citation_count'].mean().reset_index(name='statistic').pivot_table(index='cluster',columns='is_ai',
                                                                      values='statistic').loc[
    top_ai_clusters[:10]].sort_values(1,ascending=False)

citation_means['status'] = [row[1]>=row[0] for _id,row in citation_means.iterrows()]

citation_means['status'].mean()

### 2. Track record of authors

Here we compare the publication records of authors focusing on different COVID-19 topics and in different categories (eg AI vs non AI)

In [None]:
#Focus on papers with authors
rxiv_auth = rxiv.dropna(axis=0,subset=['mag_authors'])

#Extract author ids from author credentials dict
#First we need to parse the mag_authors json
rxiv_auth['mag_authors'] = rxiv_auth['mag_authors'].apply(literal_eval)

rxiv_auth['author_id'] = [[x['author_id'] for x in p] for p in rxiv_auth['mag_authors']]

In [None]:
#Create the lookup between papers and authors
paper_author_lookup = rxiv_auth[['id','author_id']].explode('author_id').pipe(preview)

In [None]:
#Find authors with at least one covid paper
covid_paper_ids = set(cov['id'])
ai_paper_ids = set(rxiv.query('is_ai == 1')['id'])
covid_ai_paper_ids = set(cov.query('is_ai == 1')['id'])

#Get lists of authors with at least one covid, ai, covid ai paper
cov_paper_auths,ai_paper_auths,covid_ai_paper_auths = [set(
    paper_author_lookup.loc[[x in ids for x in paper_author_lookup['id']]]['author_id']) for 
                                                     ids in [covid_paper_ids,ai_paper_ids,
                                                             covid_ai_paper_ids]]

In [None]:
#Merge with xiv and focus on covid authors
trajectories = rxiv.merge(paper_author_lookup,left_on='id',right_on='id').dropna(
    axis=0,subset=['author_id'])

#### Covid and non-Covid authors.

In [None]:
#Focus on non Covid publications in recent years
trajectories_non_covid = trajectories.loc[[x not in covid_paper_ids 
                                                               for x in trajectories['id']]]
trajectories_recent = trajectories_non_covid.query("(year == 2018) | (year == 2019)")

author_cluster = trajectories.groupby('cluster')['author_id'].apply(lambda x: set(list(x)))

#Extract researcher profiles (focusing on those with publications in 2019)
#Note - this takes some time to run
author_profile = trajectories_recent.groupby('author_id').apply(lambda x: get_author_profile(x))

#Label authors with whether they have one covid, one ai or one covid_ai paper
author_profile['has_cov'], author_profile['has_ai'],author_profile['has_covid_ai'] = [
    author_profile.index.isin(group) for group in [cov_paper_auths,ai_paper_auths,covid_ai_paper_auths]]

#Label them wih the clusters where they have published
author_profile['cluster'] = [[cid for cid,cgr in author_cluster.iteritems() if auth in cgr] for
                             auth in author_profile.index]

In [None]:
#Here we compare the statistics for papers in various categories
stat_names = ['paper_n','citation_mean','citation_median','experience']

#Create a bunch of tables that compare mean citations for ait
cov_comp  = author_profile.groupby('has_cov')[stat_names].mean()
ai_comp  = author_profile.query('has_ai == True').groupby('has_covid_ai')[stat_names].mean()
cov_ai_comp  = author_profile.query('has_cov == True').groupby('has_covid_ai')[stat_names].mean()

tables = [cov_comp,ai_comp,cov_ai_comp] 
var_names = ['has_cov','has_covid_ai','has_covid_ai']
table_names = ['all_papers','all_ai_papers','all_covid_papers']

all_tables = pd.concat([make_mean_comp(t,var,name) for t,var,name in zip(tables,var_names,table_names)])

all_tables.head()

In [None]:
#Here we test statistical significance of differences in means between the variables
test_df = pd.DataFrame([get_tests_table(table,variable_name,mannwhitneyu) for table,variable_name in
        zip([author_profile,
             author_profile.query('has_ai == True'),
             author_profile.query('has_cov == True')],
            ['has_cov','has_covid_ai','has_covid_ai'])],
                     index=['all_papers','all_ai_papers','all_covid_papers'])

#Turn into a long df so we can merge with the means table 
test_df_long = test_df.reset_index(drop=False).melt(id_vars='index',
                                                    var_name='statistic',
                                                    value_name='significant')
test_df_long['significant'] = test_df_long['significant']<0.05

all_tables_tests = pd.merge(all_tables,test_df_long,left_on=['statistic','table'],
                           right_on=['statistic','index'])

In [None]:
#Tidy up variable names for the chart


var_names = ['statistic','variable','table','index','significant']

#Lookups between variables
stat_lookup = {'paper_n':'Number of papers','citation_mean':'Citation (mean)',
              'citation_median':'Citation (median)','experience':'Experience'}

var_lookup = {'has_cov: False':'Not COVID-19','has_cov: True':'COVID-19',
             'has_covid_ai: True': 'COVID-19 and AI','has_covid_ai: False': 'COVID-19 and not AI'}

table_lookup = {'all_papers':'All research','all_ai_papers':'AI research',
               'all_covid_papers':'COVID-19 research'}

significant = {True:'Significant',False:'Insignificant'}

#Convert variables using the lookups
for v,l in zip(var_names,[stat_lookup,var_lookup,table_lookup,table_lookup,significant]):
    all_tables_tests[v] = convert_var(all_tables_tests[v],l)

#Create a rounded variable for tooltops
all_tables_tests['value_label'] = [str(np.round(x,2)) for x in all_tables_tests['value']]

#We are not interested in the experience variable (we are focusing on authors with recent publications)
all_tables_tests = all_tables_tests.query("statistic != 'Experience'")

In [None]:
#Make chart
mean_comp = (alt.Chart(all_tables_tests)
             .mark_bar(height=15,stroke='black')
             .encode(
                 y=alt.Y('variable:N',title=''),
                 x=alt.X('value:Q',title='Score'),
                 color=alt.Color('significant',
                                 scale=alt.Scale(range=['lightpink','steelblue']),title='Significance'),
                 column=alt.Column('statistic:N'),
                 row=alt.Row('table:N',
                             sort=['All research','AI research','COVID-19 reesearch'],
                             title='Corpus'),
                 tooltip=['value_label'])
             .resolve_scale(y='independent',x='shared').properties(height=70,width=70))

mean_comp.save(f"{fig_path}/fig_13.html")

mean_comp

#### AI and non-AI authors between clusters

In this case we want to consider the full trajectory of researchers working in Covid, not just the most recent two years, but excluding Covid papers

In [None]:
#Focus on Covid authors in recent years
trajectories_covid_authors = trajectories_non_covid.loc[trajectories.author_id.isin(cov_paper_auths)]
trajectories_covid_authors = trajectories_covid_authors.query('(year > 2017) & (year <2020)')

In [None]:
#Extract author profile
author_profile_cov = trajectories_covid_authors.groupby('author_id').apply(lambda x: get_author_profile(x))
#Label authors with whether they have one covid, one ai or one covid_ai paper
author_profile_cov['has_covid_ai'] = author_profile_cov.index.isin(covid_ai_paper_auths)
#Label them wih the clusters where they have published
author_profile_cov['cluster'] = [[cid for cid,cgr in author_cluster.iteritems() if auth in cgr] for
                             auth in author_profile_cov.index]

In [None]:
#Author profile in cluster

#Explode the dataframe by the author cluster
author_profile_exploded = author_profile_cov.explode('cluster')

#Calculate means for each cluster
prof_clust = author_profile_exploded.groupby(
    ['cluster','has_covid_ai'])[stat_names].mean().loc[top_ai_clusters[:10]].reset_index(drop=False)

In [None]:
#Calculate statistical significance of differences

cluster_test_df = pd.DataFrame([get_tests_table(author_profile_exploded.loc[author_profile_exploded['cluster']==x],
                                                        'has_covid_ai',mannwhitneyu) for
                               x in top_ai_clusters[:10]],index=top_ai_clusters[:10])

cluster_test_long = cluster_test_df.reset_index(drop=False).melt(id_vars='index',var_name='statistic',
                                            value_name='significance')

cluster_test_long['significance_thres'] = ['p < 0.01' if x<0.01 else 'p < 0.05' if x<0.05 else 'p < 0.1' if x<0.1 
                                           else 'p > 0.1' for
                                    x in cluster_test_long['significance']]

In [None]:
#Make charts
charts = []

for v in ['paper_n','citation_mean','citation_median']:
    
    d = prof_clust[['cluster','has_covid_ai',v]]
    
    d['has_covid_ai'] = convert_ai(d['has_covid_ai'])
    
    s = cluster_test_long.loc[cluster_test_long['statistic']==v].set_index(
        'index')['significance_thres'].to_dict()
    
    d['significance_thres'] = d['cluster'].map(s)
    d['cluster'] = clean_cluster(d['cluster'])
    
    d[f'Mean {stat_lookup[v]}'] = [str(np.round(x,2)) for x in d[v]]
        
    c = (alt.Chart(d)
         .mark_bar(height=10,stroke='black',strokeWidth=1,strokeOpacity=1)
         .encode(y=alt.Y('has_covid_ai',title=None),
                 x=alt.X(v,title=stat_lookup[v]),
                 color=alt.Color('has_covid_ai',title='Category'),
                 opacity=alt.Opacity('significance_thres:N',scale=alt.Scale(range=[0.3,1]),
                                    title='Significance',
                                    sort=['p > 0.1','p < 0.1','p < 0.05','p < 0.01']),
                 row=alt.Row('cluster',
                             sort=alt.EncodingSortField(v,'max',order='descending')),
                tooltip=[f'Mean {stat_lookup[v]}']))
    
    charts.append((c).properties(height=40,width=40,title=stat_lookup[v]))
    
cluster_comp = alt.hconcat(*charts).configure_axis(grid=True)

cluster_comp.save(f"{fig_path}/fig_14.html")

cluster_comp

In [None]:
for x in stat_names:
    
    
    piv = prof_clust.pivot_table(index='cluster',columns='has_covid_ai',values=x)
    
    print(x)
    print(np.mean(piv[False]>piv[True]))
    
    print('\n')