# Machines of healing grace?

Code with basic analysis and results from the AI v Covid paper

**Sections**

1. Descriptive analysis
  * How much Covid and AI activity do we detect in our data sources?
  * Is AI over or underrepresented in Covid research
  * How has AI activity evolved over time?
2. Topical analysis
  * What is the topical composition of Covid research and in what areas is AI focusing?
  * What are some examples of AI research to tackle Covid?
  * How has it evolved over time?
3. Geography
  * Where is AI research happening?
  * Who is doing it?
  * Do we find any differences in the topics that different countries focus on?
  * What reflects whether a country focuses on Covid research? Demand pull or supply push?
4. Knowledge base
  * On what topics do AI researchers draw on?
4. **Analysis of quality**
  * What are the levels of quality (impact) of Covid AI research papers?
  * What are the levels of experience of AI researchers focusing on Covid?
  * How does the above differ between AI research clusters?
  * Could we look at other data sources such as altmetrics?

## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import altair as alt
from altair_saver import save
from toolz.curried import *
import random
import geopandas as gp

In [None]:
FIG_PATH = f"{project_dir}/reports/figures/report_1"
SRC_PATH = f"{project_dir}/data/processed/ai_research"


In [None]:
pd.options.mode.chained_assignment = None

In [None]:
def save_fig(figure,name):
    save(figure,f'{FIG_PATH}/{name}.png',method='selenium',
         webdriver=DRIVER,scale_factor=3)
    
def preview(x):
    print(x.head())
    print(x.shape)
    return(x)

In [None]:
def citation_distr(_vector,bins=[0,1,2,3,5,10,20,100,1000]):
    
    bins_ = bins.copy()
    
    #if max(_vector)>bins[-1]:
    #    bins_.append(max(_vector))
    
    cut = pd.cut(_vector,bins=bins_,right=False,include_lowest=True)
    
    out = cut.value_counts(normalize=True)
    out.index= ['_'.join([x.strip() for x in re.sub('\[|\)','',str(inter)).split(',')]) for inter in out.index]
    
    return(out)
    


## 1. Read data

In [None]:
#All arXiv data
xiv = pd.read_csv(f"{SRC_PATH}/xiv_papers_labelled.csv",dtype={'id':str}).pipe(preview)

In [None]:
xiv.columns = [x.lower() for x in xiv.columns]

In [None]:
ai_ids = set(xiv.loc[xiv['is_ai']==True]['id'])

In [None]:
#Create a cov df

cov = xiv.query("is_covid == True").reset_index(drop=True).pipe(preview)

In [None]:
#All topics
cluster_memberships = pd.read_csv(f"{project_dir}/data/processed/ai_research/paper_cluster.csv",header=None)
cluster_lookup = cluster_memberships.set_index(0).to_dict()[1]

In [None]:
# Load the author data HERE (or recalculate myself)

## 2. Analyse data

### Compare citations covid vs non covid

In [None]:
# How do the levels of citations for Covid and non-Covid research compare?

In [None]:
xiv_2020 = xiv.query('year == 2020')

In [None]:
xiv_2020.groupby(['is_covid','is_ai'])['citation_count'].mean().reset_index(drop=False).pivot_table(
index='is_covid',columns='is_ai')

In [None]:
cit_groups = xiv_2020.groupby(
    ['is_covid','is_ai','article_source'])['citation_count'].mean().reset_index().pipe(preview)

alt.Chart(cit_groups).mark_bar().encode(x='is_covid:N',y='citation_count',
                                        column='is_ai:N',
                                        row='article_source').properties(height=100,width=50)

In [None]:
xiv_cit_distrs = xiv_2020.groupby(
    ['is_covid','is_ai','article_source'])['citation_count'].apply(lambda x:
                                                                  citation_distr(x)).reset_index(
    drop=False).pipe(preview)

In [None]:
int_lookup = {'0_1':'0: 0-1', '100_1000':'7: 10-1000', '10_20':'5: 10-20', '1_2':'1: 1-2', 
              '20_100':'6: 20-100', '2_3':'2: 2-3', 
              '3_5':'3: 3-5', '5_10':'4: 5-10'}
xiv_cit_distrs['int_sorted'] = xiv_cit_distrs['level_3'].map(int_lookup)

In [None]:
b = (alt
 .Chart(xiv_cit_distrs)
 .mark_bar(stroke='black',strokeWidth=0.5)
 .encode(x='is_covid:N',y='citation_count',
         color=alt.Color(
             'int_sorted:O',scale=alt.Scale(scheme='orangered')),
             column='article_source',row='is_ai:N'))

b = b.properties(height=75,width=60).resolve_scale(color='shared')

save_fig(b,"fig_10_citation_comp")

b

In [None]:
#Citations by cluster

xiv['cluster'] = xiv['id'].map(cluster_lookup)
cov = xiv.query('is_covid==1').reset_index(drop=True)

In [None]:
top_ai_clusters = list(cov.query('is_ai==1')['cluster'].value_counts().sort_values(ascending=False)[:12].index)

ai_clusters_citation_sorted = list(
    cov.loc[cov.cluster.isin(top_ai_clusters)].groupby(
    ['cluster','is_ai'])['citation_count'].mean().reset_index(drop=False).query("is_ai == 1").pipe(
        preview).sort_values('citation_count',ascending=False)['cluster'])

In [None]:
cov_clust_cit = cov.groupby(['is_ai','cluster'])['citation_count'].apply(
    lambda x: citation_distr(x)).reset_index(drop=False).assign(name_clean = lambda x: x['level_2'].map(
    int_lookup)).pipe(preview)

### Compare citations between clusters

In [None]:
b2 = (alt.Chart(cov_clust_cit)
 .transform_filter(alt.FieldOneOfPredicate('cluster',ai_clusters_citation_sorted))
 .mark_bar(stroke='black',strokeWidth=0.5)
 .encode(x='is_ai:N',
         y=alt.Y('citation_count',title=['% of paper in','citation category']),
         color=alt.Color('name_clean:O',scale=alt.Scale(scheme='orangered')),
         facet=alt.Facet('cluster',sort=ai_clusters_citation_sorted,columns=6))).properties(height=100)

save_fig(b2,"fig_11_citation_cluster")

b2

### Compare citations between types of authors

In [None]:
def get_author_profile(x):
    
    stats = [len(x),
             x['citation_count'].median(),
             x['citation_count'].mean(),
             2020-x['year'].min()]
    return(pd.Series(stats,
                     index=['paper_n','citation_median','citation_mean','experience']))


In [None]:
from ast import literal_eval

xiv_auth = xiv.dropna(axis=0,subset=['mag_authors'])

In [None]:
#Extract author ids from author credentials dict
xiv_auth['mag_authors'] = xiv_auth['mag_authors'].apply(literal_eval)

In [None]:
xiv_auth['author_id'] = [[x['author_id'] for x in p] for p in xiv_auth['mag_authors']]

In [None]:
#Create the lookup between papers and authors
paper_author_lookup = xiv_auth[['id','author_id']].explode('author_id').pipe(preview)

In [None]:
#Find authors with at least one covid paper
covid_paper_ids = set(cov['id'])

cov_paper_auths = set(paper_author_lookup.loc[[x in covid_paper_ids for x in paper_author_lookup['id']]]['author_id'])

In [None]:
#Merge with xiv and focus on covid authors
trajectories = xiv.merge(paper_author_lookup,left_on='id',right_on='id').dropna(
    axis=0,subset=['author_id'])

#Focus on covid authors but removing covid publications -
trajectories_covid_authors = trajectories.loc[[x in cov_paper_auths for x in trajectories['author_id']]]

trajectories_covid_authors_2 = trajectories_covid_authors.loc[[x not in covid_paper_ids 
                                                               for x in trajectories_covid_authors['id']]]

In [None]:
author_profile = trajectories_covid_authors_2.groupby('author_id').apply(lambda x: get_author_profile(x))

In [None]:
#author_profile_disc = author_profile.apply(lambda x: pd.qcut(x,q=np.arange(0,1.1,0.25),labels=False))

In [None]:
cov_auth_prof = pd.merge(
    pd.merge(cov,paper_author_lookup,left_on='id',right_on='id'),author_profile.reset_index(drop=False),
    left_on='author_id',right_on='author_id')

In [None]:
paper_stats_ai_non_ai = (cov_auth_prof
                   .groupby('id')[stat_names]
                   .max()
                   .reset_index(drop=False)
                   .merge(cov_auth_prof[['id','is_ai','cluster']],left_on='id',right_on='id')
                   .groupby('is_ai')[stat_names].mean())
paper_stats_ai_non_ai

In [None]:
stat_names = ['paper_n','citation_mean','citation_median','experience']

paper_stats_ind = (cov_auth_prof
                   .groupby('id')[stat_names]
                   .max()
                   .reset_index(drop=False)
                   .merge(cov_auth_prof[['id','is_ai','cluster']],left_on='id',right_on='id')
                   .groupby(['is_ai','cluster'])[stat_names].mean()
                   .reset_index(drop=False)
                   .melt(id_vars=['is_ai','cluster'])).pipe(preview)

In [None]:
all_chs = []
sorted_stats = ['paper_n','experience','citation_median','citation_mean']


for x in sorted_stats:
    ch = (alt.Chart(paper_stats_ind)
          .transform_filter(alt.FieldOneOfPredicate('cluster',top_ai_clusters))
          .transform_filter(alt.datum.variable==x)
          .mark_bar().encode(x='is_ai:N',color='is_ai:N',
                             y=alt.Y('value',title=x),
                    column=alt.Column('cluster',
                                      sort=alt.EncodingSortField('value',
                                                                 op='max',
                                                                 order='descending')))).properties(width=30,height=50)
    all_chs.append(ch)

    
comp = alt.vconcat(*all_chs)

save_fig(comp,'fig_12_qual')

comp

In [None]:
paper_stats_ind_2 = (cov_auth_prof
                   .groupby('id')[stat_names]
                   .max()
                   .reset_index(drop=False)
                   .merge(cov_auth_prof[['id','is_ai','cluster','citation_count']],left_on='id',right_on='id'))

In [None]:
cluster_corr = paper_stats_ind_2.groupby(['cluster','is_ai']).apply(lambda x: x[stat_names+['citation_count']].corr()
                                                       ).reset_index(drop=False).melt(
    id_vars=['cluster','level_2','is_ai'])

In [None]:
# ordered_vars = stat_names+['citation_count']

# (alt.Chart(cluster_corr)
#  .transform_filter(alt.FieldOneOfPredicate('cluster',top_ai_clusters))
#  .mark_rect(stroke='white',strokeWidth=0.5)
#  .encode(x=alt.X('level_1',sort=ordered_vars),
#          y=alt.Y('variable',sort=ordered_vars),
#          color=alt.Color('value',scale=alt.Scale(scheme='viridis')))
# .facet('cluster',columns=4))

In [None]:
perf_corr = (alt.Chart(cluster_corr)
 .transform_filter(alt.FieldOneOfPredicate('cluster',top_ai_clusters))
 .transform_filter(alt.datum.level_2=='citation_count')
 .transform_filter(alt.datum.variable != 'citation_count')
 .mark_bar(stroke='black',strokeWidth=0.1,width=10)
 .encode(x=alt.X('is_ai:N'),
         y='value',
         row=alt.Row('variable',sort=sorted_stats,title='Correlation coefficient'),
         color=alt.Color('value',scale=alt.Scale(scheme='redblue'),sort='descending'),
             column=alt.Column('cluster',sort=top_ai_clusters)
        )).properties(
    height=50,width=40).configure_axis(grid=True)

save_fig(perf_corr,"fig_13_perf_corr")

perf_corr