# Machines of healing grace?

Code with basic analysis and results from the AI v Covid paper

**Sections**

1. Descriptive analysis
  * How much Covid and AI activity do we detect in our data sources?
  * Is AI over or underrepresented in Covid research
  * How has AI activity evolved over time?
2. **Topical analysis**
  * What is the topical composition of Covid research and in what areas is AI focusing?
  * What are some examples of AI research to tackle Covid?
  * How has it evolved over time?
3. Geography
  * Where is AI research happening?
  * Who is doing it?
  * Do we find any differences in the topics that different countries focus on?
  * What reflects whether a country focuses on Covid research? Demand pull or supply push?
4. Knowledge base
  * On what topics do AI researchers draw on?
4. Analysis of diffusion
  * What determines the focus of AI researchers on particular topics?
  * Does Covid oriented-AI research reflect the composition of the broader field? 
  * What researchers have been attracted to AI research and why?
  

## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import altair as alt
from altair_saver import save
from toolz.curried import *
import random

In [None]:
FIG_PATH = f"{project_dir}/reports/figures/report_1"
SRC_PATH = f"{project_dir}/data/processed/ai_research"


In [None]:
pd.options.mode.chained_assignment = None

In [None]:
def save_fig(figure,name):
    save(figure,f'{FIG_PATH}/{name}.png',method='selenium',
         webdriver=DRIVER,scale_factor=3)
    
def preview(x):
    print(x.head())
    print(x.shape)
    return(x)

## 1. Read data

In [None]:
xiv = pd.read_csv(f"{SRC_PATH}/xiv_papers_labelled.csv",dtype={'id':str}).pipe(preview)

In [None]:
xiv.columns = [x.lower() for x in xiv.columns]

In [None]:
ai_ids = set(xiv.loc[xiv['is_ai']==True]['id'])

In [None]:
tops = pd.read_csv(f"{SRC_PATH}/tidy_paper_topics_ai_2.csv").pipe(preview)

tops['is_ai'] = [x in ai_ids for x in tops['article_id']]

In [None]:
#Create a cov df

cov = xiv.query("is_covid == True").reset_index(drop=True).pipe(preview)

## 2. Analyse data

### Clusters and topics

In [None]:
tops_no_dupes = tops.drop_duplicates(['article_id','cluster','is_ai'])

In [None]:
#Prep - get variables to sort axis
#Shares of AI by cluster
cluster_ai = 100*pd.crosstab(tops_no_dupes['cluster'],tops_no_dupes['is_ai'],
                        normalize=1).sort_values(True,ascending=False)
#This is the list of clusters to order the chart
bar_order = cluster_ai.index

cluster_distr = cluster_ai.reset_index(drop=False).melt(id_vars='cluster')

w = 0.1
topic_count = tops.loc[
    tops['weight']>w].groupby(['topic','cluster']).size().reset_index(name='count')

ai_topics = list(tops.loc[
    tops['weight']>w].groupby(['topic','is_ai']).size().reset_index(
    name='count').pivot_table(index='topic',columns='is_ai',values='count').fillna(
    0).assign(share=lambda x: x[True]/x[False]).sort_values('share',ascending=False).index)

In [None]:
#Barchart component
bar_b = alt.Chart(cluster_distr).mark_bar(opacity=0.5,width=15,
                                      stroke='black',
                                     strokeWidth=1).encode(x=alt.X('cluster',sort=list(bar_order),
                                                                  axis=alt.Axis(labels=False,
                                                                                title="",ticks=False)),
                                                           y=alt.Y('value',title='% in cluster',stack=False),
                                                          color='is_ai')
bar = bar_b.properties(height=100)

In [None]:
#
hm_b = alt.Chart(topic_count).transform_filter(alt.FieldOneOfPredicate('topic',ai_topics[:40]))
hm = hm_b.mark_rect().encode(x=alt.X('cluster',sort=list(bar_order),
                                  axis=alt.Axis(grid=True,gridDash=[1,1],
                                                gridOpacity=1)),
                          y=alt.Y('topic',sort=list(ai_topics)
                                  #axis=alt.Axis(grid=True,gridDash=[5,1],gridOpacity=0.5)
                                 ),color='count:Q',tooltip=['topic','cluster'])

In [None]:
conc = alt.vconcat(bar.properties(width=400),hm.properties(
    height=500,
    width=400),spacing=0)

save_fig(conc,"fig_3_topic_mix")

conc

### Clusters and sources

In [None]:
#What are the sources of papers in different topics?
paper_source_lookup = xiv.set_index('id')['article_source'].to_dict()

In [None]:
tops_no_dupes['source'] = tops_no_dupes['article_id'].map(paper_source_lookup)

In [None]:
source_cluster_ai = tops_no_dupes.groupby(['source','is_ai','cluster']).size().pipe(preview).reset_index(name='paper_count')

In [None]:
s = (alt.Chart(source_cluster_ai)
     .mark_bar().encode(
         x=alt.X('cluster',sort=list(bar_order)),
         y='paper_count',
         color='source',
         row=alt.Row('is_ai',sort=[True,False])))
s = s.resolve_scale(y='independent').properties(width=400,height=100)

save_fig(s,'fig_4_sources')

In [None]:
def get_examples(_list,values):
    if len(_list)==0:
        return('')
    elif len(_list)<values:
        return(random.choices(_list,k=len(_list)))
    else:
        return(random.choices(_list,k=values))

### Table with examples

In [None]:
#Table with examples

example_table_content = {'cluster':[],'salient_topics':[],'example_ai_papers':[],'example_non_ai_papers':[]}

for c in bar_order:
    #Add name
    example_table_content['cluster'].append(c)
    
    #Get relevant papers
    rel = tops.loc[tops['cluster']==c]
    
    #Add top topics
    sal_tops = '\n'.join(
        rel.groupby('topic')['weight'].mean().sort_values(ascending=False).index[:3])
    
    example_table_content['salient_topics'].append(sal_tops)
    
    #Get some of the papers
    rel_indices = set(rel['article_id'])
    
    cov_in_cluster = cov.loc[cov.id.isin(rel_indices)]
    
    
    ex_ai,ex_nai = ['\n'.join(get_examples(
        list(cov_in_cluster.loc[cov_in_cluster['is_ai']==val]['title']),values=2)) for val in [True,False]]
    
    example_table_content['example_ai_papers'].append(ex_ai)
    example_table_content['example_non_ai_papers'].append(ex_nai)
    


In [None]:
example_table = pd.DataFrame(example_table_content)
example_table.to_csv(f"{FIG_PATH}/table_1_examples.csv",mode='w')

In [None]:
example_table.head()

### Evoution of topics over time

In [None]:
cluster_mapping = pd.DataFrame(tops_no_dupes[['article_id','cluster']])
cluster_mapping.to_csv(f"{project_dir}/data/interim/ai_cluster_mapping.csv",index_label=False)

In [None]:
cov['cluster'] = cov['id'].map(cluster_mapping)
cov['date'] = pd.to_datetime(cov['created'])

In [None]:
#Grouped

cluster_trends = cov.query("date>='2020'").groupby(['date','is_ai','cluster']).size().reset_index(name='count').pipe(preview)
cluster_trends['cluster_short'] = [x if x in bar_order[:7] else 'other' for x in cluster_trends['cluster']]

In [None]:
(alt.Chart(cluster_trends)
 .transform_filter(alt.datum.is_ai==1)
 .mark_bar()
 .encode(x='date',y='count',color='cluster_short'))