# Machines of healing grace?

Code with basic analysis and results from the AI v Covid paper

**Sections**

1. Descriptive analysis
  * How much Covid and AI activity do we detect in our data sources?
  * Is AI over or underrepresented in Covid research
  * How has AI activity evolved over time?
2. Topical analysis
  * What is the topical composition of Covid research and in what areas is AI focusing?
  * What are some examples of AI research to tackle Covid?
  * How has it evolved over time?
3. **Geography**
  * Where is AI research happening?
  * Who is doing it?
  * Do we find any differences in the topics that different countries focus on?
  * What reflects whether a country focuses on Covid research? Demand pull or supply push?
4. Knowledge base
  * On what topics do AI researchers draw on?
4. Analysis of diffusion
  * What determines the focus of AI researchers on particular topics?
  * Does Covid oriented-AI research reflect the composition of the broader field? 
  * What researchers have been attracted to AI research and why?
  

## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import altair as alt
from altair_saver import save
from toolz.curried import *
import random
import geopandas as gp

In [None]:
FIG_PATH = f"{project_dir}/reports/figures/report_1"
SRC_PATH = f"{project_dir}/data/processed/ai_research"


In [None]:
pd.options.mode.chained_assignment = None

In [None]:
def save_fig(figure,name):
    save(figure,f'{FIG_PATH}/{name}.png',method='selenium',
         webdriver=DRIVER,scale_factor=3)
    
def preview(x):
    print(x.head())
    print(x.shape)
    return(x)


## 1. Read data

In [None]:
#All arXiv data
xiv = pd.read_csv(f"{SRC_PATH}/xiv_papers_labelled.csv",dtype={'id':str}).pipe(preview)

In [None]:
xiv.columns = [x.lower() for x in xiv.columns]

In [None]:
ai_ids = set(xiv.loc[xiv['is_ai']==True]['id'])

In [None]:
#Create a cov df

cov = xiv.query("is_covid == True").reset_index(drop=True).pipe(preview)

In [None]:
#All topics
tops = pd.read_csv(f"{SRC_PATH}/tidy_paper_topics_ai_2.csv",dtype={'article_id':str}).pipe(preview)

tops['is_ai'] = [x in ai_ids for x in tops['article_id']]

In [None]:
#Load the GRID data

conf_path = os.getenv('config_path')

from data_getters.arxiv_grid import get_arxiv_grid
grid = get_arxiv_grid(conf_path, all_articles=True)

## 2. Analyse data

In [None]:
xiv_geo = pd.merge(xiv,grid,left_on='id',right_on='article_id')

In [None]:
xiv_geo['institute_country'].fillna('Unmatched',inplace=True) 

### Country frequencies

How active in Covid research are different countries?

In [None]:
country_freqs = xiv_geo['institute_country'].value_counts().rename('all_arxiv')


In [None]:
queries = ["is_covid == 1","is_ai == 1","(is_covid ==1) & (is_ai ==1)"]
names = ['covid','ai','covid_ai']

all_acts = pd.concat([country_freqs,
    pd.concat([xiv_geo.query(q)['institute_country'].value_counts(
    ).rename(n) for n,q in zip(names,queries)],axis=1)],axis=1).fillna(0)

#Top countries
top_countries = list(all_acts.sort_values('all_arxiv',ascending=False)[:25].index)

In [None]:
geo_activity_long_norm = (100*all_acts.apply(lambda x: x/x.sum()).sort_values('all_arxiv',ascending=False)).loc[
    top_countries].reset_index(drop=False).melt(id_vars=['index']).pipe(preview)

In [None]:
#Second component of the chart
#Topic representation by country
cluster_mapping = tops.drop_duplicates('article_id').set_index('article_id')['cluster'].to_dict()

In [None]:
cov_geo = xiv_geo.query("(is_ai == 1) & (is_covid == 1)").reset_index(drop=False)

cov_geo['cluster'] = cov_geo['id'].map(cluster_mapping)

top_ai_clusters = top_ai_clusters = 

In [None]:
top_ai_clusters = tops.drop_duplicates('article_id').groupby(['is_ai','cluster']).size()[True].sort_values(
    ascending=False)[:5].index

country_cluster = cov_geo.groupby(['institute_country','cluster']).size().reset_index(name='count')
country_cluster['cluster_short'] = [x if x in top_ai_clusters else 'Other' for x in country_cluster['cluster']]

In [None]:
#Chart

In [None]:
base = alt.Chart(geo_activity_long_norm).encode(
    y=alt.Y('index',sort=top_countries,title=''),
               x=alt.X('value',title='% of all activity in category'))

p = (base.mark_point(filled=True,
                 size=100,opacity=0.75,stroke='black',strokeWidth=1)
     .encode(
         color=alt.Color('variable',title='Category'),
         shape=alt.Shape('variable',
                         scale=alt.Scale(range=['circle','circle','cross','cross']))))

l = (base.mark_line(strokeWidth=0.7,color='black')
    .encode(detail='index'))


stack = (alt.Chart(country_cluster)
 .transform_filter(alt.FieldOneOfPredicate('institute_country',top_countries))
 .mark_bar().encode(y=alt.Y('institute_country',sort=top_countries,title=''),
                    x='count',color=alt.Color('cluster_short',title='Topic cluster')))

comp = (alt.hconcat((p+l).properties(width=250,height=500),stack.properties(height=500,width=150))
 .resolve_scale(color='independent',shape='independent'))

save_fig(comp,'fig_5_country')

comp

### Evolution of activity



In [None]:
cov_geo_all = xiv_geo.query("is_covid == 1").reset_index(drop=False)

cov_geo_all['date'] = pd.to_datetime(cov_geo_all['created'])

In [None]:
cov_geo_trend = cov_geo_all.query('year ==2020').groupby(
    ['institute_country','is_ai','date']).size().loc[top_countries[:12]].reset_index(name='count')

In [None]:
tr = alt.Chart(
    cov_geo_trend).mark_line(
    opacity=0.9).transform_window(
    m='mean(count)',frame=[-3,3],groupby=['institute_country','is_ai']).encode(
    x='date',y=alt.Y('m:Q',title=['Research','participations']),
                     color='is_ai:N',
    facet=alt.Facet('institute_country',columns=4,
                   sort=alt.EncodingSortField('count','sum',order='descending'))).properties(
    width=100,height=85).resolve_scale(y='independent')

save_fig(tr,"fig_6_geo_trend")

tr

In [None]:
xiv.query('is_ai==1').to_csv("/Users/jmateosgarcia/Desktop/arxiv_ai.csv",compression='zip')

In [None]:
xiv.columns