# Machines of healing grace?

Code with basic analysis and results from the AI v Covid paper

**Sections**

1. **Descriptive analysis**
  * How much Covid and AI activity do we detect in our data sources?
  * Is AI over or underrepresented in Covid research
  * How has AI activity evolved over time?
2. Topical analysis
  * What is the topical composition of Covid research and in what areas is AI focusing?
  * What are some examples of AI research to tackle Covid?
  * How has it evolved over time?
3. Geography
  * Where is AI research happening?
  * Who is doing it?
  * Do we find any differences in the topics that different countries focus on?
  * What reflects whether a country focuses on Covid research? Demand pull or supply push?
4. Knowledge base
  * On what topics do AI researchers draw on?
4. Analysis of diffusion
  * What determines the focus of AI researchers on particular topics?
  * Does Covid oriented-AI research reflect the composition of the broader field? 
  * What researchers have been attracted to AI research and why?
  

## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import altair as alt
from altair import expr, datum
from altair_saver import save
from toolz.curried import *

In [None]:
FIG_PATH = f"{project_dir}/reports/figures/report_revised"
SRC_PATH = f"{project_dir}/data/processed/ai_research"


In [None]:
pd.options.mode.chained_assignment = None

In [None]:
def save_fig(figure,name):
    save(figure,f'{FIG_PATH}/{name}.png',method='selenium',
         webdriver=DRIVER,scale_factor=3)
    
def preview(x):
    print(x.head())
    print(x.shape)
    return(x)

## 1. Read data

In [None]:
xiv = pd.read_csv(f"{SRC_PATH}/xiv_papers_labelled.csv").pipe(preview)

In [None]:
xiv.columns = [x.lower() for x in xiv.columns]

## 2. Analysis

In [None]:
print(xiv['is_ai'].mean()*100)
print(xiv['is_covid'].mean()*100)

print(100*np.mean(xiv['is_covid'] & xiv['is_ai']))

In [None]:
#xiv['date'] = xiv['created'].apply(lambda x: np.datetime64(datetime.datetime.strptime(x,"%Y-%m-%d")))

xiv['date'] = pd.to_datetime(xiv['created'])

In [None]:
#We explore some of the trends in sources. It looks like the data collections are misaligned
s = xiv.groupby(['date','article_source']).size().reset_index(name='count')
s = s.loc[pd.to_datetime(s['date'])>datetime.datetime(2020,1,1)]

tr = (alt
 .Chart(s)
 .transform_window(rolling='mean(count)',frame=[-10,10],groupby=['article_source'])
 .mark_line()
 .encode(x='date',y='count',row='article_source')).properties(height=100)

save_fig(tr,"test_trends")
tr

In [None]:
xiv.groupby('article_source')['date'].max()

In [None]:
xiv = xiv.loc[pd.to_datetime(xiv['date'])<= datetime.datetime(2020,5,19)]

In [None]:
xiv_recent = xiv.query("year > 2016")

In [None]:
xiv_daily = xiv_recent['date'].value_counts().rename('all_xiv')

In [None]:
queries = ["is_covid == 1","is_ai == 1","(is_covid ==1) & (is_ai ==1)"]
names = ['covid','ai','covid_ai']

all_series = pd.concat([xiv_daily,
    pd.concat([xiv_recent.query(q)['date'].value_counts().rename(n) for n,q in zip(names,queries)],axis=1)],axis=1).fillna(0)

all_series_long = all_series.reset_index(drop=False).melt(id_vars='index')

all_series_long['index'] = pd.to_datetime(all_series_long['index'])



In [None]:
### Representation

xiv['covid_ai'] = xiv['is_covid'] & xiv['is_ai']

cov_ai_ct = pd.DataFrame(xiv.query('year == 2020').groupby(['is_covid','is_ai']).size().reset_index(drop=False).pivot(
    index='is_covid',columns='is_ai',values=0).apply(lambda x: x/x.sum(),axis=1).reset_index(drop=False)).melt(id_vars='is_covid')


In [None]:
base = (alt.Chart(cov_ai_ct)
 .transform_calculate(value = 100*alt.datum.value)
 .mark_bar().encode(y='is_covid:N',
                    x=alt.X('value:Q',title='% of all papers in category')))

bar = base.encode(color='is_ai:N')

text = (base.mark_text(align='left',color='white',dx=3)
        .transform_filter(alt.datum.is_ai==1)
        .transform_calculate(my_text = expr.format(datum.value,".2f"))
        .encode(text="my_text:Q"
                #alt.Text("value:Q",format=".2f"))
               ))

save_fig(bar+text,'fig_1_representation')

(bar+text)

In [None]:
cov_ai_ct['value'][3]/cov_ai_ct['value'][2]

In [None]:
#Trends

In [None]:
norm = 100*pd.concat([all_series[['covid']].apply(lambda x: x/all_series['all_xiv']),
                     all_series['covid_ai']/all_series['ai'],
                     all_series['covid_ai']/all_series['covid']],axis=1).rename(
columns={'covid':'Covid-19 (% all)','covid_ai':'Covid-19 AI papers (% all)',0:'Covid-19 AI papers (% AI)',
         1:'Covid-19 AI papers (% Covid)'})
norm_rec = norm.loc[pd.to_datetime(norm.index)>datetime.datetime(2019,11,1)].reset_index(drop=False).melt(id_vars='index')

In [None]:
lch = (alt.Chart(norm_rec)
 .transform_window(rolling_mean = 'mean(value)',frame=[-7,7],groupby=['variable'])
 .mark_line().encode(x=alt.X('index',title=''),
                     y=alt.Y('rolling_mean:Q',title=''),
                             row='variable').resolve_scale(y='independent')
 .properties(height=110,width=600))

save_fig(lch,"fig_2_evolution")

lch

In [None]:
# By data source

source_shares = 100*pd.concat(
    [xiv_recent.query(q)['article_source'].value_counts(normalize=True).rename(n) for n,q in zip(names,queries)],axis=1)

source_shares_long = source_shares.reset_index(drop=False).melt(id_vars=['index'])

In [None]:
base = alt.Chart(
    source_shares_long).mark_bar().encode(y=alt.Y('variable',title='Category'),
                                          x=alt.X('value',title='% of papers in category'),
                                          color=alt.Color('index:N',title='Source'))



source_prop = prop.properties(width=400,height=100)

save_fig(source_prop,'fig_3_source_shares')

source_prop

In [None]:
sum(xiv['is_ai'])