## Section 1: Descriptive analysis

  * How much Covid and AI activity do we detect in our data sources?
  * Is AI over or underrepresented in Covid research
  * How has AI activity evolved over time?
  

## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import altair as alt
from altair import expr, datum
from altair import expr
from ai_covid_19.utils.utils import *

In [None]:
fig_path = f'{project_dir}/reports/figures/mohg_figures'

## 1. Read data

In [None]:
#Arxiv data
rxiv = pd.read_csv(f"{data_path}/processed/rxiv_papers_update.csv").pipe(preview)

In [None]:
rxiv['date'] = pd.to_datetime(rxiv['created'])

In [None]:
rxiv['is_covid_ai'] = rxiv['is_covid'] & rxiv['is_ai']

## 2. Data analysis

### Initial exploration and processing

In [None]:
print(f"""
total: {len(rxiv)}
ai: {sum(rxiv['is_ai'])} ({100*np.round(np.mean(rxiv['is_ai']),3)}%)
covid = {sum(rxiv['is_covid'])} ({100*np.round(np.mean(rxiv['is_covid']),4)}%)
covid_ai = {np.sum(rxiv['is_covid'] & rxiv['is_ai'])} ({np.round(100*np.mean(rxiv['is_covid'] & rxiv['is_ai']),5)}%)
""")

In [None]:
#We explore some of the trends in sources. It looks like the data collections are misaligned
s = rxiv.groupby(['date','article_source']).size().reset_index(name='count')
s = s.loc[pd.to_datetime(s['date'])>datetime.datetime(2020,1,1)]

tr = (alt
 .Chart(s)
 .transform_window(rolling='mean(count)',frame=[-10,10],groupby=['article_source'])
 .mark_line()
 .encode(x='date',y='count',row='article_source')).properties(height=100)
tr

Data collection for biorXiv and medrXiv finished before arXiv. We will remove all data for which we don't have data for all sources.

In [None]:
#Identify the last date for which we have biorXiv data
rxiv.groupby('article_source')['date'].max()

In [None]:
rxiv = rxiv.loc[rxiv['date']<= datetime.datetime(2020,6,17)]

### AI representation in Covid-19 research

In [None]:
#We calculate the share of AI in Covid-19 vs outside

cov_ai_ct = pd.DataFrame(rxiv.query('year == 2020').groupby(['is_covid','is_ai']).size().reset_index(drop=False).pivot(
    index='is_covid',columns='is_ai',values=0).apply(lambda x: x/x.sum(),axis=1).reset_index(drop=False)).melt(id_vars='is_covid')

#Clean up the labels
cov_lookup = {True:'COVID_19',False:'Not COVID_19'}
ai_lookup = {True:'AI',False:'Not AI'}

cov_ai_ct['is_covid'],cov_ai_ct['is_ai'] = [cov_ai_ct[var].map(look) for var,look in 
                                            zip(['is_covid','is_ai'],[cov_lookup,ai_lookup])]

cov_ai_ct['value_label'] = [str(np.round(100*x,2))+'%' for x in cov_ai_ct['value']]

In [None]:
base = (alt.Chart(cov_ai_ct)
        .transform_calculate(value = 100*alt.datum.value)
        .mark_bar().encode(y=alt.Y('is_covid:N',title=''),
                    x=alt.X('value:Q',title='% of all papers in category'),
                    color=alt.Color('is_ai:N',title='Paper category'),
                    order=alt.Order('is_ai:N',sort='ascending'),
                   tooltip=['value_label:N']))

base.save(f'{fig_path}/fig_1.html')

base

In [None]:
print(f"{np.round(100*cov_ai_ct['value'][3]/cov_ai_ct['value'][2],2)}%")

### Research trends

We compare research trends in all the data, COVID-19 applications as a shrare of AI and AI applications as a share of COVID-19

In [None]:
#We focus on recent years
rxiv_recent = rxiv.query("year > 2016")

#All publications in rxiv
rxiv_daily = rxiv_recent['date'].value_counts().rename('all_xiv')

In [None]:
queries = ["is_covid == 1","is_ai == 1","is_covid_ai ==1"]
names = ['covid','ai','covid_ai']

all_series = pd.concat([rxiv_daily,
    pd.concat([rxiv_recent.query(q)['date'].value_counts().rename(n) for n,q in zip(names,queries)],axis=1)],axis=1).fillna(0)

all_series_long = all_series.reset_index(drop=False).melt(id_vars='index')

In [None]:
#Normalise the time series
norm = 100*pd.concat([all_series[['covid']].apply(lambda x: x/all_series['all_xiv']),
                     all_series['covid_ai']/all_series['ai'],
                     all_series['covid_ai']/all_series['covid']],axis=1).rename(
    columns={'covid':'Covid-19 (% all)','covid_ai':'Covid-19 AI papers (% all)',0:'Covid-19 AI papers (% AI)',
         1:'Covid-19 AI papers (% Covid)'})

#Focus on recent times
norm_rec = norm.loc[
    pd.to_datetime(norm.index)>datetime.datetime(2019,11,1)].reset_index(drop=False).melt(id_vars='index')

In [None]:
#Create the chart

linechart = (alt.Chart(norm_rec)
 .transform_window(rolling_mean = 'mean(value)',frame=[-7,7],groupby=['variable'])
 .mark_line().encode(x=alt.X('index',title=''),
                     y=alt.Y('rolling_mean:Q',title=''),
                             row=alt.Row('variable',title='')).resolve_scale(y='independent')
 .properties(height=110,width=600))

linechart.save(f'{fig_path}/fig_2.html')

linechart

### Representation by data source

We compare shares of activity by data source

In [None]:
#Calculate shares of covid and ai activity in different data sources (arXiv, biorXiv, medrXiv)
#Note we are only focusing on publications since 2016

source_shares = 100*pd.concat(
    [rxiv_recent.query(q)['article_source'].value_counts(normalize=True).rename(n) for n,q in zip(names,queries)],axis=1)

source_shares_long = source_shares.reset_index(drop=False).melt(id_vars=['index'])

variable_lookup = {'covid':'COVID-19','ai':'AI','covid_ai':'AI and COVID-19'}
source_lookup = {'medrxiv':'medrXiv','arxiv':'arXiv','biorxiv':'biorXiv'}

source_shares_long['variable'],source_shares_long['index'] = [source_shares_long[v].map(lookup) for
                                                             v,lookup in zip(['variable','index'],
                                                                            [variable_lookup,
                                                                              source_lookup])]

source_shares_long['value_label'] = [str(np.round(x,2))+'%' for x in source_shares_long['value']]

In [None]:
base_source = alt.Chart(
    source_shares_long).mark_bar().encode(y=alt.Y('variable',title='Category'),
                                          x=alt.X('value',title='% of papers in category'),
                                          color=alt.Color('index:N',title='Source'),
                                         tooltip=['index','value_label'])



source_prop = base_source.properties(width=400,height=100)

source_prop.save(f'{fig_path}/fig_3.html')

source_prop