# Machines of healing grace?

Code with basic analysis and results from the AI v Covid paper

**Sections**

1. Descriptive analysis
  * How much Covid and AI activity do we detect in our data sources?
  * Is AI over or underrepresented in Covid research
  * How has AI activity evolved over time?
2. Topical analysis
  * What is the topical composition of Covid research and in what areas is AI focusing?
  * What are some examples of AI research to tackle Covid?
  * How has it evolved over time?
3. Geography
  * Where is AI research happening?
  * Who is doing it?
  * Do we find any differences in the topics that different countries focus on?
  * What reflects whether a country focuses on Covid research? Demand pull or supply push?
4. **Knowledge base**
  * On what topics do AI researchers draw on?
4. Analysis of diffusion
  * What determines the focus of AI researchers on particular topics?
  * Does Covid oriented-AI research reflect the composition of the broader field? 
  * What researchers have been attracted to AI research and why?
  

## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import altair as alt
from altair_saver import save

from scipy.stats import entropy, zscore
from cord19.estimators.complexity import *

from data_getters.inspector import get_schemas
from data_getters.core import get_engine
from dotenv import load_dotenv,find_dotenv

In [None]:
load_dotenv(find_dotenv())
sql_creds = os.getenv('config_path')

In [None]:
FIG_PATH = f"{project_dir}/reports/figures/report_1"
SRC_PATH = f"{project_dir}/data/processed/ai_research"


In [None]:
pd.options.mode.chained_assignment = None

In [None]:
def preview(x):
    print(x.head())
    print('\n')
    print(x.shape)
    
    return(x)

def herf(vector):
    '''Calculates the herfindahl concentration index for a vector
    
    '''
    return(1-np.sum([(x/sum(vector))**2 for x in list(vector)])) #NB we are doing the reverse of herf

def binarise(vector,thres=0.1):
    '''Binarises values in a vector based in whether they are above a value or not
    
    '''
    
    return([x>thres for x in list(vector)])

def calculate_diversities(topic_mix,measures,measure_names):
    '''Applies various diversity measures to a df
    '''
    
    div = pd.concat([topic_mix.apply(measure,axis=1) for measure in measures],axis=1)

    div.columns = measure_names
    
    return(div)
    
def div_ai_corr(div_long,ai_map,method='spearman'):
    '''Calculates correlation between diversity measures and AI shares
    
    '''
    dv = div_long.copy()
    
    dv['ai_share'] = dv['cluster'].map(ai_shares)
    
    corr = dv.pivot_table(index=['cluster','ai_share'],
                      columns='variable',values='value').reset_index(level=1).corr(method=method)
    return(corr)

def complexity_index(x):
    '''
    Calculates complexity index for a group / topic mix matrix
    '''
    
    lq = create_lq(x)
    eci = calc_eci(lq)
    return(eci)

def save_fig(figure,name):
    save(figure,f'{FIG_PATH}/{name}.png',method='selenium',
         webdriver=DRIVER,scale_factor=3)


In [None]:
def flatten(_list,freq=False,norm=True):
    
    flat = [x for el in _list for x in el]
    
    if freq==False:
        return flat
    else:
        return pd.Series(flat).value_counts(normalize=norm)

## Load data

In [None]:
tops = pd.read_csv(f"{project_dir}/data/processed/ai_research/tidy_paper_topics_ai_2.csv")

tops['is_ai']=tops['is_ai'].astype(bool)



In [None]:
with open(f"{project_dir}/data/processed/ai_research/ai_article_mag_info.json",'r') as infile:
    article_mag = json.load(infile)
    
with open(f"{project_dir}/data/processed/ai_research/citation_lookup.json",'r') as infile:
    citation_lookup = json.load(infile)

In [None]:
#This MAG fields of study table contains information about the mag hierarchy we can use to parse it
con = get_engine(sql_creds)

mag_fos = pd.concat(pd.read_sql('mag_fields_of_study',con,chunksize=1000))

## 1. Knowledge base of various clusters

### Process fields of study info

In [None]:
mag_fos['name_l'] = [x.lower() for x in mag_fos['name']]

In [None]:
mag_levels = mag_fos.groupby('level')['name_l'].apply(set)

### Extract cited fos by paper

In [None]:
#Convert mag ids to strs in the corpus df
tops_ = tops.dropna(axis=0,subset=['mag_id'])

tops_['mag_id'] = [str(int(x)) for x in tops_['mag_id']]


cov_short = tops_[['article_id','mag_id','cluster','is_ai']].drop_duplicates(
    'mag_id').reset_index(drop=True)


cov_short['cited'] = cov_short['mag_id'].map(citation_lookup)

In [None]:
#Collect field of study sets for each element in cited
cov_cits = cov_short.dropna(axis=0,subset=['cited'])

cov_cits['fos_cited'] = [flatten([article_mag[x]['fields_of_study'] if 'fields_of_study' in 
                          article_mag[x].keys() else [] for x in cit]) for cit in cov_cits['cited']]

cov_cits['fos_cited_unique'] = [set(x) for x in cov_cits['fos_cited']]

In [None]:
cov_cits['fos_cited_l1'] = [[x for x in cited if x in mag_levels[0]] for cited in cov_cits['fos_cited']]

### High level analysis: distribution of citations at level 1

In [None]:
#In total

#def 

In [None]:
# For AI vs AI

l1_cits = cov_cits.groupby('is_ai')['fos_cited_l1'].apply(lambda x: 100*flatten(x,freq=True)).reset_index(
    drop=False).pipe(preview)

In [None]:
base = (alt
        .Chart(l1_cits)
        .mark_bar(opacity=0.5,stroke='black')
        .encode(
            y=alt.Y('level_1',sort=alt.EncodingSortField('fos_cited_l1','sum',order='descending'),
                   title='Field of Study'),
            x=alt.X('fos_cited_l1',stack=None,title=['% of citations by papers','in category']),color='is_ai'))

out = base.properties(height=300,width=200)

save_fig(out,"fig_7_field1_citations")

out

In [None]:
def get_field_citation_distribution(df):
    '''
    Calculates distribution of citations by paper and bins them into categories
    
    '''
    
    df_w_cit = df.loc[[len(x)>0 for x in df['fos_cited_l1']]].reset_index(drop=False)
    
    logger.info(f"total with citations={len(df_w_cit)} and total_ai={sum(df_w_cit['is_ai']==1)}")
    
    #Citation distribution by paper 
    p_f1_citations = pd.DataFrame(
        [pd.Series(x).value_counts() for x in df_w_cit['fos_cited_l1']]).apply(
        lambda x: x/x.sum(),axis=1).fillna(0)
    
    paper_cit_distr = pd.concat([df_w_cit[['is_ai','article_id']],p_f1_citations],axis=1)

    paper_cit_distr_long = paper_cit_distr.melt(id_vars=['is_ai','article_id'],value_name='share_fields_cited')
    

    paper_cit_distr_long_2 = pd.concat([paper_cit_distr_long,paper_cit_distr_long.groupby(
        ['variable','is_ai'])['share_fields_cited'].apply(
        lambda x: pd.cut(x,bins=20,labels=False)).rename('bin').reset_index(drop=True)],axis=1)
    
    paper_cit_bins = paper_cit_distr_long_2.groupby(
    ['is_ai','variable','bin'])['share_fields_cited'].size().reset_index(drop=False)

    paper_cit_bins['norm'] = paper_cit_bins.groupby(['is_ai','variable'])['share_fields_cited'].apply(lambda x: x/x.sum())
    
    return(paper_cit_bins)
    
    

In [None]:
all_paper_cit_bins = get_field_citation_distribution(cov_cits)

In [None]:
def make_hist(df,t):
    h = (alt.Chart(df)
         .transform_filter(alt.FieldOneOfPredicate('variable',['computer science','medicine','biology','mathematics']))
         .mark_bar(opacity=0.5,width=5,stroke='black',
                   strokeWidth=1).encode(x='bin',y=alt.Y('norm',stack=None,title='Share of papers'),
                               row=alt.Row('variable',sort=['computer science','medicine','biology','mathematics'],
                                          title='Field being cited'),
                                 color='is_ai:N')
         .properties(height=100,width=120,title=t))
    return(h)

In [None]:
all_hist = make_hist(all_paper_cit_bins,"All papers")

In [None]:
ai_clusters_sorted = list(cov_short.query('is_ai==1')['cluster'].value_counts().index)
top_5_ai_clusters = ai_clusters_sorted[:2]

In [None]:
clusts_distr = [get_field_citation_distribution(cov_cits.loc[cov_cits['cluster']==c]) for c in top_5_ai_clusters]

hists = [all_hist]+[make_hist(cl,n) for cl,n in zip(clusts_distr,top_5_ai_clusters)]

In [None]:
hists = alt.hconcat(*hists).resolve_scale(y='shared')

save_fig(hists,"fig_8_hists")

hists

In [None]:
#Heatmap with citations to group 3 by category?

In [None]:
cov_cits['fos_cited_l1'] = [[x for x in cited if x in mag_levels[1]] for cited in cov_cits['fos_cited']]

In [None]:
top_50_topics = list(flatten(cov_cits['fos_cited_l1'],freq=True)[:30].index)

In [None]:
cov_fos1 = cov_cits.groupby(
    ['is_ai','cluster'])['fos_cited_l1'].apply(lambda x: 100*flatten(list(x),freq=True)).reset_index(drop=False)

In [None]:
#Get a lookup
fos_0_lu = {r['id']:r['name'] for idx,r in mag_fos.query("level == 0").iterrows()}

fos_1_to_0_lu = {fos_0_lu[[int(x) for x in pars.split(',')][0]] for pars in mag_fos.loc[mag_fos['level']==1]['parent_ids']}

name_lookup = {r['name'].lower():fos_0_lu[int(r['parent_ids'].split(',')[0])] for rid,r in mag_fos.loc[mag_fos['level']==1].iterrows()}

In [None]:
#Add higher level discipline to table

cov_fos1['discipline'] = cov_fos1['level_2'].map(name_lookup)

In [None]:
point_ch = (alt.Chart(cov_fos1)
 .transform_filter(alt.FieldOneOfPredicate('level_2',top_50_topics))
 .transform_filter(alt.FieldOneOfPredicate('cluster',ai_clusters_sorted[:10]))
 .mark_point(filled=True,strokeWidth=0.7,stroke='black')
 .encode(y='is_ai',
         x=alt.X('level_2',sort=top_50_topics),
         size=alt.Size('fos_cited_l1',title=['% of all citations','in category']),
         color='discipline:N',
      row=alt.Row('cluster',sort=ai_clusters_sorted[:10]))).properties(width=500)

point_ch = point_ch.configure_axis(grid=True)

save_fig(point_ch,"fig_9_bubble")

point_ch