# Section 4: Knowledge base

* On what topics do AI researchers draw on?  

## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import altair as alt
from itertools import chain
from scipy.stats import entropy, zscore
from data_getters.inspector import get_schemas
from dotenv import load_dotenv,find_dotenv
from ai_covid_19.utils.utils import *

In [None]:
def overlap(set_1,set_2):
    ov = 100*len(set_1.intersection(set_2))/len(set_1.union(set_2))
    return(ov)

In [None]:
def flatten(_list,freq=False,norm=True):
    
    flat = [x for el in _list for x in el]
    
    if freq==False:
        return flat
    else:
        return pd.Series(flat).value_counts(normalize=norm)

## 1. Load data

In [None]:
rxiv = pd.read_csv(f"{data_path}/processed/rxiv_metadata.csv",
                   dtype={'id':str,'is_ai':bool,'is_covid':bool}).pipe(preview)

In [None]:
topics = pd.read_csv(f"{data_path}/processed/covid_semantic.csv",
                    dtype={'article_id':str}).pipe(preview)

In [None]:
mag_fos = pd.read_csv(f"{data_path}/processed/mag_fos.csv").pipe(preview).dropna(axis=0,
                                                                                                 subset=['name'])

In [None]:
with open(f"{data_path}/processed/ai_article_mag_info.json",'r') as infile:
    article_mag = json.load(infile)
    
with open(f"{data_path}/processed/citation_lookup.json",'r') as infile:
    citation_lookup = json.load(infile)

## 2. Data analysis

### 0. Process fields of study info

In [None]:
#We create a lookup between levels and names (lowercased)
mag_fos['name_l'] = [x.lower() for x in mag_fos['name']]
mag_levels = mag_fos.groupby('level')['name_l'].apply(set)

In [None]:
#Convert mag ids to strs in the corpus df
topics_ = topics.dropna(axis=0,subset=['mag_id'])

topics_['mag_id'] = [str(int(x)) for x in topics_['mag_id']]

cov_short = topics_[['article_id','mag_id','cluster','is_ai']].drop_duplicates(
    'mag_id').reset_index(drop=True)

#This gives us all the cited papers by a paper in the covid dataset
cov_short['cited'] = cov_short['mag_id'].map(citation_lookup)

In [None]:
#Extract field of study sets for each element in cited
cov_cits = cov_short.dropna(axis=0,subset=['cited'])

cov_cits['fos_cited'] = [flatten([article_mag[x]['fields_of_study'] if 'fields_of_study' in 
                          article_mag[x].keys() else [] for x in cit if x in article_mag.keys()]) for cit in cov_cits['cited']]

cov_cits['fos_cited_unique'] = [set(x) for x in cov_cits['fos_cited']]

cov_cits['fos_cited_l0'] = [[x for x in cited if x in mag_levels[0]] for cited in cov_cits['fos_cited']]

### 1. Distribution of citations at level 1

What is the distribution of citations to high level disciplines inside AI vs outside?

In [None]:
l1_cits = cov_cits.groupby('is_ai')['fos_cited_l0'].apply(lambda x: 100*flatten(x,freq=True)).reset_index(
    drop=False).pipe(preview)

In [None]:
#Clean variable names
l1_cits['Category'] = convert_ai(l1_cits['is_ai'])
l1_cits['level_1'] = [x.capitalize() for x in l1_cits['level_1']]
l1_cits['value_label'] = make_pc(l1_cits['fos_cited_l0'])

##### Chart

In [None]:
bar = (alt
        .Chart(l1_cits)
        .mark_bar(opacity=0.5,stroke='black')
        .encode(
            y=alt.Y('level_1',sort=alt.EncodingSortField('fos_cited_l0','sum',order='descending'),
                   title='Field of Study'),
            x=alt.X('fos_cited_l0',stack=None,title=['% of citations by papers','in category']),
            color='Category',
            tooltip=['Category','level_1','value_label']))

bar_2 = bar.properties(height=300,width=400)

bar_2.save(f"{fig_path}/fig_8.html")

bar_2

##### Over or underrepresentation of citations in a topic

In [None]:
cit_rep = l1_cits.pivot_table(index='level_1',columns='Category',values='fos_cited_l0').assign(
    prop=lambda x: x['AI']/x['Not AI']).dropna()
cit_rep.loc[['Medicine','Biology','Computer science']]

##### Overlap in citations between topics

In [None]:
cit_sets = cov_cits.groupby('is_ai')['cited'].apply(lambda x: set(chain(*list(x))))

print(str(np.round(100*len(cit_sets[True].intersection(cit_sets[False]))/len(cit_sets[True].union(cit_sets[False])),
                   2))+'%')

### Distribution of citations at level 3

Here we compare citatins between AI / non AI research at a higher level of field of study granularity

In [None]:
cov_cits['fos_cited_l1'] = [[x for x in cited if x in mag_levels[1]] for cited in cov_cits['fos_cited']]

In [None]:
#We want to focus on the most cited topics within the corpus
top_50_topics = list(flatten(cov_cits['fos_cited_l1'],freq=True)[:30].index)

ai_clusters_sorted = list(cov_short.query('is_ai==True')['cluster'].value_counts().index)

In [None]:
#This extracts the distribution of citations per category / cluster
cov_fos1 = cov_cits.groupby(
    ['is_ai','cluster'])['fos_cited_l1'].apply(lambda x: 100*flatten(list(x),freq=True)).reset_index(drop=False)

In [None]:
#Get a lookup
fos_0_lu = {r['id']:r['name'] for idx,r in mag_fos.query("level == 0").iterrows()}

fos_1_to_0_lu = {fos_0_lu[[int(x) for x in pars.split(',')][0]] for pars in mag_fos.loc[mag_fos['level']==1]['parent_ids']}

name_lookup = {r['name'].lower():fos_0_lu[int(r['parent_ids'].split(',')[0])] for rid,r in mag_fos.loc[mag_fos['level']==1].iterrows()}

In [None]:
#Add higher level discipline to table
cov_fos1['discipline'] = cov_fos1['level_2'].map(name_lookup)

#Some cleaning of variable names etc for the chart
cov_fos1['Cluster'] = clean_cluster(cov_fos1['cluster'])
cov_fos1['Subfield'] = [x.capitalize() for x in cov_fos1['level_2']]
cov_fos1['% of citations in Cluster'] = make_pc(cov_fos1['fos_cited_l1'])
top_50_cap = [x.capitalize() for x in top_50_topics]
clean_clust = clean_cluster(ai_clusters_sorted[:10])
cov_fos1['cluster'] = clean_cluster(cov_fos1['Cluster'])
cov_fos1['is_ai'] = convert_ai(cov_fos1['is_ai'])

##### Chart

In [None]:
point_ch = (alt.Chart(cov_fos1)
            .transform_filter(alt.FieldOneOfPredicate('Subfield',top_50_cap))
            .transform_filter(alt.FieldOneOfPredicate('cluster',clean_clust))
            .mark_point(filled=True,strokeWidth=0.7,stroke='black')
            .encode(y=alt.Y('is_ai:N',title=''),
                    x=alt.X('Subfield:N',sort=top_50_cap,title='Subfield:N'),
                    size=alt.Size('fos_cited_l1',title=['% of all citations','in cluster']),
                    color=alt.Color('discipline:N',title='Discipline'),
                    tooltip = ['Cluster','Subfield','discipline','% of citations in Cluster'],
                    row=alt.Row('Cluster',sort=clean_clust,title='Cluster'))).properties(width=500)

point_ch = point_ch.configure_axis(grid=True)

point_ch.save(f"{fig_path}/fig_9.html")

point_ch

#### Overlaps between AI / non-AI citations across clusters

In [None]:
#Creates a set of cited references per cluster and AI / non AI pair
cit_cluster_sets = cov_cits.groupby(['cluster','is_ai'])['cited'].apply(lambda x: set(chain(*list(x))))

p = {}

for x in set(cov_cits['cluster']):
    rel = cit_cluster_sets[x]
    try:
        p[x] = overlap(rel[True],rel[False])
    except:
        p[x] = np.nan
        
ov = pd.Series(p).sort_values(ascending=False).reset_index(name='citation_overlap')
ov['index'] = clean_cluster(ov['index'])

##### Plot

In [None]:
b = (alt.Chart(ov)
     .mark_bar(width=9)
     .encode(x=alt.X('index',sort=alt.EncodingSortField('citation_overlap',order='descending')),
             y=alt.Y('citation_overlap')))

r = (alt.Chart(ov)
     .transform_calculate(mean='3.7')
     .mark_rule(color='red',strokeDash=[1,1])
     .encode(
         y=alt.Y('mean:Q',title=['% reference overlap','between AI and non-AI papers'])))

fin = (b+r).properties(height=200)

fin.save(f"{fig_path}/fig_10.html")

fin