# Modularity analysis

We consider if fields with higher levels of AI activity have more knowledge modularity than those that don't.

Actions:
1. Load relevant data
  * topic mix
  * citation data
2. Exploratory analysis
  * Knowledge basis for other clusters
2. Analyse modularity
  * Topic distribution in different clusters
  * Modularity of citation network for each cluster
  * Modularity of topics leveraged in each cluster
  
The above will probably require a function to extract networks from co-occurrence item lists and an algorithm to calculate the modularity of those networks

## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import altair as alt
from altair_saver import save
from selenium import webdriver

from scipy.stats import entropy, zscore
from cord19.estimators.complexity import *

from data_getters.inspector import get_schemas
from data_getters.core import get_engine
from dotenv import load_dotenv,find_dotenv

In [None]:
load_dotenv(find_dotenv())
sql_creds = os.getenv('config_path')

driver_path = os.getenv('chrome_driver_path')
DRIVER = webdriver.Chrome(executable_path=driver_path)

In [None]:
pd.options.mode.chained_assignment = None

In [None]:
def preview(x):
    print(x.head())
    print('\n')
    print(x.shape)
    
    return(x)

def herf(vector):
    '''Calculates the herfindahl concentration index for a vector
    
    '''
    return(1-np.sum([(x/sum(vector))**2 for x in list(vector)])) #NB we are doing the reverse of herf

def binarise(vector,thres=0.1):
    '''Binarises values in a vector based in whether they are above a value or not
    
    '''
    
    return([x>thres for x in list(vector)])

def calculate_diversities(topic_mix,measures,measure_names):
    '''Applies various diversity measures to a df
    '''
    
    div = pd.concat([topic_mix.apply(measure,axis=1) for measure in measures],axis=1)

    div.columns = measure_names
    
    return(div)
    
def div_ai_corr(div_long,ai_map,method='spearman'):
    '''Calculates correlation between diversity measures and AI shares
    
    '''
    dv = div_long.copy()
    
    dv['ai_share'] = dv['cluster'].map(ai_shares)
    
    corr = dv.pivot_table(index=['cluster','ai_share'],
                      columns='variable',values='value').reset_index(level=1).corr(method=method)
    return(corr)

def complexity_index(x):
    '''
    Calculates complexity index for a group / topic mix matrix
    '''
    
    lq = create_lq(x)
    eci = calc_eci(lq)
    return(eci)
    

In [None]:
def flatten(_list,freq=False,norm=True):
    
    flat = [x for el in _list for x in el]
    
    if freq==False:
        return flat
    else:
        return pd.Series(flat).value_counts(normalize=norm)

## Load data

In [None]:
tops = pd.read_csv(f"{project_dir}/data/processed/ai_research/tidy_paper_topics_ai.csv")

tops['is_ai']=tops['is_ai'].astype(bool)



In [None]:
with open(f"{project_dir}/data/processed/ai_research/ai_article_mag_info.json",'r') as infile:
    article_mag = json.load(infile)
    
with open(f"{project_dir}/data/processed/ai_research/citation_lookup.json",'r') as infile:
    citation_lookup = json.load(infile)

In [None]:
#This MAG fields of study table contains information about the mag hierarchy we can use to parse it
con = get_engine(sql_creds)

mag_fos = pd.concat(pd.read_sql('mag_fields_of_study',con,chunksize=1000))

## Analysis

### 1. Link between AI share and topic concentration

* Do research fields with more AI activity tend to have less topic diversity / complexity?

In [None]:
#Prep
div_measures = [herf,entropy]
div_measure_names = ['herfindahl','entropy']

#paper-cluster lookup
paper_cluster_lu = tops.drop_duplicates('index').set_index('index')['cluster'].to_dict()

#cluster-share_lookup
ai_shares = tops.drop_duplicates('index').groupby('cluster')['is_ai'].mean().sort_values(
    ascending=False).to_dict()


In [None]:
#Focus on papers
paper_mixes = tops.pivot_table(index='index',columns='topic',values='weight')

#Calculate diversify measures
div = calculate_diversities(paper_mixes,div_measures,div_measure_names)

#Allocate papers to clusters and calculate means
div['cluster'] = div.index.map(paper_cluster_lu)

div_means = div.groupby('cluster').mean().apply(zscore).reset_index(drop=False).melt(id_vars=['cluster'])

In [None]:
div_ai_corr(div_means,ai_shares)

In [None]:
#Calculate field level diversities

In [None]:
top_bin = paper_mixes.applymap(lambda x: int(x>0.1))
#Label papers with clusters
top_bin['cluster'] = top_bin.index.map(paper_cluster_lu)

top_bin_long = top_bin.melt(id_vars='cluster')

cl_top_distr = top_bin_long.groupby(['cluster','topic'])['value'].sum().reset_index(
    name='count').pivot(index='cluster',columns='topic',values='count')

In [None]:
cl_div = calculate_diversities(cl_top_distr,[herf,entropy],['herfindahl','entropy']).reset_index(
    drop=False).melt(id_vars='cluster')

eci = complexity_index(cl_top_distr).reset_index(drop=False).melt(id_vars='cluster')

cl_div_2 = pd.concat([cl_div,eci])

In [None]:
div_ai_corr(cl_div_2,ai_shares)

## 2. Knowledge base of various clusters

### Process fields of study info

In [None]:
mag_fos['name_l'] = [x.lower() for x in mag_fos['name']]

In [None]:
mag_levels = mag_fos.groupby('level')['name_l'].apply(set)

### Extract cited fos by paper

In [None]:
#Convert mag ids to strs in the corpus df
tops_ = tops.dropna(axis=0,subset=['mag_id'])

tops_['mag_id'] = [str(int(x)) for x in tops_['mag_id']]


cov_short = tops_[['index','mag_id','cluster','is_ai']].drop_duplicates(
    'mag_id').reset_index(drop=True)


cov_short['cited'] = cov_short['mag_id'].map(citation_lookup)


In [None]:
#Collect field of study sets for each element in cited
cov_cits = cov_short.dropna(axis=0,subset=['cited'])

cov_cits['fos_cited'] = [flatten([article_mag[x]['fields_of_study'] if 'fields_of_study' in 
                          article_mag[x].keys() else [] for x in cit]) for cit in cov_cits['cited']]

cov_cits['fos_cited_unique'] = [set(x) for x in cov_cits['fos_cited']]

In [None]:
cov_cits['fos_cited_l1'] = [[x for x in cited if x in mag_levels[0]] for cited in cov_cits['fos_cited']]

### High level analysis: distribution of citations at level 0

In [None]:
#In total

#def 

In [None]:
# For AI vs AI

l1_cits = cov_cits.groupby('is_ai')['fos_cited_l1'].apply(lambda x: 100*flatten(x,freq=True)).reset_index(
    drop=False).pipe(preview)

In [None]:
base = (alt
        .Chart(l1_cits)
        .mark_bar(opacity=0.5,stroke='black')
        .encode(
            y=alt.Y('level_1',sort=alt.EncodingSortField('fos_cited_l1','sum',order='descending')),
            x=alt.X('fos_cited_l1',stack=None,title='% of fields cited'),color='is_ai'))

out = base.properties(height=300,width=200)

save(out,"test.png",method='selenium',
         webdriver=DRIVER,scale_factor=2)

In [None]:
#What is the distribution?

In [None]:
cov_w_cit = cov_cits.loc[[len(x)>0 for x in cov_cits['fos_cited_l1']]].reset_index(drop=False)

p_f1_citations = pd.DataFrame(
    [pd.Series(x).value_counts() for x in cov_w_cit['fos_cited_l1']]).apply(
    lambda x: x/x.sum(),axis=1).fillna(0)

In [None]:
paper_cit_distr = pd.concat([cov_w_cit[['is_ai','cluster']],p_f1_citations],axis=1)

paper_cit_distr_long = paper_cit_distr.melt(id_vars=['is_ai','cluster'],value_name='share_fields_cited').pipe(
    preview)

In [None]:
paper_cit_distr_long_2 = pd.concat([paper_cit_distr_long,paper_cit_distr_long.groupby(
    ['variable','is_ai'])['share_fields_cited'].apply(
    lambda x: pd.cut(x,bins=20,labels=False)).rename('bin').reset_index(drop=True)],axis=1)

In [None]:
paper_cit_bins = paper_cit_distr_long_2.groupby(
    ['is_ai','variable','bin'])['share_fields_cited'].size().reset_index(drop=False)

paper_cit_bins['norm'] = paper_cit_bins.groupby(['is_ai','variable'])['share_fields_cited'].apply(lambda x: x/x.sum())

In [None]:
(alt.Chart(paper_cit_bins)
 .transform_filter(alt.FieldOneOfPredicate('variable',['computer science','medicine','biology','mathematics']))
 .mark_bar(opacity=0.5,width=7,stroke='black',
           strokeWidth=1).encode(x='bin',y=alt.Y('norm',stack=None,title='Share of papers'),
                               row='variable',color='is_ai:N')
 .properties(
    height=100,width=170))

In [None]:
def extract_fos_cited(df,unique):
    '''Extracts citations from a subcorpus
    
    '''
    
    if unique!=False:
        return(flatten(flatten(df['fos_cited']),freq=True))
    else:
        return(flatten(df['fos_cited_unique'],freq=True,norm=True))
    
    

In [None]:
cluster_freqs = {}
cluster_freqs_unique = {}

for x in set(tops['cluster']):
    
    if pd.isnull(x)==False:
    
        rel = cov_cits.loc[cov_cits['cluster']==x]
        freqs = extract_fos_cited(rel,unique=False)
    
        cluster_freqs[x] = freqs
        
        #This calculates the share of papers in a cluster that cite a topic
        freqs_unique = 100*extract_fos_cited(rel,unique=True)
        cluster_freqs_unique[x] = freqs_unique

In [None]:
field_mix = pd.DataFrame(cluster_freqs_unique)
top_fos = field_mix.sum(axis=1).sort_values(ascending=False)[:50].index
field_mix_long= field_mix.loc[top_fos].reset_index(drop=False).melt(id_vars='index',var_name='cluster',
                                                                      value_name='share')

In [None]:
#Plot

(alt.Chart(field_mix_long)
 .mark_rect()
 .encode(
    x=alt
     .X('cluster:N',sort=list(ai_shares.keys())),
    y=alt
     .Y('index',sort=alt.EncodingSortField('share',op='sum',order='descending')),
    color='share:Q',tooltip=['cluster','index'])
 .properties(height=550))

In [None]:
#Now check this

cl_11 = cov_cits.loc[cov_cits['cluster']=='cluster_11']

cl_11_res = {}

for x in [True,False]:
    
    rel_2 = cl_11.loc[cl_11['is_ai']==x]
    out = extract_fos_cited(rel_2,unique=True)
    cl_11_res[x]=out
    
check = pd.DataFrame(cl_11_res).fillna(0).sort_values(False,ascending=False)

top_f = check.index[:40]

check_long = pd.melt(check.loc[[x in top_f for x in check.index]].reset_index(drop=False),id_vars='index')

bas = alt.Chart(check_long).mark_bar(opacity=0.4).encode(
    y=alt.Y('index',sort=list(top_f)))

ai = bas.transform_filter(alt.datum.variable==True).encode(x='value',color='variable')

nai = bas.transform_filter(alt.datum.variable==False).encode(x='value',color='variable')

(ai+nai).properties(height=500)