# arXiv analysis

We use the arXiv processed data and a topic model trained on a subset of the data to explore various questions about the trajectory of AI research:

* Do we find any topical differences between the research undertaken in countries with different levels of political freedom / civil liberty?
* Do we find any topical differences between the research undertaken in teams with / without women involved?
* Do we find any topical differences between the research undertaken in public / private research organisations?


## Preamble

In [None]:
%run notebook_preamble.ipy

In [None]:
# Put functions etc here

def flatten_list(my_list):
    '''
    Flattens a list
    '''
    
    return([x for el in my_list for x in el])


In [None]:
# %load lda_pipeline.py
from gensim import corpora, models
from string import punctuation
from string import digits
import re
import pandas as pd
import numpy as np

#Characters to drop
drop_characters = re.sub('-','',punctuation)+digits

#Stopwords
from nltk.corpus import stopwords

stop = stopwords.words('English')

#Stem functions
from nltk.stem import *
stemmer = PorterStemmer()


def clean_tokenise(string,drop_characters=drop_characters,stopwords=stop):
    '''
    Takes a string and cleans (makes lowercase and removes stopwords)
    
    '''
    

    #Lowercase
    str_low = string.lower()
    
    
    #Remove symbols and numbers
    str_letters = re.sub('[{drop}]'.format(drop=drop_characters),'',str_low)
    
    
    #Remove stopwords
    clean = [x for x in str_letters.split(' ') if (x not in stop) & (x!='')]
    
    return(clean)


class CleanTokenize():
    '''
    This class takes a list of strings and returns a tokenised, clean list of token lists ready
    to be processed with the LdaPipeline
    
    It has a clean method to remove symbols and stopwords
    
    It has a bigram method to detect collocated words
    
    It has a stem method to stem words
    
    '''
    
    def __init__(self,corpus):
        '''
        Takes a corpus (list where each element is a string)
        '''
        
        #Store
        self.corpus = corpus
        
    def clean(self,drop=drop_characters,stopwords=stop):
        '''
        Removes strings and stopwords, 
        
        '''
        
        cleaned = [clean_tokenise(doc,drop_characters=drop,stopwords=stop) for doc in self.corpus]
        
        self.tokenised = cleaned
        return(self)
    
    def stem(self):
        '''
        Optional: stems words
        
        '''
        #Stems each word in each tokenised sentence
        stemmed = [[stemmer.stem(word) for word in sentence] for sentence in self.tokenised]
    
        self.tokenised = stemmed
        return(self)
        
    
    def bigram(self,threshold=10):
        '''
        Optional Create bigrams.
        
        '''
        
        #Colocation detector trained on the data
        phrases = models.Phrases(self.tokenised,threshold=threshold)
        
        bigram = models.phrases.Phraser(phrases)
        
        self.tokenised = bigram[self.tokenised]
        
        return(self)
        
        
        
        

class LdaPipeline():
    '''
    This class processes lists of keywords.
    How does it work?
    -It is initialised with a list where every element is a collection of keywords
    -It has a method to filter keywords removing those that appear less than a set number of times
    
    -It has a method to process the filtered df into an object that gensim can work with
    -It has a method to train the LDA model with the right parameters
    -It has a method to predict the topics in a corpus
    
    '''
    
    def __init__(self,corpus):
        '''
        Takes the list of terms
        '''
        
        #Store the corpus
        self.tokenised = corpus
        
    def filter(self,minimum=5):
        '''
        Removes keywords that appear less than 5 times.
        
        '''
        
        #Load
        tokenised = self.tokenised
        
        #Count tokens
        token_counts = pd.Series([x for el in tokenised for x in el]).value_counts()
        
        #Tokens to keep
        keep = token_counts.index[token_counts>minimum]
        
        #Filter
        tokenised_filtered = [[x for x in el if x in keep] for el in tokenised]
        
        #Store
        self.tokenised = tokenised_filtered
        self.empty_groups = np.sum([len(x)==0 for x in tokenised_filtered])
        
        return(self)
    
    def clean(self):
        '''
        Remove symbols and numbers
        
        '''
        
        
        
    
        
    def process(self):
        '''
        This creates the bag of words we use in the gensim analysis
        
        '''
        #Load the list of keywords
        tokenised = self.tokenised
        
        #Create the dictionary
        dictionary = corpora.Dictionary(tokenised)
        
        #Create the Bag of words. This converts keywords into ids
        corpus = [dictionary.doc2bow(x) for x in tokenised]
        
        self.corpus = corpus
        self.dictionary = dictionary
        return(self)
        
    def tfidf(self):
        '''
        This is optional: We extract the term-frequency inverse document frequency of the words in
        the corpus. The idea is to identify those keywords that are more salient in a document by normalising over
        their frequency in the whole corpus
        
        '''
        #Load the corpus
        corpus = self.corpus
        
        #Fit a TFIDF model on the data
        tfidf = models.TfidfModel(corpus)
        
        #Transform the corpus and save it
        self.corpus = tfidf[corpus]
        
        return(self)
    
    def fit_lda(self,num_topics=20,passes=5,iterations=75,random_state=1803):
        '''
        
        This fits the LDA model taking a set of keyword arguments.
        #Number of passes, iterations and random state for reproducibility. We will have to consider
        reproducibility eventually.
        
        '''
        
        #Load the corpus
        corpus = self.corpus
        
        #Train the LDA model with the parameters we supplied
        lda = models.LdaModel(corpus,id2word=self.dictionary,
                              num_topics=num_topics,passes=passes,iterations=iterations,random_state=random_state)
        
        #Save the outputs
        self.lda_model = lda
        self.lda_topics = lda.show_topics(num_topics=num_topics)
        

        return(self)
    
    def predict_topics(self):
        '''
        This predicts the topic mix for every observation in the corpus
        
        '''
        #Load the attributes we will be working with
        lda = self.lda_model
        corpus = self.corpus
        
        #Now we create a df
        predicted = lda[corpus]
        
        #Convert this into a dataframe
        predicted_df = pd.concat([pd.DataFrame({x[0]:x[1] for x in topics},
                                              index=[num]) for num,topics in enumerate(predicted)]).fillna(0)
        
        self.predicted_df = predicted_df
        
        return(self)
    

## 1. Main data loads

* Arxiv enriched dataset
* Topic model
* Freedom data

### ArXiv

We load the enriched arXiv dataset, which includes paper metadata, gender indicators and field predictions.

In [None]:
arx = pd.read_csv('../data/processed/1_8_2019_arxiv_enriched.csv',compression='zip',
                 dtype={'id':str,'article_id':str,'paper_id':str})

In [None]:
#Add years to arXiv - TODO - do this in the load mag notebook
arx['year'] = [x.split('-')[0] for x in arx['created']]

In [None]:
# These are unique papers so don't include the information about location

We load the arXiv-mag-grid matched dataset, which has information about the institutions and locations for papers

In [None]:
#Load information about locatio
grid_matched = pd.read_csv('../data/external/17_8_2019_papers_institution_ucl_cleaned.csv',compression='zip',dtype={'article_id':str})

### Topic models

We trained a topSBM topic model on 25K AI papers. We will use that for our semantic comparison between papers with and without female co-authors, between countries with different levels of political liberty, and between different types of institutions. This will require working with some auxiliaries datasets such as the press freedom indices, and GRID

In [None]:
with open('../models/2_8_2019_arxiv_sbm.p','rb') as infile:
    topic_model = pickle.load(infile)

In [None]:
# Here is the model
model = topic_model[0]

### Freedom data

This data has information about political and civil liberties in various countries

In [None]:
freed = pd.read_csv('../data/processed/19_7_2019_freedom_data.csv')

### GRID roles

This has GRID roles (what an organisation 'does'

In [None]:
grid = pd.read_csv('../data/external/1_8_2019_grid_org_roles.csv',compression='zip')

### Combine thd Grid matches and metadata before we focus on the analysis

In [None]:
grid_merged = pd.merge(grid_matched,grid[['grid_id','type']],left_on='institute_id',right_on='grid_id')

grid_merged.head()

In [None]:
#We don't care about the location of multinationals since we can't match those.
grid_merged['institute_name'] = [gr['institute_name'].split(' (')[0].strip() if gr['is_multinational']!=0 else gr['institute_name'] for p,gr in grid_merged.iterrows()]

In [None]:
#This step takes ages - could it be refactored?
grid_merged['institute_country'] = ['multinational' if gr['is_multinational']!=0 else gr['institute_country'] for p,gr in grid_merged.iterrows()]

In [None]:
#This creates a list of names, countries and types of institutions for each paper.
#We can loop through those later to create dummies for comparisons and regressions

grid_grouped = pd.concat(
    [grid_merged.dropna(axis=0, subset=['institute_id']).groupby('article_id')[var].apply(lambda x: list(x)) for var in ['institute_name','institute_country',
                                                                                                                         'type']],axis=1).reset_index(drop=False)

In [None]:
#Change some names
grid_grouped.rename(columns={'institute_country':'country_list','institute_name':'institute_list','type':'type_list'},inplace=True)

## 2. Data processing

We are going to enrich the topic modelled data with metadata about gender / country / institutional affiliations etc.


In [None]:
def make_document_topic_df(model,level,n_words):
    '''
    
    We extract a document-topic df from the model.
    
    Arguments:
        model: model object
        level (int): level of the model at which we want to extract the topics
        n_words: number of words we want to use to label the columns in the document-topic df
        
    Outputs:
        A document topic df where every row is a paper (with its id) and each column is the the weight for a topic. The columns are labelled with the topic names
    
    
    '''
    
    #Create the topic mix
    d_t_df = pd.concat([pd.DataFrame(model.topicdist(n,l=level),columns=['topic',pid]).set_index('topic') for 
                      n,pid in enumerate(model.documents)],axis=1).T
    
    #Create the columns
    topic_names = ['-'.join([x[0] for x in topic_comp][:n_words]) for topic_comp in model.topics(l=level).values()]
    
    d_t_df.columns = topic_names
    
    #We name the index to simplify merging later
    d_t_df.index.name = 'paper_id'
    
    return(d_t_df)

In [None]:
#Create the topic mix

doc_topic_l0 = make_document_topic_df(model,0,5)


In [None]:
#We store the topic names as we will use them later when working with the metadata
topic_names = list(doc_topic_l0.columns)

In [None]:
def expand_document_topic_df(doc_top_df,metadata_df,variables,merge_id):
    '''
    
    We append metadata to the document. This will be useful for crosstabbing and for regression analysis later
    
    Arguments:
        doc_top_df: document topic df where we want to append metadata
        metadata_df: df with the metadata we want to append.
        variables: variablew we want to append
        merge_var: id in the metadata df that we will use for merging. 
        
    Outputs:
        A document-topic df with additional columns capturing the metadata.
    
    '''
    
    #Subset the metadata df with the variables we are interested in
    meta_df_selected = metadata_df[variables]
    
    #Reset index in the dtf for merging
    doc_top_df_temp = doc_top_df.reset_index(drop=False)
    
    #Merge. Note that we also set the index again.
    doc_top_enr = pd.merge(doc_top_df_temp,meta_df_selected,left_on='paper_id',right_on=merge_id,how='left').set_index('paper_id').drop('article_id',axis=1)
    
    return(doc_top_enr)
    
    

In [None]:
meta_variables = ['paper_id','article_id','title','abstract','year','top_field','has_female','citation_count']


doc_topic_l0_exp = expand_document_topic_df(doc_topic_l0,arx,variables=meta_variables,merge_id='paper_id')

# Note that there will be missing values for female authors.

Now we will expand with the location and org type data. This requires some work with those datasets

In [None]:
doc_topic_l0_exp_2 = expand_document_topic_df(doc_topic_l0_exp,grid_grouped,variables=['article_id','country_list','institute_list','type_list'],
                                             merge_id='article_id')

In [None]:
# Create discipline fixed effects

field_dummies = pd.get_dummies(doc_topic_l0_exp_2['top_field'])

#Put the names here to use as controls later
field_names =field_dummies.columns

#Create the analysis DF
analysis_df = pd.concat([doc_topic_l0_exp_2,field_dummies],axis=1)

#Cast analysis df as integer
analysis_df['year'] = analysis_df['year'].astype(int)

#And log
analysis_df['year_log'] = np.log(analysis_df['year'])

#### Enrich the data with the political information

Here we match the freedom data with the AI research data.

In [None]:
from ast import literal_eval
from fuzzywuzzy import fuzz,process
from itertools import product


def flatten_list(a_list):
    return([x for el in a_list for x in el])

In [None]:
#These are the countries in the AI data
countries = set(flatten_list(analysis_df['country_list'].dropna()))

In [None]:
#Now we fuzzy match them with the Freedom data
results = []

for c in list(countries):
    
    #
    
    out = process.extract(c,list(set(freed['Country'])))

    results.append([c,[x[0] for x in out if x[1]==100]])

In [None]:
#Create a lookup between AI country names and freedom country names
ai_country_freed_country_lu = {x[0]:x[1][0] for x in results if len(x[1])>0}


#Create a lookup between country names and freedom

country_status_lookup = {x['Country']:x['Status'] for c,x in freed.loc[freed['year']==2018].iterrows()}

In [None]:
analysis_df['freedom_list'] = [[country_status_lookup[ai_country_freed_country_lu[c]] for c in 
                                 c_list if c in ai_country_freed_country_lu.keys()] if type(c_list)==list else np.nan for
                                 c_list in analysis_df['country_list']]

### Community detection

We are going to implement some community detection with two objectives:

First, we want to explore rules to remove topics with high centrality since they are less informative about the purpose of a paper

Second, we want to visualise the structure of the AI field and clusters its disciplines into communities. 

We will write a quick function to do this.

In [None]:
from itertools import combinations, product, chain
import networkx as nx
import community

In [None]:
def make_network_from_doc_term_matrix(mat,threshold,id_var):
    '''
    Create a network from a document term matrix.
    
    Args
        Document term matrix where the rows are documents and the columns are topics
        threshold is the threshold to consider that a topic is present in a matrix.
        
    Returns: 
        A network
    
    '''
    
    #Melt the topic mix and remove empty entries
    cd = pd.melt(mat.reset_index(drop=False),id_vars=[id_var])

    cd = cd.loc[cd['value']>threshold]

    #This gives us the topic co-occurrence matrix
    co_occurrence = cd.groupby(id_var)['variable'].apply(lambda x: list(x))
    
    #Here the idea is to create a proximity matrix based on co-occurrences

    #Turn co-occurrences into combinations of pairs we can use to construct a similarity matrix
    sector_combs = flatten_list([sorted(list(combinations(x,2))) for x in co_occurrence])
    sector_combs = [x for x in sector_combs if len(x)>0]

    #Turn the sector combs into an edgelist
    edge_list = pd.DataFrame(sector_combs,columns=['source','target'])

    edge_list['weight']=1

    #Group over edge pairs to aggregate weights
    edge_list_weighted = edge_list.groupby(['source','target'])['weight'].sum().reset_index(drop=False)

    edge_list_weighted.sort_values('weight',ascending=False).head(n=10)
    
    #Create network and extract communities
    net = nx.from_pandas_edgelist(edge_list_weighted,edge_attr=True)
    
    return(net)

def extract_community(net,resolution,verbose=False):
    '''
    
    Extracts communities from a network.
    
    Args:
        net: a networkx object
        resolution: level of granularity in the number of communities that are extracted
    
    '''
    
    comms = community.best_partition(net,resolution=resolution,weight='weight')
    
    #return(comms)
    
    if verbose !=False:
        
        #What does this look like?
        comm_strings = pd.DataFrame(comms,index=['comm']).T.groupby('comm')

        #This is just to show the participation in communities
        for n,x in enumerate(comm_strings.groups.keys()):
            print(n)
            print('====')
            print('\t'.join(list(comm_strings.groups[x])))
            #print(', '.join(list(x.index())))
            
    return(comms)
        

In [None]:
topic_net = make_network_from_doc_term_matrix(doc_topic_l0,0.05,'paper_id')

Explore the degree distribution of the network

In [None]:
#This is looking at the degree distribution
degree_distr = pd.DataFrame(list(topic_net.degree)).sort_values(1,ascending=False).set_index(0)

degree_distr.columns = ['degree']

degree_distr['share'] = degree_distr['degree']/len(degree_distr)

In [None]:
degree_distr[:50]['share'].plot.barh(figsize=(5,8))

In [None]:
#We drop topics that appear in more than 75% of the papers - they are not very informative.
topic_drop = degree_distr.loc[degree_distr['share']>0.7].index

In [None]:
#Set random seed
import random
random.seed(123)

In [None]:
topic_net_2 = make_network_from_doc_term_matrix(doc_topic_l0[[x for x in doc_topic_l0 if x not in topic_drop]],0.05,'paper_id')

In [None]:
# with open(f'../models/13_8_2019_topic_communities.json') as infile:
#     comms = json.load(infile)

comms = extract_community(topic_net_2,resolution=0.3,verbose=True)

Create topic lookup

In [None]:
comm_names = {
  0:'algorithms',
  1:'robotics_agents',
  2:'social',
  3:'statistics',
  4:'security',
  5:'clustering',
  6:'technology',
  7:'symbolic',
  8:'optimisation',
  9:'classification',
  10:'graphs',
  11:'question_answering_systems',
  12:'health',
  13:'computer_vision',
  14:'mixed',
  15:'communication',
  16:'mathematics',
  17:'generative_transfer',
  18:'language',
  19:'mixed',
  20:'finance',
  21:'mixed',
  22:'mixed',
  23:'neuroscience',
  24:'mixed',
  25:'mixed',
  26:'physics',
  27:'genomics',
  28:'deep_learning',
  29:'recommendations',
  30:'deep_learning_sound',
  31:'computer_vision',
  32:'imaging_materials',
  33:'generative_transfer',
  34:'mixed',
  }

In [None]:
# save the communities

#json.dumps(f'../models/{today_str}_topic_communities.json')

with open(f'../models/{today_str}_topic_communities.json','w') as outfile:
    json.dump(comms, outfile)

In [None]:
#doc_topic_l0_filtered = doc_topic_l0[[x for x in doc_topic_l0.columns if x not in topic_drop]]
topics_filtered = [x for x in topic_names if x not in topic_drop]

### 3. Data analysis

In [None]:
def make_exog(df,value_container,value,make_dummy=True):
    '''
    This creates exogenous variables for modelling later.
    
    Argument:
        -df contains the variable where we want to find a value
        -variable_container is the column where we want to look for the value
        -value is the value we are looking for
        -make_dummy: if true it just counts if the value is present. If false, it counts how many times it happens. 
        
    Output
        -A df with the new column (named)
    
    
    '''
    
    df_2 = df.copy()
    
    #Create a tidy variable name
    column_name = re.sub(' ','_',value.lower())
    
    #If we want to create a dummy...
    if make_dummy == True:
        
        #We just look for it in the value container
        #There are some missing values so we have some control flow to manage that. 
        df_2[column_name] = [value in x if type(x)==list else np.nan for x in df_2[value_container]]
    
    else:
        
        #Otherwise, we count how many times it occurs
        #We deal with missing values ('non lists') as before
        df_2[column_name] = [x.count(value) if type(x)==list else np.nan for x in df_2[value_container]]
        
    return(df_2)
    
    
    

In [None]:
def topic_regression(df,target_list,exog,controls,model,binarise=False,standardise=True,cov='HC1'):
    '''
    
    This function regresses topic weights (or their binarisation) on predictors.
    
    Arguments:
        -Df with the variables
        -target_list: target variables. This is a list we loop over. 
        -exog: exogenous variable
        -controls
        -model type. OLS? Logit? TODO fix the logit
        -Binarise in case we are using logit. If not False, the value is the threshold 
            TODO when we binarise the highly detailed models, some of them become all zeros. This will work better
            with the mopre aggregate topics
        -Standardise if we standardise and log the topic weights
    
    Returns
        -A list of statsmodels summaries

    
    '''
    
    #Drop rows with missing values - sm doesn't like them
    df_2 = df[target_list+exog+controls].dropna(axis=0)
    
    #Standardise targets?
    if standardise==True:
        df_2[target_list] = (np.log(df_2[target_list]+0.00000001)).apply(zscore).astype(float)
    
    #Binarise targets if we are doing a logit
    if binarise!=False:
        df_2[target_list] = df_2[target_list].applymap(lambda x: x>binarise).astype(float)
    
    
    #Extract the exogenous and controls, add constant and cast as float
    exog_controls = add_constant(df_2[exog+controls]).astype(float)
    

    #Container output
    out = []
    coeffs = []
    
    #One regression for each target
    for t in list(target_list):
        
        #There we gp. 
        reg = model(endog=df_2[t],exog=exog_controls).fit(cov_type=cov,disp=0)
        
        out.append(reg.summary())
        
        #coeffs.append(reg)
        if model == OLS:
            coeffs.append(pd.Series([float(reg.params[exog]),float(reg.pvalues[exog]),float(reg.rsquared)],name=t))
            reg_coeff = pd.concat(coeffs,axis=1).T
            reg_coeff.columns = ['coefficient','p_value','r_square']
    
        else:
            coeffs.append(pd.Series([float(reg.params[exog]),float(reg.pvalues[exog]),float(reg.prsquared)],name=t))
            reg_coeff = pd.concat(coeffs,axis=1).T
            reg_coeff.columns = ['coefficient','p_value','pr_square']
 
    
    return([out,reg_coeff.sort_values('coefficient',ascending=False)])
        
        
def topic_comparison(df,target_list,exog,concept_lookup,quantiles=np.arange(0,1.1,0.2),thres=0):
    '''
    This function compares the distribution of activity in various topics depending on an exogenous variable of interest. 
    
    Args:
        Df with the topic mix and metadata
        target_list are the topics to consider
        exog is the variable to crosstab topics against
        concept_lookup is a df with the median proximity of each topic to the concepts
        quantiles is how we discretise the concept lookup (default value is quintiles)
        thres: =limit for considering a topic as present

    
    '''
    
    #Copy df
    
    df_2 = df.copy()
    
    #Discretise the concept lookup
    
    conc_discr = concept_lookup.apply(lambda x: pd.qcut(x,q=quantiles,labels=False,duplicates='drop'))

    
    #Calculate levels of activity per topic based on the exog variable
    
    topic_distr = pd.concat([pd.crosstab(df_2[exog],df_2[t]>thres)[True] for t in target_list],axis=1).T
    topic_distr.index = target_list
    
    
    #Merge the count with the concept lookup
    disc = pd.melt(pd.concat([topic_distr,conc_discr],axis=1).reset_index(drop=False),id_vars=['index']+list(conc_discr.columns))
    
    #This is the list where we store the results
    store={}
    
    for c in concept_lookup.columns:
        
        out = pd.pivot_table(disc.groupby([c,'variable'])['value'].sum().reset_index(drop=False),index=c,columns='variable',values='value')
        #out.apply(lambda x: x/x.sum()).plot.bar()
        
        store[c] = out
                                      
    #Output dfs with the comparisons
    return(store)

In [None]:
from statsmodels.api import OLS, Logit
from statsmodels.tools.tools import add_constant
from scipy.stats import zscore

### Descriptive analysis

Add a bunch of exogenous variables to the analysis df

In [None]:
#Variables of interest
interesting_cuts = [['freedom_list','NF'],
                    ['country_list','China'],['country_list','Russia'],['country_list','Turkey'],
                    ['type_list','Company'],['type_list','Government'],['type_list','Education'],
                    ['institute_list','Google'],['institute_list','Facebook'],['institute_list','IBM'],['institute_list','Microsoft']]

#Create the expanded df
analysis_df_expanded = analysis_df.copy()

#For each interesting variable we expand the df
for detect in interesting_cuts:
    
    analysis_df_expanded = make_exog(analysis_df_expanded,value_container=detect[0],value=detect[1])


In [None]:
#hf = topic_comparison(analysis_df_2,topics_filtered,'has_female',mean_sim_df)

In [None]:
#hf['health'].apply(lambda x: x/x.sum(),axis=0).plot.bar()

**This doesn't work very well**

There are several reasons for this:

* The documents I am using to measure ethics, surveillance etc are not very good
* The topics are too aggregated to pick up similarity with a concept
* Topics co-occur with each other. Their relation with the concepts aren't linear.
* Let's park this for now


### Trend analysis

We will create a function for this that creates a df with activity per year and topic. 

Another function to plot results.

My idea is to highlight trends of interest for different categories - papers with female authors, papers with companies, papers with non-free countries etc.

In [None]:
import matplotlib
matplotlib.rc('xtick', labelsize=12) 
matplotlib.rc('ytick', labelsize=12)
matplotlib.rc('axes',labelsize='large')
matplotlib.rc('legend',fontsize='large')
matplotlib.rc('font',size=12)
matplotlib.rc('legend',**{'fontsize':12})


In [None]:
def convert_topic_mix(topic_mix,communities,community_lookup,function='sum'):
    '''
    Converts the topic mix into communities
    
    Args:
        Topic mix
        communities is the community lookup
        function to aggregate topics
    
    
    '''
    
    #Melt, apply, pivot
    topic_long = topic_mix.reset_index(drop=False)
    
    topic_long_2 = pd.melt(topic_long,id_vars=['paper_id'])
    
    #print(set(topic_long_2['variable']))
    
    topic_long_2['comm'] = [community_lookup[communities[top]] for top in topic_long_2['variable']]
    
    #print(topic_long_2.head())
    
    #Pivot
    regrouped = pd.pivot_table(topic_long_2,index='paper_id',columns='comm',values='value',aggfunc=function)
    
    return(regrouped[[x for x in regrouped.columns if x!='mixed']])
    #return(topic_long_2)
    
    

In [None]:
def trend_analysis(topic_mix,topics,year_var='year',year_lim = [2000,2019],thres=0.1):
    '''
    Takes a df and analyses topics trends
    
    Args:
        -The topic mix where the rows are papers and the columns are topics
        -The topics to visualise
        -The year variable to consider
        -Threshold for topic occurrence.
        -comms = community lookup (or false, if we are not using communities)
    
    Returns:
        -A table with levels of activity per topic and year
    
    '''

    #Topic count per year
    
    topic_count = pd.concat([pd.crosstab(topic_mix[year_var],topic_mix[t]>thres)[True] for t in topics],axis=1).fillna(0)
    topic_count.columns = topics
    

        #Count papers per topic
        #topic_count = pd.concat([pd.crosstab(topic_mix[year_var],topic_mix[t]>0)[True] for t in topics],axis=1).fillna(0)
        
        #Add columns
        
        
    #Normalise years
    topic_count = topic_count.loc[np.arange(year_lim[0],year_lim[1])].fillna(0)
        
    return(topic_count)
    
    
    
    
    
def plot_trend_of_interest(trend_df,topics,ax,wind=3,norm=False,**kwargs):
    '''
    Plots a trend of interest.
    
    Args: 
        trend_df: the df where rows = years and column = topics
        topic: topic or topics of interest
        wind: rolling mean normalisation
        norm: if 2 = normalise for year (importance of a topic in the period) if 1 = normalise for topic (share of year activity in the topic). If False = don't normalise
        
    Returns the plot

    '''
    
    #Normalise or not?
    
    if norm==False:
        trend_df[topics].rolling(window=wind).mean().dropna().plot(ax=ax,**kwargs)
        
    else:
        trend_norm = trend_df.apply(lambda x: x/x.sum(),norm-1).fillna(0)
        
        #print(trend_norm)
    
        trend_norm[topics].rolling(window=wind).mean().dropna().plot(ax=ax,**kwargs)
    

def trend_comparison(topic_mix,topics,var,ax,year_var='year',year_lim = [2000,2019],thres=0,norm=2):
    '''
    Compares two groups in a trend of interest
    
    Args:
        -topic_mix = topic mix
        -topics: topics of interest
        -var: variable we want to compare
        -ax will generaly be a matplotlib axis with two rows 
        -The year variable to consider
        -Threshold for topic occurrence.
        -comms = community lookup (or false, if we are not using communities)
    
    Returns the plot
    
    '''
    
    outputs = [trend_analysis(topic_mix.loc[topic_mix[var]==val],topics) for val in [False,True]]
    
    for n,out in enumerate(topics):
        
        #print(out)
        plot_trend_of_interest(out,topics,norm=norm,ax=ax[n])
    
    

#### Some initial exploration

In [None]:
def save_fig(name,path='../reports/figures/slide_deck/'):
    '''
    Saves a figure
    '''
    plt.tight_layout()
    
    plt.savefig(path+f'{today_str}_{name}')

##### Total activity

In [None]:
pd.crosstab(arx['year'],arx['is_ai'],normalize=1)[1].plot()

#### Activity by field

In [None]:
fields_to_plot = [x for x in field_names if not any(num in x for num in ['1','2'])]

arx['year'] = [int(x) for x in arx['year']]

In [None]:
ai_in_fields = pd.concat([pd.crosstab(arx.loc[arx[t]>0.5]['year'],
                                     arx.loc[arx[t]>0.5]['is_ai'],normalize=0)[1] for t in fields_to_plot],axis=1).fillna(0)

ai_in_fields.columns = fields_to_plot

In [None]:
top_ai_fields = ai_in_fields.loc[2018].sort_values().index[::-1][:9]

In [None]:
ax = (100*ai_in_fields.loc[np.arange(2000,2019),top_ai_fields].rolling(window=3).mean()).dropna().plot(figsize=(10,6),cmap='tab10',linewidth=3)

ax.legend(bbox_to_anchor=(1,1),title='Scientific field')

ax.set_title('Share of AI activity by scientific field')

save_fig('field_trends.pdf')

In [None]:
#Create the topic variable
topic_comms = convert_topic_mix(analysis_df[topics_filtered],comms,comm_names)

In [None]:
#Create a df with all the information
analysis_fin = pd.concat([topic_comms,analysis_df_expanded],axis=1)

In [None]:
community_names = topic_comms.columns

In [None]:
topics_for_plot = ['computer_vision','machine_learning','symbolic','health','robotics','language',
                   #'adversarial',
                   'statistics','deep_learning',
                  'robotics_agents']

In [None]:
def make_highlight_plot(trends,vars_interest,ax,cmap,alpha=0.3):
    '''
    Creates a df where we select the topics to focus on
    
    
    Args:
        Trend is a trend df
        vars_interest are the topics or variables we eanrt to focus on
        ax the axis
        cmap is the color map we want to use
    
    Returns a plot
    
    '''
    
    #Create a lookup with numbers for values
    topic_lookup = {name:val for val,name in enumerate(vars_interest)}

    #Color map
    cols = plt.cm.get_cmap(cmap)

    #Create a vector of colors
    cols_to_show = [(0.5,0.5,0.5,alpha) if v not in topic_lookup.keys() else cols(topic_lookup[v]) for v in trends.columns]
    lw = [1 if v not in topic_lookup.keys() else 3 for v in trends.columns]
    
    #Plot
    (100*trends.rolling(window=4).mean()).dropna().plot(color=cols_to_show,ax=ax,linewidth=3)

    #Fix the legend to focus on key topics
    hand,labs = ax.get_legend_handles_labels()

    ax.legend(bbox_to_anchor=(1,1),handles = [x[0] for x in zip(hand,labs) if x[1] in vars_interest],
              labels=[x[1][:50] for x in zip(hand,labs) if x[1] in vars_interest])

In [None]:
# This is to normalise the years
comm_trends = trend_analysis(analysis_fin,community_names,thres=0.05)
all_years = analysis_fin['year'].value_counts()
comm_norm = comm_trends.apply(lambda x: x/all_years).dropna()

In [None]:
fig,ax = plt.subplots(figsize=(10,6))

make_highlight_plot(comm_norm,topics_for_plot,cmap='tab10_r',ax=ax,alpha=0.15)

#ax.legend(bbox_to_anchor=(1,1),title='Research area')

ax.set_title('Share of AI activity by research area')

plt.tight_layout()

save_fig('community_trends.pdf')

### With topics

In [None]:
notable_topics = [
    #'face-faces-identity-face_recognition-facial','person-surveillance-persons-pedestrian-pedestrians',
    #'attacks-attack-adversary-vulnerable-threat',
    #'emotions-emotion-neutral-emotional-spontaneous',
    'reinforcement_learning-policy-policies-reward-deep_reinforcement_learning',
    'cnn-convolutional_neural_networks-cnns-convolutional_neural_network-convolutional_neural_network_cnn',
    'training-trained-deep_learning-deep-train',
    'generator-gan-discriminator-generative_adversarial_networks_gans-gans',
    'translation-neural_machine_translation-machine_translation-translate-translations',
    'recurrent-lstm-rnn-recurrent_neural_network-recurrent_neural_networks']

In [None]:
topic_trends = trend_analysis(analysis_fin,topics_filtered,thres=0.05)
all_years = analysis_fin['year'].value_counts()
topic_trends_norm = topic_trends.apply(lambda x: x/all_years).dropna()

In [None]:
fig,ax = plt.subplots(figsize=(14,6))

make_highlight_plot(topic_trends_norm.loc[np.arange(2005,2019)],notable_topics,cmap='Dark2',ax=ax,alpha=0.1)

ax.set_title('Share of AI activity by detailed topic')

ax.set_ylabel('Share of AI papers with topic')

plt.tight_layout()

save_fig('trending_topics.pdf')

#### Additional analyses that identifies growing areas in recent years 

In [None]:
def make_trend_plot(df,topics_to_consider,top_n,ax,top_year=2018,thres=0.05,period=[2005,2019],alpha=0.3):
    '''
    Generates a similar plot to those above but with automatic identification of the top trends
    
    Args:
        Df is the topic mix (we will often have subsetted this to focus on a particular type of organisation)
        top_n is the top number of entities to label and display
        threshold for considering that a topic is present in a paper
        period is a list with the period we are considering
        
    Returns a similar plot to above but visualising the top n trends 

    
    '''
    
    #Check for topics with no activity:
    total_presence = (df[topics_to_consider]>thres).sum()
    no_values = total_presence.index[total_presence==0]
    
    topics_to_consider = [x for x in topics_to_consider if x not in no_values]
    
    
    #Calculate topic trends
    topic_trends = trend_analysis(df,topics_to_consider,thres=thres,year_lim=period)
    
    #Calculate all papers, for normalisation
    all_years = df['year'].value_counts()
    
    #Normalise
    topic_trends_norm = topic_trends.apply(lambda x: x/all_years).dropna()
    
    top_topics = topic_trends_norm.T.sort_values(top_year,ascending=False).index[:top_n]
    
    
    make_highlight_plot(topic_trends_norm,top_topics,cmap='Dark2',ax=ax,alpha=alpha)
    
    
    
    

In [None]:
def quick_plot(df,var_subset,topics_to_consider=topics_filtered,n_tops=8):
    '''
    Creates trend plots based on different categories.
    
    Args:
        df with papers and topics
        var_subset is the variable we want to consider (will generally be a boolean)
        n_tops: number of top institutions to visualise
    
    '''
    
    fig,ax = plt.subplots(figsize=(10,8))

    my_df = df.loc[df[var_subset]==True]
    
    make_trend_plot(my_df,topics_filtered,n_tops,ax=ax,top_year=2018,alpha=0.2)
    
    ax.set_title(var_subset)
    
    

In [None]:
# fig,ax = plt.subplots(figsize=(10,8))

# make_trend_plot(analysis_fin,topics_filtered,8,ax=ax,top_year=2018,alpha=0.2)

In [None]:
#quick_plot(analysis_fin,'nf')

In [None]:
#quick_plot(analysis_fin,'has_female')

In [None]:
#quick_plot(analysis_fin,'company')

In [None]:
#quick_plot(analysis_fin,'education')

In [None]:
#quick_plot(analysis_fin,'government')

In [None]:
#quick_plot(analysis_fin,'google')

In [None]:
#quick_plot(analysis_fin,'microsoft')

In [None]:
# chinese_govt = analysis_fin.loc[(analysis_fin['china']==True)&(analysis_fin['government']==True)]

# fig,ax = plt.subplots(figsize=(10,8))
    
# make_trend_plot(chinese_govt,topics_filtered,top_n=8,ax=ax,top_year=2018,alpha=0.2)

## Network analysis

Strategy:

* We need to visualise the network - which is quite dense. How do we do this?
  * 



### End of experimentation

In [None]:
#We want to make the size of the nodes comparable between years
size_lookup = pd.concat([(analysis_fin.loc[[x in year_set for x in analysis_fin['year']]][topics_filtered]>0.05).sum() for 
                         year_set in [
                             set(np.arange(1990,2019)),
                             set(np.arange(1990,2012)),
                             set(np.arange(2012,2015)),
                             set(np.arange(2015,2019))]],axis=1)

size_lookup.columns = ['all','pre','mid','late']

size_lookup_dict = size_lookup.to_dict()

In [None]:
comm_names

In [None]:
color_lookup = {
    'deep_learning':'blue',
    'robotics_agents':'cornflowerblue',
    'computer_vision':'aqua',
    'symbolic':'red',
    'health':'lime',
    'social':'forestgreen',
    'technology':'magenta',
    'statistics':'orange',
    'language':'yellow'
}

In [None]:
# color_lookup = {
#     2:'magenta',
#     1: 'cornflowerblue',
#     4:'cornflowerblue',
#     28:'cornflowerblue',
#     7:'red',
#     26:'yellow',
#     27:'orange',
#     14:'aqua',
#     28:'aqua',
#     #17:'plum',
#     13:'lime'}

In [None]:
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt

patches = [mpatches.Patch(facecolor=c, label=l,edgecolor='black') for l,c in color_lookup.items()]


In [None]:
def show_network(net,top_edge_share,label,loc,color_lookup=color_lookup,norm=1000,norm_2=1.2,layout=nx.kamada_kawai_layout,size_lookup=size_lookup):
    '''
    Plots a network visualisation of the topic network.
    
    
    '''
    
    new_net = net.copy()
    
    #We drop the 
    #drop_bad_edges = [e for e in new_net.edges(data=True) if not any(x in topic_drop for x in e[:2])]

    #new_net_2 = nx.Graph(drop_bad_edges)

    net_weight = sorted(new_net.edges(data=True),key=lambda x: x[2]['weight'],reverse=True)

    length = int(top_edge_share*len(net_weight))
    #
    print(length)
    
    top_edges = net_weight[:length]

    new_net_2 = nx.Graph(top_edges)
    
    pos = layout(new_net_2,
                 #weight='weight',
                 center=(0.5,0.5)
                )
    
    #Get positions
    x,y = [[v[val] for v in pos.values()] for val in [0,1]]    
    
    nx.draw_networkx_nodes(new_net_2,pos,
                       node_size=list([size_lookup[x]**norm_2 for x in dict(new_net_2.degree).keys()]),
                       node_color = [color_lookup[comm_names[comms[x]]] if comm_names[comms[x]] in color_lookup.keys() else 'white' for x in dict(new_net_2.nodes).keys()],
                       cmap='tab20c',
                       alpha=0.9,edgecolors='darkgrey')
    
    ax.annotate(label,xy=(np.min(x)+0.02,np.max(y)-0.02),size=24,color='white',fontweight='bold')

    nx.draw_networkx_edges(new_net_2,pos,width=[e[2]['weight']/norm for e in new_net_2.edges(data=True)],edge_color='white')

In [None]:
fig,ax = plt.subplots(figsize=(12,8))

ax.set_facecolor('black')

show_network(topic_net_2,0.05,norm=100,norm_2=0.9,layout=nx.kamada_kawai_layout,size_lookup=size_lookup['all'],label='All years',loc=(-0.5,1.48))

ax.legend(handles=patches,facecolor='white',loc='upper right',title='Area')
ax.set_xticks([])
ax.set_yticks([])

plt.tight_layout()

save_fig('network_all_years.pdf')


In [None]:
old_period = analysis_fin.loc[analysis_fin['year']<2011][topics_filtered]


top_net_old= make_network_from_doc_term_matrix(old_period,0.025,'paper_id')

fig,ax = plt.subplots(figsize=(10,10))

show_network(top_net_old,0.02,norm=900,norm_2=0.9,layout=nx.kamada_kawai_layout,size_lookup=size_lookup['pre'],label='Before 2012',loc=(-0.29,1.1))

ax.legend(handles=patches,facecolor='white',loc='lower left',title='Area')
ax.set_facecolor('black')
ax.set_xticks([])
ax.set_yticks([])

plt.tight_layout()

save_fig('network_early.pdf')

In [None]:
mid_period = analysis_fin.loc[(analysis_fin['year']>=2011)&(analysis_fin['year']<2016)][topics_filtered]


top_net_mid= make_network_from_doc_term_matrix(mid_period,0.025,'paper_id')

fig,ax = plt.subplots(figsize=(12,8))

show_network(top_net_mid,0.02,norm=700,norm_2=0.9,layout=nx.kamada_kawai_layout,size_lookup=size_lookup['mid'],label='Between 2012 and 2015',loc=(-0.36,1.3))

ax.legend(handles=patches,facecolor='white',loc='lower left',title='Area')
ax.set_facecolor('black')
ax.set_xticks([])
ax.set_yticks([])

plt.tight_layout()

save_fig('network_mid.pdf')

In [None]:
late_period = analysis_fin.loc[(analysis_fin['year']>2016)][topics_filtered]


top_net_late= make_network_from_doc_term_matrix(late_period,0.025,'paper_id')

fig,ax = plt.subplots(figsize=(10,10))

show_network(top_net_late,0.02,norm=700,norm_2=0.9,layout=nx.fruchterman_reingold_layout,size_lookup=size_lookup['late'],label='After 2015',loc=(-0.52,1.4))

ax.legend(handles=patches,facecolor='white',loc='lower left',title='Area')
ax.set_facecolor('black')
ax.set_xticks([])
ax.set_yticks([])

plt.tight_layout()

save_fig('network_late.pdf')

### Plot centralities

In [None]:
color_lookup_2 = {
    'deep_learning':'blue',
    #'robotics_agents':'cornflowerblue',
    'computer_vision':'aqua',
    'symbolic':'red',
    #'health':'lime',
    #'social_biological':'forestgreen',
    #'technology':'magenta',
    'statistics':'orange',
    #'language':'yellow'
}

In [None]:
def plot_centrality(network,measure,cl,ax,plot_name):
    '''
    This is to plot the centrality of different topics inside the topic network.
    
    Args:
        -network is the network whose centralities we want to plot
        -measure is the measure we want to plot
        -colour lookup is to colour the bars in the network
        -ax is the axis
    
    Returns a plot of the distributions of centrality
    
    '''
    
    #Calculate the centrality measure and normalise it
    c = pd.Series(measure(network,weight='weight'))
    
    c_norm =  pd.Series(zscore(c),index=c.index)
    
    #Plot
    c_sorted = c_norm.sort_values(ascending=False)
    
    cols = [cl[comm_names[comms[x]]] if comm_names[comms[x]] in cl.keys() else 'lightgrey' for x in c_sorted.index]
    
    c_sorted.plot.bar(color=cols,ax=ax,width=1)
    
    ax.legend(handles=patches,ncol=3)
    ax.set_xticklabels([])
    ax.set_xticks([])
    ax.set_ylabel('Normalised centrality')
    ax.set_title(plot_name)

    



In [None]:
patches = [mpatches.Patch(facecolor=c, label=l,edgecolor='black') for l,c in color_lookup.items()]

In [None]:
fig,ax = plt.subplots(ncols=2,figsize=(20,8))

plot_centrality(top_net_old,nx.eigenvector_centrality,cl=color_lookup,ax=ax[0],plot_name='Before 2012')
#plot_centrality(top_net_mid,nx.eigenvector_centrality,cl=color_lookup,ax=ax[1],plot_name='Between 2011 and 2015')
plot_centrality(top_net_late,nx.eigenvector_centrality,cl=color_lookup,ax=ax[1],plot_name='After 2015')

plt.tight_layout()

save_fig('network_centrality_change.pdf')

### Consider disruption

Our final descriptive analysis considers disruption over time: what have been the changes in the composition of AI since the 2000s?

We create a matrix that compares the topic vector for every year (a normalised sum) across years.

In [None]:
from sklearn.metrics import pairwise_distances

import seaborn as sns

In [None]:
#We want to measure distances between activity profiles in years
period=np.arange(2000,2019)

#We create a vector with counts of papers with activity in a year
year_topics = pd.concat([(analysis_fin.loc[analysis_fin['year']==y,topics_filtered]>0.05).sum() for y in period],axis=1)

year_topics.columns = period

#We normalise the results (we want to consider the relative importance of topics, not absolute)
topics_years_norm = year_topics.T.apply(lambda x: zscore(x)).dropna(axis=1)

In [None]:
#We calculate distances between years

year_sims = pd.DataFrame(1-pairwise_distances(topics_years_norm,metric='cosine'),index=period,columns=period)


In [None]:
#We also calculate rolling intra-year distances. We focus on the diagonal for visualisation
mean_sims = pd.Series(np.diag(np.matrix(year_sims.rolling(window=3).mean())))
mean_sims.index = period

In [None]:
#We plot the results, which show quite starkly the disruption in AI research before and after 2012.

fig,ax = plt.subplots(figsize=(10,8),nrows=2,gridspec_kw={'height_ratios':[3,1.2]})

ax[0].imshow(year_sims,cmap='seismic',aspect='auto')

#Some formatting of labels etc
ax[0].set_xticks([])
ax[0].set_xticklabels([])
ax[0].set_yticks(np.arange(0,len(period)))
ax[0].set_yticklabels(period)
ax[0].set_title('Year on year topic similarity',size=14)


ax[1].set_ylabel('Year-on-year \n similarity \n (rolling mean)')

mean_sims.plot(ax=ax[1])

plt.subplots_adjust(hspace=0.05)

plt.tight_layout()


save_fig('disruption_measure.pdf')

Can we calculate the half life of similarity?

In [None]:
# def make_five_year_disruption(table,year,span):
#     '''
#     This calculates the rate at which a year becomes more dissimilar from other years
    
#     Args:
#         Table with similarities
#         Year is the year of interest
#         span is how many years to consider in the analysis
    
#     '''
    
#     #This extracts the five years before the year and extracts their similarities
#     out = pd.Series(make_growth_rate(table.loc[year,(year-span):year+1][::-1])).mean()

#     return(out)




In [None]:
# out = []
# for y in np.arange(2005,2019):
    
#     dist = make_five_year_disruption(year_sims,y,4)
    
#     out.append(dist)

In [None]:
# pd.Series(out,index=np.arange(2005,2019)).rolling(window=3).mean().dropna().plot()

## Case studies

### Women in AI

Our prior is that papers with women tend to be more focused on fields such as health and social. We explore this here.

We will constrain our analysis to two issues.

a. Distribution of topics over 'communities'
b. Analysis of diversity in topics: are female papers more interdisciplinary?

### Simple comparison

In [None]:
def cross_sectional_comp(df,variable,topics,threshold):
    '''
    This function compares activity by topics between categories.
    
    Args:
        df is the dataframe we are using (generally analysis_fin, with rows = papers and columns = variables and metadata)
        variable is the variable we are using for the comparison
        topics is the topics where we want to compare (generally the community names)
        threshold is the threshold we want to use to determine if a paper is in a topic or not
    
    Returns a df with the shares of papers in each topic sorted by their distances
    
    '''
    
    #Create the counts df.
    
    #We are extracting, for each topics, the % of papers with at least one female author when the topic is present, and when it isn't.
    group_counts = pd.concat([pd.crosstab(df[variable],df[t]>threshold,normalize=1).loc[True,:] for t in topics],axis=1)
    
    #Name
    group_counts.columns = topics
    
    #Transpose
    group_counts = group_counts.T
    
    #Rename variables
    group_counts.columns = [variable+f'_{value}' for value in ['false','true']]
    
    #Create a measure of difference
    group_counts['difference'] = (group_counts.iloc[:,1]/group_counts.iloc[:,0])-1
    
    #Output
    out = group_counts.sort_values('difference',ascending=False)
    
    return(out)
    

In [None]:
def plot_regression_coefficients(var,cov='HC1',size=(8,6)):
    '''
    Plots regression coefficients.
    
    Arg:
        variable we use as predictor.
    
    '''
    
    reg = topic_regression(analysis_fin,topics_filtered,[var],controls,OLS,cov='HC1')
    
    fig,ax = plt.subplots(figsize=size)

    plot_topic_bar(reg[1]['coefficient'],cl=color_lookup,ax=ax)

    ax.set_title(f'Regression coefficient using {var} as predictor')
    

In [None]:
woman_average = analysis_fin['has_female'].value_counts(normalize=True)[True]

In [None]:
woman_community_comp = cross_sectional_comp(analysis_fin,'has_female',community_names,threshold=0.1)

fig,ax = plt.subplots(figsize=(10,6),ncols=2,sharey=True)

(100*woman_community_comp.iloc[:,1][::-1]).plot.barh(ax=ax[0])

(100*woman_community_comp['difference'][::-1]).plot.barh(ax=ax[1])

ax[0].vlines(x=100*woman_average,ymin=-0.5,ymax=len(woman_community_comp),linestyle=':',color='red')
ax[0].set_xlabel('Papers with at least one female author as \n share of the total')


ax[1].vlines(x=0,ymin=-0.5,ymax=len(woman_community_comp),linestyle=':',color='red')
ax[1].set_xlabel('Representation of papers \n with at least one female author')


ax[0].set_ylabel('')

fig.suptitle('              Representation of topics for papers with one female author',y=1.01)

plt.tight_layout()

plt.savefig(f'../reports/figures/slide_deck/{today_str}_women_representation.pdf',bbox_inches='tight')

#### Comparison by topics

In [None]:
woman_topic_comp = cross_sectional_comp(analysis_fin,'has_female',topics_filtered,threshold=0.05)

In [None]:
def plot_topic_bar(table,cl,ax):
    '''
    Simple function to plot topic bars which includes colours based on the topic-label lookup
    
    Args:
        table has topics in the index and a value to plot in the columns
        cl is the colour lookup between communities and topics
        ax is the plotting axe
    
    
    '''
    
    cols = [cl[comm_names[comms[x]]] if comm_names[comms[x]] in cl.keys() else 'lightgrey' for x in table.index]
    
    table.plot.bar(color=cols,ax=ax,width=1)
    
    ax.legend(handles=patches,ncol=3)
    ax.set_xticks([])
    ax.set_xticklabels([])

In [None]:
fig,ax = plt.subplots(figsize=(8,5))

plot_topic_bar(woman_topic_comp['difference'],cl=color_lookup,ax=ax)

ax.set_title('Representation of papers with female topics')

#### Report regression analysis

In [None]:
controls = ['year']+list(field_names)

In [None]:
plot_regression_coefficients('has_female',size=(8,6))

save_fig('woman_regression.pdf')

#### Compare paper multidisciplinarity between female and male

We conclude our analysis of differences between papers with different genders with a look at the interdisciplinarity of different paper types

In [None]:
arx_field_comp = arx.loc[arx['is_ai']==True,:].dropna(axis=0,subset=['has_female'])

In [None]:
from scipy.stats import entropy

In [None]:
def calculate_entropy(df,categories,category):
    '''
    We calculate entropy inside a paper using a distribution over semantic variables (eg discipline, community or topic). These have to be normalised
    
    arguments:
        df is the analysis df with relevant topics and metadata
        categories are the topics we want to compare
        
    outputs
        A df with entropy measures by paper
        
    
    '''
    #Normalise
    norm = df[categories].apply(lambda x: x/x.sum(),axis=1)
    
    ent = pd.DataFrame((norm.apply(lambda x: entropy(x),axis=1)),columns=['entropy'])
    
    ent['cat']=category
    
    return(ent)
    

In [None]:
# #Compare the entropies between disciplines
# gender_field_entropy = pd.concat([calculate_entropy(
#     arx_field_comp.loc[(arx_field_comp['has_female']==value)],field_names,category) for 
#                            value,category in zip([False,True],['no_female','has_female'])],axis=0)

# gender_field_entropy.groupby('cat')['entropy'].mean()

In [None]:
#Compare the entropies between topics
gender_ent = pd.concat([calculate_entropy(
    analysis_fin.loc[(analysis_fin['has_female']==value)],topics_filtered,category) for 
                           value,category in zip([False,True],['no_female','has_female'])],axis=0)

gender_ent.groupby('cat')['entropy'].mean()

In [None]:
fig,ax = plt.subplots(figsize=(2,5))

#ax.violinplot([list(gender_ent.loc[gender_ent['cat']==val,'entropy']) for val in ['has_female','no_female']])
gender_ent.boxplot(column='entropy',by='cat',ax=ax)

#ax.set_title('')
#ax.set_title('Entropy by \n female participation in AI paper')


#### Very preliminary gender analysis

Create the regression df

In [None]:
div_reg = analysis_fin.copy()

#Add entropy
div_reg['entropy'] = calculate_entropy(div_reg,topics_filtered,category='drop')['entropy']

#Drop missing values
div_reg.dropna(inplace=True)

#Creare endogenous variable
endog = div_reg['entropy'].astype(float)

#Create predictors
exog = add_constant(div_reg[['has_female','year'] + list(field_names)].astype(float))

Fit model

In [None]:
reg = OLS(endog=endog,exog=exog).fit(cov_type='HC2')

In [None]:
reg.summary()

There is some preliminary evidence suggesting that papers involving women tend, on average, to have more diverse combinations of topics.

### Company analysis

What are we going to do?

* Measure the distribution over terms as before
* Study trends (share of DL / Reinforcement learning / Computer vision accounted by companies)

#### Some basic descriptives

**How many companies?**

In [None]:
np.sum(analysis_fin['company'])/len(analysis_fin)

In [None]:
100*pd.Series(flatten_list([list(set([inst for inst in x if type(inst)==str])) for x in analysis_fin['institute_list'].dropna()])).value_counts(normalize=True)[:20]

### Trends

In [None]:
comps = pd.concat([pd.crosstab(analysis_fin['year'],analysis_fin[var],normalize=0)[True] for var in ['company','google','facebook','microsoft','ibm']],axis=1)
comps.columns = ['company','google','facebook','microsoft','ibm']
comps['other companies'] = comps['company']-comps.iloc[:,1:].sum(axis=1)

comps_data = 100*comps.loc[np.arange(2000,2019)].iloc[:,1:].rolling(window=3).mean().dropna()

In [None]:
fig,ax = plt.subplots(figsize=(10,6))

pal = sns.color_palette('Accent')

ax.stackplot(comps_data.index,comps_data.T,cmap='Dark2',labels=[x.capitalize() for x in comps_data.columns],colors=pal,edgecolor='grey')

ax.legend(bbox_to_anchor=(1.35,1))

ax.set_ylabel('% of all AI papers')
ax.set_title('Corporate participation in AI research')

plt.tight_layout()

save_fig('stacked_chart.pdf')

In [None]:
company_topic_comp = cross_sectional_comp(analysis_fin,'company',topics_filtered,threshold=0.05)

fig,ax = plt.subplots(figsize=(8,5))

plot_topic_bar(company_topic_comp['difference'],cl=color_lookup,ax=ax)

ax.set_title('Representation of papers involving companies')

In [None]:
google_topic_comp = cross_sectional_comp(analysis_fin,'google',topics_filtered,threshold=0.05)

fig,ax = plt.subplots(figsize=(8,5))

plot_topic_bar(google_topic_comp['difference'],cl=color_lookup,ax=ax)

ax.set_title('Representation of papers involving Google')

### Regression

In [None]:
plot_regression_coefficients('company',size=(8,6))

plt.tight_layout()

save_fig('company_regression.pdf')



In [None]:
plot_regression_coefficients('education',size=(8,6))

plt.tight_layout()

save_fig('education_regression.pdf')


In [None]:
plot_regression_coefficients('google',size=(8,6))

plt.tight_layout()

save_fig('google_regression.pdf')

### Time series analysis

I want to study the level of activity in a topic accounted by different types of organisations. 

The target chart contains share of all papers in a topic accounted by different types of organisations


In [None]:
analysis_fin['no_education'] = analysis_fin['education']==False

In [None]:
def extract_topic_trend(df,cat,year_lims=[2000,2019]):
    '''
    Extracts evolution of a share of a category in a topic of interest
    
    Args:
        df: the usual dataframe
        cat: the category we are interested in
        year_lims: first and last year to consider

    '''
    #rel_df = df.loc[df[cat]==True]
    
    out = pd.crosstab(df['year'],df[cat],normalize=0)
    
    return(out.loc[np.arange(year_lims[0],year_lims[1])])

def plot_topic_trend(df,cat,topics,ax,cmap,year_lims=[2000,2019],threshold=0.05,focus_topics=False,alpha=0.2):
    '''
    Plots topic trends (shares of a category in a topic)
    
    Args:
        df the usual dataframe
        topics: topics we want to display
        cat: the category of interest
        year_lims: first and last year to consider
    
    '''
    activity = []
    names = []
    
    #Use a loop to deal with cases where a category has no activity in a topic
    for t in topics:
        try:
            levels = extract_topic_trend(df.loc[df[t]>threshold],cat,year_lims)
            activity.append(levels[True])
            names.append(t)
        
        except:
            pass
        
        
    topic_trends = pd.concat(activity,axis=1).fillna(0)
    topic_trends.columns = names
    
    if focus_topics !=False:
        
        topic_lookup = {name:val for val,name in enumerate(focus_topics)}

        #Color map
        cols = plt.cm.get_cmap(cmap)

        #Create a vector of colors
        cols_to_show = [(0.5,0.5,0.5,alpha) if v not in topic_lookup.keys() else cols(topic_lookup[v]) for v in topic_trends.columns]

        #Plot
        (100*topic_trends.rolling(window=4).mean().dropna()).plot(color=cols_to_show,ax=ax,linewidth=3)

        #Fix the legend to focus on key topics
        hand,labs = ax.get_legend_handles_labels()

        ax.legend(bbox_to_anchor=(1,1),handles = [x[0] for x in zip(hand,labs) if x[1] in focus_topics],
                  labels=[x[1][:50] for x in zip(hand,labs) if x[1] in focus_topics])
    
    else:

        topic_trends.rolling(window=4).mean().dropna().plot(ax=ax)
        ax.legend(bbox_to_anchor=(1,1))
    

    


In [None]:
core_ai_topics = ['cnn-convolutional_neural_networks-cnns-convolutional_neural_network-convolutional_neural_network_cnn',
                  'recurrent-lstm-rnn-recurrent_neural_network-recurrent_neural_networks',
                 'reinforcement_learning-policy-policies-reward-deep_reinforcement_learning',
                 'translation-neural_machine_translation-machine_translation-translate-translations',
                  'latent-generative_model-generative-generative_models-latent_variables',
                 ]

In [None]:
fig,ax = plt.subplots(figsize=(14,6))

plot_topic_trend(analysis_fin,'company',cmap='Dark2',topics=topics_filtered,ax=ax,threshold=0.01,focus_topics=core_ai_topics,alpha=0.07,year_lims=[2004,2019])

ax.set_title('Share of all papers with company presence')
ax.set_ylabel('%')

save_fig('company_trends.pdf')

In [None]:
hardware = ['processing-implementation-computations-frameworks-running','hardware-energy_consumption-power_consumption-energy_efficiency-fpga']

fig,ax = plt.subplots(figsize=(14,6))

plot_topic_trend(analysis_fin,'company',cmap='Dark2',topics=topics_filtered,ax=ax,threshold=0.01,focus_topics=hardware,alpha=0.07,year_lims=[2004,2019])

ax.set_title('Share of all papers with company presence')
ax.set_ylabel('%')


In [None]:
fig,ax = plt.subplots(figsize=(14,6))

plot_topic_trend(analysis_fin,'google',cmap='Dark2',topics=topics_filtered,ax=ax,threshold=0.01,focus_topics=core_ai_topics,alpha=0.07,year_lims=[2004,2019])

ax.set_title('Share of all papers with Google presence')
ax.set_ylabel('%')

save_fig('google_trends.pdf')

In [None]:
fig,ax = plt.subplots(figsize=(14,6))

plot_topic_trend(analysis_fin,'education',cmap='Dark2',topics=topics_filtered,ax=ax,threshold=0.01,focus_topics=core_ai_topics,alpha=0.07,year_lims=[2004,2019])

ax.set_ylabel('Share of all papers with education presence')

save_fig('ed_trends.pdf')

In [None]:
# fig,ax = plt.subplots(figsize=(8,6))

# plot_topic_trend(analysis_fin,'no_education',cmap='Dark2',topics=topics_filtered,ax=ax,threshold=0.01,focus_topics=core_ai_topics,alpha=0.1,year_lims=[2004,2019])

# ax.set_ylabel('Share of all papers with no education presence')

### What are the levels of university / industry collaboration?

In [None]:
#Create a variable that captures collaborations
analysis_fin['university_industry_collab'] = [all(entity in x for entity in ['Education','Company']) if type(x)==list else np.nan for x in analysis_fin['type_list']]
analysis_fin['govt_industry_collab'] = [all(entity in x for entity in ['Government','Company']) if type(x)==list else np.nan for x in analysis_fin['type_list']]


In [None]:
analysis_fin['university_industry_collab'].sum()

In [None]:
analysis_fin['govt_industry_collab'].sum()

In [None]:
(100*pd.crosstab(analysis_fin['year'],analysis_fin['university_industry_collab'],normalize=0))[True].rolling(window=3).mean().plot(
    title='Share of papers with university industry collaborations')

In [None]:
def get_university_industry_collab_trends(df,variable,topic,threshold=0.05):
    '''
    Study university industry collaborations
    
    Args:
        df as usual
        variable is the collaboration variable we want to study
        topic the topic
        threshold is the threshold for accept a paper in a topic
    

    '''
    
    df_with_topic = df.loc[df[topic]>threshold]
    

    topic_collabs = (100*pd.crosstab(df_with_topic['year'],df_with_topic['university_industry_collab'],normalize=0))[True]
    
    
    return(topic_collabs)
    

In [None]:
#Extract collaborations on 'core AI topics'

collabs_in_topics = pd.concat([get_university_industry_collab_trends(analysis_fin,'university_industry_collab',t) for t in core_ai_topics],axis=1).fillna(0)

collabs_in_topics.columns = core_ai_topics

#Get average collaborations (we set a negative threshold to select all projects)
all_collabs = get_university_industry_collab_trends(analysis_fin,'university_industry_collab',community_names[0],threshold=-1)
all_collabs.name = 'All subjects'

#Concatenate everything
collabs_in_topics = pd.concat([all_collabs,collabs_in_topics],axis=1)

In [None]:
#Plot
ax = collabs_in_topics.loc[np.arange(1995,2019)].rolling(window=5).mean().dropna().plot(figsize=(14,6),linewidth=3)

ax.legend(bbox_to_anchor=(1,1))

ax.set_ylabel('Share of all papers')
ax.set_title('Collaborations between university and industry')

hand,labs = ax.get_legend_handles_labels()

ax.legend(bbox_to_anchor=(1,1),handles = [x[0] for x in zip(hand,labs)],
          labels=[x[1][:50] for x in zip(hand,labs)])



save_fig('collaboration_trends.pdf')

### A network visualisation?

Not for now

In [None]:
# def make_network_from_list(co_occ):
#     '''
#     Create a network from a document term matrix.
    
#     Args
#         coocc - a list ehere every element is a collectio of cooccurrences
        
#     Returns: 
#         A network
    
#     '''
    
#     #Melt the topic mix and remove empty entries
#     #cd = pd.melt(mat.reset_index(drop=False),id_vars=[id_var])

#     #cd = cd.loc[cd['value']>threshold]

#     #This gives us the topic co-occurrence matrix
#     #co_occurrence = cd.groupby(id_var)['variable'].apply(lambda x: list(x))
    
#     #Here the idea is to create a proximity matrix based on co-occurrences

#     #Turn co-occurrences into combinations of pairs we can use to construct a similarity matrix
#     sector_combs = flatten_list([sorted(list(combinations(x,2))) for x in co_occ])
#     sector_combs = [x for x in sector_combs if len(x)>0]

#     #Turn the sector combs into an edgelist
#     edge_list = pd.DataFrame(sector_combs,columns=['source','target'])

#     edge_list['weight']=1

#     #Group over edge pairs to aggregate weights
#     edge_list_weighted = edge_list.groupby(['source','target'])['weight'].sum().reset_index(drop=False)

#     edge_list_weighted.sort_values('weight',ascending=False).head(n=10)
    
#     #Create network and extract communities
#     net = nx.from_pandas_edgelist(edge_list_weighted,edge_attr=True)
    
#     return(net)

# # def show_org_network(net,
# #                      top_edge_share,
# #                      color_lookup=color_lookup,
# #                      norm=1000,norm_2=1.2,
# #                      layout=nx.kamada_kawai_layout,size_lookup=size_lookup):
# #     '''
# #     Plots a network visualisation of the topic network.
    
    
# #     '''
    
# #     new_net = net.copy()
    
# #     #We drop the 
# #     #drop_bad_edges = [e for e in new_net.edges(data=True) if not any(x in topic_drop for x in e[:2])]

# #     #new_net_2 = nx.Graph(drop_bad_edges)

# #     net_weight = sorted(new_net.edges(data=True),key=lambda x: x[2]['weight'],reverse=True)

# #     length = int(top_edge_share*len(net_weight))
# #     #
# #     print(length)
    
# #     top_edges = net_weight[:length]

# #     new_net_2 = nx.Graph(top_edges)
    
# #     pos = layout(new_net_2,
# #                  #weight='weight',
# #                  center=(0.5,0.5)
# #                 )
    
# #     #Get positions
# #     x,y = [[v[val] for v in pos.values()] for val in [0,1]]    
    
# #     nx.draw_networkx_nodes(new_net_2,pos,
# #                        #node_size=list([size_lookup[x]**norm_2 for x in dict(new_net_2.degree).keys()]),
# #                        #node_color = [color_lookup[comm_names[comms[x]]] if comm_names[comms[x]] in color_lookup.keys() else 'white' for x in dict(new_net_2.nodes).keys()],
# #                        #cmap='tab20c',
# #                        alpha=0.9,edgecolors='darkgrey')
    
# #     #ax.annotate(label,xy=(np.min(x)+0.02,np.max(y)-0.02),size=16,color='white',fontweight='bold')

# #     nx.draw_networkx_edges(new_net_2,pos,width=[e[2]['weight']/norm for e in new_net_2.edges(data=True)],edge_color='white')
    
# # org_net = make_network_from_list(analysis_fin['institute_list'].dropna())
# # # # fig,ax = plt.subplots(figsize=(12,8))

# # # # show_org_network(org_net,top_edge_share=0.01)

### Final analysis: places

We load the lookup between article ids and lads we created in `supp_6` and use it to study the geography of AI research in the UK.

More specifically, we want to create three charts:

* Concentration trends
* Concentration in AI 'core topics'
* Comparison between concentration of AI activity and areas at risk of automation


In [None]:
with open('../data/processed/17_8_2019_arxiv_lads.json','r') as infile:
    lad_lookup = json.load(infile)

In [None]:
analysis_w_countries = analysis_fin.dropna(axis=0,subset=['country_list'])

In [None]:
#Focus on papers in the UK
analysis_uk = analysis_w_countries.loc[[any(var in x for var in ['United Kingdom','Australia']) for x in analysis_w_countries['country_list']]]

In [None]:
#Label papers with their lad codes and names
analysis_uk['lad_code'],analysis_uk['lad_name'] = [[lad_lookup[x][var] if x in lad_lookup.keys() else np.nan for x in analysis_uk.index] for var in ['lad18cd','lad18nm']]

In [None]:
#Drop missing LADs for this analysis
analysis_uk = analysis_uk.dropna(axis=0,subset=['lad_name'])

### Point one: Geographical trends

In [None]:
#All the LADs in the data
all_lads = pd.Series(flatten_list(analysis_uk['lad_name'])).value_counts().index

In [None]:
def plot_local_research_concentration(df,top_n,ax,subset_topics=False,lad_list = all_lads,year_lims=[2000,2019]):
    '''
    This function plots the concentration of research activity in LADs
    
    Args:
        df (df) is the df with papers and lads (so this will have been processed as above)
        top_n (int) is how many of the lads do we want to show
        ax is the axis
        lad_list (list) is the list of LADs to consider
        subset_topics (list) is a list where the first element is the list of topics (or communities) we want to focus on; the second is the threshold for inclusion
        year_lims is the years to consider
    
    '''
    
    if subset_topics!=False:
        df = df.loc[df[subset_topics[0]].apply(lambda x: any(x>subset_topics[1]),axis=1)]
        
    
        
    activity_year = pd.concat([df.loc[[lad in x for x in df['lad_name']]]['year'].value_counts() for lad in lad_list],axis=1).fillna(0)
    activity_year.columns = lad_list
    
    top_lads = activity_year.sum(axis=0).sort_values(ascending=False).index[:top_n]
        

    (100*activity_year.apply(lambda x: x/x.sum(),axis=1).rolling(window=3).mean()).dropna().loc[np.arange(year_lims[0],
                                                                                                   year_lims[1]),top_lads].plot.bar(
        stacked=True,width=0.9,cmap='Accent',edgecolor='lightgrey',ax=ax)
    
    ax.legend(bbox_to_anchor=(1,1))

    
    

In [None]:
#Lorenz Curves of concentration
all_ai_concentration = pd.Series(
    flatten_list(analysis_uk['lad_name'])).value_counts(normalize=True).cumsum()

core_ai_concentration = pd.Series(
    flatten_list(analysis_uk.loc[analysis_uk[core_ai_topics].apply(lambda x: any(x>0.05),axis=1)]['lad_name'])).value_counts(normalize=True).cumsum()




In [None]:
#Plot
pd.concat([x.reset_index(drop=True) for x in [all_ai_concentration,core_ai_concentration]],axis=1).plot()

In [None]:
all_ai_concentration.iloc[:5]

In [None]:
core_ai_concentration.iloc[:5]

In [None]:
fig,ax = plt.subplots(figsize=(10,5))

plot_local_research_concentration(analysis_uk,8,ax=ax)

plt.tight_layout()

ax.set_ylabel('Share of all papers \n with LAD presence')
ax.set_title('Evolution of local AI research activity in the UK (top 8 locations)')

plt.tight_layout()

save_fig('ai_research_all.pdf')

In [None]:
#Towwer Hamlets involves Queen Mary university
#analysis_uk.loc[['Tower Hamlets' in x for x in analysis_uk['lad_name']]]['institute_list']

#### What about the core topics?

In [None]:
fig,ax = plt.subplots(figsize=(10,5))

plot_local_research_concentration(analysis_uk,8,ax=ax,subset_topics=[core_ai_topics,0.05],year_lims=[2005,2019])

plt.tight_layout()

ax.set_ylabel('Share of all papers with LAD presence')
ax.set_title('Evolution of local AI research activity (state of the art AI topics) in the UK (top 8 locations)')

plt.tight_layout()

save_fig('ai_research_core.pdf')

In [None]:
#analysis_uk.loc[['Wokingham' in x for x in analysis_uk['lad_name']]]['institute_list']
#Wokingham is University of Reading

### What about, say, health?

In [None]:
# health = [x for x in topics_filtered if comm_names[comms[x]]=='health']

In [None]:
# fig,ax = plt.subplots(figsize=(10,6))

# plot_local_research_concentration(analysis_uk,8,ax=ax,subset_topics=[health,0.05],year_lims=[2005,2019])

# plt.tight_layout()

# #ax.set_ylabel('Share of all papers with LAD presence')
# #ax.set_title('Evolution of local AI research activity (new AI topics) in the UK (top 8 locations)')


### Compare geography of AI activity and geography of automation

In [None]:
#Load automation data
aut = pd.read_csv('../data/processed/19_7_2019_ons_automation_clean.csv',index_col=None)

In [None]:
def get_lad_activity(df,name,subset_topics=False):
    '''
    Extracts the distribution of activity by LAD.
    
    Args:
        df (df) with the data
        topic_subset (list) if not false, the topics to focus on and their threshold for inclusion
        name (str) is the name of the variable
    
    
    '''
    if subset_topics != False:
        df = df.loc[df[subset_topics[0]].apply(lambda x: any(x>subset_topics[1]),axis=1)]
        
    counts = pd.concat([pd.Series(len(df.loc[[lad in x for x in df['lad_name']]]),name=lad,index=[name]) for lad in all_lads],axis=1).fillna(0).T
    
    return(counts)
    
    

In [None]:
# Combine automation data with AI

#List comprehension
ai_lad_counts = pd.concat([get_lad_activity(analysis_uk,name,topic_subset) for name,topic_subset in zip(['All AI','Core AI topics'],[False,[core_ai_topics,0.02]])],axis=1)

aut_ai = pd.concat([aut.set_index('lad_name'),ai_lad_counts],axis=1).dropna(axis=0,subset=['lad_code']).fillna(0)

aut_ai.head()

In [None]:
def benchmark_concentrations(df,ranking_var,quantiles,comparisons,ax):
    '''
    
    We create a df that compares share of automation, AI activity accounted by different locations.
    
    Args:
        df is a table with automation and AI activity
        ranking_var is the variable we use to create the groups to analyse the distribution
        quantiles is the number of groups we create
        comparisons are the variables we want to benchmark
    
    
    '''
    
    df_2 = df.copy()
    
    df_2['aut_rank'] = pd.qcut(df_2[ranking_var],q=quantiles,labels=False)

    df_norm = df_2[comparisons].apply(lambda x: x/x.sum())
    df_norm['aut_rank'] = df_2['aut_rank']
    
    (100*df_norm.groupby('aut_rank')[comparisons].sum()).plot.bar(ax=ax)
    
    #print(df_norm.loc[df_norm['aut_rank']==4])
    

In [None]:
fig,ax = plt.subplots(figsize=(8,5))

comps = ['number_high','All AI','Core AI topics']
q = np.arange(0,1.1,0.25)

benchmark_concentrations(aut_ai,'aut_prob',q,comps,ax)

ax.set_xlabel('Workforce automation ranking (quartile)')
ax.set_ylabel('% of the total in the UK')

ax.legend(title='Variable',labels = ['Workforce with high risk of automation','AI research activity','AI state of the art activity'])
ax.set_title('Distribution of AI activity and population at risk of automation')

plt.tight_layout()

save_fig('lad_comparison.pdf')

### Country comparison (free / not free)

#### Evolution of activity in not free countries

In [None]:
#Find top countries
countries = pd.Series(flatten_list(analysis_fin['country_list'].dropna())).value_counts().index

#Which are not free?
not_free_countries_all = [c for c in [x for x in countries if (x in country_status_lookup.keys())] if country_status_lookup[c]=='NF']

#Focus on the top countties
not_free_countries = not_free_countries_all[:10]


In [None]:
#Compare trends in not free countries vs average

analysis_w_countries = analysis_fin.dropna(axis=0,subset=['country_list'])

#Calculate activity for all countries
all_country_activity = pd.concat(
    [analysis_w_countries.loc[[x in countries for countries in analysis_w_countries['country_list']]]['year'].value_counts() for x in countries],axis=1).fillna(0)

all_country_activity.columns = countries

In [None]:
#Normalised country activity
country_norm = all_country_activity.iloc[:,1:].apply(lambda x: x/x.sum(),axis=1).loc[np.arange(2000,2019)]


country_ranked = country_norm.loc[:,not_free_countries_all + [x for x in country_norm.columns if x not in not_free_countries_all]]

In [None]:
# auth_colours = [red_cols(n) if c in not_free_countries else 'lightblue' for n,c in enumerate(country_ranked.columns)]

# not_free_to_plot =  country_ranked.rolling(window=3).mean().dropna()

# ax = (100*not_free_to_plot[not_free_countries[:-2]]).plot.bar(stacked=True,cmap='Accent',figsize=(10,6),width=0.9,edgecolor='lightgrey',linewidth=0.5)
# ax.legend(bbox_to_anchor=(1,1))
# ax.set_title('Share of all AI research activity in non-free countries')
# ax.set_ylabel('%')

# plt.tight_layout()

# save_fig('not_free_shares.pdf')

In [None]:
#all_country_activity_norm = all_country_activity.apply(lambda x: x/analysis_w_countries['year'].value_counts())

all_country_activity_norm_2 = pd.concat([analysis_w_countries['year'].value_counts(normalize=True),all_country_activity.apply(lambda x: x/x.sum(),axis=0)],axis=1)

In [None]:
all_country_activity_norm_2.rename(columns={'year':'All'},inplace=True)

In [None]:
fig,ax = plt.subplots(figsize=(8,8),nrows=2,sharex=False)

(100*not_free_to_plot[not_free_countries[:-2]]).plot.bar(stacked=True,cmap='Accent',width=0.9,edgecolor='lightgrey',linewidth=0.5,ax=ax[0],legend=True)
(100*all_country_activity_norm_2[not_free_countries[:-2]].loc[np.arange(2000,2019)]).rolling(window=4).mean().dropna().plot(cmap='Accent',ax=ax[1],legend=False,linewidth=3)
(100*all_country_activity_norm_2['All'].loc[np.arange(2000,2019)]).rolling(window=4).mean().dropna().plot(color='black',legend=True,
                                                                                                   #title='AI research trends in countries with low freedom indices',
                                                                                                         linewidth=3,ax=ax[1],linestyle=':')

ax[0].set_xticks([])


ax[0].legend(bbox_to_anchor=(1,1))
ax[1].legend(bbox_to_anchor=(1,1))
ax[0].set_title('AI Research activity in non-free countries')
ax[0].set_ylabel('% of all \n AI research activity')

#plt.tight_layout()

plt.subplots_adjust(hspace=0.1)


ax[1].set_ylabel('Year as \n share of total AI research')


save_fig('political_country_trends.pdf')

#### Plot regression coefficients

In [None]:
plot_regression_coefficients('nf',size=(8,6))

plt.tight_layout()

save_fig('not_free_specialisation.pdf')

Is the above just driven by China? We create a new variable excluding it

In [None]:
analysis_fin['not_free_not_china'] = [(x['nf']==True)&(x['china']==False) for pid,x in analysis_fin.iterrows()]

In [None]:
analysis_fin['not_free_not_china'].sum()

In [None]:
plot_regression_coefficients('not_free_not_china')

In [None]:
surv_topics = ['face-faces-identity-face_recognition-facial','person-surveillance-persons-pedestrian-pedestrians',
               #'sentiment_analysis-aspect-sentiment-reviews-opinion',
               #'malicious-files-malware-file-analysts',
               #'security-privacy-private-secure-trust'
              ]

**Cross-sectional comparison**

Here we calculate how over (or under?) represented is a topic in a country

In [None]:
cross = pd.concat([cross_sectional_comp(analysis_fin,x,surv_topics,threshold=0.05)['difference'] for x in ['china','not_free_not_china']],axis=1)

cross.columns = ['china','not_free_other_than_china']

ax = (100*cross.T.iloc[::-1]).plot.barh(title='Specialisation in visual surveillance topics',figsize=(12,5))

hand,labs = ax.get_legend_handles_labels()

ax.legend(bbox_to_anchor=(1,1),handles = [x[0] for x in zip(hand,labs)],
          labels=[x[1][:50] for x in zip(hand,labs)])


ax.vlines(x=0,ymin=-1,ymax=2,linestyle=':',color='red')

save_fig('activity_in_surveillance_topics.pdf')

### Trends

In [None]:
def trend_in_topic(df,topic,name,threshold=0.05,year_lim=[2005,2019],normalize=0):
    '''
    This returns trends of activity in a topic as a share of all activity
    
    Args:
        df is the df
        topic is the topic of interest
        threshold is the threshold
        year_lim is the years to consider
    
    
    '''
    
    if normalize!='none':
        trend = pd.crosstab(df['year'],df[topic]>threshold,normalize=normalize)
        
    else:
        trend = pd.crosstab(df['year'],df[topic]>threshold)
    
    
    trend.rename(columns={True:name},inplace=True)
    
    return(trend.loc[np.arange(year_lim[0],year_lim[1])].fillna(0)[name])
    

In [None]:
analysis_fin['All']=True

In [None]:
surv_trends = [pd.concat(
    [trend_in_topic(analysis_fin.loc[analysis_fin[var]==True],topic=t,name=var,threshold=0.05,normalize=0) for var in ['china','not_free_not_china','All']],axis=1) for t in surv_topics[:5]]


In [None]:
fig,ax = plt.subplots(figsize=(8,7),nrows=2,sharex=True)

for num,x in enumerate(surv_topics):
    
    (100*surv_trends[num]).rolling(window=3).mean().dropna().plot(ax=ax[num],linewidth=3)
    
    ax[num].set_title(x)
    ax[num].set_ylabel('% of papers in topic')
    
plt.tight_layout()

ax[1].legend().set_visible(False)

save_fig('surveillance_trends.pdf')

What is the bump in 2010?

In [None]:
surv_topics

In [None]:
face_rec = pd.concat(
    [trend_in_topic(analysis_w_countries.loc[[c in c_list for c_list in analysis_w_countries['country_list']]],
                                     topic=surv_topics[0],name=c,threshold=0.01,normalize='none') for c in not_free_countries],axis=1)


In [None]:
# iran_face = analysis_w_countries.loc[(analysis_w_countries[surv_topics[0]]>0.05)&(['Russia' in c for c in analysis_w_countries['country_list']])]

# for f in iran_face['abstract']:
#     print(f)

In [None]:
# h = analysis_w_countries.loc[(analysis_w_countries['health']>0.1)]

# for f in h['abstract']:
#     print(f)

### Outputs for the paper

Here we will create a dictionary of key results which we will output as a text file

**Growth rates**

In [None]:
year_counts = pd.crosstab(arx['year'],arx['is_ai'])

In [None]:
def make_growth_rate(series):
    '''
    This function creates a growth rate for a series
    
    It takes the series and divides a value by the next value. Divisions by zero are nan
    
    
    '''
    
    growth_rate = []

    for n,x in enumerate(series):

        if n==0:
            out=np.nan
            growth_rate.append(np.nan)
        else:
            if div!=0:
                out = 100*((x/div)-1)
                growth_rate.append(out)
            else:
                growth_rate.append(np.nan)

        div = x

    return(growth_rate)
    

In [None]:
year_growth = year_counts.apply(make_growth_rate)

In [None]:
g_ai = year_growth.iloc[-5:].mean()

In [None]:
ai_in_fields_total = pd.concat([pd.crosstab(arx.loc[arx[t]>0.5]['year'],
                                     arx.loc[arx[t]>0.5]['is_ai'])[1] for t in field_names],axis=1).fillna(0)
ai_in_fields_total.columns = field_names

In [None]:
g_non_ai = ai_in_fields_total.apply(make_growth_rate).iloc[-5:].mean().loc[[x for x in field_names if x not in 
                                                                            ['field_machine_learning_data','field_statistics_probability','field_informatics']]].mean()


In [None]:
surv_trends_total = [pd.concat(
    [trend_in_topic(analysis_fin.loc[analysis_fin[var]==True],topic=t,name=var,threshold=0.05,normalize='none') for var in ['china','not_free_not_china','All']],axis=1) for t in surv_topics[:5]]


In [None]:
surv_trends_total[0].apply(make_growth_rate).loc[2014:].mean()

In [None]:
surv_trends_total[1].apply(make_growth_rate).loc[2014:].mean()

**Examples**

In [None]:
def get_example(df,number,length):
    '''
    Gets random examples in a field
    
    Args:
        Df is the dataframe we want to use
        number is the number of examples we want
        length is the length of the examples
    
    '''
    
    choose = random.sample(list(df.index),number)
    
    for x in df.loc[choose]['abstract']:
        
        print(x[:length])
        print('\n')
    

In [None]:
# for x in ['field_astrophysics','field_biological','field_complex_systems','field_materials_quantum','field_societal']:
    
#     print(x)
#     print('====')
    
#     d = arx.loc[(arx['is_ai']==True) & (arx['top_field']==x)]
    
#     get_example(d,5,1000)