# Regional analysis

Here we focus on the regional analysis. As part of this we look at:

* Levels of concentration and its evolution at the subnational level for all arXiv, AI and SotA topics
* Detailed evolution of concentration in the UK
* Comparison with distribution of automation in England

## 0. Preamble

In [None]:
%run notebook_preamble.ipy

In [None]:
# Ignore future warnings (for when I concatenate dfs)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Other imports

In [None]:
import random

from statsmodels.api import OLS, Logit
from statsmodels.tools.tools import add_constant
from scipy.stats import zscore
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from scipy.stats import entropy
import seaborn as sns


### Functions

Add a bunch of exogenous variables to the analysis df

In [None]:
#Generic functions
def save_fig(name,path='../reports/figures/paper_rev/'):
    '''
    Saves a figure
    '''
    plt.tight_layout()
    
    plt.savefig(path+f'{today_str}_{name}')
    
    # Put functions etc here

def flatten_list(my_list):
    '''
    Flattens a list
    '''
    
    return([x for el in my_list for x in el])


def get_example(df,number,length):
    '''
    Gets random examples in a field
    
    Args:
        Df is the dataframe we want to use
        number is the number of examples we want
        length is the length of the examples
    
    '''
    
    choose = random.sample(list(df.index),number)
    
    for x in df.loc[choose]['abstract']:
        
        print(x[:length])
        print('\n')
        
def flatten_freq(nested_list):
    '''
    
    Function to calculate frequencies of elements within a nested list
    
    '''
    
    return(pd.Series(flatten_list(nested_list))).value_counts()


In [None]:
def make_tidy_lookup(names_list,length=False):
    '''
    
    Creates a cheap lookup between names, removing underscores and capitalising
    
    Args:
        names_list (list) is the list of names we want to tidy
        length is if we want to only keep a certain length of the name
    
    '''
    
    out = {x:re.sub('_',' ',x).capitalize() for x in names_list}
    return(out)



In [None]:
def cross_sectional_comp(df,variable,topics,threshold):
    '''
    This function compares activity by topics between categories.
    
    Args:
        df is the dataframe we are using (generally analysis_fin, with rows = papers and columns = variables and metadata)
        variable is the variable we are using for the comparison
        topics is the topics where we want to compare (generally the community names)
        threshold is the threshold we want to use to determine if a paper is in a topic or not
    
    Returns a df with the shares of papers in each topic sorted by their distances
    
    '''
    
    #Create the counts df.
    
    #We are extracting, for each topics, the % of papers with at least one female author when the topic is present, and when it isn't.
    group_counts = pd.concat([pd.crosstab(df[variable],df[t]>threshold,normalize=1).loc[True,:] for t in topics],axis=1)
    
    #Name
    group_counts.columns = topics
    
    #Transpose
    group_counts = group_counts.T
    
    #Rename variables
    group_counts.columns = [variable+f'_{value}' for value in ['false','true']]
    
    #Create a measure of difference
    group_counts['difference'] = (group_counts.iloc[:,1]/group_counts.iloc[:,0])-1
    
    #Output
    out = group_counts.sort_values('difference',ascending=False)
    
    return(out)

def topic_regression(df,target_list,exog,controls,model,binarise=False,standardise=True,cov='HC1'):
    '''
    
    This function regresses topic weights (or their binarisation) on predictors.
    
    Arguments:
        -Df with the variables
        -target_list: target variables. This is a list we loop over. 
        -exog: exogenous variable
        -controls
        -model type. OLS? Logit? TODO fix the logit
        -Binarise in case we are using logit. If not False, the value is the threshold 
            TODO when we binarise the highly detailed models, some of them become all zeros. This will work better
            with the mopre aggregate topics
        -Standardise if we standardise and log the topic weights
    
    Returns
        -A list of statsmodels summaries

    
    '''
    
    #Drop rows with missing values - sm doesn't like them
    df_2 = df[target_list+exog+controls].dropna(axis=0)
    
    #Standardise targets?
    if standardise==True:
        df_2[target_list] = (np.log(df_2[target_list]+0.00000001)).apply(zscore).astype(float)
    
    #Binarise targets if we are doing a logit
    if binarise!=False:
        df_2[target_list] = df_2[target_list].applymap(lambda x: x>binarise).astype(float)
    
    
    #Extract the exogenous and controls, add constant and cast as float
    exog_controls = add_constant(df_2[exog+controls]).astype(float)
    

    #Container output
    out = []
    coeffs = []
    
    #One regression for each target
    for t in list(target_list):
        
        #There we gp. 
        reg = model(endog=df_2[t],exog=exog_controls).fit(cov_type=cov,disp=0)
        
        out.append(reg.summary())
        
        #coeffs.append(reg)
        if model == OLS:
            coeffs.append(pd.Series([float(reg.params[exog]),float(reg.pvalues[exog]),float(reg.rsquared)],name=t))
            reg_coeff = pd.concat(coeffs,axis=1).T
            reg_coeff.columns = ['coefficient','p_value','r_square']
    
        else:
            coeffs.append(pd.Series([float(reg.params[exog]),float(reg.pvalues[exog]),float(reg.prsquared)],name=t))
            reg_coeff = pd.concat(coeffs,axis=1).T
            reg_coeff.columns = ['coefficient','p_value','pr_square']
 
    
    return([out,reg_coeff.sort_values('coefficient',ascending=False)])
        
       

def plot_regression_coefficients(df,var,cov='HC1',size=(8,6),ax=False,ncols=3):
    '''
    Plots regression coefficients.
    
    Arg:
        variable we use as predictor.
    
    '''
    
    reg = topic_regression(df,topics,[var],controls,OLS,cov='HC1')
    
    if ax==False:
        fig,ax = plt.subplots(figsize=size)

    plot_topic_bar(reg[1]['coefficient'],cl=color_lookup,ax=ax,ncols=ncols)

    ax.set_title(f'Regression coefficient using {var} as predictor')

def topic_comparison(df,target_list,exog,concept_lookup,quantiles=np.arange(0,1.1,0.2),thres=0):
    '''
    This function compares the distribution of activity in various topics depending on an exogenous variable of interest. 
    
    Args:
        Df with the topic mix and metadata
        target_list are the topics to consider
        exog is the variable to crosstab topics against
        concept_lookup is a df with the median proximity of each topic to the concepts
        quantiles is how we discretise the concept lookup (default value is quintiles)
        thres: =limit for considering a topic as present

    
    '''
    
    #Copy df
    
    df_2 = df.copy()
    
    #Discretise the concept lookup
    
    conc_discr = concept_lookup.apply(lambda x: pd.qcut(x,q=quantiles,labels=False,duplicates='drop'))

    
    #Calculate levels of activity per topic based on the exog variable
    
    topic_distr = pd.concat([pd.crosstab(df_2[exog],df_2[t]>thres)[True] for t in target_list],axis=1).T
    topic_distr.index = target_list
    
    
    #Merge the count with the concept lookup
    disc = pd.melt(pd.concat([topic_distr,conc_discr],axis=1).reset_index(drop=False),id_vars=['index']+list(conc_discr.columns))
    
    #This is the list where we store the results
    store={}
    
    for c in concept_lookup.columns:
        
        out = pd.pivot_table(disc.groupby([c,'variable'])['value'].sum().reset_index(drop=False),index=c,columns='variable',values='value')
        #out.apply(lambda x: x/x.sum()).plot.bar()
        
        store[c] = out
                                      
    #Output dfs with the comparisons
    return(store)

def plot_topic_bar(table,cl,ax,ncols):
    '''
    Simple function to plot topic bars which includes colours based on the topic-label lookup
    
    Args:
        table has topics in the index and a value to plot in the columns
        cl is the colour lookup between communities and topics
        ax is the plotting axe
    
    
    '''
    
    cols = [cl[comm_names[comms[x]]] if comm_names[comms[x]] in cl.keys() else 'lightgrey' for x in table.index]
    
    table.plot.bar(color=cols,ax=ax,width=1)
    
    ax.legend(handles=patches,ncol=ncols)
    ax.set_xticks([])
    ax.set_xticklabels([])
    
    
def calculate_entropy(df,categories,category):
    '''
    We calculate entropy inside a paper using a distribution over semantic variables (eg discipline, community or topic). These have to be normalised
    
    arguments:
        df is the analysis df with relevant topics and metadata
        categories are the topics we want to compare
        
    outputs
        A df with entropy measures by paper
        
    
    '''
    #Normalise
    norm = df[categories].apply(lambda x: x/x.sum(),axis=1)
    
    ent = pd.DataFrame((norm.apply(lambda x: entropy(x),axis=1)),columns=['entropy'])
    
    ent['cat']=category
    
    return(ent)

def make_exog(df,value_container,value,make_dummy=True):
    '''
    This creates exogenous variables for modelling later.
    
    Argument:
        -df contains the variable where we want to find a value
        -variable_container is the column where we want to look for the value
        -value is the value we are looking for
        -make_dummy: if true it just counts if the value is present. If false, it counts how many times it happens. 
        
    Output
        -A df with the new column (named)
    
    
    '''
    
    df_2 = df.copy()
    
    #Create a tidy variable name
    column_name = re.sub(' ','_',value.lower())
    
    #If we want to create a dummy...
    if make_dummy == True:
        
        #We just look for it in the value container
        #There are some missing values so we have some control flow to manage that. 
        df_2[column_name] = [value in x if type(x)==list else np.nan for x in df_2[value_container]]
    
    else:
        
        #Otherwise, we count how many times it occurs
        #We deal with missing values ('non lists') as before
        df_2[column_name] = [x.count(value) if type(x)==list else np.nan for x in df_2[value_container]]
        
    return(df_2)
    

def extract_topic_trend(df,cat,year_lims=[2000,2019]):
    '''
    Extracts evolution of a share of a category in a topic of interest
    
    Args:
        df: the usual dataframe
        cat: the category we are interested in
        year_lims: first and last year to consider

    '''
    #rel_df = df.loc[df[cat]==True]
    
    out = pd.crosstab(df['year'],df[cat],normalize=0)
    
    return(out.loc[np.arange(year_lims[0],year_lims[1])])

def plot_topic_trend(df,cat,topics,ax,cmap,year_lims=[2000,2019],threshold=0.05,focus_topics=False,alpha=0.2):
    '''
    Plots topic trends (shares of a category in a topic)
    
    Args:
        df the usual dataframe
        topics: topics we want to display
        cat: the category of interest
        year_lims: first and last year to consider
    
    '''
    activity = []
    names = []
    
    #Use a loop to deal with cases where a category has no activity in a topic
    for t in topics:
        try:
            levels = extract_topic_trend(df.loc[df[t]>threshold],cat,year_lims)
            activity.append(levels[True])
            names.append(t)
        
        except:
            pass
        
        
    topic_trends = pd.concat(activity,axis=1).fillna(0)
    topic_trends.columns = names
    
    if focus_topics !=False:
        
        topic_lookup = {name:val for val,name in enumerate(focus_topics)}

        #Color map
        cols = plt.cm.get_cmap(cmap)

        #Create a vector of colors
        cols_to_show = [(0.5,0.5,0.5,alpha) if v not in topic_lookup.keys() else cols(topic_lookup[v]) for v in topic_trends.columns]

        #Plot
        (100*topic_trends.rolling(window=4).mean().dropna()).plot(color=cols_to_show,ax=ax,linewidth=3)

        #Fix the legend to focus on key topics
        hand,labs = ax.get_legend_handles_labels()

        ax.legend(bbox_to_anchor=(1,1),handles = [x[0] for x in zip(hand,labs) if x[1] in focus_topics],
                  labels=[x[1][:50] for x in zip(hand,labs) if x[1] in focus_topics])
    
    else:

        topic_trends.rolling(window=4).mean().dropna().plot(ax=ax)
        ax.legend(bbox_to_anchor=(1,1))
    

    

    

In [None]:
def get_university_industry_collab_trends(df,variable,topic,threshold=0.05):
    '''
    Study university industry collaborations
    
    Args:
        df as usual
        variable is the collaboration variable we want to study
        topic the topic
        threshold is the threshold for accept a paper in a topic
    

    '''
    
    df_with_topic = df.loc[df[topic]>threshold]
    

    topic_collabs = (100*pd.crosstab(df_with_topic['year'],df_with_topic['university_industry_collab'],normalize=0))[True]
    
    
    return(topic_collabs)
    

## 1. Load data

`analysis_pack` contains the metadata and data that we serialised at the end of the `06` data integration notebook.

This includes:

* Community names for the communities (`index->community name`)
* Community indices for topics (`topic -> community index`)
* Filtered topic names (`topic names`)
* Network object with topic co-occurrences
* Analysis df
* arx is the enriched arXiv dataset



In [None]:
with open('../data/processed/24_8_2019_analysis_pack.p','rb') as infile:
    analysis_pack = pickle.load(infile)

In [None]:
comm_names = analysis_pack[0]
comms = analysis_pack[1]
topics = analysis_pack[2]
network = analysis_pack[3]
data = analysis_pack[4]
arx = analysis_pack[5]

In [None]:
arx_geo = pd.read_csv('../data/processed/26_8_2019_grid_geo_admin_all.csv',compression='zip',dtype={'article_id':str})

In [None]:
#Some lookups etc

color_lookup = {
    'deep_learning':'blue',
    'robotics_agents':'cornflowerblue',
    'computer_vision':'aqua',
    'symbolic':'red',
    'health':'lime',
    'social':'forestgreen',
    'technology':'magenta',
    'statistics':'orange',
    'language':'yellow'
}

#These are the field names
field_names = ['field_astrophysics',
 'field_biological',
 'field_complex_systems',
 'field_informatics',
 'field_machine_learning_data',
 'field_materials_quantum',
 'field_mathematical_physics',
 'field_mathematics_1',
 'field_mathematics_2',
 'field_optimisation',
 'field_particle_physics',
 'field_physics_education',
 'field_societal',
 'field_statistics_probability']

core_ai_topics = ['cnn-convolutional_neural_networks-cnns-convolutional_neural_network-convolutional_neural_network_cnn',
                  'recurrent-lstm-rnn-recurrent_neural_network-recurrent_neural_networks',
                 'reinforcement_learning-policy-policies-reward-deep_reinforcement_learning',
                 'translation-neural_machine_translation-machine_translation-translate-translations',
                  'latent-generative_model-generative-generative_models-latent_variables',
                  'training-trained-deep_learning-deep-train'
                 ]

#Create tidy field names for legend etc
tidy_field_lookup = {x:re.sub('_',' ',x[6:]).capitalize() for x in field_names}

community_names = [x for x in list(set((comm_names.values()))) if x!='mixed']

tidy_comms_lookup = make_tidy_lookup(community_names)

patches = [mpatches.Patch(facecolor=c, label=tidy_comms_lookup[l],edgecolor='black') for l,c in color_lookup.items()]

### b. Minor processing

Add AI and SotA topic labels to the geo data


In [None]:
ai_ids = set(arx.loc[arx['is_ai']==True]['paper_id'])

sota_ids = set(data.loc[[any(x>0.05 for x in row[core_ai_topics]) for pid,row in data.iterrows()]].index)

#These are the IDS of the datasets that we have modelled

modelled_ai = set(data.index)

In [None]:
#Label with years

arx_year = arx[['paper_id','year']]

In [None]:
#Label the geo-coded df with AI and SoTa
#Here we need to turn the not-modelled paper ids into missing

arx_geo['has_ai'],arx_geo['has_sota'] = [[x in relevant_set if x in modelled_ai else np.nan for x in arx_geo['article_id']] for relevant_set in [ai_ids,sota_ids]]

arx_geo = arx_geo.loc[arx_geo['is_multinational']==False]

In [None]:
arx_geo = pd.merge(arx_geo,arx_year,left_on='article_id',right_on='paper_id')

In [None]:
arx_geo_ded = arx_geo.drop_duplicates(['article_id','city_country'])[[
    'article_id','city_country','name_en','institute_country','has_ai','has_sota','year']].reset_index(drop=True)

In [None]:
arx_geo_ded['year'] = arx_geo_ded['year'].astype(int)

In [None]:
arx_geo_ded.head()

In [None]:
arx_geo_ded.shape[0]/1e6

In [None]:
100*arx_geo_ded.loc[arx_geo['has_ai']==1]['name_en'].value_counts(normalize=True)[:10]

In [None]:
100*arx_geo_ded.loc[arx_geo['has_sota']==1]['name_en'].value_counts(normalize=True)[:10]

## 2. Analysis

What are we going to do?

* Study levels and evolution of sub-national AI concentration
* COnsider the UK


### Concentration analysis

**Steps**

* What is the level of subnational concentration of AI research and how has it evolved over time

In [None]:
def top_concentration(df,agg_cat,var='All',thres=2015,n=3):
    '''
    
    Takes a dataframe and creates a cumulative distribution of activity in var
    
    Args:
        df (df) a dataframe where every row is an observation with a category whose distribution we want to study.
        agg_cat is the variable whose distribution we want to study
        var is a variable to subset the dataframe (we assume this is a boolean)
        
    
    '''
    
    #Make copy
    df_2 = df.copy()
    
    #If we want to focus on a particular variable
    if var!='All':
        df_2 = df_2.loc[df_2[var]==True]
    
    #Group and rank
    #Calculate Herdindahl index (sum of the squares of the shares)
    gr = np.sum([x**2 for x in df_2[agg_cat].value_counts(normalize=True)])
    
    top= df_2[agg_cat].value_counts(normalize=True).reset_index(drop=True)
    
    #Calculate concentration cnange
    #change
    

    p_1 = df_2.loc[(df_2['year']==2018)][agg_cat].value_counts(normalize=True)[:n].sum()
    
    p_2 = df_2.loc[(df_2['year']==2017)][agg_cat].value_counts(normalize=True)[:n].sum()

    ch = (100*p_1/p_2)-100
    #print(ch)
    
    return([gr,top,ch])
    

In [None]:
top_10_c = arx_geo_ded['institute_country'].value_counts().index[:10]

In [None]:
categories = ['All','has_ai','has_sota']
conc_countries = [[top_concentration(arx_geo_ded.loc[arx_geo_ded['institute_country']==c],'name_en',var) for c in top_10_c] for var in categories]

In [None]:
def plot_concentration_bar(df,ax):
    '''
    
    Plots a concentration horizontal bar for a country
    
    Args:
        df is a df with the information to plot
    
    '''
    
    df = 100*df
    
    #Sorts by the countries with the largest share of activity accounted by the top 3 regions
    top_3_cuml = df.loc[0:2].sum().sort_values()
    
    sort_countries = top_3_cuml.index
    
    df = df[sort_countries]
    
    df.T.plot.barh(cmap='Reds',stacked=True,legend=False,edgecolor='grey',width=0.75,ax=ax)

    
    for n,el in enumerate(df.T.index):
        
        ax.vlines(x=top_3_cuml[el],ymin=n-0.375,ymax=n+0.375,linewidth=3,color='black')
        
    return(sort_countries)

        


In [None]:
fig,ax = plt.subplots(figsize=(10,10),nrows=3,ncols=2,
                      sharex='col',sharey='row',gridspec_kw={'width_ratios':[1,0.3]})

name = ['ArXiv','AI','SotA']

out = []

for n in np.arange(0,3):
    c = pd.concat([x[1] for x in conc_countries[n]],axis=1)
    c.columns = top_10_c
    
    #This also outputs the columns so we can rearrange the 
    cols = plot_concentration_bar(c,ax=ax[n][0])
    
    ax[n][0].set_title(name[n])
    
    
    ch = pd.Series([x[2] for x in conc_countries[n]])
    ch.index = top_10_c
    
    ch = ch[cols]
    
    for num,ind in enumerate(ch):
        ax[n][1].scatter(ind,num,color='coral',edgecolor='grey',s=50,
                    marker = '>' if ind>0 else '<' )
        
        ax[n][1].hlines(y=num,xmin=0,xmax=ind,color='grey',linestyle='-',linewidth=1)
    
    
    ax[n][1].vlines(x=0,ymin=0,ymax=len(ch),linestyle=':',color='black',linewidth=1)
    
ax[2][0].set_xlabel('% of all activity accounted by region')
ax[2][1].set_xlabel('% change in % \n accounted by top 3')

plt.subplots_adjust(wspace=0.05)    

plt.tight_layout()


plt.savefig('../reports/figures/paper_rev/fig_15_subn_share.pdf')

In [None]:
pd.concat([x[1] for x in conc_countries[1]],axis=1).loc[:2].sum().mean()

In [None]:
pd.concat([x[1] for x in conc_countries[2]],axis=1).loc[:2].sum().mean()

### Final analysis: places

We load the lookup between article ids and lads we created in `supp_6` and use it to study the geography of AI research in the UK.

More specifically, we want to create three charts:

* Concentration trends
* Concentration in AI 'core topics'
* Comparison between concentration of AI activity and areas at risk of automation


In [None]:
with open('../data/processed/26_8_2019_arxiv_lads.json','r') as infile:
    lad_lookup = json.load(infile)

In [None]:
data_w_countries = data.dropna(axis=0,subset=['country_list'])

In [None]:
#Focus on papers in the UK. We include Australia because there was a mess-up with some of the geocoding

data_uk = data_w_countries.loc[[any(var in x for var in ['United Kingdom','Australia']) for x in data_w_countries['country_list']]]

In [None]:
#Label papers with their lad codes and names
data_uk['lad_code'],data_uk['lad_name'] = [[lad_lookup[x][var] if x in lad_lookup.keys() else np.nan for x in data_uk.index] for var in ['lad18cd','lad18nm']]

In [None]:
#Drop missing LADs for this analysis
data_uk = data_uk.dropna(axis=0,subset=['lad_name'])

### Point one: Geographical trends

In [None]:
#All the LADs in the data
all_lads = pd.Series(flatten_list(data_uk['lad_name'])).value_counts().index

In [None]:
def plot_local_research_concentration(df,top_n,ax,subset_topics=False,lad_list = all_lads,year_lims=[2000,2019]):
    '''
    This function plots the concentration of research activity in LADs
    
    Args:
        df (df) is the df with papers and lads (so this will have been processed as above)
        top_n (int) is how many of the lads do we want to show
        ax is the axis
        lad_list (list) is the list of LADs to consider
        subset_topics (list) is a list where the first element is the list of topics (or communities) we want to focus on; the second is the threshold for inclusion
        year_lims is the years to consider
    
    '''
    
    if subset_topics!=False:
        df = df.loc[df[subset_topics[0]].apply(lambda x: any(x>subset_topics[1]),axis=1)]
        
    
        
    activity_year = pd.concat([df.loc[[lad in x for x in df['lad_name']]]['year'].value_counts() for lad in lad_list],axis=1).fillna(0)
    activity_year.columns = lad_list
    
    top_lads = activity_year.sum(axis=0).sort_values(ascending=False).index[:top_n]
        

    (100*activity_year.apply(lambda x: x/x.sum(),axis=1).rolling(window=3).mean()).dropna().loc[np.arange(year_lims[0],
                                                                                                   year_lims[1]),top_lads].plot.bar(
        stacked=True,width=0.9,cmap='Accent',edgecolor='lightgrey',ax=ax)
    
    ax.legend(bbox_to_anchor=(1,1))

    
    

In [None]:
fig,ax = plt.subplots(figsize=(10,4))

plot_local_research_concentration(data_uk,8,ax=ax)

plt.tight_layout()

ax.set_ylabel('Share of all papers \n with LAD presence')
#ax.set_title('Evolution of local AI research activity in the UK (top 8 locations)')

plt.tight_layout()

save_fig('fig_16_ai_research_all.pdf')

In [None]:
#Towwer Hamlets involves Queen Mary university
#analysis_uk.loc[['Tower Hamlets' in x for x in analysis_uk['lad_name']]]['institute_list']

#### What about the core topics?

In [None]:
fig,ax = plt.subplots(figsize=(10,4))

plot_local_research_concentration(data_uk,8,ax=ax,subset_topics=[core_ai_topics,0.05],year_lims=[2009,2019])

plt.tight_layout()

ax.set_ylabel('Share of all papers with LAD presence')
#ax.set_title('Evolution of local AI research activity (state of the art AI topics) in the UK (top 8 locations)')

plt.tight_layout()

save_fig('fig_17_ai_research_core.pdf')

In [None]:
#analysis_uk.loc[['Wokingham' in x for x in analysis_uk['lad_name']]]['institute_list']
#Wokingham is University of Reading

### Compare geography of AI activity and geography of automation

In [None]:
#Load automation data
aut = pd.read_csv('../data/processed/19_7_2019_ons_automation_clean.csv',index_col=None)

In [None]:
def get_lad_activity(df,name,subset_topics=False):
    '''
    Extracts the distribution of activity by LAD.
    
    Args:
        df (df) with the data
        topic_subset (list) if not false, the topics to focus on and their threshold for inclusion
        name (str) is the name of the variable
    
    
    '''
    if subset_topics != False:
        df = df.loc[df[subset_topics[0]].apply(lambda x: any(x>subset_topics[1]),axis=1)]
        
    counts = pd.concat([pd.Series(len(df.loc[[lad in x for x in df['lad_name']]]),name=lad,index=[name]) for lad in all_lads],axis=1).fillna(0).T
    
    return(counts)
    
    

In [None]:
# Combine automation data with AI

#List comprehension
ai_lad_counts = pd.concat([get_lad_activity(data_uk,name,topic_subset) for name,topic_subset in zip(['All AI','Core AI topics'],[False,[core_ai_topics,0.02]])],axis=1)

aut_ai = pd.concat([aut.set_index('lad_name'),ai_lad_counts],axis=1).dropna(axis=0,subset=['lad_code']).fillna(0)

aut_ai.head()

In [None]:
def benchmark_concentrations(df,ranking_var,quantiles,comparisons,ax):
    '''
    
    We create a df that compares share of automation, AI activity accounted by different locations.
    
    Args:
        df is a table with automation and AI activity
        ranking_var is the variable we use to create the groups to analyse the distribution
        quantiles is the number of groups we create
        comparisons are the variables we want to benchmark
    
    
    '''
    
    df_2 = df.copy()
    
    df_2['aut_rank'] = pd.qcut(df_2[ranking_var],q=quantiles,labels=False)

    df_norm = df_2[comparisons].apply(lambda x: x/x.sum())
    df_norm['aut_rank'] = df_2['aut_rank']
    
    (100*df_norm.groupby('aut_rank')[comparisons].sum()).plot.bar(ax=ax)
    
    #print(df_norm.loc[df_norm['aut_rank']==4])
    

In [None]:
fig,ax = plt.subplots(figsize=(8,3.5))

comps = ['number_high','All AI','Core AI topics']
q = np.arange(0,1.1,0.25)

benchmark_concentrations(aut_ai,'aut_prob',q,comps,ax)

ax.set_xlabel('Workforce automation ranking (quartile)')
ax.set_ylabel('% of the total in the UK')

ax.legend(title='Variable',labels = ['Workforce with high risk of automation','AI research activity','AI state of the art activity'])
#ax.set_title('Distribution of AI activity and population at risk of automation')

plt.tight_layout()

save_fig('fig_18_lad_comparison.pdf')

In [None]:
arx_geo.loc[arx_geo['institute_country']=='Canada']['name_en'].value_counts()