# Country analysis

This is the country analysis that concludes our mapping of AI research

We will load and process the data, analyse semantic differences between Free / Not Free countries, and study research trends in controversial, surveillance enabling technologies.

## 0. Preamble

In [None]:
%run notebook_preamble.ipy

In [None]:
# Ignore future warnings (for when I concatenate dfs)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Other imports

In [None]:
import random

from statsmodels.api import OLS, Logit
from statsmodels.tools.tools import add_constant
from scipy.stats import zscore
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from scipy.stats import entropy
import seaborn as sns


### Functions

Add a bunch of exogenous variables to the analysis df

In [None]:
#Generic functions
def save_fig(name,path='../reports/figures/paper_rev/'):
    '''
    Saves a figure
    '''
    plt.tight_layout()
    
    plt.savefig(path+f'{today_str}_{name}')
    
    # Put functions etc here

def flatten_list(my_list):
    '''
    Flattens a list
    '''
    
    return([x for el in my_list for x in el])


def get_example(df,number,length):
    '''
    Gets random examples in a field
    
    Args:
        Df is the dataframe we want to use
        number is the number of examples we want
        length is the length of the examples
    
    '''
    
    choose = random.sample(list(df.index),number)
    
    for x in df.loc[choose]['abstract']:
        
        print(x[:length])
        print('\n')
        
def flatten_freq(nested_list):
    '''
    
    Function to calculate frequencies of elements within a nested list
    
    '''
    
    return(pd.Series(flatten_list(nested_list))).value_counts()


In [None]:
def make_tidy_lookup(names_list,length=False):
    '''
    
    Creates a cheap lookup between names, removing underscores and capitalising
    
    Args:
        names_list (list) is the list of names we want to tidy
        length is if we want to only keep a certain length of the name
    
    '''
    
    out = {x:re.sub('_',' ',x).capitalize() for x in names_list}
    return(out)



In [None]:
def cross_sectional_comp(df,variable,topics,threshold):
    '''
    This function compares activity by topics between categories.
    
    Args:
        df is the dataframe we are using (generally analysis_fin, with rows = papers and columns = variables and metadata)
        variable is the variable we are using for the comparison
        topics is the topics where we want to compare (generally the community names)
        threshold is the threshold we want to use to determine if a paper is in a topic or not
    
    Returns a df with the shares of papers in each topic sorted by their distances
    
    '''
    
    #Create the counts df.
    
    #We are extracting, for each topics, the % of papers with at least one female author when the topic is present, and when it isn't.
    group_counts = pd.concat([pd.crosstab(df[variable],df[t]>threshold,normalize=1).loc[True,:] for t in topics],axis=1)
    
    #Name
    group_counts.columns = topics
    
    #Transpose
    group_counts = group_counts.T
    
    #Rename variables
    group_counts.columns = [variable+f'_{value}' for value in ['false','true']]
    
    #Create a measure of difference
    group_counts['difference'] = (group_counts.iloc[:,1]/group_counts.iloc[:,0])-1
    
    #Output
    out = group_counts.sort_values('difference',ascending=False)
    
    return(out)

def topic_regression(df,target_list,exog,controls,model,binarise=False,standardise=True,cov='HC1'):
    '''
    
    This function regresses topic weights (or their binarisation) on predictors.
    
    Arguments:
        -Df with the variables
        -target_list: target variables. This is a list we loop over. 
        -exog: exogenous variable
        -controls
        -model type. OLS? Logit? TODO fix the logit
        -Binarise in case we are using logit. If not False, the value is the threshold 
            TODO when we binarise the highly detailed models, some of them become all zeros. This will work better
            with the mopre aggregate topics
        -Standardise if we standardise and log the topic weights
    
    Returns
        -A list of statsmodels summaries

    
    '''
    
    #Drop rows with missing values - sm doesn't like them
    df_2 = df[target_list+exog+controls].dropna(axis=0)
    
    #Standardise targets?
    if standardise==True:
        df_2[target_list] = (np.log(df_2[target_list]+0.00000001)).apply(zscore).astype(float)
    
    #Binarise targets if we are doing a logit
    if binarise!=False:
        df_2[target_list] = df_2[target_list].applymap(lambda x: x>binarise).astype(float)
    
    
    #Extract the exogenous and controls, add constant and cast as float
    exog_controls = add_constant(df_2[exog+controls]).astype(float)
    

    #Container output
    out = []
    coeffs = []
    
    #One regression for each target
    for t in list(target_list):
        
        #There we gp. 
        reg = model(endog=df_2[t],exog=exog_controls).fit(cov_type=cov,disp=0)
        
        out.append(reg.summary())
        
        #coeffs.append(reg)
        if model == OLS:
            coeffs.append(pd.Series([float(reg.params[exog]),float(reg.pvalues[exog]),float(reg.rsquared)],name=t))
            reg_coeff = pd.concat(coeffs,axis=1).T
            reg_coeff.columns = ['coefficient','p_value','r_square']
    
        else:
            coeffs.append(pd.Series([float(reg.params[exog]),float(reg.pvalues[exog]),float(reg.prsquared)],name=t))
            reg_coeff = pd.concat(coeffs,axis=1).T
            reg_coeff.columns = ['coefficient','p_value','pr_square']
 
    
    return([out,reg_coeff.sort_values('coefficient',ascending=False)])
        
       
def plot_regression_coefficients(df,var,cov='HC1',size=(8,6),ax=False,ncols=3):
    '''
    Plots regression coefficients.
    
    Arg:
        variable we use as predictor.
    
    '''
    
    reg = topic_regression(df,topics,[var],controls,OLS,cov='HC1')
    
    if ax==False:
        fig,ax = plt.subplots(figsize=size)

    plot_topic_bar(reg[1]['coefficient'],cl=color_lookup,ax=ax,ncols=ncols)

    ax.set_title(f'Regression coefficient using {var} as predictor')

def topic_comparison(df,target_list,exog,concept_lookup,quantiles=np.arange(0,1.1,0.2),thres=0):
    '''
    This function compares the distribution of activity in various topics depending on an exogenous variable of interest. 
    
    Args:
        Df with the topic mix and metadata
        target_list are the topics to consider
        exog is the variable to crosstab topics against
        concept_lookup is a df with the median proximity of each topic to the concepts
        quantiles is how we discretise the concept lookup (default value is quintiles)
        thres: =limit for considering a topic as present

    
    '''
    
    #Copy df
    
    df_2 = df.copy()
    
    #Discretise the concept lookup
    
    conc_discr = concept_lookup.apply(lambda x: pd.qcut(x,q=quantiles,labels=False,duplicates='drop'))

    
    #Calculate levels of activity per topic based on the exog variable
    
    topic_distr = pd.concat([pd.crosstab(df_2[exog],df_2[t]>thres)[True] for t in target_list],axis=1).T
    topic_distr.index = target_list
    
    
    #Merge the count with the concept lookup
    disc = pd.melt(pd.concat([topic_distr,conc_discr],axis=1).reset_index(drop=False),id_vars=['index']+list(conc_discr.columns))
    
    #This is the list where we store the results
    store={}
    
    for c in concept_lookup.columns:
        
        out = pd.pivot_table(disc.groupby([c,'variable'])['value'].sum().reset_index(drop=False),index=c,columns='variable',values='value')
        #out.apply(lambda x: x/x.sum()).plot.bar()
        
        store[c] = out
                                      
    #Output dfs with the comparisons
    return(store)

def plot_topic_bar(table,cl,ax,ncols):
    '''
    Simple function to plot topic bars which includes colours based on the topic-label lookup
    
    Args:
        table has topics in the index and a value to plot in the columns
        cl is the colour lookup between communities and topics
        ax is the plotting axe
    
    
    '''
    
    cols = [cl[comm_names[comms[x]]] if comm_names[comms[x]] in cl.keys() else 'lightgrey' for x in table.index]
    
    table.plot.bar(color=cols,ax=ax,width=1)
    
    ax.legend(handles=patches,ncol=ncols)
    ax.set_xticks([])
    ax.set_xticklabels([])
    
    
def calculate_entropy(df,categories,category):
    '''
    We calculate entropy inside a paper using a distribution over semantic variables (eg discipline, community or topic). These have to be normalised
    
    arguments:
        df is the analysis df with relevant topics and metadata
        categories are the topics we want to compare
        
    outputs
        A df with entropy measures by paper
        
    
    '''
    #Normalise
    norm = df[categories].apply(lambda x: x/x.sum(),axis=1)
    
    ent = pd.DataFrame((norm.apply(lambda x: entropy(x),axis=1)),columns=['entropy'])
    
    ent['cat']=category
    
    return(ent)

def make_exog(df,value_container,value,make_dummy=True):
    '''
    This creates exogenous variables for modelling later.
    
    Argument:
        -df contains the variable where we want to find a value
        -variable_container is the column where we want to look for the value
        -value is the value we are looking for
        -make_dummy: if true it just counts if the value is present. If false, it counts how many times it happens. 
        
    Output
        -A df with the new column (named)
    
    
    '''
    
    df_2 = df.copy()
    
    #Create a tidy variable name
    column_name = re.sub(' ','_',value.lower())
    
    #If we want to create a dummy...
    if make_dummy == True:
        
        #We just look for it in the value container
        #There are some missing values so we have some control flow to manage that. 
        df_2[column_name] = [value in x if type(x)==list else np.nan for x in df_2[value_container]]
    
    else:
        
        #Otherwise, we count how many times it occurs
        #We deal with missing values ('non lists') as before
        df_2[column_name] = [x.count(value) if type(x)==list else np.nan for x in df_2[value_container]]
        
    return(df_2)
    

def extract_topic_trend(df,cat,year_lims=[2000,2019]):
    '''
    Extracts evolution of a share of a category in a topic of interest
    
    Args:
        df: the usual dataframe
        cat: the category we are interested in
        year_lims: first and last year to consider

    '''
    #rel_df = df.loc[df[cat]==True]
    
    out = pd.crosstab(df['year'],df[cat],normalize=0)
    
    return(out.loc[np.arange(year_lims[0],year_lims[1])])

def plot_topic_trend(df,cat,topics,ax,cmap,year_lims=[2000,2019],threshold=0.05,focus_topics=False,alpha=0.2):
    '''
    Plots topic trends (shares of a category in a topic)
    
    Args:
        df the usual dataframe
        topics: topics we want to display
        cat: the category of interest
        year_lims: first and last year to consider
    
    '''
    activity = []
    names = []
    
    #Use a loop to deal with cases where a category has no activity in a topic
    for t in topics:
        try:
            levels = extract_topic_trend(df.loc[df[t]>threshold],cat,year_lims)
            activity.append(levels[True])
            names.append(t)
        
        except:
            pass
        
        
    topic_trends = pd.concat(activity,axis=1).fillna(0)
    topic_trends.columns = names
    
    if focus_topics !=False:
        
        topic_lookup = {name:val for val,name in enumerate(focus_topics)}

        #Color map
        cols = plt.cm.get_cmap(cmap)

        #Create a vector of colors
        cols_to_show = [(0.5,0.5,0.5,alpha) if v not in topic_lookup.keys() else cols(topic_lookup[v]) for v in topic_trends.columns]

        #Plot
        (100*topic_trends.rolling(window=4).mean().dropna()).plot(color=cols_to_show,ax=ax,linewidth=3)

        #Fix the legend to focus on key topics
        hand,labs = ax.get_legend_handles_labels()

        ax.legend(bbox_to_anchor=(1,1),handles = [x[0] for x in zip(hand,labs) if x[1] in focus_topics],
                  labels=[x[1][:50] for x in zip(hand,labs) if x[1] in focus_topics])
    
    else:

        topic_trends.rolling(window=4).mean().dropna().plot(ax=ax)
        ax.legend(bbox_to_anchor=(1,1))
    

    

labels=['China','Not free excluding China','All']

In [None]:
def make_growth_rate(series):
    '''
    This function creates a growth rate for a series
    
    It takes the series and divides a value by the next value. Divisions by zero are nan
    
    
    '''
    
    growth_rate = []

    for n,x in enumerate(series):

        if n==0:
            out=np.nan
            growth_rate.append(np.nan)
        else:
            if div!=0:
                out = 100*((x/div)-1)
                growth_rate.append(out)
            else:
                growth_rate.append(np.nan)

        div = x

    return(growth_rate)
    

In [None]:
def trend_in_topic(df,topic,name,threshold=0.05,year_lim=[2005,2019],normalize=0):
    '''
    This returns trends of activity in a topic as a share of all activity
    
    Args:
        df is the df
        topic is the topic of interest
        threshold is the threshold
        year_lim is the years to consider
    
    
    '''
    
    if normalize!='none':
        trend = pd.crosstab(df['year'],df[topic]>threshold,normalize=normalize)
        
    else:
        trend = pd.crosstab(df['year'],df[topic]>threshold)
    
    
    trend.rename(columns={True:name},inplace=True)
    
    return(trend.loc[np.arange(year_lim[0],year_lim[1])].fillna(0)[name])
    

## 1. Load data

`analysis_pack` contains the metadata and data that we serialised at the end of the `06` data integration notebook.

This includes:

* Community names for the communities (`index->community name`)
* Community indices for topics (`topic -> community index`)
* Filtered topic names (`topic names`)
* Network object with topic co-occurrences
* Analysis df
* arx is the enriched arXiv dataset



In [None]:
with open('../data/processed/24_8_2019_analysis_pack.p','rb') as infile:
    analysis_pack = pickle.load(infile)

In [None]:
comm_names = analysis_pack[0]
comms = analysis_pack[1]
topics = analysis_pack[2]
network = analysis_pack[3]
data = analysis_pack[4]
arx = analysis_pack[5]

In [None]:
#Some lookups etc

color_lookup = {
    'deep_learning':'blue',
    'robotics_agents':'cornflowerblue',
    'computer_vision':'aqua',
    'symbolic':'red',
    'health':'lime',
    'social':'forestgreen',
    'technology':'magenta',
    'statistics':'orange',
    'language':'yellow'
}

#These are the field names
field_names = ['field_astrophysics',
 'field_biological',
 'field_complex_systems',
 'field_informatics',
 'field_machine_learning_data',
 'field_materials_quantum',
 'field_mathematical_physics',
 'field_mathematics_1',
 'field_mathematics_2',
 'field_optimisation',
 'field_particle_physics',
 'field_physics_education',
 'field_societal',
 'field_statistics_probability']

core_ai_topics = ['cnn-convolutional_neural_networks-cnns-convolutional_neural_network-convolutional_neural_network_cnn',
                  'recurrent-lstm-rnn-recurrent_neural_network-recurrent_neural_networks',
                 'reinforcement_learning-policy-policies-reward-deep_reinforcement_learning',
                 'translation-neural_machine_translation-machine_translation-translate-translations',
                  'latent-generative_model-generative-generative_models-latent_variables',
                  'training-trained-deep_learning-deep-train'
                 ]

surv_topics = ['face-faces-identity-face_recognition-facial','person-surveillance-persons-pedestrian-pedestrians'
              ]

#Create tidy field names for legend etc
tidy_field_lookup = {x:re.sub('_',' ',x[6:]).capitalize() for x in field_names}

community_names = [x for x in list(set((comm_names.values()))) if x!='mixed']

tidy_comms_lookup = make_tidy_lookup(community_names)

patches = [mpatches.Patch(facecolor=c, label=tidy_comms_lookup[l],edgecolor='black') for l,c in color_lookup.items()]

In [None]:
with open(f'../data/processed/26_8_2019_country_status_lookup','r') as infile:
    country_status_lookup = json.load(infile)

### Work with the full geo-data

We will use the full geo-data to analyse evolution of activity in arXiv, AI and surveillance topics

In [None]:
arx_geo = pd.read_csv(
    '../data/processed/26_8_2019_grid_geo_admin_all.csv',compression='zip',dtype={'article_id':str})

#### Label the geo data with various relevant fields

In [None]:
ai_ids = set(arx.loc[arx['is_ai']==True]['paper_id'])

sota_ids = set(data.loc[[any(x>0.05 for x in row[core_ai_topics]) for pid,row in data.iterrows()]].index)

modelled_ai = set(data.index)

surv_ids = set(data.loc[data[surv_topics].apply(lambda x: any(x>0.05),axis=1)].index)

In [None]:
#Label with years

arx_year = arx[['paper_id','year']]

In [None]:
#Label the geo-coded df with AI and SoTa

arx_geo['has_ai'],arx_geo[
    'has_sota'], arx_geo['has_surv'] = [
    [x in relevant_set if x in modelled_ai else np.nan for x in arx_geo['article_id']] for relevant_set in [ai_ids,sota_ids,surv_ids]]

arx_geo = arx_geo.loc[arx_geo['is_multinational']==False]

In [None]:
arx_geo = pd.merge(arx_geo,arx_year,left_on='article_id',right_on='paper_id')

arx_geo['year'] = arx_geo['year'].astype(int)

### Country comparison (free / not free)

In [None]:
#Variables of interest
interesting_cuts = [['freedom_list','NF'],
                    ['country_list','China'],['country_list','Russia'],['country_list','Turkey'],
                    ['country_list','United States'],['country_list','United Kingdom'],['country_list','Germany'],
                    ['type_list','Company'],['type_list','Government'],['type_list','Education']]

#Create the expanded df
data_2 = data.copy()

#For each interesting variable we expand the df
for detect in interesting_cuts:
    
    data_2 = make_exog(data_2,value_container=detect[0],value=detect[1])


#### Evolution of activity in not free countries


In [None]:
#Find top countries
countries = pd.Series(flatten_list(data_2['country_list'].dropna())).value_counts().index

#Which are not free?
not_free_countries_all = [c for c in [x for x in countries if (x in country_status_lookup.keys())] if country_status_lookup[c]=='NF']

#Focus on the top countties
not_free_countries = not_free_countries_all[:10]


In [None]:
arx_geo['not_free'] = [x in not_free_countries_all for x in arx_geo['institute_country']]

arx_geo['year'] = arx_geo['year'].astype(int)

In [None]:
def calulate_trends_2(df,years=[2000,2019],select=False):
    '''
    TODO I am sure I have already written this function
    
    Function to calculate geo shares of activity
    
    Args:
        df including information about the country
        years is the years to focus the analysis on
        all is whether we want to focus on all variables or a subset
    
    '''
    
    if select!=False:
        df = df.loc[df[select]==True]
        
    #return(df['year'].value_counts().loc[np.arange(years[0],years[1])])
    return(df['year'].value_counts())
    



In [None]:
#This extracts a category (country) share of activity in the total

nf_out = []

for x in [False,'has_ai','has_sota','has_surv']:
    nf_count = pd.concat([calulate_trends_2(arx_geo.loc[arx_geo['not_free']==v],select=x) for v in [False,True]],axis=1)
    all_activity = calulate_trends_2(arx_geo,select=x)
    
    nf_count.columns = ['Free','Not Free']
    nf_count_norm = nf_count.apply(lambda x: x/all_activity) 
    
    #nf_count = nf_count.apply(lambda x: x/x.sum())
    
    nf_out.append(nf_count_norm)
    


In [None]:
not_free_shares = pd.concat([x['Not Free'] for x in nf_out],axis=1).fillna(0)

not_free_shares.columns = ['All','has_ai','has_sota','has_surv']

In [None]:
#Plot

In [None]:
fig,ax = plt.subplots(figsize=(7,3))


(100*not_free_shares.rolling(window=3).mean()).dropna().plot(linewidth=3,ax=ax)

ax.legend(labels=['All arXiv','AI','SotA','Surveillance topics'])
ax.set_ylabel('Not free countries as share of all')

save_fig('fig_19_topic_focus.pdf')

### Compare the top 10 not free countries with all

In [None]:
#pd.concat([calulate_trends_2(arx_geo.loc[arx_geo['institute_country']==v],select='has_ai') for v in focus_not_free],axis=1,join='outer').fillna(0)

In [None]:
focus_not_free = not_free_countries[:8]


In [None]:
nf_detailed = []

for x in [False,'has_ai','has_sota','has_surv']:
    nf_count = pd.concat([calulate_trends_2(arx_geo.loc[arx_geo['institute_country']==v],select=x) for v in focus_not_free],axis=1).fillna(0)
    
    
    all_activity = calulate_trends_2(arx_geo,select=x)
    
    nf_count.columns = focus_not_free
    nf_count_norm = nf_count.apply(lambda x: x/all_activity).fillna(0) 
    
    
    nf_detailed.append(nf_count_norm)
    

In [None]:
fig,ax = plt.subplots(figsize=(12,5),nrows=2,ncols=4,sharey='row',sharex=True)

titles = ['All arXiv','AI','SotA','Surveillance']

for n,p in enumerate(nf_detailed):
    
    (100*p.iloc[:,0].rolling(window=5).mean()).plot(ax=ax[0][n],linewidth=2,legend=False,c='black')    
    (100*p.iloc[:,1:].rolling(window=5).mean()).plot(ax=ax[1][n],linewidth=2,legend=False)    
    

ax[0][0].set_ylabel('%')
ax[1][0].set_ylabel('%')


# #ax[0].legend(ncol=3)
# ax[0].legend().set_visible(False)
# ax[1].legend().set_visible(False)
# ax[2].legend().set_visible(False)
# ax[3].legend().set_visible(False)
# #ax[2].legend(bbox_to_anchor=(1,2))

ax[0][3].legend(bbox_to_anchor=(1.05,1),ncol=2)
ax[1][3].legend(bbox_to_anchor=(1,1.1),ncol=2)

[ax[0][n].set_title(t) for n,t in enumerate(titles)]

plt.tight_layout()

plt.savefig(f'../reports/figures/paper_rev/{today_str}_fig_20_not_free_detail.pdf')

**Cross-sectional comparison**

Here we calculate how over (or under?) represented is a topic in a country

In [None]:
analysis_w_countries = data_2.loc[[type(x)==list for x in data_2['country_list']]]

#Calculate activity for all countries
all_country_activity = pd.concat(
    [analysis_w_countries.loc[[x in countries for countries in analysis_w_countries['country_list']]]['year'].value_counts() for x in countries],axis=1).fillna(0)

all_country_activity.columns = countries

analysis_w_countries['not_free_not_china'] = [(x['nf']==True)&(x['china']==False) for pid,x in analysis_w_countries.iterrows()]

In [None]:
cross = pd.concat([cross_sectional_comp(analysis_w_countries,x,surv_topics,threshold=0.05)['difference'] for x in ['china','not_free_not_china']],axis=1)

cross.columns = ['china','not_free_other_than_china']

ax = (100*cross.T.iloc[::-1]).plot.barh(title='Specialisation in visual surveillance topics',figsize=(10,3))

hand,labs = ax.get_legend_handles_labels()

ax.legend(loc='lower right',handles = [x[0] for x in zip(hand,labs)],
          labels=[x[1][:50] for x in zip(hand,labs)])


ax.set_xlabel('% deviation from the average')
ax.set_yticklabels(['Not Free (Excluding China)','China'])

ax.vlines(x=0,ymin=-1,ymax=2,linestyle=':',color='red')

save_fig('fig_21_activity_in_surveillance_topics.pdf')

#### Plot regression coefficients

In [None]:
controls = ['year']+list(field_names)

In [None]:
plot_regression_coefficients(analysis_w_countries,'nf',size=(8,4))

plt.tight_layout()

save_fig('fig_22_nf_specialisation.pdf')

Is the above just driven by China? We create a new variable excluding it

In [None]:
analysis_w_countries['not_free_not_china'] = [(x['nf']==True)&(x['china']==False) for pid,x in analysis_w_countries.iterrows()]

In [None]:
analysis_w_countries['not_free_not_china'].sum()

### Who is doing the facial recognition research?

We want to see if government organisations are over or underepresented in facial recognition research

In [None]:
#Share of all activity and all surveillance activity accounted for by governments
govt = 100*pd.crosstab(
    data_2[surv_topics].apply(lambda x: any(x>0.05),axis=1),data_2['government'],normalize=1).loc[True]

In [None]:
#Share of all activity and all surveillance activity involving Chinese projects with government involvement
ch_govt = 100*pd.crosstab(
    data_2[surv_topics].apply(lambda x: any(x>0.05),axis=1),data_2['government']*data_2['china'],normalize=1).loc[True]

In [None]:
ax = pd.concat([ch_govt,govt],axis=1).T.plot.barh()

ax.set_yticklabels(['China and government involved in research','Government involved in research'])
ax.set_xlabel('Share of Activity')



#### What are the levels of international collaboration in AI research?


In [None]:
surv = data_2.loc[data_2[surv_topics].apply(lambda x: any(x>0.05),axis=1)]


In [None]:
surv_china = surv.loc[surv['china']==True]


In [None]:
pd.concat([flatten_freq(surv_china['type_list']),flatten_freq(surv['type_list'])],axis=1).fillna(0).apply(lambda x: x/x.sum()).plot.bar()


### Other countries and facial technologies

In [None]:
all_surv_count = pd.concat([calulate_trends_2(arx_geo.loc[arx_geo['institute_country']==v],select='has_surv') for v in 
                            ['United Kingdom','United States','Germany']],axis=1).fillna(0)

all_surv_count.columns = ['United Kingdom','United States','Germany']

In [None]:
all_surv_count.loc[2018]

## 3d map

Here we want to create a 3d map of facial recognition technology activity by country.

We use some code that we found [here](https://medium.com/@lkhphuc/how-to-plot-a-3d-earth-map-using-basemap-and-matplotlib-2bc026483fe4)

See also [here](https://basemaptutorial.readthedocs.io/en/latest/basemap3d.html)

In [None]:
%matplotlib inline

In [None]:
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.basemap import Basemap
from mpl_toolkits.mplot3d import Axes3D

#### Step 1. Plot the basemap

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.basemap import Basemap

world_map = Basemap()

fig = plt.figure(figsize=(20,10))
ax = Axes3D(fig)


#ax.azim = 360
ax.elev = 45
ax.dist = 9.5

ax.add_collection3d(world_map.drawcoastlines(linewidth=0.5))
ax.add_collection3d(world_map.drawcountries(linewidth=0.25))

plt.show()

#### Work with arxGeo

* Label papers with whether they relate to surveillance or not
* Label countries with their freedom status (free / not free / partially free?)
* Calculate LQs and Totals by country
* Log the totals

In [None]:
arx_geo_plot = arx_geo.loc[arx_geo['is_multinational']==False].copy()

In [None]:
#Label surveillance papers
arx_geo_plot['is_surv'] = [x in surv_ids for x in arx_geo['article_id']]

arx_geo_plot['free'] = [country_status_lookup[x] if x in country_status_lookup.keys() else np.nan for x in arx_geo['institute_country']]

In [None]:
#Simple plot
(100*pd.crosstab(arx_geo_plot['is_surv'],arx_geo_plot['free'],normalize=1)).loc[True].sort_values(ascending=False).plot.bar()

In [None]:
import geopandas as gp


In [None]:
# Specialisation

def create_lq(X, binary=False):
    """ Calculate the location quotient.

    Divides the share of activity in a location by the share of activity in the UK total

    Args:
        X (pandas.DataFrame): DataFrame where rows are locations, columns are sectors and values are activity in a given sector at a location.
        binary (bool, optional): If True, discretise the data with a cut-off value of 1

    Returns:
        pandas.DataFrame
    """
    Xm = X.values
    X = pd.DataFrame((Xm/Xm.sum(1)[:, np.newaxis])/(Xm.sum(0)/Xm.sum()),
            index=X.index, columns=X.columns)
    
    return(X)

**We will focus on the top 100 countries**



In [None]:
top_75 = arx_geo_plot['institute_country'].value_counts()[:50].index

In [None]:
#Measure surveillance related activity
spec = create_lq(pd.crosstab(arx_geo_plot['institute_country'],arx_geo_plot['is_surv'])).loc[top_75].sort_values(True,ascending=False)[True]

tots = pd.crosstab(arx_geo_plot['institute_country'],arx_geo_plot['is_surv']).loc[top_75].sort_values(True,ascending=False)[True]

surv_activity = pd.concat([spec,tots],axis=1)

surv_activity.columns = ['spec','total']

surv_activity['total_discretised'] = pd.qcut(surv_activity['total'],q=np.arange(0,1.1,0.2),labels=False,
                                            duplicates='drop').apply(lambda x: x/20)

surv_activity['total_logged'] = np.log(surv_activity['total']+0.001)

In [None]:
#Measure surveillance related status
surv_activity['status'] = [country_status_lookup[x] if x in country_status_lookup.keys() else np.nan for x in surv_activity.index]
surv_activity['color'] = ['red' if x=='NF' else 'orange' if x == 'PF' else 'lightgreen' for x in surv_activity['status']]


In [None]:
#Calculate median lat lons for countries (proxy for centroids)

country_lat_lon = arx_geo_plot.groupby('institute_country')[['institute_lon','institute_lat']].median().to_dict(orient='index')

In [None]:
surv_activity['lon'],surv_activity['lat'] = [[country_lat_lon[x][var] for x in surv_activity.index] for var in ['institute_lon','institute_lat']]

### Map

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.basemap import Basemap
from matplotlib.collections import PolyCollection
from matplotlib.patches import Patch


world_map = Basemap()

fig = plt.figure(figsize=(15,8))
ax = Axes3D(fig)


#ax.azim = 360
ax.elev = 45
ax.dist = 9.5

ax.add_collection3d(world_map.drawcoastlines(linewidth=0.1))
ax.add_collection3d(world_map.drawcountries(linewidth=0.25))

polys = []
for polygon in world_map.landpolygons:
    polys.append(polygon.get_coords())


lc = PolyCollection(polys,
                    facecolor='white', closed=False)

ax.add_collection3d(lc)



ax.bar3d(surv_activity['lon'], #x
         surv_activity['lat'],  #y
         np.zeros(len(surv_activity)), #z 
         
         2, #dx
         
         surv_activity['spec'],#y
         
         surv_activity['total'], #z
         
         color= surv_activity['color'],edgecolor='black',linewidth=0.01)
ax.set_zlim(0,180)

ax.set_zlabel('Relative Specialisation in surveillance topics')
ax.set_ylabel('Bar depth represents relative specialisation in \n AI surveillance topics')


scatter1_proxy = matplotlib.lines.Line2D([0],[0], linestyle="none", c='red', marker = 's',linewidth=5)
scatter2_proxy = matplotlib.lines.Line2D([0],[0], linestyle="none", c='orange', marker = 's',linewidth=5)
scatter3_proxy = matplotlib.lines.Line2D([0],[0], linestyle="none", c='lightgreen', marker = 's',linewidth=5)
ax.legend([scatter1_proxy, scatter2_proxy,scatter3_proxy], ['Not free', 'Partially free','Free'], numpoints = 1,bbox_to_anchor=(0.9,0.7),
         title='Country classification')

#plt.savefig(f'../reports/figures/paper_rev/{today_str}_facial_recognition.png')


plt.savefig(f'../reports/figures/paper_rev/{today_str}_neurips_facial_recognition.png')