# Organisational character

In this notebook we concentrate on the link between types of organisations involved in research and the topics they focus on

* Levels of company activity
* Topical differences in focus between industry and research
* Collaboration differences

## 0. Preamble

In [None]:
%run notebook_preamble.ipy

In [None]:
# Ignore future warnings (for when I concatenate dfs)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Other imports

In [None]:
import random

from statsmodels.api import OLS, Logit
from statsmodels.tools.tools import add_constant
from scipy.stats import zscore
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from scipy.stats import entropy
import seaborn as sns


### Functions

Add a bunch of exogenous variables to the analysis df

In [None]:
#Generic functions
def save_fig(name,path='../reports/figures/paper_rev/'):
    '''
    Saves a figure
    '''
    plt.tight_layout()
    
    plt.savefig(path+f'{today_str}_{name}')
    
    # Put functions etc here

def flatten_list(my_list):
    '''
    Flattens a list
    '''
    
    return([x for el in my_list for x in el])


def get_example(df,number,length):
    '''
    Gets random examples in a field
    
    Args:
        Df is the dataframe we want to use
        number is the number of examples we want
        length is the length of the examples
    
    '''
    
    choose = random.sample(list(df.index),number)
    
    for x in df.loc[choose]['abstract']:
        
        print(x[:length])
        print('\n')

In [None]:
def make_tidy_lookup(names_list,length=False):
    '''
    
    Creates a cheap lookup between names, removing underscores and capitalising
    
    Args:
        names_list (list) is the list of names we want to tidy
        length is if we want to only keep a certain length of the name
    
    '''
    
    out = {x:re.sub('_',' ',x).capitalize() for x in names_list}
    return(out)



In [None]:
def cross_sectional_comp(df,variable,topics,threshold):
    '''
    This function compares activity by topics between categories.
    
    Args:
        df is the dataframe we are using (generally analysis_fin, with rows = papers and columns = variables and metadata)
        variable is the variable we are using for the comparison
        topics is the topics where we want to compare (generally the community names)
        threshold is the threshold we want to use to determine if a paper is in a topic or not
    
    Returns a df with the shares of papers in each topic sorted by their distances
    
    '''
    
    #Create the counts df.
    
    #We are extracting, for each topics, the % of papers with at least one female author when the topic is present, and when it isn't.
    group_counts = pd.concat([pd.crosstab(df[variable],df[t]>threshold,normalize=1).loc[True,:] for t in topics],axis=1)
    
    #Name
    group_counts.columns = topics
    
    #Transpose
    group_counts = group_counts.T
    
    #Rename variables
    group_counts.columns = [variable+f'_{value}' for value in ['false','true']]
    
    #Create a measure of difference
    group_counts['difference'] = (group_counts.iloc[:,1]/group_counts.iloc[:,0])-1
    
    #Output
    out = group_counts.sort_values('difference',ascending=False)
    
    return(out)

def topic_regression(df,target_list,exog,controls,model,binarise=False,standardise=True,cov='HC1'):
    '''
    
    This function regresses topic weights (or their binarisation) on predictors.
    
    Arguments:
        -Df with the variables
        -target_list: target variables. This is a list we loop over. 
        -exog: exogenous variable
        -controls
        -model type. OLS? Logit? TODO fix the logit
        -Binarise in case we are using logit. If not False, the value is the threshold 
            TODO when we binarise the highly detailed models, some of them become all zeros. This will work better
            with the mopre aggregate topics
        -Standardise if we standardise and log the topic weights
    
    Returns
        -A list of statsmodels summaries

    
    '''
    
    #Drop rows with missing values - sm doesn't like them
    df_2 = df[target_list+exog+controls].dropna(axis=0)
    
    #Standardise targets?
    if standardise==True:
        df_2[target_list] = (np.log(df_2[target_list]+0.00000001)).apply(zscore).astype(float)
    
    #Binarise targets if we are doing a logit
    if binarise!=False:
        df_2[target_list] = df_2[target_list].applymap(lambda x: x>binarise).astype(float)
    
    
    #Extract the exogenous and controls, add constant and cast as float
    exog_controls = add_constant(df_2[exog+controls]).astype(float)
    

    #Container output
    out = []
    coeffs = []
    
    #One regression for each target
    for t in list(target_list):
        
        #There we gp. 
        reg = model(endog=df_2[t],exog=exog_controls).fit(cov_type=cov,disp=0)
        
        out.append(reg.summary())
        
        #coeffs.append(reg)
        if model == OLS:
            coeffs.append(pd.Series([float(reg.params[exog]),float(reg.pvalues[exog]),float(reg.rsquared)],name=t))
            reg_coeff = pd.concat(coeffs,axis=1).T
            reg_coeff.columns = ['coefficient','p_value','r_square']
    
        else:
            coeffs.append(pd.Series([float(reg.params[exog]),float(reg.pvalues[exog]),float(reg.prsquared)],name=t))
            reg_coeff = pd.concat(coeffs,axis=1).T
            reg_coeff.columns = ['coefficient','p_value','pr_square']
 
    
    return([out,reg_coeff.sort_values('coefficient',ascending=False)])
        
       

def plot_regression_coefficients(df,var,cov='HC1',size=(8,6),ax=False,ncols=3):
    '''
    Plots regression coefficients.
    
    Arg:
        variable we use as predictor.
    
    '''
    
    reg = topic_regression(df,topics,[var],controls,OLS,cov='HC1')
    
    if ax==False:
        fig,ax = plt.subplots(figsize=size)

    plot_topic_bar(reg[1]['coefficient'],cl=color_lookup,ax=ax,ncols=ncols)

    ax.set_title(f'Regression coefficient using {var} as predictor')

def topic_comparison(df,target_list,exog,concept_lookup,quantiles=np.arange(0,1.1,0.2),thres=0):
    '''
    This function compares the distribution of activity in various topics depending on an exogenous variable of interest. 
    
    Args:
        Df with the topic mix and metadata
        target_list are the topics to consider
        exog is the variable to crosstab topics against
        concept_lookup is a df with the median proximity of each topic to the concepts
        quantiles is how we discretise the concept lookup (default value is quintiles)
        thres: =limit for considering a topic as present

    
    '''
    
    #Copy df
    
    df_2 = df.copy()
    
    #Discretise the concept lookup
    
    conc_discr = concept_lookup.apply(lambda x: pd.qcut(x,q=quantiles,labels=False,duplicates='drop'))

    
    #Calculate levels of activity per topic based on the exog variable
    
    topic_distr = pd.concat([pd.crosstab(df_2[exog],df_2[t]>thres)[True] for t in target_list],axis=1).T
    topic_distr.index = target_list
    
    
    #Merge the count with the concept lookup
    disc = pd.melt(pd.concat([topic_distr,conc_discr],axis=1).reset_index(drop=False),id_vars=['index']+list(conc_discr.columns))
    
    #This is the list where we store the results
    store={}
    
    for c in concept_lookup.columns:
        
        out = pd.pivot_table(disc.groupby([c,'variable'])['value'].sum().reset_index(drop=False),index=c,columns='variable',values='value')
        #out.apply(lambda x: x/x.sum()).plot.bar()
        
        store[c] = out
                                      
    #Output dfs with the comparisons
    return(store)

def plot_topic_bar(table,cl,ax,ncols):
    '''
    Simple function to plot topic bars which includes colours based on the topic-label lookup
    
    Args:
        table has topics in the index and a value to plot in the columns
        cl is the colour lookup between communities and topics
        ax is the plotting axe
    
    
    '''
    
    cols = [cl[comm_names[comms[x]]] if comm_names[comms[x]] in cl.keys() else 'lightgrey' for x in table.index]
    
    table.plot.bar(color=cols,ax=ax,width=1)
    
    ax.legend(handles=patches,ncol=ncols)
    ax.set_xticks([])
    ax.set_xticklabels([])
    
    
def calculate_entropy(df,categories,category):
    '''
    We calculate entropy inside a paper using a distribution over semantic variables (eg discipline, community or topic). These have to be normalised
    
    arguments:
        df is the analysis df with relevant topics and metadata
        categories are the topics we want to compare
        
    outputs
        A df with entropy measures by paper
        
    
    '''
    #Normalise
    norm = df[categories].apply(lambda x: x/x.sum(),axis=1)
    
    ent = pd.DataFrame((norm.apply(lambda x: entropy(x),axis=1)),columns=['entropy'])
    
    ent['cat']=category
    
    return(ent)

def make_exog(df,value_container,value,make_dummy=True):
    '''
    This creates exogenous variables for modelling later.
    
    Argument:
        -df contains the variable where we want to find a value
        -variable_container is the column where we want to look for the value
        -value is the value we are looking for
        -make_dummy: if true it just counts if the value is present. If false, it counts how many times it happens. 
        
    Output
        -A df with the new column (named)
    
    
    '''
    
    df_2 = df.copy()
    
    #Create a tidy variable name
    column_name = re.sub(' ','_',value.lower())
    
    #If we want to create a dummy...
    if make_dummy == True:
        
        #We just look for it in the value container
        #There are some missing values so we have some control flow to manage that. 
        df_2[column_name] = [value in x if type(x)==list else np.nan for x in df_2[value_container]]
    
    else:
        
        #Otherwise, we count how many times it occurs
        #We deal with missing values ('non lists') as before
        df_2[column_name] = [x.count(value) if type(x)==list else np.nan for x in df_2[value_container]]
        
    return(df_2)
    

def extract_topic_trend(df,cat,year_lims=[2000,2019]):
    '''
    Extracts evolution of a share of a category in a topic of interest
    
    Args:
        df: the usual dataframe
        cat: the category we are interested in
        year_lims: first and last year to consider

    '''
    #rel_df = df.loc[df[cat]==True]
    
    out = pd.crosstab(df['year'],df[cat],normalize=0)
    
    return(out.loc[np.arange(year_lims[0],year_lims[1])])

def plot_topic_trend(df,cat,topics,ax,cmap,year_lims=[2000,2019],threshold=0.05,focus_topics=False,alpha=0.2):
    '''
    Plots topic trends (shares of a category in a topic)
    
    Args:
        df the usual dataframe
        topics: topics we want to display
        cat: the category of interest
        year_lims: first and last year to consider
    
    '''
    activity = []
    names = []
    
    #Use a loop to deal with cases where a category has no activity in a topic
    for t in topics:
        try:
            levels = extract_topic_trend(df.loc[df[t]>threshold],cat,year_lims)
            activity.append(levels[True])
            names.append(t)
        
        except:
            pass
        
        
    topic_trends = pd.concat(activity,axis=1).fillna(0)
    topic_trends.columns = names
    
    if focus_topics !=False:
        
        topic_lookup = {name:val for val,name in enumerate(focus_topics)}

        #Color map
        cols = plt.cm.get_cmap(cmap)

        #Create a vector of colors
        cols_to_show = [(0.5,0.5,0.5,alpha) if v not in topic_lookup.keys() else cols(topic_lookup[v]) for v in topic_trends.columns]

        #Plot
        (100*topic_trends.rolling(window=4).mean().dropna()).plot(color=cols_to_show,ax=ax,linewidth=3)

        #Fix the legend to focus on key topics
        hand,labs = ax.get_legend_handles_labels()

        ax.legend(bbox_to_anchor=(1,1),handles = [x[0] for x in zip(hand,labs) if x[1] in focus_topics],
                  labels=[x[1][:50] for x in zip(hand,labs) if x[1] in focus_topics])
    
    else:

        topic_trends.rolling(window=4).mean().dropna().plot(ax=ax)
        ax.legend(bbox_to_anchor=(1,1))
    

    

    

In [None]:
def get_university_industry_collab_trends(df,variable,topic,threshold=0.05):
    '''
    Study university industry collaborations
    
    Args:
        df as usual
        variable is the collaboration variable we want to study
        topic the topic
        threshold is the threshold for accept a paper in a topic
    

    '''
    
    df_with_topic = df.loc[df[topic]>threshold]
    

    topic_collabs = (100*pd.crosstab(df_with_topic['year'],df_with_topic['university_industry_collab'],normalize=0))[True]
    
    
    return(topic_collabs)
    

## 1. Load data

`analysis_pack` contains the metadata and data that we serialised at the end of the `06` data integration notebook.

This includes:

* Community names for the communities (`index->community name`)
* Community indices for topics (`topic -> community index`)
* Filtered topic names (`topic names`)
* Network object with topic co-occurrences
* Analysis df
* arx is the enriched arXiv dataset



In [None]:
with open('../data/processed/24_8_2019_analysis_pack.p','rb') as infile:
    analysis_pack = pickle.load(infile)

In [None]:
comm_names = analysis_pack[0]
comms = analysis_pack[1]
topics = analysis_pack[2]
network = analysis_pack[3]
data = analysis_pack[4]
arx = analysis_pack[5]

In [None]:
color_lookup = {
    'deep_learning':'blue',
    'robotics_agents':'cornflowerblue',
    'computer_vision':'aqua',
    'symbolic':'red',
    'health':'lime',
    'social':'forestgreen',
    'technology':'magenta',
    'statistics':'orange',
    'language':'yellow'
}

#These are the field names
field_names = ['field_astrophysics',
 'field_biological',
 'field_complex_systems',
 'field_informatics',
 'field_machine_learning_data',
 'field_materials_quantum',
 'field_mathematical_physics',
 'field_mathematics_1',
 'field_mathematics_2',
 'field_optimisation',
 'field_particle_physics',
 'field_physics_education',
 'field_societal',
 'field_statistics_probability']

#Create tidy field names for legend etc
tidy_field_lookup = {x:re.sub('_',' ',x[6:]).capitalize() for x in field_names}

community_names = [x for x in list(set((comm_names.values()))) if x!='mixed']

tidy_comms_lookup = make_tidy_lookup(community_names)

patches = [mpatches.Patch(facecolor=c, label=tidy_comms_lookup[l],edgecolor='black') for l,c in color_lookup.items()]

## 2. Analysis

What are we going to do?

* Measure the distribution over terms as before
* Study trends (share of DL / Reinforcement learning / Computer vision accounted by companies)

### Additional processing

We detected a few companies that were not classified as multinationals even though they seem to be.

We will remove their country parentheses

In [None]:
#Look for these patterns and split on the space if present
my_comps = ['Google (','Apple (', 'Amazon (']

data['institute_list_2'] = [[x if not any(var in str(x) for var in my_comps) else x.split(' ')[0] for x in company_list] 
                            if type(company_list)==list else np.nan for company_list in data['institute_list']]

#### Enrich the data with relevant variables

In [None]:
#Variables of interest
interesting = [['type_list','Company'],['type_list','Government'],['type_list','Education'],
               ['institute_list_2','Google'],['institute_list_2','Facebook'],['institute_list_2','IBM'],['institute_list_2','Microsoft'],
              ['institute_list_2','Apple'],['institute_list_2','Amazon']]

#Create the expanded df
data_2 = data.copy()

#For each interesting variable we expand the df
for detect in interesting:
    
    data_2 = make_exog(data_2,value_container=detect[0],value=detect[1])

#### Some basic descriptives

**How many companies?**

In [None]:
np.sum(data_2['company'])/len(data_2)

In [None]:
orgs = 100*pd.Series(flatten_list([list(set([inst for inst in x if type(inst)==str])) for x in data_2['institute_list_2'].dropna()])).value_counts(normalize=True)

### Trends

In [None]:
comps = pd.concat([pd.crosstab(data_2['year'],data_2[var],normalize=0)[True] for var in ['company','google','microsoft','ibm','facebook','amazon']],axis=1)
comps.columns = ['company','google','microsoft','ibm','facebook','amazon']
comps['other companies'] = comps['company']-comps.iloc[:,1:].sum(axis=1)

comps_data = 100*comps.loc[np.arange(2000,2019)].iloc[:,1:].rolling(window=3).mean().dropna()

In [None]:
fig,ax = plt.subplots(figsize=(10,2.5))

pal = sns.color_palette('Accent_r')

ax.stackplot(comps_data.index,comps_data.T,cmap='Dark2',labels=[x.capitalize() for x in comps_data.columns],
             colors=pal,edgecolor='black',linewidth=0.3)

handles,labels = ax.get_legend_handles_labels()

ax.legend(bbox_to_anchor=(1.26,1),handles=handles[::-1],labels=labels[::-1])

ax.set_ylabel('% of all AI papers')
ax.set_title('Corporate participation in AI research')

plt.tight_layout()

#save_fig('fig_12_corporate_participation.pdf')


save_fig('neurips_corp.pdf')

In [None]:
(comps.loc[2018][1:]/comps.loc[2018][1:].sum()).cumsum()

### Topic representation

In [None]:
# company_topic_comp = cross_sectional_comp(data_2,'company',topics,threshold=0.05)

# fig,ax = plt.subplots(figsize=(8,5))

# plot_topic_bar(company_topic_comp['difference'],cl=color_lookup,ax=ax)

# ax.set_title('Representation of papers involving companies')

In [None]:
# google_topic_comp = cross_sectional_comp(data_2,'google',topics,threshold=0.05)

# fig,ax = plt.subplots(figsize=(8,5))

# plot_topic_bar(google_topic_comp['difference'],cl=color_lookup,ax=ax)


### Regression

In [None]:
controls = ['year']+list(field_names)

In [None]:
fig,ax = plt.subplots(figsize=(15,6),ncols=3,sharey=True)

plot_regression_coefficients(data_2,'google',ax=ax[0],ncols=2)
plot_regression_coefficients(data_2,'company',ax=ax[1],ncols=2)
plot_regression_coefficients(data_2,'education',ax=ax[2],ncols=2)

plt.subplots_adjust(wspace=0.05)

ax[0].set_ylabel('Regression coefficient in multivariate model')

#ax[0].legend([])
#ax[1].legend([])
#ax[2].legend([])

#plt.tight_layout()

save_fig('fig_13_estimates.pdf')



### Time series analysis

I want to study the level of activity in a topic accounted by different types of organisations. 

The target chart contains share of all papers in a topic accounted by different types of organisations


In [None]:
data_2['no_education'] = data_2['education']==False

In [None]:
core_ai_topics = ['cnn-convolutional_neural_networks-cnns-convolutional_neural_network-convolutional_neural_network_cnn',
                  'recurrent-lstm-rnn-recurrent_neural_network-recurrent_neural_networks',
                 'reinforcement_learning-policy-policies-reward-deep_reinforcement_learning',
                 'translation-neural_machine_translation-machine_translation-translate-translations',
                  'latent-generative_model-generative-generative_models-latent_variables',
                  'training-trained-deep_learning-deep-train'
                 ]

In [None]:
fig,ax = plt.subplots(figsize=(12,6),nrows=2,sharex=True)

plot_topic_trend(data_2,'company',cmap='Dark2',topics=topics,ax=ax[0],threshold=0.02,focus_topics=core_ai_topics,alpha=0.07,year_lims=[2004,2019])

#ax[0].set_title('Share of all papers with company presence')
ax[0].set_ylabel('% of all papers in topic \n with company presence')


plot_topic_trend(data_2,'google',cmap='Dark2',topics=topics,ax=ax[1],threshold=0.02,focus_topics=core_ai_topics,alpha=0.07,year_lims=[2004,2019])

ax[1].set_ylabel('% of all papers in topic \n with Google presence')

save_fig('fig_14_trends.pdf')

### What are the levels of university / industry collaboration?

In [None]:
#Create a variable that captures collaborations
data_2['university_industry_collab'] = [all(entity in x for entity in ['Education','Company']) if type(x)==list else np.nan for x in data_2['type_list']]
data_2['govt_industry_collab'] = [all(entity in x for entity in ['Government','Company']) if type(x)==list else np.nan for x in data_2['type_list']]


In [None]:
100*data_2['university_industry_collab'].mean()

In [None]:
100*data_2['university_industry_collab'].sum()/data_2['company'].sum()

In [None]:
#Extract collaborations on 'core AI topics'

collabs_in_topics = pd.concat([get_university_industry_collab_trends(data_2,'university_industry_collab',t) for t in core_ai_topics],axis=1).fillna(0)

collabs_in_topics.columns = core_ai_topics

#Get average collaborations (we set a negative threshold to select all projects)
all_collabs = get_university_industry_collab_trends(data_2,'university_industry_collab',community_names[0],threshold=-1)
all_collabs.name = 'All subjects'

#Concatenate everything
collabs_in_topics = pd.concat([all_collabs,collabs_in_topics],axis=1)

In [None]:
#Plot

to_plot = collabs_in_topics.loc[np.arange(1995,2019)].rolling(window=4).mean().dropna()

ax = to_plot.plot(figsize=(8,4.5),linewidth=3)


hand,labs = ax.get_legend_handles_labels()

ax.legend(handles = [x[0] for x in zip(hand,labs)],
          labels=[x[1][:50] for x in zip(hand,labs)],loc='upper left',title='Topic')


ax.set_xticks(np.arange(to_plot.index[0],2019,2))

ax.set_xlim(2000,2018.1)

ax.set_ylabel('% of all papers in topic with a \n university-industry collaboration')


save_fig('fig_15_collab_trends.pdf')