# Team diversity

In this notebook we concentrate on the link between gender diversity and research topics

* Levels of female participation in the field (briefly)
* Multivariate analysis

## 0. Preamble

In [None]:
%run notebook_preamble.ipy

In [None]:
# Ignore future warnings (for when I concatenate dfs)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Other imports

In [None]:
import random

from statsmodels.api import OLS, Logit
from statsmodels.tools.tools import add_constant
from scipy.stats import zscore
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from scipy.stats import entropy


### Functions

Add a bunch of exogenous variables to the analysis df

In [None]:
#Generic functions
def save_fig(name,path='../reports/figures/paper_rev/'):
    '''
    Saves a figure
    '''
    plt.tight_layout()
    
    plt.savefig(path+f'{today_str}_{name}')
    
    # Put functions etc here

def flatten_list(my_list):
    '''
    Flattens a list
    '''
    
    return([x for el in my_list for x in el])


def get_example(df,number,length):
    '''
    Gets random examples in a field
    
    Args:
        Df is the dataframe we want to use
        number is the number of examples we want
        length is the length of the examples
    
    '''
    
    choose = random.sample(list(df.index),number)
    
    for x in df.loc[choose]['abstract']:
        
        print(x[:length])
        print('\n')

In [None]:
def make_tidy_lookup(names_list,length=False):
    '''
    
    Creates a cheap lookup between names, removing underscores and capitalising
    
    Args:
        names_list (list) is the list of names we want to tidy
        length is if we want to only keep a certain length of the name
    
    '''
    
    out = {x:re.sub('_',' ',x).capitalize() for x in names_list}
    return(out)



In [None]:
def cross_sectional_comp(df,variable,topics,threshold):
    '''
    This function compares activity by topics between categories.
    
    Args:
        df is the dataframe we are using (generally analysis_fin, with rows = papers and columns = variables and metadata)
        variable is the variable we are using for the comparison
        topics is the topics where we want to compare (generally the community names)
        threshold is the threshold we want to use to determine if a paper is in a topic or not
    
    Returns a df with the shares of papers in each topic sorted by their distances
    
    '''
    
    #Create the counts df.
    
    #We are extracting, for each topics, the % of papers with at least one female author when the topic is present, and when it isn't.
    group_counts = pd.concat([pd.crosstab(df[variable],df[t]>threshold,normalize=1).loc[True,:] for t in topics],axis=1)
    
    #Name
    group_counts.columns = topics
    
    #Transpose
    group_counts = group_counts.T
    
    #Rename variables
    group_counts.columns = [variable+f'_{value}' for value in ['false','true']]
    
    #Create a measure of difference
    group_counts['difference'] = (group_counts.iloc[:,1]/group_counts.iloc[:,0])-1
    
    #Output
    out = group_counts.sort_values('difference',ascending=False)
    
    return(out)

def topic_regression(df,target_list,exog,controls,model,binarise=False,standardise=True,cov='HC1'):
    '''
    
    This function regresses topic weights (or their binarisation) on predictors.
    
    Arguments:
        -Df with the variables
        -target_list: target variables. This is a list we loop over. 
        -exog: exogenous variable
        -controls
        -model type. OLS? Logit? TODO fix the logit
        -Binarise in case we are using logit. If not False, the value is the threshold 
            TODO when we binarise the highly detailed models, some of them become all zeros. This will work better
            with the mopre aggregate topics
        -Standardise if we standardise and log the topic weights
    
    Returns
        -A list of statsmodels summaries

    
    '''
    
    #Drop rows with missing values - sm doesn't like them
    df_2 = df[target_list+exog+controls].dropna(axis=0)
    
    #Standardise targets?
    if standardise==True:
        df_2[target_list] = (np.log(df_2[target_list]+0.00000001)).apply(zscore).astype(float)
    
    #Binarise targets if we are doing a logit
    if binarise!=False:
        df_2[target_list] = df_2[target_list].applymap(lambda x: x>binarise).astype(float)
    
    
    #Extract the exogenous and controls, add constant and cast as float
    exog_controls = add_constant(df_2[exog+controls]).astype(float)
    

    #Container output
    out = []
    coeffs = []
    
    #One regression for each target
    for t in list(target_list):
        
        #There we gp. 
        reg = model(endog=df_2[t],exog=exog_controls).fit(cov_type=cov,disp=0)
        
        out.append(reg.summary())
        
        #coeffs.append(reg)
        if model == OLS:
            coeffs.append(pd.Series([float(reg.params[exog]),float(reg.pvalues[exog]),float(reg.rsquared)],name=t))
            reg_coeff = pd.concat(coeffs,axis=1).T
            reg_coeff.columns = ['coefficient','p_value','r_square']
    
        else:
            coeffs.append(pd.Series([float(reg.params[exog]),float(reg.pvalues[exog]),float(reg.prsquared)],name=t))
            reg_coeff = pd.concat(coeffs,axis=1).T
            reg_coeff.columns = ['coefficient','p_value','pr_square']
 
    
    return([out,reg_coeff.sort_values('coefficient',ascending=False)])
        
       

def plot_regression_coefficients(df,var,cov='HC1',size=(8,6)):
    '''
    Plots regression coefficients.
    
    Arg:
        variable we use as predictor.
    
    '''
    
    reg = topic_regression(df,topics,[var],controls,OLS,cov='HC1')
    
    fig,ax = plt.subplots(figsize=size)

    plot_topic_bar(reg[1]['coefficient'],cl=color_lookup,ax=ax)

    ax.set_title(f'Regression coefficient using {var} as predictor')

def topic_comparison(df,target_list,exog,concept_lookup,quantiles=np.arange(0,1.1,0.2),thres=0):
    '''
    This function compares the distribution of activity in various topics depending on an exogenous variable of interest. 
    
    Args:
        Df with the topic mix and metadata
        target_list are the topics to consider
        exog is the variable to crosstab topics against
        concept_lookup is a df with the median proximity of each topic to the concepts
        quantiles is how we discretise the concept lookup (default value is quintiles)
        thres: =limit for considering a topic as present

    
    '''
    
    #Copy df
    
    df_2 = df.copy()
    
    #Discretise the concept lookup
    
    conc_discr = concept_lookup.apply(lambda x: pd.qcut(x,q=quantiles,labels=False,duplicates='drop'))

    
    #Calculate levels of activity per topic based on the exog variable
    
    topic_distr = pd.concat([pd.crosstab(df_2[exog],df_2[t]>thres)[True] for t in target_list],axis=1).T
    topic_distr.index = target_list
    
    
    #Merge the count with the concept lookup
    disc = pd.melt(pd.concat([topic_distr,conc_discr],axis=1).reset_index(drop=False),id_vars=['index']+list(conc_discr.columns))
    
    #This is the list where we store the results
    store={}
    
    for c in concept_lookup.columns:
        
        out = pd.pivot_table(disc.groupby([c,'variable'])['value'].sum().reset_index(drop=False),index=c,columns='variable',values='value')
        #out.apply(lambda x: x/x.sum()).plot.bar()
        
        store[c] = out
                                      
    #Output dfs with the comparisons
    return(store)

def plot_topic_bar(table,cl,ax):
    '''
    Simple function to plot topic bars which includes colours based on the topic-label lookup
    
    Args:
        table has topics in the index and a value to plot in the columns
        cl is the colour lookup between communities and topics
        ax is the plotting axe
    
    
    '''
    
    cols = [cl[comm_names[comms[x]]] if comm_names[comms[x]] in cl.keys() else 'lightgrey' for x in table.index]
    
    table.plot.bar(color=cols,ax=ax,width=1)
    
    ax.legend(handles=patches,ncol=3)
    ax.set_xticks([])
    ax.set_xticklabels([])
    
    
def calculate_entropy(df,categories,category):
    '''
    We calculate entropy inside a paper using a distribution over semantic variables (eg discipline, community or topic). These have to be normalised
    
    arguments:
        df is the analysis df with relevant topics and metadata
        categories are the topics we want to compare
        
    outputs
        A df with entropy measures by paper
        
    
    '''
    #Normalise
    norm = df[categories].apply(lambda x: x/x.sum(),axis=1)
    
    ent = pd.DataFrame((norm.apply(lambda x: entropy(x),axis=1)),columns=['entropy'])
    
    ent['cat']=category
    
    return(ent)
    

## 1. Load data

`analysis_pack` contains the metadata and data that we serialised at the end of the `06` data integration notebook.

This includes:

* Community names for the communities (`index->community name`)
* Community indices for topics (`topic -> community index`)
* Filtered topic names (`topic names`)
* Network object with topic co-occurrences
* Analysis df
* arx is the enriched arXiv dataset



In [None]:
with open('../data/processed/24_8_2019_analysis_pack.p','rb') as infile:
    analysis_pack = pickle.load(infile)

In [None]:
comm_names = analysis_pack[0]
comms = analysis_pack[1]
topics = analysis_pack[2]
network = analysis_pack[3]
data = analysis_pack[4]
arx = analysis_pack[5]

In [None]:
#len(data['has_female'].dropna())

In [None]:
color_lookup = {
    'deep_learning':'blue',
    'robotics_agents':'cornflowerblue',
    'computer_vision':'aqua',
    'symbolic':'red',
    'health':'lime',
    'social':'forestgreen',
    'technology':'magenta',
    'statistics':'orange',
    'language':'yellow'
}

#These are the field names
field_names = ['field_astrophysics',
 'field_biological',
 'field_complex_systems',
 'field_informatics',
 'field_machine_learning_data',
 'field_materials_quantum',
 'field_mathematical_physics',
 'field_mathematics_1',
 'field_mathematics_2',
 'field_optimisation',
 'field_particle_physics',
 'field_physics_education',
 'field_societal',
 'field_statistics_probability']

#Create tidy field names for legend etc
tidy_field_lookup = {x:re.sub('_',' ',x[6:]).capitalize() for x in field_names}

community_names = [x for x in list(set((comm_names.values()))) if x!='mixed']

tidy_comms_lookup = make_tidy_lookup(community_names)

patches = [mpatches.Patch(facecolor=c, label=tidy_comms_lookup[l],edgecolor='black') for l,c in color_lookup.items()]

## 2. Analysis

In [None]:
woman_average = data['has_female'].value_counts(normalize=True)[True]

#d = data.loc[data['top_field']=='field_informatics']

#woman_average = d['has_female'].value_counts(normalize=True)[True]

#print(woman_average)

# #(100*pd.crosstab(d['year'],d['has_female'],normalize=0)).loc[np.arange(2000,2019)][True].rolling(window=3).mean().plot()

In [None]:
woman_community_comp = cross_sectional_comp(data,'has_female',community_names,threshold=0.1)

fig,ax = plt.subplots(figsize=(10,6),ncols=2,sharey=True)

(100*woman_community_comp.iloc[:,1][::-1]).plot.barh(ax=ax[0])

(100*woman_community_comp['difference'][::-1]).plot.barh(ax=ax[1])

ax[0].vlines(x=100*woman_average,ymin=-0.5,ymax=len(woman_community_comp),linestyle=':',color='red')
ax[0].set_xlabel('Papers with at least one female author as \n share of the total')

ax[0].set_yticklabels(tidy_comms_lookup[x] for x in woman_community_comp.index[::-1])


ax[1].vlines(x=0,ymin=-0.5,ymax=len(woman_community_comp),linestyle=':',color='red')
ax[1].set_xlabel('Representation of papers \n with at least one female author')


ax[0].set_ylabel('')

#fig.suptitle('              Representation of topics for papers with one female author',y=1.01)


save_fig('fig_9_woman_regression.pdf')

#### Comparison by topics

In [None]:
woman_topic_comp = cross_sectional_comp(data,'has_female',topics,threshold=0.05)

In [None]:
fig,ax = plt.subplots(figsize=(8,5))

plot_topic_bar(woman_topic_comp['difference'],cl=color_lookup,ax=ax)

ax.set_title('')

#### Report regression analysis

In [None]:
controls = ['year']+list(field_names)

In [None]:
plot_regression_coefficients(data,'has_female',size=(8,2.6))

#save_fig('fig_10_woman_regression.pdf')

save_fig('neurips_woman_regression.pdf')

## Something on topic disciplinarity

In [None]:
q = np.arange(0,1.1,0.1)

data['entropy'] = calculate_entropy(data,topics,'entropy')['entropy']

data['entropy_q'] = pd.qcut(data.loc[data['top_field']=='field_machine_learning_data']['entropy'],q=q,labels=False)

In [None]:
fig,ax = plt.subplots(figsize=(8,4))

ent_grouped = 100*pd.crosstab(data['entropy_q'],data['has_female'],normalize=1)

ax.scatter(np.arange(0,len(q)-1),ent_grouped[False],c='blue',s=50,edgecolor='grey')
ent_grouped[False].plot(color='blue',linestyle=':')


ax.scatter(np.arange(0,len(q)-1),ent_grouped[True],c='orange',s=50,edgecolor='grey')
ent_grouped[True].plot(color='orange',linestyle=':')

ax.hlines(y=10,xmin=0,xmax=10,edgecolor='grey',linestyle='--')

ax.set_xticks(np.arange(0,len(q)))
ax.set_xticklabels(np.arange(0,len(q)))

ax.set_xlim(-0.5,9.5)

ax.set_ylabel('% of all papers')
ax.set_xlabel('Entropy decile')


save_fig('fig_11_women_disc_representation.pdf')
