### Notebook for analyzing probabilities in our dataset.

In [None]:
# install the follwoing packages in the enviroment:
# python3 -m pip install pandas
# python3 -m pip install seaborn

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import pingouin

import numpy as np
import json

import os

from read_jsondata import read_jsons

import time

#### Configure local paths

In [None]:
root = ! pwd
root = root[0]

RAW_DIR=root+"/author_allgenders/"  

if not os.path.exists(RAW_DIR):
    print("The directory {} does not exist.\nThere is no raw data for statistical analysis.".format(RAW_DIR))

#### Load publication data into pandas dataframe

In [None]:
df = read_jsons(RAW_DIR) 
df

#### Create new columns in the dataframe extracting useful information from list of coauthors

In [None]:
# Number of authors and initialed names:

df['Number_authors'] = df['all_genders'].apply(lambda x: len(x)) #take the length of the list all_genders
df['Number_init'] = df['all_genders'].apply(lambda x: len([s for s in x if "init"==s]))


# First author's gender and percentage:

df['First_Author_gend'] = df['all_genders'].apply(lambda x: x[0]) #take the first element of the list all_genders
df['First_Author_perc'] = df['all_percent'].apply(lambda x: x[0])

# Last author's gender and percentage:

df['Last_Author_gend'] = df['all_genders'].apply(lambda x: x[-1]) #take the last element of the list all_genders
df['Last_Author_perc'] = df['all_percent'].apply(lambda x: x[-1])

df

#### Dropping init (unidentified initialed names)

In [None]:
df = df[df.Number_init==0].copy()
df

#### Check number of papers in each journal

In [None]:
journals = df.journal.unique()

for jou in journals:  
    
     print(jou, df[df.journal==jou].journal.count())


 #### It is easier if the all probabilities are with respect to the same gender (female)

In [None]:
# prob(female) = 1 - prob(male)

# Prob last author female:

df['Last_Author_probF'] = df['Last_Author_perc'] #initialize the new column
df.loc[df['Last_Author_gend'] == 'male','Last_Author_probF'] = \
    1 - df.loc[df['Last_Author_gend'] == 'male','Last_Author_probF']

# Prob first author female:

df['First_Author_probF'] = df['First_Author_perc'] #initialize the new column
df.loc[df['First_Author_gend'] == 'male','First_Author_probF'] = \
    1 - df.loc[df['First_Author_gend'] == 'male','First_Author_probF']

df

## Now we can compute some interesting probabilities:

### Useful formulas:

Suppose $x_i$ refers to the article $i$ and $N$ is the total number of articles. Then, the probability of an article having female author is (the law of total probability):

$$p(\text{female}) = \sum_{i}^N p(\text{female}|x_i) p(x_i). $$

If we have all the probabilities with respect to the female gender, then the probability of having a male author will be:

$$p(\text{male}) = \sum_{i}^N (1 - p(\text{female}|x_i)) p(x_i). $$

$p(x_i)$ is the probability of the article $x_i$. All articles have the same probability, therefore $p(x_i) = \frac{1}{N}$. This means that the formulas above are same as taking the average of  $p(\text{female}|x_i)$ or $(1 - p(\text{female}|x_i))$, respectively.


#### Let's compute some easy probabilities to start

In [None]:
p_ff = df['First_Author_probF'].sum()/df.shape[0]
p_mf = (1 - df['First_Author_probF']).sum()/df.shape[0]
p_fl = df['Last_Author_probF'].sum()/df.shape[0]
p_ml = (1 - df['Last_Author_probF']).sum()/df.shape[0]


print('Probability of having a female first author:', p_ff)
print('Probability of having a male first author:', p_mf)

print('Probability of having a female last author:', p_fl)
print('Probability of having a male last author:', p_ml)

In [None]:
# same as before in absolute numbers:

print('Number of articles having a female first author:', p_ff*df.shape[0])
print('Number of articles having a male first author:', p_mf*df.shape[0])
print('Ratio between them:', p_mf/p_ff)


print('Number of articles having a female last author:', p_fl*df.shape[0])
print('Number of articles having a male last author:', p_ml*df.shape[0])
print('Ratio between them:', p_ml/p_fl)

#### Overall probability of female or male authorship

In [None]:
def Prob_author(x,y, kind="female"):
    sum = 0
    for i,elem in enumerate(x):
        if elem != kind:
            sum += 1 - float(y[i]) 
        elif elem == kind:
            sum += float(y[i])
    return sum


# 1) determine overall probabilities
#==================================
# How many authors in total?
n_authors_all =df.Number_authors.sum()

# Sum of probability of female / total nr. 
p_female_all = df.apply(lambda x: Prob_author(x.all_genders, x.all_percent, kind="female"), axis=1).sum() / n_authors_all

# Sum of probability of male / total nr.
p_male_all = df.apply(lambda x: Prob_author(x.all_genders, x.all_percent, kind="male"), axis=1).sum() / n_authors_all

#==================================



# Now print some info; Check it adds to 1
#==================================
print("All probabilities sum: ", p_female_all + p_male_all)
print("Overall P of female authorship: ", p_female_all)
print("Overall P of male authorship: ", p_male_all)

#### Overall probability of female or male coauthorship [not first or last]

In [None]:
def Prob_coauthor(x,y, kind="female"):
    sum = 0
    for i,elem in enumerate(x):
        if i == 0 or i==len(x)-1:
            continue
        if elem != kind:
            sum += 1 - float(y[i]) 
        elif elem == kind:
            sum += float(y[i])
    return sum


# 1) determine overall probabilities of coauthors (not first and not last)
#======================================================================
df_co = df[df.Number_authors>2].copy()  # only consider papers with more than two authors

n_coauthors =df_co.Number_authors.sum() - 2*df_co.shape[0]  # remove first and last

# Sum of probability of female / total nr. 
p_female_all_co = df_co.apply(lambda x: Prob_coauthor(x.all_genders, x.all_percent, kind="female"), axis=1).sum() / n_coauthors

# Sum of probability of male / total nr.
p_male_all_co = df_co.apply(lambda x: Prob_coauthor(x.all_genders, x.all_percent, kind="male"), axis=1).sum() / n_coauthors

#==================================



# Now print some info; Check it adds to 1
#==================================
print("All probabilities sum: ", p_female_all_co + p_male_all_co)
print("Overall P of female coauthorship: ", p_female_all_co)
print("Overall P of male coauthorship: ", p_male_all_co)

#### Make bar plot with all previous descriptive probabilities

In [None]:
import matplotlib.patches as mpatches

sns.set(style="ticks")
sns.set_context("notebook", font_scale=1.3, rc={"lines.linewidth": 2.5})




data = {'Prob': [p_ff, p_female_all_co, p_fl, p_mf, p_male_all_co, p_ml], 
        'Gender': ['Female', 'Female', 'Female', 'Male', 'Male', 'Male'],
       'Pos': ['First', 'Coauthor','Last','First', 'Coauthor','Last']}


df_plot = pd.DataFrame(data=data)

ax = bar1 = sns.barplot(y="Pos",  x="Prob", data=df_plot,  estimator=sum,ci = None, color='lightgray', #color='darkorange'
                        dodge = False)
bar2 = sns.barplot(y="Pos", x="Prob", data=df_plot[df_plot.Gender == 'Female'], ci=None,  color='rebeccapurple',dodge = False)

top_bar = mpatches.Patch(color='lightgray', label='Male')
bottom_bar = mpatches.Patch(color='rebeccapurple', label='Female')
plt.legend(handles=[ bottom_bar, top_bar], bbox_to_anchor=(0.99, 0.64),
          ncol=1, fancybox=True, shadow=False)

#ax.axvline(0.2347, ls=':', color = 'olive', lw = '2.')

ax.set(xlabel='Probability', ylabel='')
sns.despine()

#plt.savefig('./Figures/descriptive_stats_seaborn.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/descriptive_stats_seaborn.png',dpi = 300, bbox_inches="tight")

#### Probabilities of having at least one male/female author in an article

Having at least one female author refers to any coauthor combination excluding the case in which all authors are male:

$$p(\text{at least 1 female}|x_i) = 1 - p(\text{all male}|x_i)$$

Computing probability for all male coauthors is easier. In the following, we drop the dependency on $x_i$ for clarity.

$$p(\text{all male}) = p(\text{male}_1)p(\text{male}_2|\text{male}_1)p(\text{male}_3|\text{male}_1,\text{male}_2)... = \prod_i^n p(\text{male}_i)$$

where n is the number of authors and the last step assumes that the gender probability of each authorship is independent of the gender of other coauthors (just to simplify the problem). 

In [None]:
#Define functions to multiply probabilities in each row

def Prob_atleast(x,y, gender_atleast, gender_other):
    prod = 1
    for i,elem in enumerate(x):
        if elem == gender_other:
            prod *= float(y[i]) 
        elif elem == gender_atleast:
            prod *= 1 - float(y[i])
    return 1 - prod


def Prob_atleast_bin(x,y, gender_atleast, gender_other):
    prod = 1
    for i,elem in enumerate(x):
        if elem == gender_other:
            prod *= round(y[i]) 
        elif elem == gender_atleast:
            prod *= 1 - round(y[i])
    return 1 - prod


# Create corresponding columns:

df['Prob_atleast_Fauthor'] = df.apply(lambda x: Prob_atleast(x.all_genders, x.all_percent, "female", "male"), axis=1)
df['Prob_atleast_Mauthor'] = df.apply(lambda x: Prob_atleast(x.all_genders, x.all_percent, "male", "female"), axis=1)

# add the same columns as if the authors were binary

df['Prob_atleast_Fauthor_binary'] = df.apply(lambda x: Prob_atleast_bin(x.all_genders, x.all_percent, "female", "male"), axis=1)
df['Prob_atleast_Mauthor_binary'] = df.apply(lambda x: Prob_atleast_bin(x.all_genders, x.all_percent, "male", "female"), axis=1)

df

#### Compute overall probabilities

In [None]:
p_atleast_f = df['Prob_atleast_Fauthor'].sum()/df.shape[0]
p_atleast_m = df['Prob_atleast_Mauthor'].sum()/df.shape[0]

print('Probability of having at least one female author in an article', p_atleast_f)

print('Probability of having at least one male author in an article', p_atleast_m)

print('or the opposite...')

print('Probability of having all female authors in an article', 1 - p_atleast_m)

print('Probability of having all male authors in an article', 1 - p_atleast_f)

#### Plot previous results

In [None]:
data = {'Prob': [p_atleast_f, 1 - p_atleast_m, p_atleast_m, 1 - p_atleast_f], 
        'Gender': ['Female', 'Female', 'Male', 'Male'],
       'Pos': ['At least one', 'All', 'At least one', 'All']}


df_plot = pd.DataFrame(data=data)

ax = sns.barplot(x="Pos",  y="Prob", data=df_plot,  hue = 'Gender', ci = None, palette=['rebeccapurple', 'lightgray'])

ax.set(xlabel='', ylabel='Probability')
plt.gca().legend().set_title('')

sns.despine()

#plt.savefig('./Figures/descriptive_stats_atleast1_all.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/descriptive_stats_atleast1_all.png',dpi = 300, bbox_inches="tight")

#### Absolute numbers:

In [None]:
print('Number of articles with all female authors:', (1 - p_atleast_m)*df.shape[0])
print('Number of articles with all male authors:', (1 - p_atleast_f)*df.shape[0])
print('Ratio between them:', (1 - p_atleast_f)/(1 - p_atleast_m))

print('Number of articles with at least one female authors:', p_atleast_f*df.shape[0])
print('Number of articles with at least one male authors:', p_atleast_m*df.shape[0])
print('Ratio between them:', p_atleast_m/p_atleast_f)

#### Plot in a nicer way:

In [None]:
labels = ['Mixed', 'All male', 'All female']
sizes = [df.shape[0]-(1 - p_atleast_f)*df.shape[0]-(1 - p_atleast_m)*df.shape[0],
         (1 - p_atleast_f)*df.shape[0], (1 - p_atleast_m)*df.shape[0]]
explode = (0., 0., 0.)  # 
colors = ['rosybrown','lightgray','rebeccapurple']

fig1, ax1 = plt.subplots()
patches, texts, pcts = ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=False, startangle=90,labeldistance = None,  colors = colors,
       wedgeprops={'linewidth': 2.0, 'edgecolor': 'white'},rotatelabels=True)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.legend( bbox_to_anchor=(0.8, 0.95),
          ncol=1, fancybox=True, shadow=False)
plt.setp(pcts, color='black')
pcts[2].set_position((0.1,1.1))
pcts[0].set_fontsize(14)
pcts[1].set_fontsize(14)
pcts[2].set_fontsize(14)

#plt.savefig('./Figures/descriptive_stats_atleast1_all_pieplot.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/descriptive_stats_atleast1_all_pieplot.png',dpi = 300, bbox_inches="tight")

#### in between: Overall probailities of female / male authors per number of auhtors in papers

In [None]:
# Theoretical probabilities
print("=" * 100)
print("Probability that a paper with x author(s) has at least 1 female author:")
print("x=1: p=", round(1. - p_male_all, 3), ";   x=2: p=", round(1. - p_male_all ** 2, 3),
      ";   x=3: p=", round(1. - p_male_all ** 3, 3), ";   p=10: p=", round(1. - p_male_all ** 10, 3))
print("=" * 100)
# and the reverse:
print("Probability that a paper with x author(s) has only female authors:")
print("x=1: p=", round(p_female_all, 3), ";   x=2: p=", round(p_female_all ** 2, 3),
      ";   x=3: p=", round(p_female_all ** 3, 3), ";   p=10: p=", round(p_female_all ** 10, 3))
# etc
print("=" * 100)
#==================================



# Compute theoretical probabilities of having all male / all female / 1 male / 1 female author(s)
#==================================
dummy = pd.DataFrame()
n_authors = range(1, 21)
p_all_f_given_n = p_female_all ** n_authors
p_all_m_given_n = p_male_all ** n_authors
p_one_f_given_n = 1. - p_all_m_given_n
p_one_m_given_n = 1. - p_all_f_given_n


# UGLY CODE below. Please fix if you are inspired
n_authors_data = df.Number_authors.unique()
n_authors_data.sort()

p_atleast_f_per_n = []
p_atleast_m_per_n = []
p_all_f_per_n = []
p_all_m_per_n = []
# p_atleast_f_per_n_binary = []
# p_atleast_m_per_n_binary = []
# p_all_f_per_n_binary = []
# p_all_m_per_n_binary = []

nr_papers_n = []
n_authors_data = n_authors_data[0: 20]
for i in n_authors_data:
    p_atleast_f_per_n.append(df[df.Number_authors == i].Prob_atleast_Fauthor.mean())    
    p_atleast_m_per_n.append(df[df.Number_authors == i].Prob_atleast_Mauthor.mean())
    p_all_f_per_n.append((1. - df[df.Number_authors == i].Prob_atleast_Mauthor).mean())
    p_all_m_per_n.append((1. - df[df.Number_authors == i].Prob_atleast_Fauthor).mean())
    
#     p_atleast_f_per_n_binary.append(df[df.Number_authors == i].Prob_atleast_Fauthor_binary.mean())    
#     p_atleast_m_per_n_binary.append(df[df.Number_authors == i].Prob_atleast_Mauthor_binary.mean())
#     p_all_f_per_n_binary.append((1. - df[df.Number_authors == i].Prob_atleast_Mauthor_binary).mean())
#     p_all_m_per_n_binary.append((1. - df[df.Number_authors == i].Prob_atleast_Fauthor_binary).mean())
    
    nr_papers_n.append(len(df[df.Number_authors == i])) 

dummy["n"] = n_authors_data
dummy["n_t"] = n_authors_data - 1
dummy["all_f"] = p_all_f_per_n
dummy["all_f_theory"] = p_all_f_given_n
dummy["all_m"] = p_all_m_per_n
dummy["all_m_theory"] = p_all_m_given_n
dummy["all_m_plot"] = dummy["all_m"] + dummy["all_f"]
dummy["all_m_theory_plot"] = dummy["all_m_theory"] + dummy["all_f_theory"]
dummy["mixed"] = np.ones(len(p_all_m_given_n)) - dummy.all_m - dummy.all_f
dummy["mixed_plot"] = np.ones(len(p_all_m_given_n))
dummy["all_m_plot2"] = np.ones(len(p_all_m_given_n))
dummy["mixed_plot2"] = dummy["mixed"] + dummy["all_f"]
dummy["all_m_theory_plot"] = np.ones(len(p_all_m_given_n)) - dummy.all_m_theory

## Plot results: 
dummy["all_m_theory_plot"].iat[0] = np.nan
sns.barplot(y="all_m_plot2", x="n", data=dummy ,color="lightgrey", dodge=False)
sns.barplot(y="mixed_plot2", x="n", data=dummy ,color="rosybrown", dodge=False)
sns.barplot(y="all_f",x="n", data=dummy, color="rebeccapurple")
sns.scatterplot(x="n_t", y="all_f_theory", data=dummy, marker="o", linewidth=1, zorder=10, 
                color="white", edgecolor="k")
sns.scatterplot(x="n_t", y="all_m_theory_plot", data=dummy, marker="o", linewidth=1,zorder=10, 
                color="white", edgecolor="k")
plt.ylabel("Probability")

sns.despine()
#plt.xticks([i for i in range(0, 13, 2)], [i for i in range(0, 13, 2)])
plt.xlim(-0.5, 11.5)
plt.xlabel("Number of authors")
print(dummy["n"].values)
print(nr_papers_n)

#plt.savefig('./Figures/probs_vs_nauthors.pdf',dpi = 300, bbox_inches="tight")

#### plot biases to better understand the figures above

In [None]:
plt.figure()
ax = plt.subplot(111)
plt.bar(dummy.n,  dummy.all_m_theory - dummy.all_m, color="rebeccapurple")
#plt.bar(dummy.n, dummy.all_f_theory - dummy.all_f, color="lightgray")

plt.xlim(0, 12.5)
plt.ylim(-0.08, 0.08)
#plt.legend(["At least one female", "At least one male"])#["P(f | n, obs) - P(f | n)", "P(m | n, obs) - P(m | n)"])
plt.xticks([i for i in range(0, 13, 2)], [i for i in range(0, 13, 2)])
plt.xlabel("Number of authors")
plt.ylabel("Discrepancy")
plt.hlines(y=0.0, xmin=0., xmax=13, linestyles="-", linewidth=0.75, color="k")
# plt.title("Bias at least one female")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
#plt.savefig("./Figures/bias_pAtLeast1Female.pdf", bbox_inches="tight", dpi=150)

#### check how many papers in each bin

In [None]:
for i in range(12): #update values for each journal
    cond = df['Number_authors']==(i+1)
    print("N = ", len(df[cond].values), " for number of authors ", i + 1)

#### consequences:

In [None]:
import math

print('A female author will have 95% probability of being in a publication if the article has at least',
math.ceil(np.log(0.05)/np.log(p_male_all)),'auhtors')

print('A male author will have 95% probability of being in a publication if the article has at least',
math.ceil(np.log(0.05)/np.log(p_female_all)),'auhtors')

#### average number of authors in dataset

In [None]:
mean_articles = df['Number_authors'].sum()/df.shape[0]

print('Average number of authors for articles', mean_articles)

#### We can compute the same probabilities per each journal

In [None]:
journals = df['journal'].unique() # a list of unique journal names


for i in journals: #update values for each journal
    cond = df['journal']==i
    print("N = ", len(df[cond].values), " for journal ", i)
    df.loc[cond,'P_atleast_F_journal'] = df.loc[cond,'Prob_atleast_Fauthor'].sum()/df[cond].shape[0]
    df.loc[cond,'P_atleast_M_journal'] = df.loc[cond,'Prob_atleast_Mauthor'].sum()/df[cond].shape[0]
    df.loc[cond,'P_first_F_journal'] = df.loc[cond,'First_Author_probF'].sum()/df[cond].shape[0]
    df.loc[cond,'P_last_F_journal'] = df.loc[cond,'Last_Author_probF'].sum()/df[cond].shape[0]
    df.loc[cond,'Number_authors_journal'] = df.loc[cond,'Number_authors'].mean()


    cond = df_co['journal']==i
    n_coauthors =df_co[cond].Number_authors.sum() - 2*df_co[cond].shape[0]

    # Sum of probability of female / total nr. 
    df_co.loc[cond,'P_coauthor_F_journal'] = df_co[cond].apply(lambda x: Prob_coauthor(x.all_genders, x.all_percent, kind="female"), axis=1).sum() / n_coauthors    

#### Plot bar plots for fixed number of authors

In [None]:
## Gather all high impact journals together to increase number of papers in bin

df['journal2'] = df['journal']
df.loc[df['journal2'] == 'Nature','journal2'] = 'Nat/Sci' 
df.loc[df['journal2'] == 'Science','journal2'] = 'Nat/Sci' 
df.loc[df['journal2'] == 'NatGeo','journal2'] = 'Nat/Sci' 

In [None]:
### First author

dict_IF = ['Nat/Sci', 'EPSL', 'GRL', 
        'JGR', 'G3', 'SRL', 'Tectp', 'SE', 
       'GEOPH.', 'GJI', 'BSSA', 'PEPI']

from matplotlib.pyplot import figure

figure(figsize=(4, 4))


ax = sns.barplot(y="journal2", x="First_Author_probF",  data=df, order=dict_IF, color = 'rebeccapurple',capsize=.2, errwidth = 2,
                ci = None)

plt.xlim([0,0.5])

patches = ax.patches
lines_per_err = 3

for i, line in enumerate(ax.get_lines()):
    line.set_color('white')
    

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.grid(alpha=0.5)


ax.axvline(p_ff, ls=':', color = 'black', lw = '2.')  #a line in the overall value

ax.set(xlabel='Probability', ylabel='')
#ax.set_title('Prob. first female author')

#plt.savefig('./Figures/First_f_perJournal.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/First_f_perJournal.png',dpi = 300, bbox_inches="tight")

In [None]:
### Last author

figure(figsize=(4, 4))


ax =  sns.barplot(y="journal2", x="Last_Author_probF",  data=df, order=dict_IF, color = 'rebeccapurple',capsize=.2, errwidth = 2,
                ci = None)

plt.xlim([0,0.5])

patches = ax.patches
lines_per_err = 3

for i, line in enumerate(ax.get_lines()):
    line.set_color('white')
    

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.grid(alpha=0.5)

ax.axvline(p_fl, ls=':', color = 'black', lw = '2.')  #a line in the overall value

ax.set(xlabel='Probability', ylabel='')
#ax.set_title('Prob. last female author')

#plt.savefig('./Figures/Last_f_perJournal.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/Last_f_perJournal.png',dpi = 300, bbox_inches="tight")

In [None]:
### co-authors

figure(figsize=(4, 4))


df_co['journal2'] = df_co['journal']
df_co.loc[df_co['journal2'] == 'Nature','journal2'] = 'Nat/Sci' 
df_co.loc[df_co['journal2'] == 'Science','journal2'] = 'Nat/Sci' 
df_co.loc[df_co['journal2'] == 'NatGeo','journal2'] = 'Nat/Sci' 


ax = sns.barplot(y="journal2", x="P_coauthor_F_journal",  data=df_co, order=dict_IF, color = 'rebeccapurple',capsize=.2, errwidth = 2,
                ci = None)

plt.xlim([0,0.5])

patches = ax.patches
lines_per_err = 3

for i, line in enumerate(ax.get_lines()):
    line.set_color('white')
    

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.grid(alpha=0.5)

ax.axvline(p_female_all_co, ls=':', color = 'black', lw = '2.')  #a line in the overall value

ax.set(xlabel='Probability', ylabel='')
#ax.set_title('Prob. female coauthor')

#plt.savefig('./Figures/Coauthor_f_perJournal.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/Coauthor_f_perJournal.png',dpi = 300, bbox_inches="tight")

In [None]:
### At least on female author

df['all_f'] = 1 - df["P_atleast_M_journal"]
df['all_m'] = 1 - df["P_atleast_F_journal"]
df['mixed'] = 1- df['all_m'] 
df['all_m'] = 1 


ax = sns.barplot(y="journal2", x="all_m",  data=df, order=dict_IF, color = 'lightgray',capsize=.2, errwidth = 2,
                ci = None)

ax = sns.barplot(y="journal2", x="mixed",  data=df, order=dict_IF, color = 'rosybrown',capsize=.2, errwidth = 2,
                ci = None)

sns.barplot(y="journal2", x="all_f",  data=df, order=dict_IF, color = 'rebeccapurple',capsize=.2, errwidth = 2,
ci = None)
plt.xlim([0,1])

ax.set(xlabel='Probability', ylabel='')
ax.set_title('Prob. at least one female author')

#plt.savefig('./Figures/Atleast1_f_perJournal_orderIF_mixed.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/Atleast1_f_perJournal_orderIF_mixed.png',dpi = 300, bbox_inches="tight")

In [None]:
### plot average number of authors per journal

ax = sns.barplot(y="journal2", x="Number_authors",  data=df,estimator = np.mean, order=dict_IF, 
                 color = 'c', ci=None, capsize=.2, errwidth = 2)

ax.set(xlabel='Number authors', ylabel='')

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.grid(alpha=0.5)
plt.xlim([0,5.01])

#plt.savefig('./Figures/NumAuthors_perJournal.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/NumAuthors_perJournal_orderNumAuthor.png',dpi = 300, bbox_inches="tight")

#### See the temporal trend of having at least one F or one M in a publication for all journals per year


In [None]:
years = df['year'].unique() # a list of unique journal names
years.sort()
print(years)

for i in years: #update values for each journal
    cond = df['year']==i
    print("Number of articles = ", len(df[cond].values), " for year ", i)
    df.loc[cond,'P_atleast_F_year'] = df.loc[cond,'Prob_atleast_Fauthor'].sum()/df[cond].shape[0]
    df.loc[cond,'P_atleast_M_year'] = df.loc[cond,'Prob_atleast_Mauthor'].sum()/df[cond].shape[0]
    df.loc[cond,'All_M_year'] = (1 - df.loc[cond,'Prob_atleast_Fauthor']).sum()/df[cond].shape[0]
    df.loc[cond,'All_F_year'] = (1 - df.loc[cond,'Prob_atleast_Mauthor']).sum()/df[cond].shape[0]
    df.loc[cond,'Mixed_year'] = (1 - df.loc[cond,'All_M_year'] - df.loc[cond,'All_F_year']).sum()/df[cond].shape[0]
    df.loc[cond,'Number_authors_year'] = df.loc[cond,'Number_authors'].mean()
    
    cond = df_co['year']==i
    n_coauthors =df_co[cond].Number_authors.sum() - 2*df_co[cond].shape[0]

    # Sum of probability of female / total nr. 
    df_co.loc[cond,'P_coauthor_F_year'] = df_co[cond].apply(lambda x: Prob_coauthor(x.all_genders, x.all_percent, kind="female"), axis=1).sum() / n_coauthors


#### Plot useful graphs:

In [None]:
### Prob at least one female, all male, and all female authors

df['All_M_year_plot'] = 1
df['Mixed_year_plot'] = 1 - df['All_M_year']


ax = sns.barplot(y="year", x="All_M_year_plot",  data=df, order=years, color = 'lightgray',capsize=.2, errwidth = 2,
                ci = None)

ax = sns.barplot(y="year", x="Mixed_year_plot",  data=df, order=years, color = 'rosybrown',capsize=.2, errwidth = 2,
                ci = None)

sns.barplot(y="year", x="All_F_year",  data=df, order=years, color = 'rebeccapurple',capsize=.2, errwidth = 2,
ci = None)
ax.set(xlabel='Probability', ylabel='Year')
plt.xlim([0,1])

# plt.savefig('./Figures/Atleast1_f_perYear.pdf',dpi = 300, bbox_inches="tight")
# plt.savefig('./Figures/Atleast1_f_perYear.png',dpi = 300, bbox_inches="tight")

In [None]:
### plot the same but for different years

labels = ['Mixed', 'All male', 'All female']
colors = ['rosybrown','lightgray','rebeccapurple']
explode = (0., 0., 0.)  # 


years_sub = ['2010','2015','2020']
fig1, ax1 = plt.subplots(1,len(years_sub),figsize=(15, 5))

#years_sub = ['2010','2012','2014','2016','2018','2020']

#fig1, ax1 = plt.subplots(2,int(len(years_sub)/2),figsize=(15, 5))

ax1 = ax1.ravel()



for i,year in enumerate(years_sub): #update values for each journal
    cond = df['year']==year
    sizes = [1 - (1 - df.loc[cond,'P_atleast_F_year'].mean()) - (1 - df.loc[cond,'P_atleast_M_year'].mean()),
             1 - df.loc[cond,'P_atleast_F_year'].mean(),1 - df.loc[cond,'P_atleast_M_year'].mean()]

    print(sizes)


    patches, texts, pcts = ax1[i].pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', normalize = True,
        shadow=False, startangle=90,labeldistance = None,  colors = colors,
       wedgeprops={'linewidth': 2.0, 'edgecolor': 'white'},rotatelabels=True)
    ax1[i].axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    ax1[i].set_title(year, y=-0.05)
    plt.setp(pcts, color='black')
    pcts[2].set_position((0.1,1.1))
    pcts[0].set_fontsize(14)
    pcts[1].set_fontsize(14)
    pcts[2].set_fontsize(14)



    
ax1[i].legend( bbox_to_anchor=(0.95, 0.65),
          ncol=1, fancybox=True, shadow=False)


#plt.savefig('./Figures/PerYear_atleast1_all_pieplot.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/PerYear_atleast1_all_pieplot.png',dpi = 300, bbox_inches="tight")

#### Female & male first, last, and co- authorship probabilities per year 

In [None]:
figure(figsize=(4, 4))

#ax = sns.barplot(y="year", x="First_Author_probF",  data=df, estimator = sum, order=years, color = 'lightgray')
ax = sns.barplot(y="year", x="First_Author_probF", data=df, ci=None,  order=years, color='rebeccapurple',dodge = False,
           capsize=.2, errwidth = 2)
ax.set(xlabel='Probability first author', ylabel='Year')
plt.xlim([0,0.5])

patches = ax.patches
lines_per_err = 3

for i, line in enumerate(ax.get_lines()):
    line.set_color('white')
    

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.grid(alpha=0.5)


#plt.savefig('./Figures/FirstAuthor_perYear.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/FirstAuthor_perYear.png',dpi = 300, bbox_inches="tight")

In [None]:
figure(figsize=(4, 4))
#ax = sns.barplot(y="year", x="Last_Author_probF",  data=df, estimator = sum, order=years, color = 'lightgray')
ax=sns.barplot(y="year", x="Last_Author_probF", data=df, ci=None,  order=years, color='rebeccapurple',dodge = False,
            capsize=.2, errwidth = 2)
ax.set(xlabel='Probability last author', ylabel='Year')
plt.xlim([0,0.5])

patches = ax.patches
lines_per_err = 3

for i, line in enumerate(ax.get_lines()):
    line.set_color('white')
    
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.grid(alpha=0.5)

#plt.savefig('./Figures/LastAuthor_perYear.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/LastAuthor_perYear.png',dpi = 300, bbox_inches="tight")

In [None]:
figure(figsize=(4, 4))

#ax = sns.barplot(y="year", x="P_coauthor_F_year",  data=df_co, estimator = sum, order=years, color = 'lightgray',capsize=.2, errwidth = 2,
#                ci = None)
ax= sns.barplot(y="year", x="P_coauthor_F_year",  data=df_co, order=years, color = 'rebeccapurple',capsize=.2, errwidth = 2,
                ci = None)
plt.xlim([0,0.5])

ax.set(xlabel='Probability coauthor', ylabel='Year')
ax.set_title('')

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.grid(alpha=0.5)

#plt.savefig('./Figures/Coauthor_f_perYear.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/Coauthor_f_perYear.png',dpi = 300, bbox_inches="tight")

#### Average number of articles per year

In [None]:
ax = sns.barplot(y="year", x="Number_authors", data=df, ci=None, estimator = np.mean, order=years, color='c',dodge = False,
            capsize=.2, errwidth = 2)
ax.set(xlabel='Number authors', ylabel='Year')

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.grid(alpha=0.5)
plt.xlim([0,5.01])


#plt.savefig('./Figures/NumberAuthors_perYear.pdf',dpi = 300, bbox_inches="tight")
#plt.savefig('./Figures/NumberAuthors_perYear.png',dpi = 300, bbox_inches="tight")

#### Rates of change in participation of female authors per year


In [None]:
for i in years: #update values for each journal
    cond = (df['year']==i) 
    # alternatively: Do not remove Init and then probabilities do not sum to 1.
    # cond = df['year']==i
    
    print("Number of articles = ", len(df[cond].values), " for year ", i)
    df.loc[cond,'P_first_F_year'] = df.loc[cond,'First_Author_probF'].sum() / len(df.loc[cond])
    df.loc[cond,'P_first_M_year'] = (1. - df.loc[cond,'First_Author_probF']).sum() / len(df.loc[cond])
    
    # uncomment the following line to check if sums up to 1
    # print(df.loc[cond, "P_first_F_year"].iloc[0:5] + df.loc[cond, "P_first_M_year"].iloc[0:5])
    print("Probability that first author is female: ", df.loc[cond, "P_first_F_year"].iloc[0])
    print("Probability that first author is male: ", df.loc[cond, "P_first_M_year"].iloc[0])

    
    # last author
    cond = (df['year']==i) 
    # alternatively: Do not remove Init and then probabilities do not sum to 1.
    # cond = df['year']==i
    
    #print("Number of articles = ", len(df[cond].values), " for year ", i)
    df.loc[cond,'P_last_F_year'] = df.loc[cond,'Last_Author_probF'].sum() / len(df.loc[cond])
    df.loc[cond,'P_last_M_year'] = (1. - df.loc[cond,'Last_Author_probF']).sum() / len(df.loc[cond])
    
    # uncomment the following line to check if sums up to 1
    # print(df.loc[cond, "P_last_F_year"].iloc[0:5] + df.loc[cond, "P_last_M_year"].iloc[0:5])
    print("Probability that last author is female: ", df.loc[cond, "P_last_F_year"].iloc[0])
    print("Probability that last author is male: ", df.loc[cond, "P_last_M_year"].iloc[0])

df.to_csv("analysis_output_" + time.strftime("%Y-%m-%d.csv"))

In [None]:
slope_1, intercept_1, r_value_1, p_value_1, std_err_1 = stats.linregress(
    df.year.unique().astype('float'),df.P_first_F_year.unique())

tmp = stats.linregress(
    df.year.unique().astype('float'),df.P_first_F_year.unique())

intercept_stderr_1 = tmp.intercept_stderr

print('Pearson correlation coefficients and increasing rate:')
print('\n')

print('Correlation between first female and year (r,p-value):', r_value_1, p_value_1)
print('Increasing rate first female per year (%):', slope_1*100, ' SD:', std_err_1*100)


slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress(
    df.year.unique().astype('float'),df.P_last_F_year.unique())

tmp = stats.linregress(
    df.year.unique().astype('float'),df.P_last_F_year.unique())

intercept_stderr_l = tmp.intercept_stderr

print('\n')
print('Correlation between last female and year (r,p-value):', r_value_l, p_value_l)
print('Increasing rate last female per year (%):', slope_l*100, ' SD:', std_err_l*100)

slope_co, intercept_co, r_value_co, p_value_co, std_err_co = stats.linregress(
    df_co.year.astype('float').unique(),df_co.P_coauthor_F_year.unique())

tmp = stats.linregress(
    df_co.year.astype('float').unique(),df_co.P_coauthor_F_year.unique())

intercept_stderr_co = tmp.intercept_stderr

print('\n')
print('Correlation between coauthor female and year (r,p-value):', r_value_co, p_value_co)
print('Increasing rate coauthor female per year (%):', slope_co*100, ' SD:', std_err_co*100)


slope_au, intercept_au, r_value_au, p_value_au, std_err_au = stats.linregress(
    df.year.astype('float').unique(),df.Number_authors_year.unique())

tmp = stats.linregress(
    df.year.astype('float').unique(),df.Number_authors_year.unique())

intercept_stderr_au = tmp.intercept_stderr

print('\n')
print('Correlation between number of authors and year (r,p-value):', r_value_au, p_value_au)
print('Increasing rate number of authors per year (%):', slope_au*100, ' SD:', std_err_au*100)



slope_alf, intercept_alf, r_value_alf, p_value_alf, std_err_alf = stats.linregress(
    df.year.astype('float').unique(),df.P_atleast_F_year.unique())

tmp = stats.linregress(
    df.year.astype('float').unique(),df.P_atleast_F_year.unique())

intercept_stderr_alf = tmp.intercept_stderr

print('\n')
print('Correlation between prob. at least one female author and year (r,p-value):', r_value_alf, p_value_alf)
print('Increasing rate prob. at least one female author per year (%):', slope_alf*100, ' SD:', std_err_alf*100)

slope_alm, intercept_alm, r_value_alm, p_value_alm, std_err_alm = stats.linregress(
    df.year.astype('float').unique(),df.P_atleast_M_year.unique())

tmp = stats.linregress(
    df.year.astype('float').unique(),df.P_atleast_M_year.unique())

intercept_stderr_alm = tmp.intercept_stderr

print('\n')
print('Correlation between prob. at least one male author and year (r,p-value):', r_value_alm, p_value_alm)
print('Increasing rate prob. at least one male author per year (%):', slope_alm*100, ' SD:', std_err_alm*100)


In [None]:
slope_mix, intercept_mix, r_value_mix, p_value_mix, std_err_mix = stats.linregress(
    df.year.astype('float').unique(),df.Mixed_year.unique())

tmp = stats.linregress(
    df.year.astype('float').unique(),df.Mixed_year.unique())

intercept_stderr_mix = tmp.intercept_stderr


print('Correlation between prob. mixed-gender-authored publications and year (r,p-value):', 
      r_value_mix, p_value_mix)
print('Increasing rate prob. mixed-gender-authored publications per year (%):', slope_mix*100, ' SD:', std_err_mix*100)



Linear trends are statistically significant. 

The rates of increase per year are weirdly consistent for first and last author: 0.28 % for first authors and 0.31 % for last authors. For coauthors, is almost twice faster the incres: 0.556%

# when would seismology reach parity?

In [None]:
year_parity_first = (0.5 - intercept_1) / slope_1
print('Female and male first authorship will be equally likely in', year_parity_first - 2022, 'years')

year_parity_last = (0.5 - intercept_l) / slope_l
print('Female and male last authorship will be equally likely in', year_parity_last - 2022, 'years')

year_parity_co = (0.5 - intercept_co) / slope_co
print('Female and male coauthorship will be equally likely in', year_parity_co - 2022, 'years')

year_parity_alf = (0.98 - intercept_alf) / slope_alf
print('Female coauthors will appear in all papers in', year_parity_alf - 2022, 'years')

year_parity_alm = (0.98 - intercept_alm) / slope_alm
print('Male coauthors will appear in all papers in', year_parity_alm - 2022, 'years')

In [None]:
year_parity_mix = (0.98 - intercept_mix) / slope_mix
print('Mixed-gender authors will appear in all papers in', year_parity_mix - 2022, 'years')

### Oh...in just about 75 - 95 years! 
While the rate of increase in last authorship is ever so slightly higher, the level of last authorships is lower to begin with, so to reach parity will take a bit longer.