In [1]:
# Core analysis packages
import numpy as np
import os, sys
import pandas as pd
from scipy import stats
from scipy.special import comb
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats import anova
# from patsy import dmatrices
import bff
import pingouin as pg
import researchpy

import networkx as nx
from sklearn.preprocessing import StandardScaler

# Plotting packages
import matplotlib.pyplot as plt
plt.rcdefaults()
import seaborn as sns 
from adjustText import adjust_text # For nonoverlapping text labels on plots
sns.set(style="ticks", color_codes=True)
sns.set_style("white")
sns.set_style({'xtick.bottom': True, 'ytick.left': True})
colorref = ["gray", "royalblue", "crimson", "goldenrod", "mediumorchid", "seagreen"]

# iPython magic commands
%matplotlib notebook
%load_ext autoreload
%autoreload 2
%autosave 30

SMALL_SIZE = 12
MEDIUM_SIZE = 12
BIG_SIZE = 14

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIG_SIZE)  # fontsize of the figure title
cust_palette = sns.color_palette("Paired")[6:10]
cust_palette = [cust_palette[i] for i in [1,0,3,2]]

def median_split(S):
    return S > S.median()

  **kwargs
  **kwargs


Autosaving every 30 seconds


In [2]:
def ttest_ind(x1, x2, equivar=False, alpha=0.05, printres=False):
    n1 = len(x1)
    M1 = np.mean(x1)
    s1 = np.std(x1, ddof=1)
    n2 = len(x2)
    M2 = np.mean(x2)
    s2 = np.std(x2, ddof=1)
    
    # t-test
    [t, p] = stats.ttest_ind(x1, x2, equal_var=equivar)
    # cohen's d
    dof = n1 + n2 - 2
    sp = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2) / dof)
    d = np.abs(M1 - M2) / sp
    # degrees of freedom
    df = (s1**2/n1 + s2**2/n2)**2 / ((s1**2/n1)**2/(n1-1) + (s2**2/n2)**2/(n2-1))
    # confidence intervals (M1 - M2) ± ts(M1 - M2)
    se = np.sqrt(sp**2/n1 + sp**2/n2)
    CI = (M1 - M2) + np.array([-1,1])*stats.t.ppf(1-alpha/2, df, loc=0, scale=1)*se

    res = (t, df, p, d, CI[0], CI[1])
    if printres:
        print("t = %.5f, df = %.5f, p = %.5f, d = %.5f, CI = (%.5f, %.5f)" % res)
    else:
        return res

In [15]:
# Load file (from same directory as the notebook)
df = pd.read_excel(os.path.expanduser("Study2.xlsx"))
df.head()

Unnamed: 0,Country,Rgion,GGG,GGG_inverse,PercentF,PercentM,PercentWomen_population
0,Albania,CSEurope,0.769,0.231,0.52027,0.47973,49.102995
1,Algeria,Middle East and North Africa,0.634,0.366,0.326923,0.673077,49.479405
2,Argentina,SouthAmerica,0.746,0.254,0.475,0.525,51.214146
3,Armenia,CSEurope,0.684,0.316,0.514706,0.485294,52.965341
4,Australia,AngloSaxon,0.731,0.269,0.48,0.52,50.201238


In [4]:
fig, ax = plt.subplots(1,1, figsize=(7,5))

sns.regplot(x=df.GGG_inverse, y=df.PercentM, scatter_kws={"color": "#88678E"}, \
            line_kws={"color":"#88678E","alpha":0.7,"lw":3}, ax=ax)

X = df.GGG_inverse
Y = df.PercentM
T = df.Country
texthandles = [plt.text(X[i], Y[i], T[i], ha='center', va='center', fontsize=8) for i in range(df.shape[0])]
adjust_text(texthandles)
ax.set_ylabel('Percent Men in Google Image Search Results')
ax.set(xlabel='Gender Inequality (Global Gender Gap Index)')
plt.ylim(.25, .9)

sns.despine()
plt.tight_layout()
plt.savefig('Figure2.pdf', format="pdf")

<IPython.core.display.Javascript object>

In [19]:
scaler = StandardScaler()
df[['GGG_inverse', 'PercentM', 'PercentWomen_population']] = scaler.fit_transform(df[['GGG_inverse', 'PercentM', 'PercentWomen_population']])
df.head()


Unnamed: 0,Country,Rgion,GGG,GGG_inverse,PercentF,PercentM,PercentWomen_population
0,Albania,CSEurope,0.769,-0.704,0.52027,-0.710756,-0.875544
1,Algeria,Middle East and North Africa,0.634,1.70442,0.326923,0.964811,-0.642503
2,Argentina,SouthAmerica,0.746,-0.293676,0.475,-0.318439,0.431503
3,Armenia,CSEurope,0.684,0.812413,0.514706,-0.662534,1.515696
4,Australia,AngloSaxon,0.731,-0.026074,0.48,-0.361769,-0.195604


In [25]:
md = smf.ols("PercentM ~ GGG_inverse", df)
mdf = md.fit()
print(mdf.summary())

                            OLS Regression Results                            
Dep. Variable:               PercentM   R-squared:                       0.250
Model:                            OLS   Adj. R-squared:                  0.235
Method:                 Least Squares   F-statistic:                     16.69
Date:                Fri, 22 Apr 2022   Prob (F-statistic):           0.000159
Time:                        09:45:20   Log-Likelihood:                -66.295
No. Observations:                  52   AIC:                             136.6
Df Residuals:                      50   BIC:                             140.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept   -5.343e-16      0.122  -4.36e-15      

In [26]:
md = smf.ols("PercentM ~ GGG_inverse + PercentWomen_population", df)
mdf = md.fit()
print(mdf.summary())

                            OLS Regression Results                            
Dep. Variable:               PercentM   R-squared:                       0.258
Model:                            OLS   Adj. R-squared:                  0.228
Method:                 Least Squares   F-statistic:                     8.541
Date:                Fri, 22 Apr 2022   Prob (F-statistic):           0.000657
Time:                        09:45:50   Log-Likelihood:                -66.009
No. Observations:                  52   AIC:                             138.0
Df Residuals:                      49   BIC:                             143.9
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept               -5

In [10]:
# Load file (from same directory as the notebook)
df = pd.read_excel(os.path.expanduser("study12.xlsx"))
df.head()

Unnamed: 0,Country,Study2,Study1
0,Albania,0.52027,
1,Algeria,0.326923,
2,Argentina,0.475,
3,Armenia,0.514706,
4,Australia,0.48,0.363636


In [11]:
md = smf.ols(" Study2 ~ Study1", df)
mdf = md.fit()
print(mdf.summary())

                            OLS Regression Results                            
Dep. Variable:                 Study2   R-squared:                       0.276
Model:                            OLS   Adj. R-squared:                  0.251
Method:                 Least Squares   F-statistic:                     11.06
Date:                Wed, 09 Mar 2022   Prob (F-statistic):            0.00240
Time:                        20:52:27   Log-Likelihood:                 27.889
No. Observations:                  31   AIC:                            -51.78
Df Residuals:                      29   BIC:                            -48.91
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1731      0.082      2.115      0.0