In [1]:
import os  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path  # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [2]:
from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module import researchpy_fork as rp # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module import specification_curve_fork as specy # type:ignore # isort:skip # fmt:skip # noqa # nopep8


Using MPS


0it [00:00, ?it/s]

<Figure size 640x480 with 0 Axes>

# Functions

In [3]:
# Function to order categories
def categorize_df_results_gender_age(df, gender_order=None, age_order=None, ivs=None):
    if gender_order is None:
        gender_order = ['Female', 'Mixed Gender', 'Male']
    if age_order is None:
        age_order = ['Older', 'Mixed Age', 'Younger']
    if ivs is None:
        ivs = ['Gender', 'Age']
    # Arrange Categories
    for iv in ivs:
        if iv == 'Gender':
            order = gender_order
        elif iv == 'Age':
            order = age_order
        try:
            df[iv] = df[iv].astype('category').cat.reorder_categories(order, ordered=True)

            df[iv] = pd.Categorical(
                df[iv], categories=order, ordered=True
            )
            df[f'{iv}_Num'] = pd.to_numeric(df[iv].cat.codes).astype('int64')
        except ValueError as e:
            print(e)

    return df


# READ DATA

In [4]:
with open(f'{data_dir}df_manual_len.txt', 'r') as f:
    df_manual_len = int(f.read())

df_manual = pd.read_pickle(f'{df_save_dir}df_manual_for_training.pkl')
assert len(df_manual) == df_manual_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_manual_len} BUT IS OF LENGTH {len(df_manual)}'
print(f'Dataframe loaded with shape: {df_manual.shape}')


Dataframe loaded with shape: (5947, 68)


In [5]:
with open(f'{data_dir}df_jobs_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

# df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
print(f'Dataframe loaded with shape: {df_jobs.shape}')


Dataframe loaded with shape: (307300, 83)


In [6]:
df_manual = categorize_df_results_gender_age(df_manual)

In [7]:
df_jobs = categorize_df_results_gender_age(df_jobs)

## Set dataframes

In [8]:
dataframes = {
    'df_manual': df_manual,
    'df_jobs': df_jobs,
}

## Analysis plan:

1. [Descriptives, visualizations, and tables](./1.%20descriptives_visualization_and_tables.ipynb)
2. [Frequencies and Normality tests](./2.%20frequencies_and_normality_test.ipynb)
   1. Frequencies, histograms, and QQ plots
      * Normal test
      * Kurtosis test
      * Shapiro
      * Anderson
      * Bartlett
   2. Correlation between independent variables (ivs) and control variables and Multicolinarity test
      * Pearson's R
      * VIF
     - **ivs_dummy** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
     - **ivs_perc** (continous ratio) = Social category percentage per sector (0-100)
     - **% Sector per Workforce** (continous ratio) = Sector percentage per worksforce (0-100)
     - **num_words** (continous ratio) = Number of words in job description
     - **English Requirement in Sentence** (binary nominal) = English requirement in job description (0 vs. 1)
     - **Dutch Requirement in Sentence** (binary nominal) = Dutch requirement in job description (0 vs. 1)

3. [Secondary Analysis](./3.%20chisqt_anova_and_regression.ipynb)

   1. Chi-square
      * df_manual:
        - **dvs** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - **ivs** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
      * df_jobs:
        - **dvs** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - **ivs** (binary nominal) = Social category designation (Female, Male, Mixed Gender)

   2. One-way ANOVA, interactions, and post-hoc test
      * df_manual:
        - **dvs** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - **ivs** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
          - If Levene's test is *not significant*, use classic ANOVA and Tukey's post hoc test
          - If Levene's test is *significant*, use Welch's and Kruskal-Wallis ANOVA and Games Howell's post hoc test
      * df_jobs:
         - **dvs_prob** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
         - **ivs** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
           - If Levene's test is *not significant*, use classic ANOVA and Tukey's post hoc test
           - If Levene's test is *significant*, use Welch's and Kruskal-Wallis ANOVA and Games Howell's post hoc test

   3. Logistic Regression  with all interaction (smf):
      * df_manual:
        - **dvs** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - **ivs_perc** (continous ratio) = Social category percentage per sector (0-100)
      * df_jobs:
        - **dvs** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - **ivs_perc** (continous ratio) = Social category percentage per sector (0-100)
   4. OLS Regression with all interaction:
      * df_jobs:
        - **dvs_prob** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
        - **ivs_perc** (continous ratio) = Social category percentage per sector (0-100)
   5. Multilevel OLS Regression with all interaction:
      * df_jobs:
        - **dvs_prob** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
        - **ivs_perc** (continous ratio) = Social category percentage per sector (0-100)

4. [Main Analysis](./4.%20specification_curve_analysis.ipynb)

   1. Logistic Specification Curve Analysis:
      * df_manual:
        - **dvs** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - **ivs_perc** (continous ratio) = Social category percentage per sector (0-100)
       * df_jobs:
        - **dvs** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - **ivs_perc** (continous ratio) = Social category percentage per sector (0-100)
   2. OLS Specification Curve Analysis:
      * df_jobs:
        - **dvs_prob** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
        - **ivs_perc** (continous ratio) = Social category percentage per sector (0-100)


# Specification Curve Analysis

In [9]:
# Make dicts of models and IVs
sm_models = {
    'logistic': sm.Logit,
    'OLS': sm.OLS,
}

ivs_for_spec = {
    'dummy': ivs_dummy,
    'percentages': ivs_perc,
    'all': ivs_dummy_and_perc,
}


In [10]:
%%time
for (df_name, df), (model_name, model), (iv_type, ivs) in tqdm_product(dataframes.items(), sm_models.items(), ivs_for_spec.items()):

    if df_name == 'df_manual':
        dvs_ = dvs
    elif df_name == 'df_jobs':
        dvs_ = dvs_all

    print(f'{"="*5} {model_name.upper()} REGRESSION SPECIFICATION MODE RESULTS FOR {df_name} USING {iv_type.upper()} {"="*5}')
    print(f'Running specification curve analysis with:\nDEPENDENT VARIABLES = {dvs_}\nINDEPENDENT VARIABLES = {ivs}\nCONTROLS = {controls}')

    with contextlib.suppress(np.linalg.LinAlgError):
        sc = specy.SpecificationCurve(df=df, y_endog=dvs_, x_exog=ivs, controls=controls[:6])
        sc.fit(estimator=model)
        df_results = sc.df_r

        # Plot and save
        for iv, dv in tqdm_product(ivs, dvs_):
            print('~'*80)
            print(f'\n{"="*5} RESULTS FOR {iv.title()} ON {dv.title()} {"="*5}\n')
            print('~'*80)

            for image_save_format in tqdm.tqdm(['eps', 'png', 'svg']):
                if iv_type == 'dummy':
                    plot_title = f"{dv.title()} x {iv.split('_')[1].title()}-dominated Sectors"
                elif iv_type == 'percentages':
                    plot_title = f"{dv.title()} x {' '.join(iv.split('_')[-2:])}"
                save_path = f'{plot_save_path}{df_name} - Specification Curve - {iv} x {dv}.{image_save_format}'
                # Use following if not using forked specification_curve
                # sc.plot(preferred_spec=[iv, dv], save_path=save_path,)
                sc_fig = sc.plot(
                    preferred_spec=[iv, dv].extend(controls),
                    save_path=save_path,
                    show_plot=False,
                    return_fig=True,
                    plot_title=plot_title
                )

        # Get statsmodels results and save
        ## Get controls mask
        controls_mask = df_results['Specification'].apply(lambda x: all(control in x for control in controls))
        ## Get gender only results
        gender_mask = df_results['Specification'].apply(lambda x: any(item for item in ivs[:-len(ivs)//2] if item in x and len(x) == 2))
        df_results_gender = df_results[gender_mask]
        if df_results_gender[controls_mask].empty:
            print('No specification with Gender and all controls.')
        else:
            df_results_gender = df_results_gender[controls_mask]
        # Get age only results
        age_mask = df_results['Specification'].apply(lambda x: any(item for item in ivs[len(ivs)//2:] if item in x and len(x) == 2))
        df_results_age = df_results[age_mask]
        if df_results_age[controls_mask].empty:
            print('No specification with Age and all controls.')
        else:
            df_results_age = df_results_age[controls_mask]

        for df in [df_results_gender, df_results_age]:
            for idx, row in df.iterrows():
                for dv_iv in row['Specification']:
                    if dv_iv in ivs_dummy_and_perc:
                        iv_name = dv_iv
                    elif dv_iv in dvs_:
                        dv_name = dv_iv
                print('\n')
                print('+'*20)
                print(f'{dv_name} x {iv_name}\n')
                print('+'*20)
                print(f'{row["Results"].summary()}')
                print('-'*20)

                # Save results to file
                df_to_save = pd.DataFrame(csv.reader(row['Results'].summary().as_csv().split('\n'), delimiter=','))
                df_to_save.to_csv(f'{table_save_path}{model_name} specification curve {df_name} - {iv_type} - {dv_name} x {iv_name}.csv', index=False)

        # Top 10 significant highest coefficients
        df_coeff_p = df_results.loc[sc.df_r['coeff_pvals'] < 0.05].sort_values(by=['Coefficient'], ascending=False)
        print(f"Top 10 significant coefficients:\n{df_coeff_p[['x_exog', 'y_endog', 'coeff_pvals', 'Coefficient', 'conf_int', 'pvalues']].head(10)}")

        print(f'{"="*5} END OF RESULTS FOR {iv.title()} {"="*5}')
        print('~'*80, '\n')


  0%|          | 0/12 [00:00<?, ?it/s]

===== LOGISTIC REGRESSION SPECIFICATION MODE RESULTS FOR df_manual USING DUMMY =====
Running specification curve analysis with:
DEPENDENT VARIABLES = ['Warmth', 'Competence']
INDEPENDENT VARIABLES = ['Gender_Female', 'Gender_Mixed', 'Gender_Male', 'Age_Older', 'Age_Mixed', 'Age_Younger']
CONTROLS = ['% Sector per Workforce', 'Job Description num_words', 'English Requirement in Job Ad_Yes', 'Dutch Requirement in Job Ad_Yes', 'Platform_LinkedIn', 'Platform_Indeed', 'Platform_Glassdoor', 'English Requirement in Job Ad', 'Dutch Requirement in Job Ad', 'Platform']
Optimization terminated successfully.
         Current function value: 0.677163
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.645774
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.661686
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.673243
         Iterations 5
Optimization terminated su

In [None]:
# %%time
# # Logistic Specification Curve Analysis for 0:1 Warmth and Competence x percentage Gender and Age
# for df_name, df in dataframes.items():

#     print(f'{"="*5} RESULTS FOR {df_name} {"="*5}')
#     print(f'Running specification curve analysis with:\nDEPENDENT VARIABLES = {dvs}\nINDEPENDENT VARIABLES = {ivs_dummy}\nCONTROLS = {controls}')

#     sc = specy.SpecificationCurve(df=df, y_endog=dvs, x_exog=ivs_dummy, controls=controls)
#     sc.fit(estimator=sm.Logit)
#     df_results = sc.df_r

#     # Plot and save
#     for iv_dummy, dv in tqdm_product(ivs_dummy, dvs):
#         print(f'{"="*5} RESULTS FOR {iv_dummy.title()} ON {dv.title()} {"="*5}')

#         for image_save_format in tqdm.tqdm(['eps', 'png', 'svg']):
#             save_path = f'{plot_save_path}Specification Curve - {iv_dummy} x {dv}.{image_save_format}'
#             # Use following if not using forked specification_curve
#             # sc.plot(preferred_spec=[iv, dv], save_path=save_path,)
#             sc_fig  = sc.plot(
#                 preferred_spec=[iv_dummy, dv],
#                 save_path=save_path,
#                 show_plot=False,
#                 return_fig=True,
#                 plot_title=f"{dv.title()} x {iv_dummy.split('_')[1].title()}-dominated Sectors"
#             )
#         print(sc_fig)

#     # Get statsmodels results and save
#     ## Get gender only results
#     gender_mask = df_results['Specification'].apply(lambda x: any(item for item in ['Gender_Female_% per Sector', 'Gender_Male_% per Sector'] if item in x and len(x) == 2))
#     df_results_gender = df_results[gender_mask]
#     # Get age only results
#     age_mask = df_results['Specification'].apply(lambda x: any(item for item in ['Age_Older_% per Sector', 'Age_Younger_% per Sector'] if item in x and len(x) == 2))
#     df_results_age = df_results[age_mask]

#     for df in [df_results_gender, df_results_age]:
#         for idx, row in df.iterrows():
#             for dv_iv in row["Specification"]:
#                 if dv_iv in ivs_dummy:
#                     iv_name = dv_iv
#                 elif dv_iv in dvs:
#                     dv_name = dv_iv
#             print('\n')
#             print('+'*20)
#             print(f'{dv_name} x {iv_name}\n')
#             print('+'*20)
#             print(f'{row["Results"].summary()}')
#             print('-'*20)

#             # Save results to file
#             df_to_save = pd.DataFrame(csv.reader(row["Results"].summary().as_csv().split('\n'), delimiter=','))
#             df_to_save.to_csv(f'{table_save_path}logistic specification curve dummy - {dv_name} x {iv_name} {df_name}.csv', index=False)

#     # Top 10 significant highest coefficients
#     df_coeff_p = df_results.loc[sc.df_r['coeff_pvals'] < 0.05].sort_values(by=['Coefficient'], ascending=False)
#     print(f"Top 10 significant coefficients:\n{df_coeff_p[['x_exog', 'y_endog', 'coeff_pvals', 'Coefficient', 'conf_int', 'pvalues']].head(10)}")

#     print(f'{"="*5} END OF RESULTS FOR {iv_dummy.title()} {"="*5}')


In [None]:
# %%time
# # Logistic Specification Curve Analysis for 0:1 Warmth and Competence x percentage Gender and Age
# for df_name, df in dataframes.items():

#     print(f'{"="*5} RESULTS FOR {df_name} {"="*5}')
#     print(f'Running specification curve analysis with:\nDEPENDENT VARIABLES = {dvs}\nINDEPENDENT VARIABLES = {ivs_perc}\nCONTROLS = {controls}')

#     sc = specy.SpecificationCurve(df=df, y_endog=dvs, x_exog=ivs_perc, controls=controls)
#     sc.fit(estimator=sm.Logit)
#     df_results = sc.df_r

#     # Plot and save
#     for iv_perc, dv in tqdm_product(ivs_perc, dvs):
#         print(f'{"="*5} RESULTS FOR {iv_perc.title()} ON {dv.title()} {"="*5}')

#         for image_save_format in tqdm.tqdm(['eps', 'png', 'svg']):
#             save_path = f'{plot_save_path}Specification Curve - {iv_perc} x {dv}.{image_save_format}'
#             # Use following if not using forked specification_curve
#             # sc.plot(preferred_spec=[iv, dv], save_path=save_path,)
#             sc_fig  = sc.plot(
#                 preferred_spec=[iv_perc, dv],
#                 save_path=save_path,
#                 show_plot=False,
#                 return_fig=True,
#                 plot_title=f"{dv.title()} x {' '.join(ivs_perc[0].split('_')[-2:])}"
#             )
#         print(sc_fig)

#     # Get statsmodels results and save
#     ## Get gender only results
#     gender_mask = df_results['Specification'].apply(lambda x: any(item for item in ['Gender_Female_% per Sector', 'Gender_Male_% per Sector'] if item in x and len(x) == 2))
#     df_results_gender = df_results[gender_mask]
#     # Get age only results
#     age_mask = df_results['Specification'].apply(lambda x: any(item for item in ['Age_Older_% per Sector', 'Age_Younger_% per Sector'] if item in x and len(x) == 2))
#     df_results_age = df_results[age_mask]

#     for df in [df_results_gender, df_results_age]:
#         for idx, row in df.iterrows():
#             for dv_iv in row["Specification"]:
#                 if dv_iv in ivs_perc:
#                     iv_name = dv_iv
#                 elif dv_iv in dvs:
#                     dv_name = dv_iv
#             print('\n')
#             print('+'*20)
#             print(f'{dv_name} x {iv_name}\n')
#             print('+'*20)
#             print(f'{row["Results"].summary()}')
#             print('-'*20)

#             # Save results to file
#             df_to_save = pd.DataFrame(csv.reader(row["Results"].summary().as_csv().split('\n'), delimiter=','))
#             df_to_save.to_csv(f'{table_save_path}logistic specification curve percentages - {dv_name} x {iv_name} {df_name}.csv', index=False)

#     # Top 10 significant highest coefficients
#     df_coeff_p = df_results.loc[sc.df_r['coeff_pvals'] < 0.05].sort_values(by=['Coefficient'], ascending=False)
#     print(f"Top 10 significant coefficients:\n{df_coeff_p[['x_exog', 'y_endog', 'coeff_pvals', 'Coefficient', 'conf_int', 'pvalues']].head(10)}")

#     print(f'{"="*5} END OF RESULTS FOR {iv_perc.title()} {"="*5}')
