In [None]:
import os  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path  # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

for _ in range(5):

    parent_path = str(Path.cwd().parents[_]).split('/')[-1]

    if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

        code_dir = str(Path.cwd().parents[_])

        if code_dir is not None:
            break

sys.path.append(code_dir)
# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module import researchpy_fork as rp # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module import specification_curve_fork as specy # type:ignore # isort:skip # fmt:skip # noqa # nopep8


# Functions

# READ DATA

In [None]:
with open(f'{data_dir}df_manual_len.txt', 'r') as f:
    df_manual_len = int(f.read())

df_manual = pd.read_pickle(f'{df_save_dir}df_manual_for_training.pkl')
assert len(df_manual) == df_manual_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_manual_len} BUT IS OF LENGTH {len(df_manual)}'
print(f'Dataframe loaded with shape: {df_manual.shape}')
df_manual = categorize_df_gender_age(df_manual)


In [None]:
with open(f'{data_dir}df_jobs_for_analysis_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
print(f'Dataframe loaded with shape: {df_jobs.shape}')
df_jobs = categorize_df_gender_age(df_jobs)


# Analysis plan:

1. ## [Descriptives, visualizations, and tables](./1.%20descriptives_visualization_and_tables.ipynb)
2. ## [Frequencies and Normality tests](./2.%20frequencies_and_normality_test.ipynb)
   1. ### Frequencies, histograms, and QQ plots
      * Normal test
      * Kurtosis test
      * Shapiro
      * Anderson
      * Bartlett
   2. ### Correlation between independent variables (IVs) and control variables and Multicolinarity test
      * Pearson's R
      * VIF
     - ***ivs_dummy*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
     - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
     - ***% Sector per Workforce*** (continous ratio) = Sector percentage per worksforce (0-100)
     - ***num_words*** (continous ratio) = Number of words in job description
     - ***English Requirement in Job Ad*** (binary nominal) = English requirement in job description (0 vs. 1)
     - ***Dutch Requirement in Job Ad*** (binary nominal) = Dutch requirement in job description (0 vs. 1)
     - ***Platform*** (binary dummy) = LinkedIn (0 vs. 1), Indeed (0 vs. 1), Glassdoor (0 vs. 1)

3. ## [ANOVA and Chi-square (Pearson's R)](./3.%20chisqt_and_anova.ipynb)

   1. ### Chi-square
      * **df_manual:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
      * **df_jobs:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)

   2. ### One-way ANOVA, interactions, and post-hoc test
      * **df_manual:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
          - If Levene's test is *not significant*, use classic ANOVA and Tukey's post hoc test
          - If Levene's test is *significant*, use Welch's and Kruskal-Wallis ANOVA and Games Howell's post hoc test
      * **df_jobs:**
         - ***dvs_prob*** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
         - ***ivs*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
           - If Levene's test is *not significant*, use classic ANOVA and Tukey's post hoc test
           - If Levene's test is *significant*, use Welch's and Kruskal-Wallis ANOVA and Games Howell's post hoc test

4. ## [Regression Analysis](./3.%20regression_analysis.ipynb)
   1. ### Logistic Regression  with all interaction (smf):
      * **df_manual:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
      * **df_jobs:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
   2. ### OLS Regression with all interaction:
      * **df_jobs:**
        - ***dvs_prob*** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
   3. ### Multilevel OLS Regression with all interaction:
      * **df_jobs:**
        - ***dvs_prob*** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)

5. ## [Specification Curve Analysis](./4.%20specification_curve_analysis.ipynb)

   1. ### Logistic Specification Curve Analysis:
      * **df_manual:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
      * **df_jobs:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
   2. ### OLS Specification Curve Analysis:
      * **df_jobs:**
        - ***dvs_prob*** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)


## Set dataframes

#### Dataframes dict

In [None]:
dataframes = {
    'df_manual': df_manual,
    'df_jobs': df_jobs,
}

# Specification Curve Analysis

#### Models, IVs, and controls dict

In [None]:
# Models dict
sm_models = {
    'logistic': sm.Logit,
    'OLS': sm.OLS,
}

# IVs dict
ivs_for_spec = {
    'dummy': ivs_dummy,
    'percentages': ivs_perc,
    'all': ivs_dummy_and_perc,
}

# Controls dict
controls_for_spec_dict = {
    'perc_words': controls[:2],
    'perc_words_lang': controls[:4],
    'perc_words_lang_platform': controls[:6],
}

In [None]:
# Reset font size to 8 for readability
mpl.rcParams['font.size'] = 8
plt.rcParams['font.size'] = 8

In [None]:
%%time
for (df_name, df), (model_name, model), (ivs_type, ivs_), (controls_name, controls_) in tqdm_product(dataframes.items(), sm_models.items(), ivs_for_spec.items(), controls_for_spec_dict.items()):

    # Set DV and plot title suffix
    if df_name == 'df_manual':
        dvs_ = dvs
        dvs_type = 'binary'
        plot_title_prefix = 'Binary Coding of Warmth and Competence x '
    elif df_name == 'df_jobs':
        dvs_ = dvs_all
        dvs_type = 'binary and probability'
        plot_title_prefix = 'Binary Coding and Probability of Warmth and Competence x '

    # Set IV and plot title prefix
    if ivs_type == 'dummy':
        plot_title_suffix = 'Categorical Dominant Social Category of Sector'
    elif ivs_type == 'percentages':
        plot_title_suffix = 'Percentages of Social Category per Sector'
    elif ivs_type == 'all':
        plot_title_suffix = 'Categorical Dominant Social Category of Sector and Percentages of Social Category per Sector'

    # Set controls
    if controls_name == 'perc_words':
        plot_title_suffix += ' + % Sector per Workforce + Job Description num_words'
    elif controls_name == 'perc_words_lang':
        plot_title_suffix += ' + % Sector per Workforce + Job Description num_words + Language Requirements'
    elif controls_name == 'perc_words_lang_platform':
        plot_title_suffix += ' + % Sector per Workforce + Job Description num_words + Language Requirements + Platform'

    # Run specification curve analysis
    print(f'{"="*5} {model_name.upper()} REGRESSION SPECIFICATION MODE RESULTS FOR {df_name} USING {dvs_type.upper()} x {ivs_type.upper()} + {controls_name.upper()} {"="*5}')
    print(f'Running specification curve analysis with:\nDEPENDENT VARIABLES = {dvs_}\nINDEPENDENT VARIABLES = {ivs_}\nCONTROLS = {controls}')

    try:
        sc = specy.SpecificationCurve(df=df, y_endog=dvs_, x_exog=ivs_, controls=controls_)
        sc.fit(estimator=model)
        df_results = sc.df_r

        # Plot and save
        plot_title = f'{plot_title_prefix}{plot_title_suffix}'
        print('~'*80)
        print(f'\n{"="*5} RESULTS FOR {plot_title} {"="*5}\n')
        print('~'*80)

        for image_save_format in tqdm.tqdm(['eps', 'png', 'svg']):
            # Use following if not using forked specification_curve
            # sc.plot(preferred_spec=[iv, dv], save_path=save_path,)
            save_path = f'{plot_save_path}{df_name} - Specification Curve - {dvs_type} Warmth and Competence x {ivs_type} Gender and Age + {controls_name}.{image_save_format}'
            sc_fig = sc.plot(
                save_path=save_path,
                show_plot=False,
                return_fig=True,
                plot_title=plot_title
            )

        # Get statsmodels results and save
        ## Get controls mask
        controls_mask = df_results['Specification'].apply(lambda x: all(control in x for control in controls_))
        ## Get gender only results
        gender_mask = df_results['Specification'].apply(lambda x: any(item for item in ivs_[:-len(ivs_)//2] if item in x and len(x) == 2))
        df_results_gender = df_results[gender_mask]
        if df_results_gender[controls_mask].empty:
            print('No specification with Gender and all controls.')
        else:
            df_results_gender = df_results_gender[controls_mask]
        # Get age only results
        age_mask = df_results['Specification'].apply(lambda x: any(item for item in ivs_[len(ivs_)//2:] if item in x and len(x) == 2))
        df_results_age = df_results[age_mask]
        if df_results_age[controls_mask].empty:
            print('No specification with Age and all controls.')
        else:
            df_results_age = df_results_age[controls_mask]

        for df in [df_results_gender, df_results_age]:
            for idx, row in df.iterrows():
                for dv_iv in row['Specification']:
                    if dv_iv in ivs_dummy_and_perc:
                        iv_name = dv_iv
                    elif dv_iv in dvs_:
                        dv_name = dv_iv
                print('\n')
                print('+'*20)
                print(f'{dv_name} x {iv_name}\n')
                print('+'*20)
                print(f'{row["Results"].summary()}')
                print('-'*20)

                # Save results to file
                df_to_save = pd.DataFrame(csv.reader(row['Results'].summary().as_csv().split('\n'), delimiter=','))
                df_to_save.to_csv(f'{table_save_path}{model_name} specification curve {df_name} - {dvs_type} Warmth and Competence x {ivs_type} Gender and Age + {controls_name}.csv', index=False)

        # Top 10 significant highest coefficients
        df_coeff_p = df_results.loc[sc.df_r['coeff_pvals'] < 0.05].sort_values(by=['Coefficient'], ascending=False)
        print(f"Top 10 significant coefficients:\n{df_coeff_p[['x_exog', 'y_endog', 'coeff_pvals', 'Coefficient', 'conf_int', 'pvalues']].head(10)}")

    except(np.linalg.LinAlgError):
        print(f'Singular matrix when using {model_name} with {controls_name} + {dvs_type} x {ivs_type}')

    print(f'{"="*5} END OF RESULTS FOR {dvs_type.upper()} x {ivs_type.upper()} + {controls_name.upper()} {"="*5}')
    print('~'*80, '\n')
