### START HERE IF SOURCING FROM df_manual_FOR_training
### PLEASE SET CORRECT DIRECTORY PATHS BELOW


# Descriptives and visualization


In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

if code_dir_name not in str(Path.cwd()).split('/')[-1]:
    for _ in range(5):

        parent_path = str(Path.cwd().parents[_]).split('/')[-1]

        if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

            code_dir = str(Path.cwd().parents[_])

            if code_dir is not None:
                break
else:
    code_dir = str(Path.cwd())
sys.path.append(code_dir)

# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module import researchpy_fork as rp # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module.statannotations_fork.Annotator import Annotator # type:ignore # isort:skip # fmt:skip # noqa # nopep8


In [None]:
try:
    df_sectors_all = pd.read_pickle(f'{table_save_path}Sectors Output from script.pkl')
except FileNotFoundError:
    cbs_notebook = '\\'.join(f'{scraped_data}CBS/CBS.ipynb')
    %run $cbs_notebook import df_sectors_all # type:ignore # isort:skip # fmt:skip # noqa # nopep8


# Functions

In [None]:
def show_and_close_plots():
    plt.show()
    plt.clf()
    plt.cla()
    plt.close()


In [None]:
def close_plots():
    plt.clf()
    plt.cla()
    plt.close()


# Analysis plan:

1. ## [Descriptives and tables](./1.%20descriptives_and_tables.ipynb)
2. ## [Visualization](./2.%20visualization.ipynb)
3. ## [Frequencies and Normality tests](./2.%20frequencies_and_normality_test.ipynb)
   1. ### Frequencies, histograms, and QQ plots
      * Normal test
      * Kurtosis test
      * Shapiro
      * Anderson
      * Bartlett
   2. ### Correlation between independent variables (IVs) and control variables and Multicolinarity test
      * Pearson's R
      * VIF
     - ***ivs_dummy*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
     - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
     - ***% Sector per Workforce*** (continous ratio) = Sector percentage per worksforce (0-100)
     - ***num_words*** (continous ratio) = Number of words in job description
     - ***English Requirement in Job Ad*** (binary nominal) = English requirement in job description (0 vs. 1)
     - ***Dutch Requirement in Job Ad*** (binary nominal) = Dutch requirement in job description (0 vs. 1)
     - ***Platform*** (binary dummy) = LinkedIn (0 vs. 1), Indeed (0 vs. 1), Glassdoor (0 vs. 1)

4. ## [ANOVA and Chi-square (Pearson's R)](./3.%20chisqt_and_anova.ipynb)

   1. ### Chi-square
      * **df_manual:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
      * **df_jobs:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)

   2. ### One-way ANOVA, interactions, and post-hoc test
      * **df_manual:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
          - If Levene's test is *not significant*, use classic ANOVA and Tukey's post hoc test
          - If Levene's test is *significant*, use Welch's and Kruskal-Wallis ANOVA and Games Howell's post hoc test
      * **df_jobs:**
         - ***dvs_prob*** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
         - ***ivs*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
           - If Levene's test is *not significant*, use classic ANOVA and Tukey's post hoc test
           - If Levene's test is *significant*, use Welch's and Kruskal-Wallis ANOVA and Games Howell's post hoc test

5. ## [Regression Analysis](./3.%20regression_analysis.ipynb)
   1. ### Logistic Regression  with all interaction (smf):
      * **df_manual:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
      * **df_jobs:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
   2. ### OLS Regression with all interaction:
      * **df_jobs:**
        - ***dvs_prob*** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
   3. ### Multilevel OLS Regression with all interaction:
      * **df_jobs:**
        - ***dvs_prob*** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)

6. ## [Specification Curve Analysis](./4.%20specification_curve_analysis.ipynb)

   1. ### Logistic Specification Curve Analysis:
      * **df_manual:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
      * **df_jobs:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
   2. ### OLS Specification Curve Analysis:
      * **df_jobs:**
        - ***dvs_prob*** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)


# Descriptives

### READ DATA

In [None]:
with open(f'{data_dir}df_manual_len.txt', 'r') as f:
    df_manual_len = int(f.read())

df_manual = pd.read_pickle(f'{df_save_dir}df_manual_for_analysis.pkl')
assert len(df_manual) == df_manual_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_manual_len} BUT IS OF LENGTH {len(df_manual)}'
print(f'Dataframe loaded with shape: {df_manual.shape}')
df_manual = categorize_df_gender_age(df_manual)


In [None]:
with open(f'{data_dir}df_jobs_for_analysis_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_manual)}'
print(f'Dataframe loaded with shape: {df_jobs.shape}')
df_jobs = categorize_df_gender_age(df_jobs)


In [None]:
dataframes = {
    'df_jobs': df_jobs,
    # 'df_manual': df_manual,
}


### All info

In [None]:
# All info
analysis_columns = [
    'Warmth',
    'Competence'
]

for df_name, df in dataframes.items():
    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    df = categorize_df_gender_age(df)

    df.info()


In [None]:
non_list_columns = [c for c in df_jobs.columns if not df_jobs[c].apply(lambda x: isinstance(x, list)).any()]
non_list_columns = df_jobs.columns.get_indexer(non_list_columns)


In [None]:
dfSummary(df_jobs.iloc[:, non_list_columns], is_collapsible = True)


In [None]:
for df_name, df in dataframes.items():
    skim(df_jobs.iloc[:, non_list_columns])


## Sentence Level

### All Gender and Age info at Sentence Level

In [None]:
# Gender and Age info by sentence
def run_descriptives_ivs_all_sent(df_name, df, ivs_all=ivs_all):
    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    print('='*30)
    print('Gender and Age info at Sentence Level')
    print('-'*30)
    get_df_info(df, ivs_all=ivs_all)
    print('-'*30)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_descriptives_ivs_all_sent_interact(df_name):
        run_descriptives_ivs_all_sent(df_name, dataframes[df_name])
else:
    run_descriptives_ivs_all_sent(list(dataframes.keys())[0], dataframes[list(dataframes.keys())[0]])


### % Gender and Age info at Sentence Level

In [None]:
def run_descriptives_iv_percs_sent(df_name, df):
    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    for iv_perc in ivs_perc:
        min_sector = df['Sector'].loc[df[iv_perc] == df[iv_perc].min()].values[0]
        max_sector = df['Sector'].loc[df[iv_perc] == df[iv_perc].max()].values[0]
        mean = df[iv_perc].mean().round(2).astype(float)
        std = df[iv_perc].std().round(2).astype(float)
        print(f'{iv_perc}:\nMin Sector: {df[iv_perc].min():.1f}% in {min_sector}\nMax Sector: {df[iv_perc].max():.1f}% in {max_sector}\nMean: {mean}\nStandard Deviation: {std}\n')
        print('-'*20)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_descriptives_iv_percs_sent_interact(df_name):
        run_descriptives_iv_percs_sent(df_name, dataframes[df_name])
else:
    run_descriptives_iv_percs_sent(list(dataframes.keys())[0], dataframes[list(dataframes.keys())[0]])


### All Warmth and Competence info at Sentence Level

In [None]:
# Warmth and Competence percentages info by sentence
def run_descriptives_dvs_sent(df_name, df):
    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    print('='*30)
    print('Warmth and Competence info at Sentence Level')
    print('-'*30)
    get_df_info(df, ivs_all=dvs_all)
    print('-'*30)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_descriptives_dvs_sent_interact(df_name):
        run_descriptives_dvs_sent(df_name, dataframes[df_name])
else:
    run_descriptives_dvs_sent(list(dataframes.keys())[0], dataframes[list(dataframes.keys())[0]])


## Job Ad Level

### All Gender and Age info at Job Ad Level

In [None]:
# Gender and Age info by job ad
def run_descriptives_ivs_all_job(df_name, df, ivs_all=ivs_all):
    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    print('='*30)
    print('Gender and Age info at Job Advertisement Level')
    print('-'*30)
    get_df_info(df.groupby(['Job ID']).first(), ivs_all=ivs_all)
    print('-'*30)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_descriptives_ivs_all_job_interact(df_name):
        run_descriptives_ivs_all_job(df_name, dataframes[df_name])
else:
    run_descriptives_ivs_all_job(list(dataframes.keys())[0], list(dataframes.values())[0])


### % Gender and Age info at Job Ad Level

In [None]:
def run_descriptives_iv_percs_job(df_name, df):
    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')
    df = df.groupby(['Job ID']).first()

    for iv_perc in ivs_perc:
        min_sector = df['Sector'].loc[df[iv_perc] == df[iv_perc].min()].values[0]
        max_sector = df['Sector'].loc[df[iv_perc] == df[iv_perc].max()].values[0]
        mean = df[iv_perc].mean().round(2).astype(float)
        std = df[iv_perc].std().round(2).astype(float)
        print(f'{iv_perc}:\nMin Sector: {df[iv_perc].min():.1f}% in {min_sector}\nMax Sector: {df[iv_perc].max():.1f}% in {max_sector}\nMean: {mean}\nStandard Deviation: {std}\n')
        print('-'*20)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_descriptives_iv_percs_job_interact(df_name):
        run_descriptives_iv_percs_job(df_name, dataframes[df_name])
else:
    run_descriptives_iv_percs_job(list(dataframes.keys())[0], dataframes[list(dataframes.keys())[0]])


### All Warmth and Competence info at Job Ad Level

In [None]:
# Warmth and Competence info by job ad
def run_descriptives_dvs_job(df_name, df):
    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    print('='*30)
    print('Warmth and Competence info at Job Advertisement Level')
    print('-'*30)
    get_df_info(df.groupby(['Job ID']).first(), ivs_all=dvs_all)
    print('-'*30)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_descriptives_dvs_job_interact(df_name):
        run_descriptives_dvs_job(df_name, dataframes[df_name])
else:
    run_descriptives_dvs_job(list(dataframes.keys())[0], dataframes[list(dataframes.keys())[0]])


### All Job Ad string info at Job Ad Level

In [None]:
# Get longest and shortest sentence
def run_job_desc_lengths(df_name, df, text_col=None, num_words_col=None):
    if text_col is None:
        text_col = 'Job Description spacy_sentencized'
    if num_words_col is None:
        num_words_col = 'Job Description_num_words'
    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    print('='*30)
    print('Job Description Length at Sentence Level')
    print('-'*30)
    len_average_char = df[text_col].apply(len).mean()
    average_char = df[text_col].loc[(df[text_col].apply(len) - len_average_char).abs().idxmin()]
    len_longest_char = df[text_col].apply(len).max()
    longest_char = df[text_col].loc[df[text_col].apply(len).idxmax()]
    len_shortest_char = df[text_col].apply(len).min()
    shortest_char = df[text_col].loc[df[text_col].apply(len).idxmin()]

    len_average = df[num_words_col].mean()
    len_longest = df[num_words_col].max()
    len_shortest = df[num_words_col].min()

    print(f'Average Sentence length: {len_average}')
    print('-'*30)
    print(f'Longest Sentence length: {len_longest}')
    print('-'*30)
    print(f'Shortest Sentence length: {len_shortest}')
    print('-'*30)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_job_desc_lengths_interact(df_name):
        run_job_desc_lengths(df_name, dataframes[df_name])
else:
    run_job_desc_lengths(list(dataframes.keys())[0], dataframes[list(dataframes.keys())[0]])


# Controls

## Sentence Level

### Controls all info at Sentence Level

In [None]:
# Control variables info by sentence
def run_descriptives_controls_sent(df_name, df, controls_=None):
    if controls_ is None:
        controls_ = controls

    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    print('='*30)
    print(f'Control varibales info at Sentence Level: {controls_}')
    print('-'*30)
    get_df_info(df, ivs_all = controls_)
    print('-'*30)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_descriptives_controls_sent_interact(df_name):
        run_descriptives_controls_sent(df_name, dataframes[df_name])
else:
    run_descriptives_controls_sent(list(dataframes.keys())[0], dataframes[list(dataframes.keys())[0]])


### All info % Sector per Workforce at Sentence Level

In [None]:
def run_descriptives_sectors_all_job(df_name, df, controls_=None):
    if controls_ is None:
        controls_ = controls

    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    print('='*30)
    print('Sector info at Sentence Level')
    print('-'*30)
    get_df_info(df, ivs_all=['% Sector per Workforce'])
    print('-'*30)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_descriptives_sectors_all_job_interact(df_name):
        run_descriptives_sectors_all_job(df_name, dataframes[df_name])
else:
    run_descriptives_sectors_all_job(list(dataframes.keys())[0], dataframes[list(dataframes.keys())[0]])


### % Sector per Workforce at Sentence Level

In [None]:
def run_descriptives_sectors_job(df_name, df, controls_=None):
    if controls_ is None:
        controls_ = controls

    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    min_sector = df['Sector'].loc[df['% Sector per Workforce'] == df['% Sector per Workforce'].min()].values[0]
    max_sector = df['Sector'].loc[df['% Sector per Workforce'] == df['% Sector per Workforce'].max()].values[0]
    mean = df['% Sector per Workforce'].mean().round(2).astype(float)
    std = df['% Sector per Workforce'].std().round(2).astype(float)
    print(f'"% Sector per Workforce":\nMin Sector: {df["% Sector per Workforce"].min():.1f}% in {min_sector}\nMax Sector: {df["% Sector per Workforce"].max():.1f}% in {max_sector}\nMean: {mean}\nStandard Deviation: {std}\n')
    print('-'*20)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_descriptives_sectors_job_interact(df_name):
        run_descriptives_sectors_job(df_name, dataframes[df_name])
else:
    run_descriptives_sectors_job(list(dataframes.keys())[0], dataframes[list(dataframes.keys())[0]])


### IVs and Controls Correlation Matrix

In [None]:
def run_corr_ivs_controls_sent(df_name, df, ivs_=None, controls_=None):
    if ivs_ is None:
        ivs_ = ivs_dummy_perc_and_perc_interactions
    if controls_ is None:
        controls_ = controls

    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    considered_features = controls_[:2] + ivs_[:]
    corr_df = df[considered_features].corr()
    print('-'*20)
    # print(f'Correlation Matrix for {df_name}')
    # print(corr_df)
    print('-'*20)
    print('Highly correlated variables:\n')
    print('-'*20)
    print(corr_df[(corr_df > 0.5) & (corr_df != 1)].stack().sort_values(ascending=False).drop_duplicates())
    print('-'*20)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_corr_ivs_controls_sent_interact(df_name):
        run_corr_ivs_controls_sent(df_name, dataframes[df_name], ivs_=ivs_dummy_perc_and_perc_interactions)
else:
    run_corr_ivs_controls_sent(list(dataframes.keys())[0], list(dataframes.values())[0], ivs_=ivs_dummy_perc_and_perc_interactions)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_corr_ivs_controls_sent_interact(df_name):
        run_corr_ivs_controls_sent(df_name, dataframes[df_name], ivs_=ivs_dummy_and_perc)
else:
    run_corr_ivs_controls_sent(list(dataframes.keys())[0], list(dataframes.values())[0], ivs_=ivs_dummy_and_perc)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_corr_ivs_controls_sent_interact(df_name):
        run_corr_ivs_controls_sent(df_name, dataframes[df_name], ivs_=ivs_perc_and_perc_interactions)
else:
    run_corr_ivs_controls_sent(list(dataframes.keys())[0], list(dataframes.values())[0], ivs_=ivs_perc_and_perc_interactions)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_corr_ivs_controls_sent_interact(df_name):
        run_corr_ivs_controls_sent(df_name, dataframes[df_name], ivs_=ivs_perc_interactions)
else:
    run_corr_ivs_controls_sent(list(dataframes.keys())[0], list(dataframes.values())[0], ivs_=ivs_perc_interactions)


## Job Ad Level

### All Controls info at Job Ad Level

In [None]:
# Control variables info by job ad
def run_descriptives_controls_job(df_name, df, controls_=None):
    if controls_ is None:
        controls_ = controls

    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    print('='*30)
    print('Control varibales info at Job Advertisement Level')
    print('-'*30)
    get_df_info(df.groupby(['Job ID']).first(), ivs_all = controls_)
    print('-'*30)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_descriptives_controls_job_interact(df_name):
        run_descriptives_controls_job(df_name, dataframes[df_name])
else:
    run_descriptives_controls_job(list(dataframes.keys())[0], dataframes[list(dataframes.keys())[0]])


In [None]:
def run_corr_ivs_controls_job(df_name, df, ivs_=None, controls_=None):
    if ivs_ is None:
        ivs_ = ivs_dummy_perc_and_perc_interactions
    if controls_ is None:
        controls_ = controls

    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    df = df.groupby(['Job ID']).first()

    considered_features = controls_[:2] + ivs_[:]
    corr_df = df[considered_features].corr()
    print('-'*20)
    # print(f'Correlation Matrix for {df_name}')
    # print(corr_df)
    print('-'*20)
    print('Highly correlated variables:\n')
    print('-'*20)
    print(corr_df[(corr_df > 0.5) & (corr_df != 1)].stack().sort_values(ascending=False).drop_duplicates())
    print('-'*20)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_corr_ivs_controls_job_interact(df_name):
        run_corr_ivs_controls_job(df_name, dataframes[df_name], ivs_=ivs_dummy_perc_and_perc_interactions)
else:
    run_corr_ivs_controls_job(df_name, dataframes[df_name], ivs_=ivs_dummy_perc_and_perc_interactions)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_corr_ivs_controls_job_interact(df_name):
        run_corr_ivs_controls_job(df_name, dataframes[df_name], ivs_=ivs_dummy_and_perc)
else:
    run_corr_ivs_controls_job(df_name, dataframes[df_name], ivs_=ivs_dummy_and_perc)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_corr_ivs_controls_job_interact(df_name):
        run_corr_ivs_controls_job(df_name, dataframes[df_name], ivs_=ivs_perc_and_perc_interactions)
else:
    run_corr_ivs_controls_job(df_name, dataframes[df_name], ivs_=ivs_perc_and_perc_interactions)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_corr_ivs_controls_job_interact(df_name):
        run_corr_ivs_controls_job(df_name, dataframes[df_name], ivs_=ivs_perc_interactions)
else:
    run_corr_ivs_controls_job(df_name, dataframes[df_name], ivs_=ivs_perc_interactions)


## Imbalance Ratios

In [None]:
# Imbalance Ratio
all_imbalance_ratio_dict = {}
def run_imbalance_ratio(df_name, df):
    print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')

    warmth_imbalance_ratio = df['Warmth'].loc[
        df['Warmth'] == 1].count()/df['Warmth'].loc[df['Warmth'] == 0
    ].count()
    competence_imbalance_ratio = df['Competence'].loc[
        df['Competence'] == 1].count()/df['Competence'].loc[df['Competence'] == 0
    ].count()

    all_imbalance_ratio_dict[f'{df_name} Warmth'] = warmth_imbalance_ratio
    all_imbalance_ratio_dict[f'{df_name} Competence'] = competence_imbalance_ratio

    print('='*20)
    print('Imabalance Ratios')
    print('-'*10)
    print(f'Warmth IR: {warmth_imbalance_ratio:.2f}')
    print(f'Competence IR: {competence_imbalance_ratio:.2f}')
    print('='*20)

    with open(f'{data_dir}{df_name}_all_imbalance_ratio_dict.json', 'w') as f:
        json.dump(all_imbalance_ratio_dict, f)


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_imbalance_ratio_interact(df_name):
        run_imbalance_ratio(df_name, dataframes[df_name])
else:
    run_imbalance_ratio(list(dataframes.keys())[0], list(dataframes.values())[0])


# Tables

In [None]:
def save_desc_excel(
    df_desc,
    index_var,
    title_prefix,
    file_save_path,
    sheet_name=None,
    startrow=None,
    startcol=None,
):
    if sheet_name is None:
        sheet_name = 'All'
    if startrow is None:
        startrow = 1
    if startcol is None:
        startcol = 1

    # index = df_desc.index.to_frame().reset_index(drop=True)
    df_desc = df_desc.reset_index(drop=False, col_level=1, col_fill=f'{title_prefix} Job Advertisements')

    # Define last rows and cols locs
    header_range = len(df_desc.columns.levels)
    endrow = startrow + header_range + df_desc.shape[0]
    endcol = startcol + df_desc.shape[1]

    # Write
    writer = pd.ExcelWriter(f'{file_save_path}.xlsx')
    df_desc.to_excel(writer, sheet_name=sheet_name, merge_cells=True, startrow=startrow, startcol=startcol)
    workbook  = writer.book
    worksheet = writer.sheets[sheet_name]
    worksheet.set_row(startrow + header_range, None, None, {'hidden': True}) # hide the empty row that appears after the headers
    worksheet.set_column(startrow, 1, None, None, {'hidden': True}) # hide the index column

    # MAIN BODY
    # Format column headers
    for i, (col_num, col_value) in tqdm_product(range(header_range), (enumerate(df_desc.columns.values))):
        row_to_write = startrow + i
        col_to_write = startcol + 1 + col_num # 1 is for index
        header_formats = {'bold': False, 'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'align': 'center', 'top': True, 'bottom': True, 'left': False, 'right': False}

        if col_value[i] in ['n', 'M', 'SD']:
            header_formats |= {'italic': True}

        if col_value[i] == '95% Conf.':
            worksheet.set_column(col_to_write, col_to_write, 8.5)

        if col_value[i] == index_var:
            worksheet.set_column(col_to_write, col_to_write, 10)
            header_formats['align'] = 'left'
            header_formats |= {'text_wrap': True}
            worksheet.merge_range(row_to_write, col_to_write, header_range, col_to_write, index_var, workbook.add_format(header_formats))
        else:
            worksheet.write(row_to_write, col_to_write, col_value[i], workbook.add_format(header_formats))

    # Format body columns
    num = [col_num for col_num, value in enumerate(df_desc.columns.values) if value[-1] == 'n']
    perc = [col_num for col_num, value in enumerate(df_desc.columns.values) if value[-1] == '%']
    body_max_row_idx, body_max_col_idx = df_desc.shape

    for c, r in tqdm_product(range(body_max_col_idx), range(body_max_row_idx)):
        row_to_write = startrow + header_range + 1 + r # 1 is for the hidden empty column under the header
        col_to_write = startcol + 1 + c # 1 is for index
        body_formats = {'num_format': '0.00', 'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'align': 'center', 'text_wrap': True, 'left': False, 'right': False}

        if r == body_max_row_idx-1:
            body_formats |= {'bottom': True}

        if c == 0:
            body_formats |= {'align': 'left'}

        if c in num:
            body_formats |= {'num_format': '0'}

        if c in perc:
            body_formats |= {'num_format': '0.0'}

        worksheet.write(row_to_write, col_to_write, df_desc.iloc[r, c], workbook.add_format(body_formats))

    writer.close()


In [None]:
def make_df_desc(df, df_name, vars_list, var_name, index_var, sentence_level=False, continous_var_names_list=None):

    if continous_var_names_list is None:
        continous_var_names_list = ['Probabilities', 'Percentages']

    if df_name == 'df_manual':
        title_prefix = 'Manually Annotated Dataset'
    elif df_name == 'df_jobs':
        title_prefix = 'Classifier Labeled'

    if sentence_level == False:
        level = 'Job Advertisement'
        df = df.groupby('Job ID').first()
    if sentence_level == True:
        level = 'Sentence'

    # Warmth and Competence Categorical df
    if len(set(var_name.split()).intersection(continous_var_names_list)) == 0:
        df_cat = rp.summary_cat(df[vars_list], ascending= True).round(2)
        df_cat['Variable'] = df_cat['Variable'].replace('', np.nan).fillna(method='ffill')
        df_cat = df_cat.loc[df_cat['Outcome'] == 1].drop(columns=['Outcome'])
        totals = pd.DataFrame(df_cat.sum(numeric_only=True)).transpose()
        totals.insert(0, 'Variable', 'Total')
        df_cat = df_cat.fillna('')
        df_cat = pd.concat([df_cat, totals], axis='index', ignore_index=True)

    # Warmth and Competence Continuous df
    df_cont = rp.summary_cont(df[vars_list], conf = 0.95, decimals = 2)

    # Merged df
    if len(set(var_name.split()).intersection(continous_var_names_list)) == 0:
        df_desc = df_cat.merge(df_cont, on='Variable', how='outer')
        df_desc = df_desc.fillna('')
    else:
        df_desc = df_cont

    # Rename variable columns
    df_desc['Variable'] = df_desc['Variable'].apply(
        lambda var_name: f'{var_name.split("_")[1]}-dominated'.replace('_', ' ').strip()
        if '_' in var_name and 'Mixed' not in var_name and '%' not in var_name and 'Probability' not in var_name
        else f'{var_name.split("_")[1]} Gender'.replace('_', ' ')
        if '_' in var_name and 'Mixed' in var_name and '%' not in var_name and 'Probability' not in var_name
        else " ".join(var_name.split("_")[1:]).split()[0]
        if '_' in var_name and 'Mixed' not in var_name and '%' in var_name and 'Probability' not in var_name
        else f'{var_name.split("_")[0]} Probability'.replace('_', ' ')
        if '_' in var_name and 'Mixed' not in var_name and '%' not in var_name and 'Probability' in var_name
        else var_name
    )

    # Clean up df and set index
    if len(set(var_name.split()).intersection(continous_var_names_list)) == 0:
        drop_columns = ['N', 'SE', '95% Conf.', 'Interval']
        rename_dict = {'Variable': index_var, 'Count': 'n', 'Percent': '%', 'Mean': 'M'}
    else:
        drop_columns = ['N', 'SE']
        rename_dict = {'Variable': index_var, 'Mean': 'M', 'SD': 'SD', '95% Conf. Int.': '95% CI'}

    df_desc = df_desc.drop(columns=drop_columns)
    df_desc = df_desc.rename(columns=rename_dict)
    df_desc = df_desc.set_index(keys=[index_var], drop=True)

    # Make into MultiIndex
    df_desc.columns = pd.MultiIndex.from_product([[level], df_desc.columns])

    return df_desc


In [None]:
vars_dict = {
    'Gender Categorical Designation of Sector': ivs_gender_dummy,
    'Age Categorical Designation of Sector': ivs_age_dummy,
    'Gender Percentages per Sector (%)': ivs_gender_perc,
    'Age Percentages per Sector (%)': ivs_age_perc,
    'Warmth and Competence Categorical Coding': dvs,
    'Warmth and Competence Probabilities': dvs_prob,
}


In [None]:
def make_desc_tables(df_name, df, var_name, vars_list):
    if df_name == 'df_manual':
        title_prefix = 'Manually Annotated Dataset'
    elif df_name == 'df_jobs' and 'Warmth and Competence' not in var_name:
        title_prefix = 'Collected Dataset'
    elif df_name == 'df_jobs':
        title_prefix = 'Classifier Labeled'

    # Set index varaible name
    if 'Warmth and Competence' in var_name:
        index_var = 'Stereotype-related frames'
    elif 'Percentages' in var_name:
        index_var = 'Percentages per Sector (PPS)'
    else:
        index_var = 'Sectors'

    with contextlib.suppress(KeyError):
        # Categorical DF on job ad level
        df_desc_cat_jobad = make_df_desc(df, df_name, vars_list=vars_list, var_name=var_name, index_var=index_var, sentence_level=False)

        # Categorical DF on sentence level
        df_desc_cat_sent = make_df_desc(df, df_name, vars_list=vars_list, var_name=var_name, index_var=index_var, sentence_level=True)

        # Merge Categorical dfs
        df_desc_cat = df_desc_cat_jobad.merge(df_desc_cat_sent, on=index_var)

        # Continuous DF on job ad level
        df_desc_cont_jobad = make_df_desc(df, df_name, vars_list=vars_list, var_name=var_name, index_var=index_var, sentence_level=False)

        # Continuous DF on sentence level
        df_desc_cont_sent = make_df_desc(df, df_name, vars_list=vars_list, var_name=var_name, index_var=index_var, sentence_level=True)

        # Merge Continuous dfs
        df_desc_cont = df_desc_cont_jobad.merge(df_desc_cont_sent, on=index_var)

        # Collect dfs in list
        df_desc_list = [df_desc_cat, df_desc_cont]

        for df_desc in df_desc_list:
            levels_with_title = [[f'{title_prefix} Job Advertisements']]
            # Add title prefix
            levels_with_title.extend(
                list(df_desc.columns.get_level_values(i).unique())
                    for i in range(len(df_desc.columns.levels))
            )
            # levels_with_title.insert(0, )
            if 'Warmth and Competence' not in var_name:
                levels_with_title.insert(1, [var_name])

            # Make into MultiIndex
            df_desc.columns = pd.MultiIndex.from_product(levels_with_title)

            # Save Tables
            # File save path
            file_save_path = f'{table_save_path}descriptives {df_name} {title_prefix} {var_name} - Job Advertisement'
            # CSV
            df_desc.to_csv(f'{file_save_path}.csv', index=True)
            # PKL
            df_desc.to_pickle(f'{file_save_path}.pkl')
            # TEX
            with pd.option_context('max_colwidth', 10000000000):
                df_desc.style.to_latex(
                    f'{file_save_path}.tex',
                    convert_css=True,
                    environment='longtable',
                    hrules=True,
                    # escape=True,
                    # multicolumn=True,
                    multicol_align='c',
                    position='H',
                    caption=f'{var_name} Descriptives', label='Descriptives'
                )
            # MD
            df_desc.to_markdown(f'{file_save_path}.md', index=True)
            # EXCEL
            save_desc_excel(df_desc, index_var, title_prefix, file_save_path)

        print('\n')
        print(f'{"+"*20} {df_name.upper()} {"+"*20}\n')
        print(f'{var_name} Descriptives')
        if df_desc_list[0].equals(df_desc_list[1]):
            print(df_desc_list[0])
        else:
            print(df_desc_list[0])
            print(df_desc_list[1])
        print('\n')


In [None]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys(), var_name=vars_dict.keys())
    def make_desc_tables_interact(df_name, var_name):
        make_desc_tables(df_name, dataframes[df_name], var_name, vars_dict[var_name])
else:
    for (df_name, df), (var_name, vars_list) in tqdm_product(dataframes.items(), vars_dict.items()):
        make_desc_tables(df_name, df, var_name, vars_list)
