In [1]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

if code_dir_name not in str(Path.cwd()).split('/')[-1]:
    for _ in range(5):

        parent_path = str(Path.cwd().parents[_]).split('/')[-1]

        if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

            code_dir = str(Path.cwd().parents[_])

            if code_dir is not None:
                break
else:
    code_dir = str(Path.cwd())
sys.path.append(code_dir)

# %load_ext autoreload
# %autoreload 2


In [2]:
from setup_module.imports import *  # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module import researchpy_fork as rp # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module import specification_curve_fork as specy # type:ignore # isort:skip # fmt:skip # noqa # nopep8


Using MPS


0it [00:00, ?it/s]

<Figure size 640x480 with 0 Axes>

# Functions

# Analysis plan:

1. ## [Descriptives and tables](./1.%20descriptives_and_tables.ipynb)
2. ## [Visualization](./2.%20visualization.ipynb)
3. ## [Frequencies and Normality tests](./2.%20frequencies_and_normality_test.ipynb)
   1. ### Frequencies, histograms, and QQ plots
      * Normal test
      * Kurtosis test
      * Shapiro
      * Anderson
      * Bartlett
   2. ### Correlation between independent variables (IVs) and control variables and Multicolinarity test
      * Pearson's R
      * VIF
     - ***ivs_dummy*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
     - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
     - ***% Sector per Workforce*** (continous ratio) = Sector percentage per worksforce (0-100)
     - ***num_words*** (continous ratio) = Number of words in job description
     - ***English Requirement in Job Ad*** (binary nominal) = English requirement in job description (0 vs. 1)
     - ***Dutch Requirement in Job Ad*** (binary nominal) = Dutch requirement in job description (0 vs. 1)
     - ***Platform*** (binary dummy) = LinkedIn (0 vs. 1), Indeed (0 vs. 1), Glassdoor (0 vs. 1)

4. ## [ANOVA and Chi-square (Pearson's R)](./3.%20chisqt_and_anova.ipynb)

   1. ### Chi-square
      * **df_manual:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
      * **df_jobs:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)

   2. ### One-way ANOVA, interactions, and post-hoc test
      * **df_manual:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
          - If Levene's test is *not significant*, use classic ANOVA and Tukey's post hoc test
          - If Levene's test is *significant*, use Welch's and Kruskal-Wallis ANOVA and Games Howell's post hoc test
      * **df_jobs:**
         - ***dvs_prob*** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
         - ***ivs*** (binary nominal) = Social category designation (Female, Male, Mixed Gender)
           - If Levene's test is *not significant*, use classic ANOVA and Tukey's post hoc test
           - If Levene's test is *significant*, use Welch's and Kruskal-Wallis ANOVA and Games Howell's post hoc test

5. ## [Regression Analysis](./3.%20regression_analysis.ipynb)
   1. ### Logistic Regression  with all interaction (smf):
      * **df_manual:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
      * **df_jobs:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
   2. ### OLS Regression with all interaction:
      * **df_jobs:**
        - ***dvs_prob*** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
   3. ### Multilevel OLS Regression with all interaction:
      * **df_jobs:**
        - ***dvs_prob*** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)

6. ## [Specification Curve Analysis](./4.%20specification_curve_analysis.ipynb)

   1. ### Logistic Specification Curve Analysis:
      * **df_manual:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
      * **df_jobs:**
        - ***dvs*** (binary nominal) = 'Warmth' and 'Competence' (0 vs. 1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)
   2. ### OLS Specification Curve Analysis:
      * **df_jobs:**
        - ***dvs_prob*** (continous ratio) = 'Warmth' and 'Competence' probabilities (0-1)
        - ***ivs_perc*** (continous ratio) = Social category percentage per sector (0-100)


# READ DATA

In [3]:
with open(f'{data_dir}df_manual_for_analysis_len.txt', 'r') as f:
    df_manual_len = int(f.read())

df_manual = pd.read_pickle(f'{df_save_dir}df_manual_for_analysis.pkl')
assert len(df_manual) == df_manual_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_manual_len} BUT IS OF LENGTH {len(df_manual)}'
print(f'Dataframe loaded with shape: {df_manual.shape}')
df_manual = categorize_df_gender_age(df_manual)


Dataframe loaded with shape: (5653, 76)


In [4]:
with open(f'{data_dir}df_jobs_for_analysis_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
print(f'Dataframe loaded with shape: {df_jobs.shape}')
df_jobs = categorize_df_gender_age(df_jobs)


Dataframe loaded with shape: (309144, 79)


## Set dataframes

#### Dataframes dict

In [5]:
dataframes = {
    'df_jobs': df_jobs,
    # 'df_manual': df_manual,
}


# Frequencies


In [6]:
def run_freq_and_normality_tests(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    data_names = dvs[:]+ ivs_dummy_perc_and_perc_interactions[:]

    if df_name == 'df_manual':
        dvs_ = dvs[:]
    elif df_name == 'df_jobs':
        data_names.extend(dvs_prob[:])
        dvs_ = dvs_all[:]

    print('~'*20)
    # print(rp.codebook(df[data_names]))

    # Gender and Age
    print('-'*20)
    print(f'Categorical Summary {ivs}')
    freq_iv=rp.summary_cat(df[ivs]).round(3)
    print(freq_iv)
    freq_iv.to_csv(f'{table_save_path}frequencies {df_name} - Gender and Age.csv')
    freq_iv.style.to_latex(f'{table_save_path}frequencies {df_name} - Gender and Age.tex', hrules=True)
    print('-'*20)
    print('\n')

    # Gender and Age Percentages
    print('-'*20)
    print(f'Continuous Summary {ivs_perc_and_perc_interactions}')
    freq_iv_perc=rp.summarize(df[ivs_perc_and_perc_interactions], ci_level = 0.95, decimals = 3)
    print(freq_iv_perc)
    freq_iv_perc.to_csv(f'{table_save_path}frequencies {df_name} - Gender and Age Percentages.csv')
    freq_iv_perc.style.to_latex(f'{table_save_path}frequencies {df_name} - Gender and Age Percentages.tex', hrules=True)
    print('-'*20)
    print('\n')

    # Gender and Age Dummies
    print('-'*20)
    print(f'Continuous Summary {ivs_dummy}')
    freq_iv_dummy=rp.summarize(df[ivs_dummy], ci_level = 0.95, decimals = 3)
    print(freq_iv_dummy)
    freq_iv_dummy.to_csv(f'{table_save_path}frequencies {df_name} - Gender and Age Dummies.csv')
    freq_iv_dummy.style.to_latex(f'{table_save_path}frequencies {df_name} - Gender and Age Dummies.tex', hrules=True)
    print('-'*20)
    print('\n')

    # Gender and Age Counts
    print('-'*20)
    print(f'Continuous Summary {ivs_count}')
    freq_iv_count=rp.summarize(df[ivs_count], ci_level = 0.95, decimals = 3)
    print(freq_iv_count)
    freq_iv_count.to_csv(f'{table_save_path}frequencies {df_name} - Gender and Age Counts.csv')
    freq_iv_count.style.to_latex(f'{table_save_path}frequencies {df_name} - Gender and Age Counts.text')
    print('-'*20)
    print('\n')

    # Warmth and Competence
    print('-'*20)
    print(f'Categorical Summary {dvs}')
    freq_dv=rp.summary_cat(df[dvs]).round(3)
    print(freq_dv)
    freq_dv.to_csv(f'{table_save_path}frequencies {df_name} - Warmth and Competence.csv')
    freq_dv.style.to_latex(f'{table_save_path}frequencies {df_name} - Warmth and Competence.tex', hrules=True)
    print('-'*20)
    print('\n')

    if df_name == 'df_jobs':
        # Warmth and Competence Probabilities
        print('-'*20)
        print(f'Continuous Summary {dvs_prob}')
        freq_dv_prob=rp.summarize(df[dvs_prob], ci_level = 0.95, decimals = 3)
        print(freq_dv_prob)
        freq_dv_prob.to_csv(f'{table_save_path}frequencies {df_name} - Warmth and Competence Probabilities.csv')
        freq_dv_prob.style.to_latex(f'{table_save_path}frequencies {df_name} - Warmth and Competence Probabilities.tex', hrules=True)
        print('-'*20)
        print('\n')

        print('-'*20)
        print('Grouped Frequencies/ Summary ANOVAs Categorical Gender and Age')
        summary_aova = rp.summary_cont(df.groupby(ivs)[dvs_prob], conf=0.95, decimals=3)
        print(summary_aova)
        summary_aova.to_csv(f'{table_save_path}summary anova {df_name} - {ivs} x {dvs}.csv')
        summary_aova.style.to_latex(f'{table_save_path}summary anova {df_name} - {ivs} x {dvs}.tex', hrules=True)
        print('-'*20)
        print('\n')

    for iv, dv in tqdm_product(ivs, dvs):
        print('-'*20)
        print(f'Grouped Frequencies/ Summary ANOVAs {dv} with {iv} Dummies')
        summary_aova_probs_dummy = rp.summary_cont(df_jobs[dv].groupby(df_jobs[iv]), conf=0.95, decimals=3)
        print(summary_aova_probs_dummy)
        summary_aova_probs_dummy.to_csv(f'{table_save_path}summary anova probabilities dummy {df_name} - {iv} x {dv}.csv')
        summary_aova_probs_dummy.style.to_latex(f'{table_save_path}summary anova probabilities dummy {df_name} - {iv} x {dv}.tex', hrules=True)
        print('-'*20)
        print('\n')

    if df_name == 'df_jobs':
        print('-'*20)
        print('Grouped Frequencies/ Summary ANOVAs')
        summary_aova_probs = rp.summary_cont(df.groupby(ivs)[dvs_all], conf=0.95, decimals=3)
        print(summary_aova_probs)
        summary_aova_probs.to_csv(f'{table_save_path}summary anova probabilities {df_name} - {ivs} x {dvs_all}.csv')
        summary_aova_probs.style.to_latex(f'{table_save_path}summary anova probabilities {df_name} - {ivs} x {dvs_all}.tex', hrules=True)
        print('-'*20)
        print('\n')

        for iv, dv in tqdm_product(ivs, dvs_all):
            print('-'*20)
            print(f'Grouped Frequencies/ Summary ANOVAs {dv} with {iv} Dummies')
            summary_aova_probs_dummy = rp.summary_cont(df_jobs[dv].groupby(df_jobs[iv]), conf=0.95, decimals=3)
            print(summary_aova_probs_dummy)
            summary_aova_probs_dummy.to_csv(f'{table_save_path}summary anova probabilities dummy {df_name} - {iv} x {dv}.csv')
            summary_aova_probs_dummy.style.to_latex(f'{table_save_path}summary anova probabilities dummy {df_name} - {iv} x {dv}.tex', hrules=True)
            print('-'*20)
            print('\n')

    # # Histogram
    # df[ivs_perc].hist()
    # plt.show()
    # plt.clf()
    # plt.cla()
    # plt.close()
    # print('-'*20)
    # print('\n')

    # df[ivs_count].hist()
    # plt.show()
    # plt.clf()
    # plt.cla()
    # plt.close()
    # print('-'*20)
    # print('\n')

    # df[ivs_dummy].hist()
    # plt.show()
    # plt.clf()
    # plt.cla()
    # plt.close()
    # print('-'*20)
    # print('\n')

    # if df_name == 'df_jobs':
    #     # Histogram
    #     df[dvs_prob].hist()
    #     plt.show()
    #     plt.clf()
    #     plt.cla()
    #     plt.close()
    #     print('-'*20)
    #     print('\n')

    # # QQ plot
    # qq_plot = pg.qqplot(df[ivs_perc], dist='norm')
    # plt.show()
    # plt.clf()
    # plt.cla()
    # plt.close()
    # print('-'*20)
    # print('\n')

    # qq_plot = pg.qqplot(df[ivs_count], dist='norm')
    # plt.show()
    # plt.clf()
    # plt.cla()
    # plt.close()
    # print('-'*20)
    # print('\n')

    # qq_plot = pg.qqplot(df[ivs_dummy], dist='norm')
    # plt.show()
    # plt.clf()
    # plt.cla()
    # plt.close()
    # print('-'*20)
    # print('\n')

    # if df_name == 'df_jobs':
    #     # QQ plot dvs_prob
    #     qq_plot = pg.qqplot(df[dvs_prob], dist='norm')
    #     plt.show()
    #     plt.clf()
    #     plt.cla()
    #     plt.close()
    #     print('-'*20)
    #     print('\n')


In [7]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_freq_and_normality_tests_interactive(df_name):
        run_freq_and_normality_tests(df_name=df_name, df=dataframes[df_name])
else:
    run_freq_and_normality_tests(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
~~~~~~~~~~~~~~~~~~~~
--------------------
Categorical Summary ['Gender', 'Age']
  Variable    Outcome      Count  Percent
0  Gender   Mixed Gender  117735  38.08  
1                   Male  112991  36.55  
2                 Female   78418  25.37  
3     Age      Mixed Age  198190  64.11  
4                  Older   62959  20.37  
5                Younger   47995  15.53  
--------------------


--------------------
Continuous Summary ['Gender_Female_% per Sector', 'Gender_Male_% per Sector', 'Age_Older_% per Sector', 'Age_Younger_% per Sector', 'Interaction_Female_Older_% per Sector', 'Interaction_Female_Younger_% per Sector', 'Interaction_Male_Older_% per Sector', 'Interaction_Male_Younger_% per Sector']
                    Name                      N      Mean   Median  Variance     SD    SE             95% Conf. Interval           
0               Gender_Female_%

  0%|          | 0/4 [00:00<?, ?it/s]

--------------------
Grouped Frequencies/ Summary ANOVAs Warmth with Gender Dummies


                 N    Mean  SD   SE   95% Conf.  Interval
Gender                                                   
Female         78418  0.24 0.43 0.00    0.24       0.25  
Mixed Gender  117735  0.26 0.44 0.00    0.26       0.26  
Male          112991  0.22 0.41 0.00    0.21       0.22  
--------------------


--------------------
Grouped Frequencies/ Summary ANOVAs Competence with Gender Dummies


                 N    Mean  SD   SE   95% Conf.  Interval
Gender                                                   
Female         78418  0.44 0.50 0.00    0.44       0.44  
Mixed Gender  117735  0.48 0.50 0.00    0.47       0.48  
Male          112991  0.50 0.50 0.00    0.49       0.50  
--------------------


--------------------
Grouped Frequencies/ Summary ANOVAs Warmth with Age Dummies


              N    Mean  SD   SE   95% Conf.  Interval
Age                                                   
Older

  0%|          | 0/8 [00:00<?, ?it/s]

--------------------
Grouped Frequencies/ Summary ANOVAs Warmth with Gender Dummies


                 N    Mean  SD   SE   95% Conf.  Interval
Gender                                                   
Female         78418  0.24 0.43 0.00    0.24       0.25  
Mixed Gender  117735  0.26 0.44 0.00    0.26       0.26  
Male          112991  0.22 0.41 0.00    0.21       0.22  
--------------------


--------------------
Grouped Frequencies/ Summary ANOVAs Competence with Gender Dummies


                 N    Mean  SD   SE   95% Conf.  Interval
Gender                                                   
Female         78418  0.44 0.50 0.00    0.44       0.44  
Mixed Gender  117735  0.48 0.50 0.00    0.47       0.48  
Male          112991  0.50 0.50 0.00    0.49       0.50  
--------------------


--------------------
Grouped Frequencies/ Summary ANOVAs Warmth_Probability with Gender Dummies


                 N    Mean  SD   SE   95% Conf.  Interval
Gender                                    

# Normality Tests


In [8]:
def run_normality_tests(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    if df_name == 'df_manual':
        dvs_ = dvs
    elif df_name == 'df_jobs':
        dvs_ = dvs_all

    # Test of Normality for skew and kurtosis
    # if p < alpha, the null hypothesis is rejected,\nis not from a normal distribution
    norm = scipy.stats.normaltest(df[dvs_])
    print('\n')
    print('='*80)
    print(f'{dvs_} Test of Normality:')
    print('-'*80)
    for key, val in dict(zip(normality_tests_labels, norm)).items():
        print(key,': ', val) # not significant
    print('\n')

    # Shapir-Wilk Test of Normality
    # if p < alpha, the null hypothesis is rejected,\nis not from a normal distribution
    norm_res = scipy.stats.shapiro(df[dvs_])
    print('\n')
    print('='*80)
    print(f'{dvs_} Shapir-Wilk Test of Normality:')
    print('-'*80)
    for key, val in dict(zip(normality_tests_labels, norm_res)).items():
        print(key,': ', val) # significant
    print('\n')

    for dv, iv in tqdm_product(dvs_, ivs_dummy_perc_and_perc_interactions):
        print('\n')
        print('+'*120)
        print(f'Dependent Variable: {dv}\nIndependent Variable: {iv}')
        print('\n')
        print('~' * 20)

        # Anderson-Darling Test of Normality
        # if p < alpha, the null hypothesis is rejected, x is not from a normal distribution
        norm_and = scipy.stats.anderson(df[dv])
        print('\n')
        print('='*80)
        print('Anderson-Darling Test of Normality:')
        print('\n')
        print('~' * 20)
        print(f'{iv} x {dv}')
        for key, val in dict(zip(normality_tests_labels, norm_and)).items():
            print(key,': ', val) # not significant
        print('\n')
        if norm_and.fit_result.success:
            print('Anderson-Darling Test of Normality: The test was successful.')
        else:
            print('Anderson-Darling Test of Normality: The test was not successful.')
        print('~' * 20)
        print('\n')

        # NORMALITY TESTS
        print('\n')
        print('='*80)
        print('NORMALITY TEST')
        print('\n')
        print('~' * 20)
        print(f'{iv} x {dv}')
        norm = pg.normality(data=df, dv=dv, group=iv).round(3)
        if normal := all(norm.normal == True):
            print(f"{iv} x {dv} Normality test: All groups are normally distributed.")
        else:
            print(f"{iv} x {dv} Normality test: Not all groups are normally distributed.")
        print(f"{iv} x {dv} Normality test:\n{norm}")
        norm.to_csv(f'{table_save_path}normality {df_name} - {iv} x {dv}.csv')
        norm.style.to_latex(f'{table_save_path}normality {df_name} - {iv} x {dv}.tex', hrules=True)
        print('~' * 20)
        print('\n')

        # # ANOVA SPHERICITY TEST
        # print('\n')
        # print('='*80)
        # print('SPHERICITY TEST')
        # print('\n')
        # print('~' * 20)
        # print(f'{iv} x {dv}')
        # spher_all = pg.sphericity(data=df, dv=dv, within=iv, method='mauchly')
        # spher, test_stat, chisq, dof, pval = spher_all
        # print('-' * 20)
        # print(f"{iv} x {dv} Sphericity test:\n{spher} at p-value: {round(pval, 3)}, chi-square: {round(chisq, 3)}, degrees of freedom: {round(dof)}, Test statistic: {round(test_stat)}") # if p-value < 0.05, then the data are not spherically distributed = Multivariate analysis
        # # spher.to_csv(f'{table_save_path}sphericity {df_name} - {iv} x {dv}.csv')
        # # spher.style.to_latex(f'{table_save_path}sphericity {df_name} - {iv} x {dv}.tex', hrules=True)
        # print('~' * 20)
        # print('\n')

        # BARTLETTS TESTS
        print('\n')
        print('='*80)
        print("BARTLETT'S TEST")
        print('\n')
        print('~' * 20)
        print(f'{iv} x {dv}')
        bartlett = pg.homoscedasticity(data=df, dv=dv, group=iv, method='bartlett').round(3) #dv
        if equal_var_bartlett := eval(bartlett.equal_var.to_string(index=False)):
            print(f"{iv} x {dv} Bartlett's test: All groups have equal variances.")
        else:
            print(f"{iv} x {dv} Bartlett's test: Not all groups have equal variances.")
        print(f"{iv} x {dv} Bartlett's test:\n{bartlett}")
        bartlett.to_csv(f"{table_save_path}bartlett's {df_name} - {iv} x {dv}.csv")
        bartlett.style.to_latex(f"{table_save_path}bartlett's {df_name} - {iv} x {dv}.tex")
        print('~' * 20)
        print('\n')



In [9]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_normality_tests_interactive(df_name):
        run_normality_tests(df_name=df_name, df=dataframes[df_name])
else:
    run_normality_tests(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


['Warmth', 'Competence', 'Warmth_Probability', 'Competence_Probability'] Test of Normality:
--------------------------------------------------------------------------------
Statistic :  [  55181.4952479  1059724.20965701  241454.74598387 1219933.10659379]
p-value :  [0. 0. 0. 0.]




['Warmth', 'Competence', 'Warmth_Probability', 'Competence_Probability'] Shapir-Wilk Test of Normality:
--------------------------------------------------------------------------------
Statistic :  0.7312663197517395
p-value :  0.0




  0%|          | 0/56 [00:00<?, ?it/s]



++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dependent Variable: Warmth
Independent Variable: Gender_Female


~~~~~~~~~~~~~~~~~~~~


Anderson-Darling Test of Normality:


~~~~~~~~~~~~~~~~~~~~
Gender_Female x Warmth
Statistic :  75699.0008912711
p-value :  [0.576 0.656 0.787 0.918 1.092]


Anderson-Darling Test of Normality: The test was successful.
~~~~~~~~~~~~~~~~~~~~




NORMALITY TEST


~~~~~~~~~~~~~~~~~~~~
Gender_Female x Warmth
Gender_Female x Warmth Normality test: Not all groups are normally distributed.
Gender_Female x Warmth Normality test:
                W   pval  normal
Gender_Female                   
0.00          0.53  0.00   False
1.00          0.53  0.00   False
~~~~~~~~~~~~~~~~~~~~




BARTLETT'S TEST


~~~~~~~~~~~~~~~~~~~~
Gender_Female x Warmth
Gender_Female x Warmth Bartlett's test: All groups have equal variances.
Gender_Female x Warmth Bartlett's test:
           T   pval  equal_var
bar

## Correlation between IVs and Control Variables (Multicollinearity)

### Categorical Gender Sectors

In [10]:
def run_corr_cat_gender_sectors(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    # Full chi-square
    expected, observed, full_chisqt = pg.chi2_independence(data=df, x=ivs_dummy[0], y=ivs_dummy[2])
    print('\n')
    print('+'*120)
    print(f'IV 1: {ivs_dummy[0]}\nIV 2: {ivs_dummy[2]}')
    print('\n')
    print('~' * 20)
    print('FULL CHI-SQUARE TEST:')
    print('-'*20)
    print(f'Observed Count:\n{observed}\n')
    print('-'*20)
    print(f'Expected Count:\n{expected}\n')
    print('-'*20)
    print(f'Chi-square:\n{full_chisqt.round(3)}\n')
    print('~' * 20)
    chi_to_save = pd.concat([pd.concat([observed, pd.DataFrame(expected)], axis='index'), full_chisqt], axis='index')
    chi_to_save.to_csv(f'{table_save_path}chi-square {df_name} - {ivs_dummy[0]} x {ivs_dummy[2]}.csv')
    chi_to_save.style.to_latex(f'{table_save_path}chi-square {df_name} - {ivs_dummy[0]} x {ivs_dummy[2]}.tex', hrules=True)

    # Chi-square
    chisqt = pd.crosstab(df[ivs_dummy[0]], df[ivs_dummy[2]])
    pearson_r, p_value, dof, expected = scipy.stats.chi2_contingency(chisqt)
    reject_H0 = p_value > alpha
    reject_H = p_value < alpha

    # if not reject_H0 and reject_H:
    #     print('\n')
    #     print('+'*120)
    #     print(f'IV 1: {ivs_dummy[0]}\nIV 2: {ivs_dummy[2]}\nNOT SIGNIFICANT at p-value: {p_value:.3f}!')
    #     print('\n')
    #     print('~' * 20)
    #     print(f"Pearsons's R: {pearson_r}.\np-value: {p_value:.3f}.\nDegree of freedom: {dof}.\nH0 Rejected: {reject_H0}\nH Rejected: {reject_H}")
    #     print('\n')
    # elif reject_H0 and not reject_H:
    print('\n')
    print('+'*120)
    print(f'IV 1: {ivs_dummy[0]}\nIV 2: {ivs_dummy[2]}')
    print('\n')
    print('~' * 20)
    print(f"Pearsons's R: {pearson_r}.\np-value: {p_value:.3f}.\nDegree of freedom: {dof}.\nH0 Rejected: {reject_H0}\nH Rejected: {reject_H}")
    print('-'*20)
    print(f'Observed Count:\n{chisqt}\n')
    print('-'*20)
    print(f'Expected Count:\n{expected}\n')
    print('~' * 20)


### Categorical Age Sectors

In [11]:
def run_corr_cat_age_sectors(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    # Full chi-square
    expected, observed, full_chisqt = pg.chi2_independence(data=df, x=ivs_dummy[3], y=ivs_dummy[5])
    print('\n')
    print('+'*120)
    print(f'IV 1: {ivs_dummy[3]}\nIV 2: {ivs_dummy[5]}')
    print('\n')
    print('~' * 20)
    print('FULL CHI-SQUARE TEST:')
    print('-'*20)
    print(f'Observed Count:\n{observed}\n')
    print('-'*20)
    print(f'Expected Count:\n{expected}\n')
    print('-'*20)
    print(f'Chi-square:\n{full_chisqt.round(3)}\n')
    print('~' * 20)
    chi_to_save = pd.concat([pd.concat([observed, pd.DataFrame(expected)], axis='index'), full_chisqt], axis='index')
    chi_to_save.to_csv(f'{table_save_path}chi-square {df_name} - {ivs_dummy[3]} x {ivs_dummy[5]}.csv')
    chi_to_save.style.to_latex(f'{table_save_path}chi-square {df_name} - {ivs_dummy[3]} x {ivs_dummy[5]}.tex', hrules=True)

    # Chi-square
    chisqt = pd.crosstab(df[ivs_dummy[3]], df[ivs_dummy[5]])
    pearson_r, p_value, dof, expected = scipy.stats.chi2_contingency(chisqt)
    reject_H0 = p_value > alpha
    reject_H = p_value < alpha

    # if not reject_H0 and reject_H:
    #     print('\n')
    #     print('+'*120)
    #     print(f'IV 1: {ivs_dummy[3]}\nIV 2: {ivs_dummy[5]}\nNOT SIGNIFICANT at p-value: {p_value:.3f}!')
    #     print('\n')
    #     print('~' * 20)
    #     print(f"Pearsons's R: {pearson_r}.\np-value: {p_value:.3f}.\nDegree of freedom: {dof}.\nH0 Rejected: {reject_H0}\nH Rejected: {reject_H}")
    #     print('\n')
    # elif reject_H0 and not reject_H:
    print('\n')
    print('+'*120)
    print(f'IV 1: {ivs_dummy[3]}\nIV 2: {ivs_dummy[5]}')
    print('\n')
    print('~' * 20)
    print(f"Pearsons's R: {pearson_r}.\np-value: {p_value:.3f}.\nDegree of freedom: {dof}.\nH0 Rejected: {reject_H0}\nH Rejected: {reject_H}")
    print('-'*20)
    print(f'Observed Count:\n{chisqt}\n')
    print('-'*20)
    print(f'Expected Count:\n{expected}\n')
    print('~' * 20)


### Categorical Gender and Age Sectors

In [12]:
def run_corr_cat_gender_and_age_sectors(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    for iv_gender_dummy, iv_age_dummy in tqdm_product(ivs_gender_dummy, ivs_age_dummy):
        # Full chi-square
        expected, observed, full_chisqt = pg.chi2_independence(data=df, x=iv_gender_dummy, y=iv_age_dummy)
        print('\n')
        print('+'*120)
        print(f'IV 1: {iv_gender_dummy}\nIV 2: {iv_age_dummy}')
        print('\n')
        print('~' * 20)
        print('FULL CHI-SQUARE TEST:')
        print('-'*20)
        print(f'Observed Count:\n{observed}\n')
        print('-'*20)
        print(f'Expected Count:\n{expected}\n')
        print('-'*20)
        print(f'Chi-square:\n{full_chisqt.round(3)}\n')
        print('~' * 20)
        chi_to_save = pd.concat([pd.concat([observed, pd.DataFrame(expected)], axis='index'), full_chisqt], axis='index')
        chi_to_save.to_csv(f'{table_save_path}chi-square {df_name} - {iv_gender_dummy} x {iv_age_dummy}.csv')
        chi_to_save.style.to_latex(f'{table_save_path}chi-square {df_name} - {iv_gender_dummy} x {iv_age_dummy}.tex', hrules=True)

        # Chi-square
        chisqt = pd.crosstab(df[iv_gender_dummy], df[iv_age_dummy])
        pearson_r, p_value, dof, expected = scipy.stats.chi2_contingency(chisqt)
        reject_H0 = p_value > alpha
        reject_H = p_value < alpha

        # if not reject_H0 and reject_H:
        #     print('\n')
        #     print('+'*120)
        #     print(f'IV 1: {iv_gender_dummy}\nIV 2: {iv_age_dummy}\nNOT SIGNIFICANT at p-value: {p_value:.3f}!')
        #     print('\n')
        #     print('~' * 20)
        #     print(f"Pearsons's R: {pearson_r}.\np-value: {p_value:.3f}.\nDegree of freedom: {dof}.\nH0 Rejected: {reject_H0}\nH Rejected: {reject_H}")
        #     print('\n')
        # elif reject_H0 and not reject_H:
        print('\n')
        print('+'*120)
        print(f'IV 1: {iv_gender_dummy}\nIV 2: {iv_age_dummy}')
        print('\n')
        print('~' * 20)
        print(f"Pearsons's R: {pearson_r}.\np-value: {p_value:.3f}.\nDegree of freedom: {dof}.\nH0 Rejected: {reject_H0}\nH Rejected: {reject_H}")
        print('-'*20)
        print(f'Observed Count:\n{chisqt}\n')
        print('-'*20)
        print(f'Expected Count:\n{expected}\n')
        print('~' * 20)


### Binary Warmth and Competence

In [13]:
def run_corr_bi_warmth_and_competence(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    # Full chi-square
    expected, observed, full_chisqt = pg.chi2_independence(data=df, x=dvs[0], y=dvs[1])
    print('\n')
    print('+'*120)
    print(f'DV 1: {dvs[0]}\nDV 2: {dvs[1]}')
    print('\n')
    print('~' * 20)
    print('FULL CHI-SQUARE TEST:')
    print('-'*20)
    print(f'Observed Count:\n{observed}\n')
    print('-'*20)
    print(f'Expected Count:\n{expected}\n')
    print('-'*20)
    print(f'Chi-square:\n{full_chisqt.round(3)}\n')
    print('~' * 20)
    chi_to_save = pd.concat([pd.concat([observed, pd.DataFrame(expected)], axis='index'), full_chisqt], axis='index')
    chi_to_save.to_csv(f'{table_save_path}chi-square {df_name} - {dvs[0]} x {dvs[1]}.csv')
    chi_to_save.style.to_latex(f'{table_save_path}chi-square {df_name} - {dvs[0]} x {dvs[1]}.tex', hrules=True)

    # Chi-square
    chisqt = pd.crosstab(df[dvs[0]], df[dvs[1]])
    pearson_r, p_value, dof, expected = scipy.stats.chi2_contingency(chisqt)
    reject_H0 = p_value > alpha
    reject_H = p_value < alpha

    # if not reject_H0 and reject_H:
    #     print('\n')
    #     print('+'*120)
    #     print(f'IV 1: {dvs[0]}\nIV 2: {dvs[1]}\nNOT SIGNIFICANT at p-value: {p_value:.3f}!')
    #     print('\n')
    #     print('~' * 20)
    #     print(f"Pearsons's R: {pearson_r}.\np-value: {p_value:.3f}.\nDegree of freedom: {dof}.\nH0 Rejected: {reject_H0}\nH Rejected: {reject_H}")
    #     print('\n')
    # elif reject_H0 and not reject_H:
    print('\n')
    print('+'*120)
    print(f'DV 1: {dvs[0]}\nDV 2: {dvs[1]}')
    print('\n')
    print('~' * 20)
    print(f"Pearsons's R: {pearson_r}.\np-value: {p_value:.3f}.\nDegree of freedom: {dof}.\nH0 Rejected: {reject_H0}\nH Rejected: {reject_H}")
    print('-'*20)
    print(f'Observed Count:\n{chisqt}\n')
    print('-'*20)
    print(f'Expected Count:\n{expected}\n')
    print('~' * 20)


### Categorical Language Requirement

In [14]:
def run_corr_cat_lang_req(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    # Full chi-square
    expected, observed, full_chisqt = pg.chi2_independence(data=df, x=controls[2], y=controls[3])
    print('\n')
    print('+'*120)
    print(f'Control 1: {controls[2]}\nControl 2: {controls[3]}')
    print('\n')
    print('~' * 20)
    print('FULL CHI-SQUARE TEST:')
    print('-'*20)
    print(f'Observed Count:\n{observed}\n')
    print('-'*20)
    print(f'Expected Count:\n{expected}\n')
    print('-'*20)
    print(f'Chi-square:\n{full_chisqt.round(3)}\n')
    print('~' * 20)
    chi_to_save = pd.concat([pd.concat([observed, pd.DataFrame(expected)], axis='index'), full_chisqt], axis='index')
    chi_to_save.to_csv(f'{table_save_path}chi-square {df_name} - {controls[2]} x {controls[3]}.csv')
    chi_to_save.style.to_latex(f'{table_save_path}chi-square {df_name} - {controls[2]} x {controls[3]}.tex', hrules=True)

    # Chi-square
    chisqt = pd.crosstab(df[controls[2]], df[controls[3]])
    pearson_r, p_value, dof, expected = scipy.stats.chi2_contingency(chisqt)
    reject_H0 = p_value > alpha
    reject_H = p_value < alpha

    # if not reject_H0 and reject_H:
    #     print('\n')
    #     print('+'*120)
    #     print(f'Control 1: {controls[2]}\nControl 2: {controls[3]}\nNOT SIGNIFICANT at p-value: {p_value:.3f}!')
    #     print('\n')
    #     print('~' * 20)
    #     print(f"Pearsons's R: {pearson_r}.\np-value: {p_value:.3f}.\nDegree of freedom: {dof}.\nH0 Rejected: {reject_H0}\nH Rejected: {reject_H}")
    #     print('\n')
    # elif reject_H0 and not reject_H:
    print('\n')
    print('+'*120)
    print(f'Control 1: {controls[2]}\nControl 2: {controls[3]}')
    print('\n')
    print('~' * 20)
    print(f"Pearsons's R: {pearson_r}.\np-value: {p_value:.3f}.\nDegree of freedom: {dof}.\nH0 Rejected: {reject_H0}\nH Rejected: {reject_H}")
    print('-'*20)
    print(f'Observed Count:\n{chisqt}\n')
    print('-'*20)
    print(f'Expected Count:\n{expected}\n')
    print('~' * 20)


In [15]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_corr_interactive(df_name):
        run_corr_cat_gender_sectors(df_name=df_name, df=dataframes[df_name])
        run_corr_cat_age_sectors(df_name=df_name, df=dataframes[df_name])
        run_corr_cat_gender_and_age_sectors(df_name=df_name, df=dataframes[df_name])
        run_corr_bi_warmth_and_competence(df_name=df_name, df=dataframes[df_name])
        run_corr_cat_lang_req(df_name=df_name, df=dataframes[df_name])
else:
    run_corr_cat_gender_sectors(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])
    run_corr_cat_age_sectors(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])
    run_corr_cat_gender_and_age_sectors(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])
    run_corr_bi_warmth_and_competence(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])
    run_corr_cat_lang_req(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
IV 1: Gender_Female
IV 2: Gender_Male


~~~~~~~~~~~~~~~~~~~~
FULL CHI-SQUARE TEST:
--------------------
Observed Count:
Gender_Male      0.00      1.00  
Gender_Female                    
0.00          117735.50 112990.50
1.00           78417.50      0.50

--------------------
Expected Count:
Gender_Male      0.00     1.00  
Gender_Female                   
0.00          146396.49 84329.51
1.00           49756.51 28661.49

--------------------
Chi-square:
          test         lambda      chi2      dof  pval  cramer  power
0             pearson   1.00       60522.07 1.00  0.00   0.44   1.00 
1        cressie-read   0.67       65157.59 1.00  0.00   0.46   1.00 
2      log-likelihood   0.00       86145.93 1.00  0.00   0.53   1.00 
3       freema

  0%|          | 0/9 [00:00<?, ?it/s]



++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
IV 1: Gender_Female
IV 2: Age_Older


~~~~~~~~~~~~~~~~~~~~
FULL CHI-SQUARE TEST:
--------------------
Observed Count:
Age_Older        0.00     1.00  
Gender_Female                   
0.00          167767.50 62958.50
1.00           78417.50     0.50

--------------------
Expected Count:
Age_Older        0.00     1.00  
Gender_Female                   
0.00          183737.29 46988.71
1.00           62447.71 15970.29

--------------------
Chi-square:
          test         lambda     chi2      dof  pval  cramer  power
0             pearson   1.00      26868.85 1.00  0.00   0.29   1.00 
1        cressie-read   0.67      29784.19 1.00  0.00   0.31   1.00 
2      log-likelihood   0.00      42033.56 1.00  0.00   0.37   1.00 
3       freeman-tukey  -0.50      72916.74 1.00  0.00   0.49   1.00 
4  mod-log-likelihood  -1.00     308753.91 1.00  0.00   1.00   1.00 
5        

# VIF

In [16]:
# compute the vif for all given features
def compute_vif(df, considered_features):

    X = df[considered_features]
    # the calculation of variance inflation requires a constant
    X.insert(0, 'intercept', 1)

    # create dataframe to store vif values
    vif = pd.DataFrame()
    vif['Variable'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif = vif.loc[vif['Variable']!='intercept']

    return vif


### VIF for Percentage IVs

In [17]:
def run_vif_ivs_perc_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    for iv_gender_perc, iv_age_perc in tqdm_product(ivs_gender_perc, ivs_age_perc):
        considered_features = [iv_gender_perc, iv_age_perc] + controls[:2]
        vif = compute_vif(df, considered_features)
        print(vif.sort_values('VIF', ascending=False))


In [18]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_perc_controls_interactive(df_name):
        run_vif_ivs_perc_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_perc_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


  0%|          | 0/4 [00:00<?, ?it/s]

                    Variable                    VIF
3                       % Sector per Workforce 1.03
2                       Age_Older_% per Sector 1.02
1                   Gender_Female_% per Sector 1.01
4  Job Description spacy_sentencized_num_words 1.00
                    Variable                    VIF
3                       % Sector per Workforce 1.03
2                     Age_Younger_% per Sector 1.02
1                   Gender_Female_% per Sector 1.01
4  Job Description spacy_sentencized_num_words 1.00
                    Variable                    VIF
3                       % Sector per Workforce 1.03
2                       Age_Older_% per Sector 1.02
1                     Gender_Male_% per Sector 1.01
4  Job Description spacy_sentencized_num_words 1.00
                    Variable                    VIF
3                       % Sector per Workforce 1.03
2                     Age_Younger_% per Sector 1.02
1                     Gender_Male_% per Sector 1.01
4  Job Descr

### VIF for Categorical Dummy IVs

In [19]:
def run_vif_ivs_dummy(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = ivs_dummy[:]
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))


In [20]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_dummy_interactive(df_name):
        run_vif_ivs_dummy(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_dummy(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     Variable          VIF    
1  Gender_Female           inf
6    Age_Younger           inf
4      Age_Older 6263032691.75
3    Gender_Male  617549047.15
5      Age_Mixed    6289852.71
2   Gender_Mixed     739006.58
CPU times: user 4.98 s, sys: 732 ms, total: 5.71 s
Wall time: 1.87 s


In [21]:
def run_vif_ivs_dummy_no_mixed(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = ivs_dummy[:]
    considered_features.remove('Age_Mixed')
    considered_features.remove('Gender_Mixed')
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))


In [22]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_dummy_no_mixed_interactive(df_name):
        run_vif_ivs_dummy_no_mixed(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_dummy_no_mixed(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     Variable     VIF
2    Gender_Male 1.79
1  Gender_Female 1.64
4    Age_Younger 1.46
3      Age_Older 1.31
CPU times: user 1.76 s, sys: 590 ms, total: 2.35 s
Wall time: 637 ms


In [23]:
def run_vif_num(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = ['Gender_Num', 'Age_Num']
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))


In [24]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_num_interactive(df_name):
        run_vif_num(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_num(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    Variable   VIF
2     Age_Num 1.13
1  Gender_Num 1.13
CPU times: user 695 ms, sys: 97.4 ms, total: 792 ms
Wall time: 238 ms


In [25]:
def run_vif_ivs_dummy_genxage_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    for iv_gender_dummy, iv_age_dummy in tqdm_product(ivs_gender_dummy, ivs_age_dummy):
        print('-'*20)
        print(f'VIF for {iv_gender_dummy} x {iv_age_dummy}')
        considered_features = [iv_gender_dummy, iv_age_dummy] + controls[:2]
        vif = compute_vif(df, considered_features)
        print(vif.sort_values('VIF', ascending=False))


In [26]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_dummy_genxage_controls_interactive(df_name):
        run_vif_ivs_dummy_genxage_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_dummy_genxage_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


  0%|          | 0/9 [00:00<?, ?it/s]

--------------------
VIF for Gender_Female x Age_Older
                    Variable                    VIF
2                                    Age_Older 1.15
1                                Gender_Female 1.11
3                       % Sector per Workforce 1.05
4  Job Description spacy_sentencized_num_words 1.00
--------------------
VIF for Gender_Female x Age_Mixed
                    Variable                    VIF
2                                    Age_Mixed 1.36
1                                Gender_Female 1.28
3                       % Sector per Workforce 1.10
4  Job Description spacy_sentencized_num_words 1.00
--------------------
VIF for Gender_Female x Age_Younger
                    Variable                    VIF
2                                  Age_Younger 1.09
1                                Gender_Female 1.07
3                       % Sector per Workforce 1.02
4  Job Description spacy_sentencized_num_words 1.00
--------------------
VIF for Gender_Mixed x Age_Older

In [27]:
def run_vif_ivs_dummy_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = ivs_dummy[:] + controls[:2]
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))
    vif.to_csv(f'{table_save_path}vif {df_name} - {ivs_dummy} x Controls.csv')
    vif.style.to_latex(f'{table_save_path}vif {df_name} - {ivs_dummy} x Controls.tex', hrules=True)


In [28]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_dummy_controls_interactive(df_name):
        run_vif_ivs_dummy_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_dummy_controls(df_name=list(dataframes.keys())[0], df=dataframes[list(dataframes.keys())[0]])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
                    Variable                           VIF      
4                                    Age_Older 21862134113449.01
6                                  Age_Younger  6148258876956.31
3                                  Gender_Male   159320761559.05
1                                Gender_Female   102743327075.65
2                                 Gender_Mixed     1272567080.58
5                                    Age_Mixed        9582381.88
7                       % Sector per Workforce              1.40
8  Job Description spacy_sentencized_num_words              1.00
CPU times: user 8.59 s, sys: 1.09 s, total: 9.68 s
Wall time: 2.7 s


In [29]:
def run_vif_ivs_dummy_no_mixed_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = ivs_dummy[:] + controls[:2]
    considered_features.remove('Age_Mixed')
    considered_features.remove('Gender_Mixed')
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))
    vif.to_csv(f'{table_save_path}vif {df_name} - MIXED REMOVED {ivs_dummy} x Controls.csv')
    vif.style.to_latex(f'{table_save_path}vif {df_name} - MIXED REMOVED {ivs_dummy} x Controls.tex', hrules=True)


In [30]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_dummy_no_mixed_controls_interactive(df_name):
        run_vif_ivs_dummy_no_mixed_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_dummy_no_mixed_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
                    Variable                    VIF
2                                  Gender_Male 2.27
1                                Gender_Female 1.94
4                                  Age_Younger 1.77
5                       % Sector per Workforce 1.40
3                                    Age_Older 1.34
6  Job Description spacy_sentencized_num_words 1.00
CPU times: user 4.23 s, sys: 689 ms, total: 4.92 s
Wall time: 1.37 s


In [31]:
def run_vif_ivs_num_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    print('-'*20)
    print('VIF for Gender_Num x Age_Num')
    considered_features = ['Gender_Num', 'Age_Num'] + controls[:2]
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))


In [32]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_num_controls_interactive(df_name):
        run_vif_ivs_num_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_num_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
--------------------
VIF for Gender_Num x Age_Num
                    Variable                    VIF
1                                   Gender_Num 1.16
2                                      Age_Num 1.13
3                       % Sector per Workforce 1.03
4  Job Description spacy_sentencized_num_words 1.00
CPU times: user 1.82 s, sys: 275 ms, total: 2.1 s
Wall time: 598 ms


In [33]:
def run_vif_ivs_perc_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = ivs_perc[:] + controls[:2]
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))
    vif.to_csv(f'{table_save_path}vif {df_name} - {ivs_perc} x Controls.csv')
    vif.style.to_latex(f'{table_save_path}vif {df_name} - {ivs_perc} x Controls.tex', hrules=True)


In [34]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_perc_controls_interactive(df_name):
        run_vif_ivs_perc_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_perc_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
                    Variable                      VIF  
1                   Gender_Female_% per Sector 73526.21
2                     Gender_Male_% per Sector 73516.07
4                     Age_Younger_% per Sector   211.64
3                       Age_Older_% per Sector   211.20
5                       % Sector per Workforce     1.16
6  Job Description spacy_sentencized_num_words     1.01
CPU times: user 4.58 s, sys: 751 ms, total: 5.33 s
Wall time: 1.49 s


In [35]:
def run_vif_ivs_dummy_and_perc_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = ivs_dummy_and_perc[:] + controls[:2]
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))
    vif.to_csv(f'{table_save_path}vif {df_name} - {ivs_dummy_and_perc} x Controls.csv')
    vif.style.to_latex(f'{table_save_path}vif {df_name} - {ivs_dummy_and_perc} x Controls.tex', hrules=True)


In [36]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_dummy_and_perc_controls_interactive(df_name):
        run_vif_ivs_dummy_and_perc_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_dummy_and_perc_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
                     Variable                           VIF      
8                                   Age_Younger 57370695890070.02
6                                     Age_Older 56294995342131.20
3                                   Gender_Male   243286585493.91
1                                 Gender_Female    97777866181.15
7                                     Age_Mixed      102358782.55
2                                  Gender_Mixed       34350627.09
5                      Gender_Male_% per Sector         147271.28
4                    Gender_Female_% per Sector         129758.88
9                        Age_Older_% per Sector            256.79
10                     Age_Younger_% per Sector            237.06
11                       % Sector per Workforce              1.92
12  Job Description spacy_sentencized_num_words              1.01
CPU times: user 20.

In [37]:
def run_vif_ivs_dummy_perc_and_perc_interactions_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = ivs_dummy_perc_and_perc_interactions[:] + controls[:2]
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))
    # vif.to_csv(f'{table_save_path}vif {df_name} - {ivs_dummy_perc_and_perc_interactions} x Controls.csv')
    # vif.style.to_latex(f'{table_save_path}vif {df_name} - {ivs_dummy_perc_and_perc_interactions} x Controls.tex', hrules=True)


In [38]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_dummy_perc_and_perc_interactions_controls_interactive(df_name):
        run_vif_ivs_dummy_perc_and_perc_interactions_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_dummy_perc_and_perc_interactions_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
                     Variable                          VIF     
1                                 Gender_Female             inf
2                                  Gender_Mixed             inf
3                                   Gender_Male             inf
6                                     Age_Older             inf
7                                     Age_Mixed             inf
8                                   Age_Younger             inf
5                      Gender_Male_% per Sector 414276481222.56
4                    Gender_Female_% per Sector 412984835155.48
14        Interaction_Male_Younger_% per Sector 158761928556.79
12      Interaction_Female_Younger_% per Sector 140915834958.95
13          Interaction_Male_Older_% per Sector 138104864378.12
9                        Age_Older_% per Sector 114694637278.32
10                     Age_Younger_% per Sect

In [39]:
def run_vif_ivs_dummy_and_perc_controls_no_mixed(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = ivs_dummy_and_perc[:] + controls[:2]
    considered_features.remove('Age_Mixed')
    considered_features.remove('Gender_Mixed')
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))
    vif.to_csv(f'{table_save_path}vif {df_name} - MIXED REMOVED {ivs_dummy_and_perc} x Controls.csv')
    vif.style.to_latex(f'{table_save_path}vif {df_name} - MIXED REMOVED {ivs_dummy_and_perc} x Controls.tex', hrules=True)


In [40]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_dummy_and_perc_controls_no_mixed_interactive(df_name):
        run_vif_ivs_dummy_and_perc_controls_no_mixed(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_dummy_and_perc_controls_no_mixed(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
                     Variable                       VIF  
3                    Gender_Female_% per Sector 151518.58
4                      Gender_Male_% per Sector 149776.06
7                        Age_Older_% per Sector    256.86
8                      Age_Younger_% per Sector    237.21
1                                 Gender_Female     10.84
2                                   Gender_Male      7.13
5                                     Age_Older      4.36
6                                   Age_Younger      2.86
9                        % Sector per Workforce      1.92
10  Job Description spacy_sentencized_num_words      1.01
CPU times: user 13.5 s, sys: 990 ms, total: 14.5 s
Wall time: 4.29 s


In [41]:
def run_vif_ivs_dummy_perc_and_perc_interactions_controls_no_mixed(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = ivs_dummy_perc_and_perc_interactions[:] + controls[:2]
    considered_features.remove('Age_Mixed')
    considered_features.remove('Gender_Mixed')
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))
    # vif.to_csv(f'{table_save_path}vif {df_name} - MIXED REMOVED {ivs_dummy_perc_and_perc_interactions} x Controls.csv')
    # vif.style.to_latex(f'{table_save_path}vif {df_name} - MIXED REMOVED {ivs_dummy_perc_and_perc_interactions} x Controls.tex', hrules=True)


In [42]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_dummy_perc_and_perc_interactions_controls_no_mixed_interactive(df_name):
        run_vif_ivs_dummy_perc_and_perc_interactions_controls_no_mixed(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_dummy_perc_and_perc_interactions_controls_no_mixed(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
                     Variable                          VIF     
4                      Gender_Male_% per Sector 414276481222.56
3                    Gender_Female_% per Sector 412984835155.48
12        Interaction_Male_Younger_% per Sector 158761928556.79
10      Interaction_Female_Younger_% per Sector 140915834958.95
11          Interaction_Male_Older_% per Sector 138104864378.12
7                        Age_Older_% per Sector 114694637278.32
8                      Age_Younger_% per Sector 108576723540.40
9         Interaction_Female_Older_% per Sector 106300960130.54
1                                 Gender_Female           70.48
2                                   Gender_Male           38.81
5                                     Age_Older            7.79
6                                   Age_Younger            7.73
13                       % Sector per Workfor

In [43]:
def run_vif_ivs_num_and_perc_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = ivs_num_and_perc[:] + controls[:2]
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))
    vif.to_csv(f'{table_save_path}vif {df_name} - {ivs_num_and_perc} x Controls.csv')
    vif.style.to_latex(f'{table_save_path}vif {df_name} - {ivs_num_and_perc} x Controls.tex', hrules=True)


In [44]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_ivs_num_and_perc_controls_interactive(df_name):
        run_vif_ivs_num_and_perc_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_ivs_num_and_perc_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
                    Variable                      VIF  
2                   Gender_Female_% per Sector 92579.70
3                     Gender_Male_% per Sector 91684.09
5                       Age_Older_% per Sector   233.75
6                     Age_Younger_% per Sector   219.00
1                                   Gender_Num    15.21
4                                      Age_Num     5.54
7                       % Sector per Workforce     1.30
8  Job Description spacy_sentencized_num_words     1.01
CPU times: user 7.83 s, sys: 577 ms, total: 8.41 s
Wall time: 2.88 s


In [45]:
def run_vif_dvs(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = dvs

    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))


In [46]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_dvs_interactive(df_name):
        run_vif_dvs(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_dvs(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    Variable   VIF
2  Competence 1.00
1      Warmth 1.00
CPU times: user 611 ms, sys: 83.2 ms, total: 694 ms
Wall time: 260 ms


In [47]:
def run_vif_dvs_prob(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    if df_name == 'df_manual':
        considered_features = dvs_prob_predicted
    elif df_name == 'df_jobs':
        considered_features = dvs_prob

    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))


In [48]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_dvs_prob_interactive(df_name):
        run_vif_dvs_prob(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_dvs_prob(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
          Variable         VIF
1      Warmth_Probability 1.01
2  Competence_Probability 1.01
CPU times: user 1.02 s, sys: 115 ms, total: 1.14 s
Wall time: 351 ms


In [49]:
def run_vif_dvs_binary_and_prob(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    if df_name == 'df_manual':
        considered_features = dvs + dvs_prob_predicted
    elif df_name == 'df_jobs':
        considered_features = dvs + dvs_prob

    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))


In [50]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_dvs_binary_and_prob_interactive(df_name):
        run_vif_dvs_binary_and_prob(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_dvs_binary_and_prob(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
          Variable         VIF
4  Competence_Probability 9.46
2              Competence 9.42
3      Warmth_Probability 4.28
1                  Warmth 4.24
CPU times: user 2.12 s, sys: 234 ms, total: 2.35 s
Wall time: 702 ms


In [51]:
def run_vif_dvs_actual_and_predicted(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    df = df.dropna(subset=dvs_predicted).reset_index(drop=True)
    considered_features = dvs + dvs_predicted

    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))


In [52]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_dvs_actual_and_predicted_interactive(df_name):
        run_vif_dvs_actual_and_predicted(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_dvs_actual_and_predicted(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
         Variable        VIF
2            Competence 3.20
4  Competence_predicted 3.20
1                Warmth 2.31
3      Warmth_predicted 2.30
CPU times: user 596 ms, sys: 126 ms, total: 723 ms
Wall time: 204 ms


In [53]:
def run_vif_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    considered_features = controls[:2]
    vif = compute_vif(df, considered_features)
    print(vif.sort_values('VIF', ascending=False))
    vif.to_csv(f'{table_save_path}vif {df_name} - Controls.csv')
    vif.style.to_latex(f'{table_save_path}vif {df_name} - Controls.tex', hrules=True)


In [54]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_vif_controls_interactive(df_name):
        run_vif_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_vif_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
                    Variable                    VIF
2  Job Description spacy_sentencized_num_words 1.00
1                       % Sector per Workforce 1.00
CPU times: user 897 ms, sys: 282 ms, total: 1.18 s
Wall time: 381 ms


# Double LASSO Regression

### Double LASSO Regression for Controls x DVs

In [55]:
def run_double_lasso_dvs_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    endog_names = dvs
    exog_names = controls[:2]

    endog = df[endog_names]
    exog = df[exog_names]
    constant = sm.add_constant(exog)

    lasso = Lasso(alpha=0.1)
    lasso.fit(constant, endog)
    print('Estimate coefficients for Lasso')
    for i, name in enumerate(exog_names):
        print(f'{name}: {lasso.coef_[i]}')


In [56]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_double_lasso_dvs_controls_interactive(df_name):
        run_double_lasso_dvs_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_double_lasso_dvs_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Estimate coefficients for Lasso
% Sector per Workforce: [0.        0.        0.0067655]
Job Description spacy_sentencized_num_words: [0.         0.         0.00856056]
CPU times: user 268 ms, sys: 156 ms, total: 424 ms
Wall time: 72.9 ms


### Double LASSO Regression for Controls x IVs Percentage

In [57]:
def run_double_lasso_ivs_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    for iv in ivs_perc:
        print(f'{"-"*20} {iv} {"-"*20}')
        endog_names = iv
        exog_names = controls[:2]

        endog = df[endog_names]
        exog = df[exog_names]
        constant = sm.add_constant(exog)

        lasso = Lasso(alpha=0.1)
        lasso.fit(constant, endog)
        print('Estimate coefficients for Lasso')
        for i, name in enumerate(exog_names):
            print(f'{name}: {lasso.coef_[i]}')


In [58]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_double_lasso_ivs_controls_interactive(df_name):
        run_double_lasso_ivs_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_double_lasso_ivs_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-------------------- Gender_Female_% per Sector --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: 0.1840724058744032
-------------------- Gender_Male_% per Sector --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: -0.1821895224406096
-------------------- Age_Older_% per Sector --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: -0.1576793783711465
-------------------- Age_Younger_% per Sector --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: 0.1612556701870287
CPU times: user 3.16 s, sys: 227 ms, total: 3.38 s
Wall time: 1.02 s


### Double LASSO Regression for Controls x IVs Dummy

In [59]:
def run_double_lasso_ivs_dummy_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    for iv in ivs_dummy:
        print(f'{"-"*20} {iv} {"-"*20}')
        endog_names = iv
        exog_names = controls[:2]

        endog = df[endog_names]
        exog = df[exog_names]
        constant = sm.add_constant(exog)

        lasso = Lasso(alpha=0.1)
        lasso.fit(constant, endog)
        print('Estimate coefficients for Lasso')
        for i, name in enumerate(exog_names):
            print(f'{name}: {lasso.coef_[i]}')


In [60]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_double_lasso_ivs_dummy_controls_interactive(df_name):
        run_double_lasso_ivs_dummy_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_double_lasso_ivs_dummy_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-------------------- Gender_Female --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: -0.0007463151787829066
-------------------- Gender_Mixed --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: 0.017265140140321114
-------------------- Gender_Male --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: -0.015213069081486858
-------------------- Age_Older --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: -0.007465558084382717
-------------------- Age_Mixed --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_wo

### Double LASSO Regression for Controls x IVs Dummy and Percentage

In [61]:
def run_double_lasso_ivs_dummy_and_perc_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    for iv in ivs_dummy_and_perc:
        print(f'{"-"*20} {iv} {"-"*20}')
        endog_names = iv
        exog_names = controls[:2]

        endog = df[endog_names]
        exog = df[exog_names]
        constant = sm.add_constant(exog)

        lasso = Lasso(alpha=0.1)
        lasso.fit(constant, endog)
        print('Estimate coefficients for Lasso')
        for i, name in enumerate(exog_names):
            print(f'{name}: {lasso.coef_[i]}')


In [62]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_double_lasso_ivs_dummy_and_perc_controls_interactive(df_name):
        run_double_lasso_ivs_dummy_and_perc_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_double_lasso_ivs_dummy_and_perc_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-------------------- Gender_Female --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: -0.0007463151787829066
-------------------- Gender_Mixed --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: 0.017265140140321114
-------------------- Gender_Male --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: -0.015213069081486858
-------------------- Gender_Female_% per Sector --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: 0.1840724058744032
-------------------- Gender_Male_% per Sector --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Descrip

### Double LASSO Regression for Controls x IVs Dummy, Percentage, and Interactions

In [63]:
def run_double_lasso_ivs_dummy_perc_and_perc_interactions_controls(df_name, df):
    print('\n')
    print('+'*120)
    print(f'{"="*50} RESULTS FOR {df_name} {"="*50}')

    for iv in ivs_dummy_perc_and_perc_interactions:
        print(f'{"-"*20} {iv} {"-"*20}')
        endog_names = iv
        exog_names = controls[:2]

        endog = df[endog_names]
        exog = df[exog_names]
        constant = sm.add_constant(exog)

        lasso = Lasso(alpha=0.1)
        lasso.fit(constant, endog)
        print('Estimate coefficients for Lasso')
        for i, name in enumerate(exog_names):
            print(f'{name}: {lasso.coef_[i]}')


In [64]:
%%time
if len(dataframes) > 1:
    @interact(df_name=dataframes.keys())
    def run_double_lasso_ivs_dummy_perc_and_perc_interactions_controls_interactive(df_name):
        run_double_lasso_ivs_dummy_perc_and_perc_interactions_controls(df_name=df_name, df=dataframes[df_name])
else:
    run_double_lasso_ivs_dummy_perc_and_perc_interactions_controls(df_name=list(dataframes.keys())[0], df=list(dataframes.values())[0])




++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-------------------- Gender_Female --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: -0.0007463151787829066
-------------------- Gender_Mixed --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: 0.017265140140321114
-------------------- Gender_Male --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: -0.015213069081486858
-------------------- Gender_Female_% per Sector --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Description spacy_sentencized_num_words: 0.1840724058744032
-------------------- Gender_Male_% per Sector --------------------
Estimate coefficients for Lasso
% Sector per Workforce: 0.0
Job Descrip