In [1]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

if code_dir_name not in str(Path.cwd()).split('/')[-1]:
    for _ in range(5):

        parent_path = str(Path.cwd().parents[_]).split('/')[-1]

        if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

            code_dir = str(Path.cwd().parents[_])

            if code_dir is not None:
                break
else:
    code_dir = str(Path.cwd())
sys.path.append(code_dir)

# %load_ext autoreload
# %autoreload 2


In [2]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from estimators_get_pipe import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


Using MPS


0it [00:00, ?it/s]

Using MPS


<Figure size 640x480 with 0 Axes>

### Set variables

In [3]:
# Variables
method = 'Supervised'
classifiers_type = 'all'
if classifiers_type == 'nonlinear':
    classifiers_pipe = classifiers_pipe_nonlinear
elif classifiers_type == 'linear':
    classifiers_pipe = classifiers_pipe_linear
elif classifiers_type == 'ensemble':
    classifiers_pipe = classifiers_pipe_ensemble
elif classifiers_type == 'all':
    classifiers_pipe = classifiers_pipe

results_save_path = f'{models_save_path}{method} Results/'
with open(f'{data_dir}{method}_results_save_path.txt', 'w') as f:
    f.write(results_save_path)
if not os.path.exists(results_save_path):
    os.makedirs(results_save_path)
done_xy_save_path = f'{results_save_path}Search+Xy/'
with open(f'{data_dir}{method}_done_xy_save_path.txt', 'w') as f:
    f.write(done_xy_save_path)
if not os.path.exists(done_xy_save_path):
    os.makedirs(done_xy_save_path)

t = time.time()
n_jobs = -1
n_splits = 10
n_repeats = 3
random_state = 42
refit = True
class_weight = 'balanced'
cv = RepeatedStratifiedKFold(
    n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
)
scoring = 'recall'
scores = [
    'recall', 'accuracy', 'f1', 'roc_auc',
    'explained_variance', 'matthews_corrcoef'
]
scorers = {
    'precision_score': make_scorer(precision_score, zero_division=0),
    'recall_score': make_scorer(recall_score, zero_division=0),
    'accuracy_score': make_scorer(accuracy_score, zero_division=0),
}
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
metrics_dict = {
    f'{scoring.title()} Best Score': np.nan,
    f'{scoring.title()} Best Threshold': np.nan,
    'Train - Mean Cross Validation Score': np.nan,
    f'Train - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Train - Mean Explained Variance - {scoring.title()}': np.nan,
    'Test - Mean Cross Validation Score': np.nan,
    f'Test - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Test - Mean Explained Variance - {scoring.title()}': np.nan,
    'Explained Variance': np.nan,
    'Accuracy': np.nan,
    'Balanced Accuracy': np.nan,
    'Precision': np.nan,
    'Average Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'Matthews Correlation Coefficient': np.nan,
    'Brier Score': np.nan,
    'Fowlkes–Mallows Index': np.nan,
    'R2 Score': np.nan,
    'ROC': np.nan,
    'AUC': np.nan,
    'Log Loss/Cross Entropy': np.nan,
    'Cohen’s Kappa': np.nan,
    'Geometric Mean': np.nan,
    'Classification Report': np.nan,
    'Imbalanced Classification Report': np.nan,
    'Confusion Matrix': np.nan,
    'Normalized Confusion Matrix': np.nan,
}

# Transformer variables
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
# Set random seed
random_state = 42
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
cores = multiprocessing.cpu_count()


Using MPS


### Functions

In [4]:
def save_df_full_summary_excel(
    df_full_summary,
    title,
    text_to_add_list,
    file_save_path,
    sheet_name=None,
    startrow=None,
    startcol=None,
):
    if sheet_name is None:
        sheet_name = 'All'
    if startrow is None:
        startrow = 1
    if startcol is None:
        startcol = 1

    # Define last rows and cols locs
    header_range = 1
    endrow = startrow + header_range + df_full_summary.shape[0]
    endcol = startcol + df_full_summary.shape[1]

    # Remove NAs
    df_full_summary = df_full_summary.fillna('')

    # Write
    writer = pd.ExcelWriter(f'{file_save_path}.xlsx')
    df_full_summary.to_excel(writer, sheet_name=sheet_name, merge_cells=True, startrow=startrow, startcol=startcol)
    workbook  = writer.book
    worksheet = writer.sheets[sheet_name]
    worksheet.set_column(startrow, 1, None, None, {'hidden': True}) # hide the index column

    # Title
    worksheet.merge_range(1, startcol, 1, endcol, title, workbook.add_format({'italic': True, 'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'align': 'left', 'top': True, 'bottom': True, 'left': False, 'right': False}))

    # Main body
    body_max_row_idx, body_max_col_idx = df_full_summary.shape

    for c, r in tqdm_product(range(body_max_col_idx), range(body_max_row_idx)):
        row_to_write = startrow + header_range + r
        col_to_write = startcol + 1 + c # 1 is for index
        body_formats = {'num_format': '0.00', 'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'align': 'center', 'text_wrap': True, 'left': False, 'right': False}

        if r == 0:
            body_formats |= {'top': True, 'bottom': True, 'left': False, 'right': False}
            worksheet.set_column(col_to_write, col_to_write, 10)

        if r == body_max_row_idx-1:
            body_formats |= {'bottom': True}

        if c == 0:
            body_formats |= {'align': 'left'}
            worksheet.set_column(col_to_write, col_to_write, 15)

        worksheet.write(row_to_write, col_to_write, df_full_summary.iloc[r, c], workbook.add_format(body_formats))

    # Add Note
    note_format = {'italic': True, 'font_name': 'Times New Roman', 'font_size': 10, 'font_color': 'black', 'align': 'left', 'left': False, 'right': False}
    worksheet.merge_range(endrow, startcol, endrow, endcol, 'Note.', workbook.add_format(note_format))
    # Add text
    for i, text in enumerate(text_to_add_list):
        worksheet.merge_range(endrow + 1 + i , startcol, endrow + 1 + i, endcol, text, workbook.add_format(note_format))

    writer.close()


In [5]:
def make_full_report(
    results, dv, dvs_name, dv_type,
    regression_info_dict=None, regressor_order=None, text_to_add_list=None, title=None, model_names=None
):
    '''
    Make a full report for a regression analysis.
    results: statsmodels regression results object or list of results objects
    dv: str, dependent variable name
    '''

    if regression_info_dict is None:
        # Regression info dict
        regression_info_dict = {
            'Model Name': lambda x: f'{x.model.__class__.__name__}',
            'N': lambda x: f'{int(x.nobs):d}',
            'R-squared': lambda x: f'{x.rsquared:.5f}',
            'R-squared Adj.': lambda x: f'{x.rsquared_adj:.5f}',
            'Log-Likelihood': lambda x: f'{x.llf:.5f}',
            'Pseudo R2': lambda x: f'{x.prsquared:.5f}',
            'F': lambda x: f'{x.fvalue:.5f}',
            'F (p-value)': lambda x: f'{x.f_pvalue:.5f}',
            'df_model': lambda x: f'{x.df_model:.0f}',
            'df_total': lambda x: f'{x.df_resid + x.df_model + 1:.0f}',
            'df_resid': lambda x: f'{x.df_resid:.0f}',
            'AIC': lambda x: f'{x.aic:.5f}',
            'BIC': lambda x: f'{x.bic:.5f}',
            'ICC': lambda x: f'{x.rsquared / (x.rsquared + (x.nobs - 1) * x.mse_resid):.5f}',
            'RMSE': lambda x: f'{x.mse_resid ** 0.5:.5f}',
            'RMSE (std)': lambda x: f'{x.mse_resid ** 0.5 / x.model.endog.std():.5f}',
            'Omnibus': lambda x: f'{sms.omni_normtest(x.resid).statistic:.5f}',
            'Omnibus (p-value)': lambda x: f'{sms.omni_normtest(x.resid).pvalue:.5f}',
            'Skew': lambda x: f'{sms.jarque_bera(x.resid)[-2]:.5f}',
            'Kurtosis': lambda x: f'{sms.jarque_bera(x.resid)[-1]:.5f}',
            'Jarque-Bera (JB)': lambda x: f'{sms.jarque_bera(x.resid)[0]:.5f}',
            'Jarque-Bera (p-value)': lambda x: f'{sms.jarque_bera(x.resid)[1]:.5f}',
            'Intercept': lambda x: f'{x.params["const"]:.5f}',
            'Intercept (std)': lambda x: f'{x.bse["const"]:.5f}',
            'Intercept t': lambda x: f'{x.tvalues["const"]:.5f}',
            'Intercept t (p-value)': lambda x: f'{x.pvalues["const"]:.5f}',
            'Intercept (95% CI)': lambda x: f'{x.conf_int().loc["const"][0]:.5f} - {x.conf_int().loc["const"][1]:.5f}',
            'Unstandardized Coefficent B (b)': lambda x: f'{x.params[0]:.5f}',
            'Standard Error (SE)': lambda x: f'{x.bse[0]:.5f}',
            'Standardized Coefficient b* (β)': lambda x: f'{x.params[0] / x.model.endog.std():.5f}',
            't': lambda x: f'{x.tvalues[0]:.5f}',
            't (p-value)': lambda x: f'{x.pvalues[0]:.5f}',
            '95% CI': lambda x: f'{x.conf_int().iloc[0, 1]:.5f} - {x.conf_int().iloc[0, 1]:.5f}',
            # 'Summary': lambda x: f'{x.summary()}',
            # 'F (p-value - FDR)': lambda x: f'{x.f_pvalue_fdr:.5f}',
            # 'F (p-value - Bonferroni)': lambda x: f'{x.f_pvalue_bonf:.5f}',
            # 't (p-value - FDR)': lambda x: f'{x.pvalues_fdr[1]:.5f}',
            # 't (p-value - Bonferroni)': lambda x: f'{x.pvalues_bonf[1]:.5f}',
        }
    if model_names is None:
        if isinstance(results, list):
            model_names = [
                f'{results[0].model.endog_names.split("_")[0] if "_" in results[0].model.endog_names else results[0].model.endog_names} Model {i}'
                for i in range(len(results[0].model.endog_names))
            ]
            model_names[0] = model_names[0].replace('Model 0', 'Full Model')
        else:
            model_names = [
                f'{results.model.endog_names.split("_")[0] if "_" in results.model.endog_names else results.model.endog_names}'
            ]

    order_type = 'unordered' if regressor_order is None else 'ordered'
    if text_to_add_list is None:
        text_to_add_list = []
        if regressor_order is not None:
            text_to_add_list.append('Models are ordered by independent variable type.')

        else:
            text_to_add_list.append('Models are ordered by coefficient size, largest to smallest.')

    if title is None:
        title = f'{dv_type} OLS Regression {dv}'

    try:
        # Statsmodels summary_col
        full_summary = summary_col(
            results,
            stars=True,
            info_dict=regression_info_dict,
            regressor_order=regressor_order,
            float_format='%0.3f',
            model_names=model_names,
        )
        if isinstance(results, list) and len(results) > 4:
            full_summary.tables[0][full_summary.tables[0].filter(regex='Full Model').columns[0]].loc['Unstandardized Coefficent B (b)': '95% CI'] = ''

        # Add title and notes
        full_summary.add_title(title)
        text_to_add_list.extend(full_summary.extra_txt)
        for text in text_to_add_list:
            full_summary.add_text(text)
        # Save
        save_name = f'{table_save_path}{dv_type} OLS Regression {dv}'
        print(f'Saving {save_name}...')
        df_full_summary = pd.read_html(full_summary.as_html())[0]
        df_full_summary.to_csv(f'{save_name}.csv')
        df_full_summary.style.to_latex(f'{save_name}.tex', hrules=True)
        save_df_full_summary_excel(df_full_summary, title, text_to_add_list, save_name)

        return full_summary
    except IndexError:
        return None


In [6]:
def get_standardized_coefficients(results):

    # # Get standardized regression coefficients
    # std = np.asarray(constant.std(0))

    # if 'const' in results.params and 'const' in constant:
    #     std[0] = 1
    # tt = results.t_test(np.diag(std))
    # tt.c_names = results.model.exog_names

    # t-test
    std = results.model.exog.std(0)
    if 'const' in results.params:
        std[0] = 1
    tt = results.t_test(np.diag(std))
    if results.model.__class__.__name__ == 'MixedLM' or 'Group Var' in results.model.exog_names:
        offset = slice(None, -1)
        tt.c_names = results.model.exog_names[offset]
    else:
        offset = slice(None, None)
        tt.c_names = results.model.exog_names

    # Make df with standardized and unstandardized coefficients
    df_std_coef = pd.DataFrame(
        {
            'coef': results.params[offset].apply(lambda x: f'{x:.5f}'),
            'std err': results.bse[offset].apply(lambda x: f'{x:.5f}'),
            'std coef': (results.params[offset] / results.model.exog[offset].std(axis=0)).apply(lambda x: f'{x:.5f}'),
            't': results.tvalues[offset].apply(lambda x: f'{x:.5f}'),
            'P>|t|': results.pvalues[offset].apply(lambda x: f'{x:.5f}'),
            '[0.025': results.conf_int()[0][offset].apply(lambda x: f'{x:.5f}'),
            '0.975]': results.conf_int()[1][offset].apply(lambda x: f'{x:.5f}'),
        }
    )
    # if 'Group Var' in df_std_coef.index:
    #     df_std_coef = df_std_coef.drop('Group Var', axis='index')
    # # Add standardized coefficients and other data from t-test
    # df_std_coef['std coef'] = tt.effect
    # df_std_coef['std err'] = tt.sd
    # df_std_coef['t'] = tt.statistic
    # df_std_coef['P>|t|'] = tt.pvalue
    # df_std_coef['[0.025'] = tt.conf_int()[:, 0]
    # df_std_coef['0.975]'] = tt.conf_int()[:, 1]
    # df_std_coef['var'] = [names[i] for i in range(len(results.model.exog_names))]
    # df_std_coef = df_std_coef.sort_values('std coef', ascending=False)
    df_std_coef = df_std_coef.reset_index().rename(columns={'index': 'var'})
    df_std_coef = df_std_coef.rename(
        columns={
            'var': 'Variable',
            'coef': 'Unstandardized Coefficent B (b)',
            'std err': 'Standard Error',
            'std coef':'Standardized Coefficient b* (β)',
            't': 't-value',
            'P>|t|': 'p-value',
            '[0.025': '95% CI Lower',
            '0.975]': '95% CI Upper'
        }
    )
    # Reorder columns
    df_std_coef = df_std_coef[[
        'Variable',
        'Unstandardized Coefficent B (b)',
        'Standard Error',
        'Standardized Coefficient b* (β)',
        't-value',
        'p-value',
        '95% CI Lower',
        '95% CI Upper'
    ]]

    return tt, df_std_coef


### READ DATA

In [7]:
with open(f'{data_dir}df_jobs_for_analysis_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
print(f'Dataframe df_jobs_for_analysis loaded with shape: {df_jobs.shape}')


Dataframe df_jobs_for_analysis loaded with shape: (309438, 79)


In [8]:
df_jobs['Warmth'].equals(df_jobs['Warmth_predicted'])


False

In [9]:
df_jobs['Competence'].equals(df_jobs['Competence_predicted'])


False

## Check biased and unbiased regressions models using human annotated and classifier predicted Warmth and Competence
Source: https://mochenyang.github.io/mochenyangblog/research/2022/01/10/ForestIV.html

### Unbiased and Biased Warmth and CompetenceOLS regression with human annotated actual values as DV and all IVs

In [10]:
def compare_actual_and_predicted(df, endog_names_dict=None):
    if endog_names_dict is None:
        endog_names_dict = defaultdict(lambda: defaultdict())
    exog_names = ivs_dummy_perc_and_perc_interactions + controls[:2]

    for dv in dvs:
        endog_names_dict[dv] = {
            'Unbiased': {'endog_names': f'{dv}_actual'},
            'Biased': {'endog_names': f'{dv}_predicted'}
        }
        df = df.loc[
            (~df[endog_names_dict[dv]['Unbiased']['endog_names']].isna())
            & (~df[endog_names_dict[dv]['Biased']['endog_names']].isna())
        ]
        print(f'Processing dataframe of length {len(df)}')

        exog = df[exog_names]
        constant = sm.add_constant(exog)

        for dv_type, endog_names in endog_names_dict[dv].items():
            endog = df[endog_names['endog_names']]
            model = sm.OLS(endog=endog, exog=constant, data=df)
            results = model.fit()
            tt, df_std_coef = get_standardized_coefficients(results)
            full_summary = make_full_report(
                results, dv, dvs_name=dv, dv_type=dv_type, title=f'{dv_type} OLS Regression {dv}'
            )
            endog_names_dict[dv][dv_type]['R-squared'] = results.rsquared
            endog_names_dict[dv][dv_type]['Results'] = results

            print('\n')
            print('-'*20)
            print(f'{dv_type.upper()} {dv}\n')
            print('-'*20)
            print('\n')
            print(f'{dv_type.upper()} SUMMARY RESULTS:')
            print(results.summary())
            print(full_summary)
            print('\n')
            print('-'*20)
            print(f'{dv_type.upper()} STANDARDIZED BETA REGRESSION COEFFICIENTS FOR {dv}:\n{df_std_coef}')
            print('\n')
            print('-'*20)

            save_name = f'{table_save_path}{dv_type} OLS Regression {dv}'
            df_summary_results = pd.DataFrame(csv.reader(results.summary().as_csv().split('\n'), delimiter=','))
            df_summary_results.to_csv(f'{save_name}.csv')
            df_summary_results.style.to_latex(f'{save_name}.tex', hrules=True)
            df_std_coef.to_csv(f'{save_name} - standardized coefficients.csv')
            df_std_coef.style.to_latex(f'{save_name} - standardized coefficients.tex', hrules=True)

        if endog_names_dict[dv][list(endog_names_dict[dv])[0]]['R-squared'] != endog_names_dict[dv][list(endog_names_dict[dv])[-1]]['R-squared']:
            print('\n')
            print('-'*20)
            print(f'{dv} {list(endog_names_dict[dv])[0]} R-Squared does not equal {list(endog_names_dict[dv])[-1]} R-Squared:')
            print(f'{dv} {list(endog_names_dict[dv])[0]} = {endog_names_dict[dv][list(endog_names_dict[dv])[0]]["R-squared"]:.3f}')
            print(f'{dv} {list(endog_names_dict[dv])[-1]} = {endog_names_dict[dv][list(endog_names_dict[dv])[-1]]["R-squared"]:.3f}')
            print('\n')
            print('-'*20)

    return dict(endog_names_dict)


In [11]:
endog_names_dict = compare_actual_and_predicted(df_jobs)


Processing dataframe of length 5944
Saving /Users/nyxinsane/Documents/Work - UvA/Automating Equity/Automating_Equity1/Automating_Equity1_Code/data/output tables/Unbiased OLS Regression Warmth...


  0%|          | 0/140 [00:00<?, ?it/s]



--------------------
UNBIASED Warmth

--------------------


UNBIASED SUMMARY RESULTS:
                            OLS Regression Results                            
Dep. Variable:          Warmth_actual   R-squared:                       0.078
Model:                            OLS   Adj. R-squared:                  0.076
Method:                 Least Squares   F-statistic:                     35.77
Date:                Mon, 30 Oct 2023   Prob (F-statistic):           8.51e-94
Time:                        19:45:14   Log-Likelihood:                -3377.1
No. Observations:                5944   AIC:                             6784.
Df Residuals:                    5929   BIC:                             6885.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
---------

  0%|          | 0/140 [00:00<?, ?it/s]



--------------------
BIASED Warmth

--------------------


BIASED SUMMARY RESULTS:
                            OLS Regression Results                            
Dep. Variable:       Warmth_predicted   R-squared:                       0.094
Model:                            OLS   Adj. R-squared:                  0.092
Method:                 Least Squares   F-statistic:                     43.80
Date:                Mon, 30 Oct 2023   Prob (F-statistic):          1.30e-115
Time:                        19:45:14   Log-Likelihood:                -3543.2
No. Observations:                5944   AIC:                             7116.
Df Residuals:                    5929   BIC:                             7217.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
-------------

  0%|          | 0/140 [00:00<?, ?it/s]



--------------------
UNBIASED Competence

--------------------


UNBIASED SUMMARY RESULTS:
                            OLS Regression Results                            
Dep. Variable:      Competence_actual   R-squared:                       0.102
Model:                            OLS   Adj. R-squared:                  0.100
Method:                 Least Squares   F-statistic:                     48.09
Date:                Mon, 30 Oct 2023   Prob (F-statistic):          3.64e-127
Time:                        19:45:14   Log-Likelihood:                -3980.0
No. Observations:                5944   AIC:                             7990.
Df Residuals:                    5929   BIC:                             8090.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
-----

  0%|          | 0/140 [00:00<?, ?it/s]



--------------------
BIASED Competence

--------------------


BIASED SUMMARY RESULTS:
                             OLS Regression Results                             
Dep. Variable:     Competence_predicted   R-squared:                       0.124
Model:                              OLS   Adj. R-squared:                  0.122
Method:                   Least Squares   F-statistic:                     60.18
Date:                  Mon, 30 Oct 2023   Prob (F-statistic):          3.00e-159
Time:                          19:45:15   Log-Likelihood:                -3917.1
No. Observations:                  5944   AIC:                             7864.
Df Residuals:                      5929   BIC:                             7965.
Df Model:                            14                                         
Covariance Type:              nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025  

In [12]:
endog_names_dict


{'Warmth': {'Unbiased': {'endog_names': 'Warmth_actual',
   'R-squared': 0.07789376529661218,
   'Results': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2986bddb0>},
  'Biased': {'endog_names': 'Warmth_predicted',
   'R-squared': 0.0937257615180127,
   'Results': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2a0d577c0>}},
 'Competence': {'Unbiased': {'endog_names': 'Competence_actual',
   'R-squared': 0.10197425136976457,
   'Results': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x178768e20>},
  'Biased': {'endog_names': 'Competence_predicted',
   'R-squared': 0.1244277453998145,
   'Results': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1787bd930>}}}

# Make Instrumental Variables

In [13]:
text_col = 'Job Description spacy_sentencized'
col = 'Warmth'
n_trees = 100
train_ratio = 0.75
test_ratio = 0.10
validation_ratio = 0.15
test_split = test_size = 1 - train_ratio
validation_split = test_ratio / (test_ratio + validation_ratio)


In [14]:
df_jobs_unlabeled = df_jobs.loc[
    (df_jobs[endog_names_dict[col]['Unbiased']['endog_names']].isna())
    & (df_jobs[endog_names_dict[col]['Biased']['endog_names']].isna())
].dropna(axis='columns').reset_index(drop=True)


In [15]:
df_jobs_unlabeled.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303494 entries, 0 to 303493
Data columns (total 73 columns):
 #   Column                                                              Non-Null Count   Dtype   
---  ------                                                              --------------   -----   
 0   Search Keyword                                                      303494 non-null  object  
 1   Platform                                                            303494 non-null  object  
 2   Job ID                                                              303494 non-null  object  
 3   Job Title                                                           303494 non-null  object  
 4   Company Name                                                        303494 non-null  object  
 5   Location                                                            303494 non-null  object  
 6   Dutch Requirement in Job Ad                                         303494 non-null  object 

In [16]:
df_jobs_unlabeled.isna().sum()


Search Keyword                                                        0
Platform                                                              0
Job ID                                                                0
Job Title                                                             0
Company Name                                                          0
Location                                                              0
Dutch Requirement in Job Ad                                           0
English Requirement in Job Ad                                         0
Dutch Requirement in Job Ad_No                                        0
Dutch Requirement in Job Ad_Yes                                       0
English Requirement in Job Ad_No                                      0
English Requirement in Job Ad_Yes                                     0
Sector Code                                                           0
Sector                                                          

In [17]:
df_jobs_labeled = df_jobs.loc[
    (~df_jobs[endog_names_dict[col]['Unbiased']['endog_names']].isna())
    & (~df_jobs[endog_names_dict[col]['Biased']['endog_names']].isna())
].reset_index(drop=True)


In [18]:
df_jobs_labeled.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5944 entries, 0 to 5943
Data columns (total 79 columns):
 #   Column                                                              Non-Null Count  Dtype   
---  ------                                                              --------------  -----   
 0   Search Keyword                                                      5944 non-null   object  
 1   Platform                                                            5944 non-null   object  
 2   Job ID                                                              5944 non-null   object  
 3   Job Title                                                           5944 non-null   object  
 4   Company Name                                                        5944 non-null   object  
 5   Location                                                            5944 non-null   object  
 6   Dutch Requirement in Job Ad                                         5944 non-null   object  
 7   Englis

In [39]:
all(df_jobs_labeled.isna().sum()) == 0


True

In [None]:
def get_instrumental_variable_estimator(df_jobs, text_col=None, n_trees=None):

    if text_col is None:
        text_col = 'Job Description spacy_sentencized'
    if n_trees is None:
        n_trees = 100
    train_ratio = 0.75
    test_ratio = 0.10
    validation_ratio = 0.15
    test_split = test_size = 1 - train_ratio
    validation_split = test_ratio / (test_ratio + validation_ratio)

    # Make df_jobs_unlabeled
    df_jobs_unlabeled = df_jobs.loc[
        (df_jobs[endog_names_dict[col]['Unbiased']['endog_names']].isna())
        & (df_jobs[endog_names_dict[col]['Biased']['endog_names']].isna())
    ].dropna(axis='columns').reset_index(drop=True)



    # Make df_jobs_labeled
    df_jobs_labeled = df_jobs.loc[
        (~df_jobs[endog_names_dict[col]['Unbiased']['endog_names']].isna())
        & (~df_jobs[endog_names_dict[col]['Biased']['endog_names']].isna())
    ].reset_index(drop=True)
    if not all(df_jobs_labeled.isna().sum()) == 0:
        raise(IndexError('Missing data in df_jobs_labeled'))

    # Drop columns with less than 5 characters
    df_jobs_labeled = df_jobs_labeled.dropna(subset=analysis_columns, how='any')
    df_jobs_labeled = df_jobs_labeled.loc[df_jobs_labeled[text_col].apply(len) >= 5]
    print(f'DF length: {len(df_jobs_labeled)}')

    # Split data
    train, test = train_test_split(
        df_jobs_labeled, train_size=1-test_split, test_size=test_split, random_state=random_state
    )
    val, test = train_test_split(
        test, test_size=validation_split, random_state=random_state
    )

    X_train = np.array(list(train[text_col].astype('str').values))
    y_train = column_or_1d(train[col].astype('int64').values.tolist(), warn=True)

    X_test = np.array(list(test[text_col].astype('str').values))
    y_test = column_or_1d(test[col].astype('int64').values.tolist(), warn=True)

    X_val = np.array(list(val[text_col].astype('str').values))
    y_val = column_or_1d(val[col].astype('int64').values.tolist(), warn=True)



In [20]:
df_jobs_labeled = df_jobs_labeled.dropna(subset=analysis_columns, how='any')
df_jobs_labeled = df_jobs_labeled.loc[df_jobs_labeled[text_col].apply(len) >= 5]
print(f'DF length: {len(df_jobs_labeled)}')


DF length: 5925


In [21]:
train, test = train_test_split(
    df_jobs_labeled, train_size=1-test_split, test_size=test_split, random_state=random_state
)
val, test = train_test_split(
    test, test_size=validation_split, random_state=random_state
)

X_train = np.array(list(train[text_col].astype('str').values))
y_train = column_or_1d(train[col].astype('int64').values.tolist(), warn=True)

X_test = np.array(list(test[text_col].astype('str').values))
y_test = column_or_1d(test[col].astype('int64').values.tolist(), warn=True)

X_val = np.array(list(val[text_col].astype('str').values))
y_val = column_or_1d(val[col].astype('int64').values.tolist(), warn=True)


In [22]:
# Vecotrize X_train and X_test using TfidfVectorizer
vectorizer = vectorizers_list[-1]
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
X_val = vectorizer.transform(X_val)


In [23]:
estimator = RandomForestRegressor(n_estimators=n_trees, random_state=random_state, n_jobs=n_jobs)
estimator.fit(X_train, y_train)


In [28]:
estimator.score(X_test, y_test)


0.07241026787265581

In [24]:
# Get predictions and probabilities
y_train_pred = estimator.predict(X_train)
y_test_pred = estimator.predict(X_test)
y_val_pred = estimator.predict(X_val)


In [None]:
metrics.classification_report(y_test, y_test_pred)


In [34]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_test_pred)
print("Mean Absolute Error:", mae)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_test_pred)
print("Mean Squared Error:", mse)


Mean Absolute Error: 0.34918688419858657
Mean Squared Error: 0.1929571501841758


In [35]:
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)


Root Mean Squared Error: 0.43926888142022513


In [37]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_test_pred)
print("R-squared (R^2) Score:", r2)


R-squared (R^2) Score: 0.07241026787265581


In [None]:
actual = y_val
pred_unlabel = estimator.predict(X_train)
indiv_pred_unlabel = [tree.predict(X_train) for tree in estimator.estimators_]
aggr_pred_unlabel = np.mean(indiv_pred_unlabel, axis=0)
