In [1]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

if code_dir_name not in str(Path.cwd()).split('/')[-1]:
    for _ in range(5):

        parent_path = str(Path.cwd().parents[_]).split('/')[-1]

        if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

            code_dir = str(Path.cwd().parents[_])

            if code_dir is not None:
                break
else:
    code_dir = str(Path.cwd())
sys.path.append(code_dir)

# %load_ext autoreload
# %autoreload 2


In [2]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module.estimators_get_pipe import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module.forestIV import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


Using MPS


0it [00:00, ?it/s]

Using MPS


<Figure size 640x480 with 0 Axes>

### Set variables

In [3]:
# Variables
method = 'Supervised'
classifiers_type = 'all'
if classifiers_type == 'nonlinear':
    classifiers_pipe = classifiers_pipe_nonlinear
elif classifiers_type == 'linear':
    classifiers_pipe = classifiers_pipe_linear
elif classifiers_type == 'ensemble':
    classifiers_pipe = classifiers_pipe_ensemble
elif classifiers_type == 'all':
    classifiers_pipe = classifiers_pipe

results_save_path = f'{models_save_path}{method} Results/'
with open(f'{data_dir}{method}_results_save_path.txt', 'w') as f:
    f.write(results_save_path)
if not os.path.exists(results_save_path):
    os.makedirs(results_save_path)
done_xy_save_path = f'{results_save_path}Search+Xy/'
with open(f'{data_dir}{method}_done_xy_save_path.txt', 'w') as f:
    f.write(done_xy_save_path)
if not os.path.exists(done_xy_save_path):
    os.makedirs(done_xy_save_path)

t = time.time()
n_jobs = -1
n_splits = 10
n_repeats = 3
random_state = 42
refit = True
class_weight = 'balanced'
cv = RepeatedStratifiedKFold(
    n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
)
scoring = 'recall'
scores = [
    'recall', 'accuracy', 'f1', 'roc_auc',
    'explained_variance', 'matthews_corrcoef'
]
scorers = {
    'precision_score': make_scorer(precision_score, zero_division=0),
    'recall_score': make_scorer(recall_score, zero_division=0),
    'accuracy_score': make_scorer(accuracy_score, zero_division=0),
}
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
metrics_dict = {
    f'{scoring.title()} Best Score': np.nan,
    f'{scoring.title()} Best Threshold': np.nan,
    'Train - Mean Cross Validation Score': np.nan,
    f'Train - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Train - Mean Explained Variance - {scoring.title()}': np.nan,
    'Test - Mean Cross Validation Score': np.nan,
    f'Test - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Test - Mean Explained Variance - {scoring.title()}': np.nan,
    'Explained Variance': np.nan,
    'Accuracy': np.nan,
    'Balanced Accuracy': np.nan,
    'Precision': np.nan,

    'Average Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'Matthews Correlation Coefficient': np.nan,
    'Brier Score': np.nan,
    'Fowlkes–Mallows Index': np.nan,
    'R2 Score': np.nan,
    'ROC': np.nan,
    'AUC': np.nan,
    'Log Loss/Cross Entropy': np.nan,
    'Cohen’s Kappa': np.nan,
    'Geometric Mean': np.nan,
    'Classification Report': np.nan,
    'Imbalanced Classification Report': np.nan,
    'Confusion Matrix': np.nan,
    'Normalized Confusion Matrix': np.nan,
}

# Transformer variables
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
# Set random seed
random_state = 42
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
cores = multiprocessing.cpu_count()


Using MPS


### Functions

In [4]:
def prob_confirmatory_tests(y_pred, y_pred_prob):

    # Confirmatory Regression
    print('+'*20)
    print('Confirmatory Tests validating the linear relationship between y_pred and y_pred_prob')
    print('-'*20)
    print('T-Test y_pred_prob ~ y_pred:')
    levene = scipy.stats.levene(y_pred_prob, y_pred)
    equal_var_levene = levene.pvalue < 0.05
    print(scipy.stats.ttest_ind(y_pred_prob, y_pred, equal_var=equal_var_levene))

    print('\n')
    print('-'*20)
    print('Logit y_pred ~ y_pred_prob:')
    try:
        logit_model = sm.Logit(endog=y_pred, exog=y_pred_prob)
        logit_results = logit_model.fit()
        std_coef = logit_results.params[0] / np.std(y_pred_prob)
        std_err = logit_results.bse[0]
        log_likelihood = logit_results.llf
        print(logit_results.summary())
        print('-'*20)
        print(f'Std Coef: {std_coef}')
        print(f'Std Err: {std_err}')
        print(f'Log Likelihood: {log_likelihood}')
    except Exception as e:
        print(type(e).__name__)

    print('-'*20)
    print('\n')
    print('-'*20)
    print('OLS y_pred_prob ~ y_pred:')
    try:
        ols_model = sm.OLS(endog=y_pred_prob, exog=y_pred)
        ols_results = ols_model.fit()
        std_coef = ols_results.params[0] / np.std(y_pred)
        std_err = ols_results.bse[0]
        print(ols_results.summary())
        print('-'*20)
        print(f'Std Coef: {std_coef}')
        print(f'Std Err: {std_err}')
    except Exception as e:
        print(type(e).__name__)

    print('-'*20)
    print('+'*20)
    print('\n')


In [5]:
def save_df_full_summary_excel(
    df_full_summary,
    title,
    text_to_add_list,
    file_save_path,
    sheet_name=None,
    startrow=None,
    startcol=None,
):
    if sheet_name is None:
        sheet_name = 'All'
    if startrow is None:
        startrow = 1
    if startcol is None:
        startcol = 1

    # Define last rows and cols locs
    header_range = 1
    endrow = startrow + header_range + df_full_summary.shape[0]
    endcol = startcol + df_full_summary.shape[1]

    # Remove NAs
    df_full_summary = df_full_summary.fillna('')

    # Write
    writer = pd.ExcelWriter(f'{file_save_path}.xlsx')
    df_full_summary.to_excel(writer, sheet_name=sheet_name, merge_cells=True, startrow=startrow, startcol=startcol)
    workbook  = writer.book
    worksheet = writer.sheets[sheet_name]
    worksheet.set_column(startrow, 1, None, None, {'hidden': True}) # hide the index column

    # Title
    worksheet.merge_range(1, startcol, 1, endcol, title, workbook.add_format({'italic': True, 'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'align': 'left', 'top': True, 'bottom': True, 'left': False, 'right': False}))

    # Main body
    body_max_row_idx, body_max_col_idx = df_full_summary.shape

    for c, r in tqdm_product(range(body_max_col_idx), range(body_max_row_idx)):
        row_to_write = startrow + header_range + r
        col_to_write = startcol + 1 + c # 1 is for index
        body_formats = {'num_format': '0.00', 'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'align': 'center', 'text_wrap': True, 'left': False, 'right': False}

        if r == 0:
            body_formats |= {'top': True, 'bottom': True, 'left': False, 'right': False}
            worksheet.set_column(col_to_write, col_to_write, 10)

        if r == body_max_row_idx-1:
            body_formats |= {'bottom': True}

        if c == 0:
            body_formats |= {'align': 'left'}
            worksheet.set_column(col_to_write, col_to_write, 15)

        worksheet.write(row_to_write, col_to_write, df_full_summary.iloc[r, c], workbook.add_format(body_formats))

    # Add Note
    note_format = {'italic': True, 'font_name': 'Times New Roman', 'font_size': 10, 'font_color': 'black', 'align': 'left', 'left': False, 'right': False}
    worksheet.merge_range(endrow, startcol, endrow, endcol, 'Note.', workbook.add_format(note_format))
    # Add text
    for i, text in enumerate(text_to_add_list):
        worksheet.merge_range(endrow + 1 + i , startcol, endrow + 1 + i, endcol, text, workbook.add_format(note_format))

    writer.close()


In [6]:
def make_full_report(
    results, dv, analysis_type, model_name, dvs_name, ivs_name, ivs_type, df_name,
    regression_info_dict=None, regressor_order=None, text_to_add_list=None, title=None, model_names=None
):
    '''
    Make a full report for a regression analysis.
    results: statsmodels regression results object or list of results objects
    dv: str, dependent variable name
    '''

    if regression_info_dict is None:
        # Regression info dict
        regression_info_dict = {
            'Model Name': lambda x: f'{x.model.__class__.__name__}',
            'N': lambda x: f'{int(x.nobs):d}',
            'R-squared': lambda x: f'{x.rsquared:.5f}',
            'R-squared Adj.': lambda x: f'{x.rsquared_adj:.5f}',
            'Log-Likelihood': lambda x: f'{x.llf:.5f}',
            'Pseudo R2': lambda x: f'{x.prsquared:.5f}',
            'F': lambda x: f'{x.fvalue:.5f}',
            'F (p-value)': lambda x: f'{x.f_pvalue:.5f}',
            'df_model': lambda x: f'{x.df_model:.0f}',
            'df_total': lambda x: f'{x.df_resid + x.df_model + 1:.0f}',
            'df_resid': lambda x: f'{x.df_resid:.0f}',
            'AIC': lambda x: f'{x.aic:.5f}',
            'BIC': lambda x: f'{x.bic:.5f}',
            'ICC': lambda x: f'{x.rsquared / (x.rsquared + (x.nobs - 1) * x.mse_resid):.5f}',
            'RMSE': lambda x: f'{x.mse_resid ** 0.5:.5f}',
            'RMSE (std)': lambda x: f'{x.mse_resid ** 0.5 / x.model.endog.std():.5f}',
            'Omnibus': lambda x: f'{sms.omni_normtest(x.resid).statistic:.5f}',
            'Omnibus (p-value)': lambda x: f'{sms.omni_normtest(x.resid).pvalue:.5f}',
            'Skew': lambda x: f'{sms.jarque_bera(x.resid)[-2]:.5f}',
            'Kurtosis': lambda x: f'{sms.jarque_bera(x.resid)[-1]:.5f}',
            'Jarque-Bera (JB)': lambda x: f'{sms.jarque_bera(x.resid)[0]:.5f}',
            'Jarque-Bera (p-value)': lambda x: f'{sms.jarque_bera(x.resid)[1]:.5f}',
            'Unstandardized Coefficent B (b)': lambda x: f'{x.params[0]:.5f}',
            'Standard Error (SE)': lambda x: f'{x.bse[0]:.5f}',
            'Standardized Coefficient b* (β)': lambda x: f'{x.params[0] / x.model.endog.std():.5f}',
            't': lambda x: f'{x.tvalues[0]:.5f}',
            't (p-value)': lambda x: f'{x.pvalues[0]:.5f}',
            '95% CI': lambda x: f'{x.conf_int().iloc[0, 1]:.5f} - {x.conf_int().iloc[0, 1]:.5f}',
            # 'Summary': lambda x: f'{x.summary()}',
            # 'F (p-value - FDR)': lambda x: f'{x.f_pvalue_fdr:.5f}',
            # 'F (p-value - Bonferroni)': lambda x: f'{x.f_pvalue_bonf:.5f}',
            # 't (p-value - FDR)': lambda x: f'{x.pvalues_fdr[1]:.5f}',
            # 't (p-value - Bonferroni)': lambda x: f'{x.pvalues_bonf[1]:.5f}',
        }
        if isinstance(results, list):
            results_to_check = results[0]
        else:
            results_to_check = results
        if all('const' in x for x in zip(results_to_check.params.index, results_to_check.bse.index, results_to_check.tvalues.index, results_to_check.pvalues.index)):
            regression_info_dict = regression_info_dict | {
                'Intercept': lambda x: f'{x.params["const"]:.5f}',
                'Intercept (std)': lambda x: f'{x.bse["const"]:.5f}',
                'Intercept t': lambda x: f'{x.tvalues["const"]:.5f}',
                'Intercept t (p-value)': lambda x: f'{x.pvalues["const"]:.5f}',
                'Intercept (95% CI)': lambda x: f'{x.conf_int().loc["const"][0]:.5f} - {x.conf_int().loc["const"][1]:.5f}',
            }
    if model_names is None:
        if isinstance(results, list):
            model_names = [
                f'{results_to_check.model.endog_names.split("_")[0] if "_" in results_to_check.model.endog_names else results_to_check.model.endog_names} Model {i}'
                for i in range(len(results))
            ]
            model_names[0] = model_names[0].replace('Model 0', 'Full Model')
        else:
            model_names = [
                f'{results.model.endog_names.split("_")[0] if "_" in results.model.endog_names else results.model.endog_names}'
            ]

    order_type = 'unordered' if regressor_order is None else 'ordered'
    if text_to_add_list is None:
        text_to_add_list = []
        if regressor_order is not None:
            text_to_add_list.append('Models are ordered by independent variable type.')

        else:
            text_to_add_list.append('Models are ordered by coefficient size, largest to smallest.')

    if title is None:
        title = f'{model_name} {analysis_type}: {dvs_name} x {ivs_name}'

    try:
        # Statsmodels summary_col
        full_summary = summary_col(
            results,
            stars=True,
            info_dict=regression_info_dict,
            regressor_order=regressor_order,
            float_format='%0.3f',
            model_names=model_names,
        )
        if isinstance(results, list) and len(results) > 4:
            full_summary.tables[0][full_summary.tables[0].filter(regex='Full Model').columns[0]].loc['Unstandardized Coefficent B (b)': '95% CI'] = ''

        # Add title and notes
        full_summary.add_title(title)
        text_to_add_list.extend(full_summary.extra_txt)
        for text in text_to_add_list:
            full_summary.add_text(text)
        # Save
        save_name = f'{table_save_path}{model_name} {df_name} - ALL {dv} {order_type} {analysis_type} on {ivs_type}'
        df_full_summary = pd.read_html(full_summary.as_html())[0]
        df_full_summary.to_csv(f'{save_name}.csv')
        df_full_summary.style.to_latex(f'{save_name}.tex', hrules=True)
        save_df_full_summary_excel(df_full_summary, title, text_to_add_list, save_name)

        return full_summary
    except IndexError as e:
        print(f'Making full report for {model_names[0]} due to the following error: {e}')
        return None


In [7]:
def get_standardized_coefficients(results):

    # # Get standardized regression coefficients
    # std = np.asarray(constant.std(0))

    # if 'const' in results.params and 'const' in constant:
    #     std[0] = 1
    # tt = results.t_test(np.diag(std))
    # tt.c_names = results.model.exog_names

    # t-test
    std = results.model.exog.std(0)
    if 'const' in results.params:
        std[0] = 1
    tt = results.t_test(np.diag(std))
    if results.model.__class__.__name__ == 'MixedLM' or 'Group Var' in results.model.exog_names:
        offset = slice(None, -1)
        tt.c_names = results.model.exog_names[offset]
    else:
        offset = slice(None, None)
        tt.c_names = results.model.exog_names

    # Make df with standardized and unstandardized coefficients
    df_std_coef = pd.DataFrame(
        {
            'coef': results.params[offset].apply(lambda x: f'{x:.5f}'),
            'std err': results.bse[offset].apply(lambda x: f'{x:.5f}'),
            'std coef': (results.params[offset] / results.model.exog[offset].std(axis=0)).apply(lambda x: f'{x:.5f}'),
            't': results.tvalues[offset].apply(lambda x: f'{x:.5f}'),
            'P>|t|': results.pvalues[offset].apply(lambda x: f'{x:.5f}'),
            '[0.025': results.conf_int()[0][offset].apply(lambda x: f'{x:.5f}'),
            '0.975]': results.conf_int()[1][offset].apply(lambda x: f'{x:.5f}'),
        }
    )
    # if 'Group Var' in df_std_coef.index:
    #     df_std_coef = df_std_coef.drop('Group Var', axis='index')
    # # Add standardized coefficients and other data from t-test
    # df_std_coef['std coef'] = tt.effect
    # df_std_coef['std err'] = tt.sd
    # df_std_coef['t'] = tt.statistic
    # df_std_coef['P>|t|'] = tt.pvalue
    # df_std_coef['[0.025'] = tt.conf_int()[:, 0]
    # df_std_coef['0.975]'] = tt.conf_int()[:, 1]
    # df_std_coef['var'] = [names[i] for i in range(len(results.model.exog_names))]
    # df_std_coef = df_std_coef.sort_values('std coef', ascending=False)
    df_std_coef = df_std_coef.reset_index().rename(columns={'index': 'var'})
    df_std_coef = df_std_coef.rename(
        columns={
            'var': 'Variable',
            'coef': 'Unstandardized Coefficent B (b)',
            'std err': 'Standard Error',
            'std coef':'Standardized Coefficient b* (β)',
            't': 't-value',
            'P>|t|': 'p-value',
            '[0.025': '95% CI Lower',
            '0.975]': '95% CI Upper'
        }
    )
    # Reorder columns
    df_std_coef = df_std_coef[[
        'Variable',
        'Unstandardized Coefficent B (b)',
        'Standard Error',
        'Standardized Coefficient b* (β)',
        't-value',
        'p-value',
        '95% CI Lower',
        '95% CI Upper'
    ]]

    return tt, df_std_coef


In [8]:
# Function to compare and produce Unbiased and Biased OLS Models
def compare_actual_and_predicted(df, analysis_type, iv_names=None, print_enabled=None):
    if print_enabled is None:
        print_enabled = True
    dv_names_dict = defaultdict(lambda: defaultdict())

    for dv in tqdm.tqdm(dvs):
        if analysis_type == 'pre_classification':
            if iv_names is None:
                iv_names = ivs_dummy_perc_and_perc_interactions + controls[:2]
            dv_names_dict[dv] = {
                'Unbiased': {'dv_names': f'{dv}_actual'},
                'Biased': {'dv_names': f'{dv}_predicted'}
            }
            df = df.loc[
                (~df[dv_names_dict[dv]['Unbiased']['dv_names']].isna())
                & (~df[dv_names_dict[dv]['Biased']['dv_names']].isna())
            ]
            print(f'Processing dataframe of length {len(df)}')

        elif analysis_type == 'post_classification':
            if iv_names is None:
                iv_names = ivs_dummy_perc_and_perc_interactions[0]
            if f'{dv}_aggr_unlabeled_predicted' in df.columns:
                dv_names_dict[dv] = {
                    'Biased': {'dv_names': f'{dv}_aggr_unlabeled_predicted'},
                }
                df = df.loc[
                    (~df[dv_names_dict[dv]['Biased']['dv_names']].isna())
                ]
                print(f'Processing dataframe of length {len(df)}')
            elif f'{dv}_actual' in df.columns:
                dv_names_dict[dv] = {
                    'Unbiased': {'dv_names': f'{dv}_actual'},
                }
                df = df.loc[
                    (~df[dv_names_dict[dv]['Unbiased']['dv_names']].isna())
                ]
                print(f'Processing dataframe of length {len(df)}')

        print(f'Analyzing {dv} {dv_names_dict[dv].keys()} Models')

        for dv_type, dv_names in tqdm.tqdm(dv_names_dict[dv].items()):
            if analysis_type == 'pre_classification':
                endog = df[dv_names['dv_names']]
                exog = df[iv_names]
            elif analysis_type == 'post_classification':
                endog = df[iv_names]
                exog = df[dv_names['dv_names']]

            model = sm.OLS(endog=endog, exog=exog, data=df)
            results = model.fit()
            tt, df_std_coef = get_standardized_coefficients(results)
            title = f'{analysis_type} {dv_type} OLS Regression {dv_names["dv_names"]} x {iv_names[:3]} etc.'
            full_summary = make_full_report(
                results=results, dv=dv, analysis_type=dv_names['dv_names'], model_name=analysis_type, df_name=dv_type,
                dvs_name=dv_names['dv_names'], ivs_name=iv_names[:3], ivs_type=iv_names[:3], title=title
            )

            dv_names_dict[dv][dv_type]['R-squared'] = results.rsquared
            dv_names_dict[dv][dv_type]['Results'] = results

            if print_enabled:
                print('\n')
                print('-'*20)
                print(f'{dv_type.upper()} {dv}\n')
                print('-'*20)
                print('\n')
                print(f'{dv_type.upper()} SUMMARY RESULTS:')
                print(results.summary())
                print(full_summary)
                print('\n')
                print('-'*20)
                print(f'{dv_type.upper()} STANDARDIZED BETA REGRESSION COEFFICIENTS FOR {dv}:\n{df_std_coef}')
                print('\n')
                print('-'*20)

            df_summary_results = pd.DataFrame(csv.reader(results.summary().as_csv().split('\n'), delimiter=','))
            save_name = f'{table_save_path}{title}'
            df_summary_results.to_csv(f'{save_name}.csv')
            df_summary_results.style.to_latex(f'{save_name}.tex', hrules=True)
            df_std_coef.to_csv(f'{save_name} - standardized coefficients.csv')
            df_std_coef.style.to_latex(f'{save_name} - standardized coefficients.tex', hrules=True)

        if dv_names_dict[dv][list(dv_names_dict[dv])[0]]['R-squared'] != dv_names_dict[dv][list(dv_names_dict[dv])[-1]]['R-squared']:
            print('\n')
            print('-'*20)
            print(f'{dv} {list(dv_names_dict[dv])[0]} R-Squared does not equal {list(dv_names_dict[dv])[-1]} R-Squared:')
            print(f'{dv} {list(dv_names_dict[dv])[0]} = {dv_names_dict[dv][list(dv_names_dict[dv])[0]]["R-squared"]:.3f}')
            print(f'{dv} {list(dv_names_dict[dv])[-1]} = {dv_names_dict[dv][list(dv_names_dict[dv])[-1]]["R-squared"]:.3f}')
            print('\n')
            print('-'*20)

    return dict(dv_names_dict)


### READ DATA

In [9]:
with open(f'{data_dir}df_jobs_for_analysis_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
print(f'Dataframe df_jobs_for_analysis loaded with shape: {df_jobs.shape}')


Dataframe df_jobs_for_analysis loaded with shape: (309144, 79)


In [10]:
df_jobs['Warmth'].equals(df_jobs['Warmth_predicted'])


False

In [11]:
df_jobs['Competence'].equals(df_jobs['Competence_predicted'])


False

In [12]:
prob_confirmatory_tests(
    df_jobs.dropna(subset=dvs_predicted)['Warmth_predicted'],
    df_jobs.dropna(subset=dvs_predicted)['Warmth'],
)


++++++++++++++++++++
Confirmatory Tests validating the linear relationship between y_pred and y_pred_prob
--------------------
T-Test y_pred_prob ~ y_pred:
TtestResult(statistic=2.4676218209820364, pvalue=0.013616102570478466, df=11298.0)


--------------------
Logit y_pred ~ y_pred_prob:
Optimization terminated successfully.
         Current function value: 0.646807
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:       Warmth_predicted   No. Observations:                 5650
Model:                          Logit   Df Residuals:                     5649
Method:                           MLE   Df Model:                            0
Date:                Wed, 15 Nov 2023   Pseudo R-squ.:                 -0.1512
Time:                        02:40:41   Log-Likelihood:                -3654.5
converged:                       True   LL-Null:                       -3174.4
Covariance Type:            nonrobust   LLR p-value:   

In [13]:
prob_confirmatory_tests(
    df_jobs.dropna(subset=dvs_predicted)['Competence_predicted'],
    df_jobs.dropna(subset=dvs_predicted)['Competence'],
)


++++++++++++++++++++
Confirmatory Tests validating the linear relationship between y_pred and y_pred_prob
--------------------
T-Test y_pred_prob ~ y_pred:
TtestResult(statistic=-0.5089219636602621, pvalue=0.6108168622758917, df=11297.996031865623)


--------------------
Logit y_pred ~ y_pred_prob:
Optimization terminated successfully.
         Current function value: 0.506983
         Iterations 6
                            Logit Regression Results                            
Dep. Variable:     Competence_predicted   No. Observations:                 5650
Model:                            Logit   Df Residuals:                     5649
Method:                             MLE   Df Model:                            0
Date:                  Wed, 15 Nov 2023   Pseudo R-squ.:                  0.2669
Time:                          02:40:41   Log-Likelihood:                -2864.5
converged:                         True   LL-Null:                       -3907.1
Covariance Type:              n

## Check biased and unbiased regressions models using human annotated and classifier predicted Warmth and Competence
Source: https://mochenyang.github.io/mochenyangblog/research/2022/01/10/ForestIV.html

### Unbiased and Biased Warmth and CompetenceOLS regression with human annotated actual values as DV and all IVs

In [14]:
dv_names_dict_pre_classification = compare_actual_and_predicted(df_jobs, analysis_type='pre_classification', print_enabled=False)


  0%|          | 0/2 [00:00<?, ?it/s]

Processing dataframe of length 5650
Analyzing Warmth dict_keys(['Unbiased', 'Biased']) Models




  0%|          | 0/126 [00:00<?, ?it/s]



  0%|          | 0/126 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00,  6.99it/s]
 50%|█████     | 1/2 [00:00<00:00,  3.41it/s]



--------------------
Warmth Unbiased R-Squared does not equal Biased R-Squared:
Warmth Unbiased = 0.077
Warmth Biased = 0.066


--------------------
Processing dataframe of length 5650
Analyzing Competence dict_keys(['Unbiased', 'Biased']) Models




  0%|          | 0/126 [00:00<?, ?it/s]



  0%|          | 0/126 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00,  7.23it/s]
100%|██████████| 2/2 [00:00<00:00,  3.18it/s]



--------------------
Competence Unbiased R-Squared does not equal Biased R-Squared:
Competence Unbiased = 0.101
Competence Biased = 0.115


--------------------





In [15]:
dv_names_dict_pre_classification


{'Warmth': {'Unbiased': {'dv_names': 'Warmth_actual',
   'R-squared': 0.07692323512582688,
   'Results': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x107097df0>},
  'Biased': {'dv_names': 'Warmth_predicted',
   'R-squared': 0.06597100861003291,
   'Results': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x29b18e380>}},
 'Competence': {'Unbiased': {'dv_names': 'Competence_actual',
   'R-squared': 0.10100927171509122,
   'Results': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x2a376cd90>},
  'Biased': {'dv_names': 'Competence_predicted',
   'R-squared': 0.11465842823221661,
   'Results': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x29d750d90>}}}

In [16]:
dv_names_dict_pre_classification['Warmth']['Unbiased']['Results'].summary()


0,1,2,3
Dep. Variable:,Warmth_actual,R-squared:,0.077
Model:,OLS,Adj. R-squared:,0.075
Method:,Least Squares,F-statistic:,33.54
Date:,"Wed, 15 Nov 2023",Prob (F-statistic):,1.7e-87
Time:,02:41:47,Log-Likelihood:,-3202.4
No. Observations:,5650,AIC:,6435.0
Df Residuals:,5635,BIC:,6534.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Gender_Female,-2.405e+04,2.37e+04,-1.016,0.310,-7.05e+04,2.24e+04
Gender_Mixed,-2.405e+04,2.37e+04,-1.016,0.310,-7.05e+04,2.24e+04
Gender_Male,-2.405e+04,2.37e+04,-1.016,0.310,-7.05e+04,2.24e+04
Gender_Female_% per Sector,481.6445,473.771,1.017,0.309,-447.130,1410.419
Gender_Male_% per Sector,480.7993,473.524,1.015,0.310,-447.491,1409.089
Age_Older,-2.405e+04,2.37e+04,-1.016,0.310,-7.05e+04,2.24e+04
Age_Mixed,-2.405e+04,2.37e+04,-1.016,0.310,-7.05e+04,2.24e+04
Age_Younger,-2.405e+04,2.37e+04,-1.016,0.310,-7.05e+04,2.24e+04
Age_Older_% per Sector,489.9778,480.877,1.019,0.308,-452.727,1432.682

0,1,2,3
Omnibus:,922.904,Durbin-Watson:,1.417
Prob(Omnibus):,0.0,Jarque-Bera (JB):,901.084
Skew:,0.907,Prob(JB):,2.15e-196
Kurtosis:,2.268,Cond. No.,5.5e+17


In [17]:
dv_names_dict_pre_classification['Warmth']['Biased']['Results'].summary()


0,1,2,3
Dep. Variable:,Warmth_predicted,R-squared:,0.066
Model:,OLS,Adj. R-squared:,0.064
Method:,Least Squares,F-statistic:,28.43
Date:,"Wed, 15 Nov 2023",Prob (F-statistic):,1.8400000000000002e-73
Time:,02:42:06,Log-Likelihood:,-3091.9
No. Observations:,5650,AIC:,6214.0
Df Residuals:,5635,BIC:,6313.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Gender_Female,-2.01e+04,2.32e+04,-0.866,0.387,-6.56e+04,2.54e+04
Gender_Mixed,-2.01e+04,2.32e+04,-0.866,0.387,-6.56e+04,2.54e+04
Gender_Male,-2.01e+04,2.32e+04,-0.866,0.387,-6.56e+04,2.54e+04
Gender_Female_% per Sector,402.9754,464.596,0.867,0.386,-507.812,1313.763
Gender_Male_% per Sector,401.7529,464.354,0.865,0.387,-508.560,1312.066
Age_Older,-2.01e+04,2.32e+04,-0.866,0.387,-6.56e+04,2.54e+04
Age_Mixed,-2.01e+04,2.32e+04,-0.866,0.387,-6.56e+04,2.54e+04
Age_Younger,-2.01e+04,2.32e+04,-0.866,0.387,-6.56e+04,2.54e+04
Age_Older_% per Sector,409.5583,471.564,0.869,0.385,-514.890,1334.006

0,1,2,3
Omnibus:,830.995,Durbin-Watson:,1.553
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1061.267
Skew:,1.027,Prob(JB):,3.54e-231
Kurtosis:,2.46,Cond. No.,5.5e+17


In [18]:
dv_names_dict_pre_classification['Competence']['Unbiased']['Results'].summary()


0,1,2,3
Dep. Variable:,Competence_actual,R-squared:,0.101
Model:,OLS,Adj. R-squared:,0.099
Method:,Least Squares,F-statistic:,45.22
Date:,"Wed, 15 Nov 2023",Prob (F-statistic):,3.84e-119
Time:,02:42:48,Log-Likelihood:,-3787.4
No. Observations:,5650,AIC:,7605.0
Df Residuals:,5635,BIC:,7704.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Gender_Female,-2.746e+04,2.63e+04,-1.045,0.296,-7.89e+04,2.4e+04
Gender_Mixed,-2.746e+04,2.63e+04,-1.046,0.296,-7.89e+04,2.4e+04
Gender_Male,-2.746e+04,2.63e+04,-1.046,0.296,-7.89e+04,2.4e+04
Gender_Female_% per Sector,549.1728,525.455,1.045,0.296,-480.921,1579.266
Gender_Male_% per Sector,549.1992,525.181,1.046,0.296,-480.357,1578.756
Age_Older,-2.746e+04,2.63e+04,-1.046,0.296,-7.89e+04,2.4e+04
Age_Mixed,-2.746e+04,2.63e+04,-1.046,0.296,-7.89e+04,2.4e+04
Age_Younger,-2.746e+04,2.63e+04,-1.046,0.296,-7.89e+04,2.4e+04
Age_Older_% per Sector,562.0209,533.336,1.054,0.292,-483.522,1607.564

0,1,2,3
Omnibus:,28580.445,Durbin-Watson:,1.142
Prob(Omnibus):,0.0,Jarque-Bera (JB):,611.852
Skew:,0.117,Prob(JB):,1.37e-133
Kurtosis:,1.405,Cond. No.,5.5e+17


In [19]:
dv_names_dict_pre_classification['Competence']['Biased']['Results'].summary()


0,1,2,3
Dep. Variable:,Competence_predicted,R-squared:,0.115
Model:,OLS,Adj. R-squared:,0.112
Method:,Least Squares,F-statistic:,52.13
Date:,"Wed, 15 Nov 2023",Prob (F-statistic):,1.56e-137
Time:,02:42:58,Log-Likelihood:,-3747.5
No. Observations:,5650,AIC:,7525.0
Df Residuals:,5635,BIC:,7625.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Gender_Female,1950.6825,2.61e+04,0.075,0.940,-4.92e+04,5.31e+04
Gender_Mixed,1950.4908,2.61e+04,0.075,0.940,-4.92e+04,5.31e+04
Gender_Male,1950.5378,2.61e+04,0.075,0.940,-4.92e+04,5.31e+04
Gender_Female_% per Sector,-38.7221,521.760,-0.074,0.941,-1061.572,984.128
Gender_Male_% per Sector,-39.1037,521.488,-0.075,0.940,-1061.420,983.213
Age_Older,1950.6213,2.61e+04,0.075,0.940,-4.92e+04,5.31e+04
Age_Mixed,1950.5198,2.61e+04,0.075,0.940,-4.92e+04,5.31e+04
Age_Younger,1950.5700,2.61e+04,0.075,0.940,-4.92e+04,5.31e+04
Age_Older_% per Sector,-38.2644,529.585,-0.072,0.942,-1076.455,999.926

0,1,2,3
Omnibus:,38708.802,Durbin-Watson:,1.259
Prob(Omnibus):,0.0,Jarque-Bera (JB):,513.002
Skew:,0.079,Prob(JB):,4.01e-112
Kurtosis:,1.532,Cond. No.,5.5e+17


## Make RandomForestRegressor Classifier


In [None]:
def make_final_indiv_and_aggr_preds(estimator, X):
    pred = estimator.predict(X)
    indiv_pred = [tree.predict(X) for tree in estimator.estimators_]
    aggr_pred = np.mean(indiv_pred, axis=0)

    return pred, indiv_pred, aggr_pred


In [None]:
def get_randomforest_instrumental_variable_estimator(df_jobs, cols_to_compare=None, text_col=None, n_trees=None):

    if cols_to_compare is None:
        cols_to_compare = ['Warmth_actual', 'Warmth_predicted', 'Competence_actual', 'Competence_predicted']
    if text_col is None:
        text_col = 'Job Description spacy_sentencized'
    if n_trees is None:
        n_trees = 100
    cols_dict = defaultdict()
    train_ratio = 0.75
    test_ratio = 0.10
    validation_ratio = 0.15
    test_split = test_size = 1 - train_ratio
    validation_split = test_ratio / (test_ratio + validation_ratio)

    # Make df_jobs_unlabeled
    df_jobs_unlabeled = df_jobs.loc[
        (df_jobs[cols_to_compare].isna()).all(axis='columns')
    ]

    if all(df_jobs_unlabeled.isna().sum()) != 0:
        raise IndexError('Missing data in df_jobs_labeled.')

    df_jobs_unlabeled = df_jobs_unlabeled.dropna(subset=analysis_columns, how='any')
    print(f'Dataframe df_jobs_unlabeled of length: {len(df_jobs_unlabeled)}')

    # Make df_jobs_labeled
    df_jobs_labeled = df_jobs.loc[
        (~df_jobs[cols_to_compare].isna()).all(axis='columns')
    ]

    if all(df_jobs_labeled.isna().sum()) != 0:
        raise IndexError('Missing data in df_jobs_labeled.')

    df_jobs_labeled = df_jobs_labeled.dropna(subset=analysis_columns, how='any')
    print(f'Dataframe df_jobs_labeled of length: {len(df_jobs_labeled)}')

    # Make df labels dict
    df_add_preds_dict = {
        'labeled': df_jobs_labeled,
        'unlabeled': df_jobs_unlabeled
    }

    # Split data
    print('Splitting data...')
    train, test = train_test_split(
        df_jobs_labeled, train_size=1-test_split, test_size=test_split, random_state=random_state
    )
    print(f'Length of train dataset: {len(train)}')
    print(f'Length of test dataset: {len(test)}')
    cols_dict = {
        'train': train, 'test': test,
    }

    for col in tqdm.tqdm(analysis_columns):
        assert col in df_jobs_labeled.columns, f'{col} column not found in df_jobs_labeled'
        print('='*20)
        print(f'Training on {col}...')

        X_train = np.array(list(train[text_col].astype('str').values))
        y_train = column_or_1d(train[col].astype('int64').values.tolist(), warn=True)

        X_test = np.array(list(test[text_col].astype('str').values))
        y_test = column_or_1d(test[col].astype('int64').values.tolist(), warn=True)

        X_labeled = np.array(list(df_jobs_labeled[text_col].astype('str').values))
        y_labeled = column_or_1d(df_jobs_labeled[col].astype('int64').values.tolist(), warn=True)

        X_unlabeled = np.array(list(df_jobs_unlabeled[text_col].astype('str').values))
        y_unlabeled = column_or_1d(df_jobs_unlabeled[col].astype('int64').values.tolist(), warn=True)

        # Vectorize using FeatueUnion
        print(f'Vectorizing using {vectorizers_list[-1].__class__.__name__}...')
        vectorizer = vectorizers_list[-1]
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        X_labeled = vectorizer.transform(X_labeled)
        X_unlabeled = vectorizer.transform(X_unlabeled)

        # Train using RandomForestRegressor
        print('Training using RandomForestRegressor...')
        estimator = RandomForestRegressor(n_estimators=n_trees, random_state=random_state, n_jobs=n_jobs)
        estimator.fit(X_train, y_train)

        # Get predictions
        print('Getting predictions...')
        y_train_pred, indiv_y_train_pred, aggr_y_train_pred = make_final_indiv_and_aggr_preds(estimator, X_train)
        y_test_pred, indiv_y_test_pred, aggr_y_test_pred = make_final_indiv_and_aggr_preds(estimator, X_test)
        y_labeled_pred, indiv_y_labeled_pred, aggr_y_labeled_pred = make_final_indiv_and_aggr_preds(estimator, X_labeled)
        y_unlabeled_pred, indiv_y_unlabeled_pred, aggr_y_unlabeled_pred = make_final_indiv_and_aggr_preds(estimator, X_unlabeled)

        # Make col dict
        cols_dict[col] = {
            'estimator': estimator, 'vectorizer': vectorizer,
            'X_train': X_train, 'y_train': y_train, 'y_train_pred': y_train_pred,
            'indiv_y_train_pred': indiv_y_train_pred, 'aggr_y_train_pred': aggr_y_train_pred,
            'X_test': X_test, 'y_test': y_test, 'y_test_pred': y_test_pred,
            'indiv_y_test_pred': indiv_y_test_pred, 'aggr_y_test_pred': aggr_y_test_pred,
            'X_labeled': X_labeled, 'y_labeled': y_labeled, 'y_labeled_pred': y_labeled_pred,
            'indiv_y_labeled_pred': indiv_y_labeled_pred, 'aggr_y_labeled_pred': aggr_y_labeled_pred,
            'X_unlabeled': X_unlabeled, 'y_unlabeled': y_unlabeled, 'y_unlabeled_pred': y_unlabeled_pred,
            'indiv_y_unlabeled_pred': indiv_y_unlabeled_pred, 'aggr_y_unlabeled_pred': aggr_y_unlabeled_pred,
        }

        # Add columns to df
        for df_lab, df in tqdm.tqdm(df_add_preds_dict.items()):
            df = pd.concat(
                [
                    df.reset_index(drop=True),
                    pd.DataFrame(
                        {
                            f'{col}_{df_lab}_predicted': cols_dict[col][f'y_{df_lab}_pred'],
                            f'{col}_aggr_{df_lab}_predicted': cols_dict[col][f'aggr_y_{df_lab}_pred'],
                        }
                    ).reset_index(drop=True),
                    pd.DataFrame(cols_dict[col][f'indiv_y_{df_lab}_pred']).transpose().add_prefix(f'{col}_tree_').reset_index(drop=True)
                ],
                axis='columns'
            )
            cols_dict[col][f'df_jobs_{df_lab}'] = df

        # Evaluate
        print('Evaluating...')
        score = estimator.score(X_test, y_test)
        mae = mean_absolute_error(y_test, y_test_pred)
        mse = mean_squared_error(y_test, y_test_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_test_pred)

        print('-'*20)
        print(f'Mean Absolute Error: {mae:3f}')
        print(f'Mean Squared Error: {mse:3f}')
        print(f'Root Mean Squared Error: {rmse:3f}')
        print(f'R-squared (R^2) Score: {r2:3f}')
        print('-'*20)

    return n_trees, dict(cols_dict)


In [None]:
n_trees, cols_dict = get_randomforest_instrumental_variable_estimator(df_jobs, n_trees=100)


In [None]:
cols_dict.keys()


In [None]:
list_columns = [c for c in df_jobs.columns if df_jobs[c].apply(lambda x: isinstance(x, list)).any()]
non_list_columns = [c for c in df_jobs.columns if not df_jobs[c].apply(lambda x: isinstance(x, list)).any()]


In [None]:
cols_dict['Warmth']['df_jobs_labeled'].info()


In [None]:
cols_dict['Competence']['df_jobs_labeled'].info()


In [None]:
df_jobs_labeled = cols_dict['Warmth']['df_jobs_labeled']\
    .drop(columns=list_columns)\
    .merge(
        cols_dict['Competence']['df_jobs_labeled'],
        how='outer',
        on=non_list_columns
    ).dropna(axis='columns', how='all')\
        .reset_index(drop=True)


In [None]:
df_jobs_labeled.info()


In [None]:
df_jobs_labeled.head()


In [None]:
cols_dict['Warmth']['df_jobs_unlabeled'].info()


In [None]:
cols_dict['Competence']['df_jobs_unlabeled'].info()


In [None]:
df_jobs_unlabeled = cols_dict['Warmth']['df_jobs_unlabeled']\
    .drop(columns=list_columns)\
        .merge(
            cols_dict['Competence']['df_jobs_unlabeled'],
            how='outer',
            on=non_list_columns
        ).dropna(axis='columns', how='all')\
            .reset_index(drop=True)


In [None]:
df_jobs_unlabeled.head()


In [None]:
df_jobs_unlabeled.info()


In [None]:
train = cols_dict['train']


In [None]:
train.head()


In [None]:
df_jobs_train = train.copy()


In [None]:
df_jobs_train.info()


In [None]:
test = cols_dict['test']


In [None]:
test.head()


In [None]:
df_jobs_test = test\
    .drop(columns=list_columns)\
    .merge(
        df_jobs_labeled,
        how='inner',
        on=non_list_columns
    ).reset_index(drop=True)



In [None]:
df_jobs_test.info()


In [None]:
df_jobs_test.head()


# Make instrumental Variable

### Make unbiased and biased models

In [None]:
df_jobs_for_correction = pd.concat([df_jobs_labeled, df_jobs_unlabeled], axis='index')


In [None]:
df_jobs_for_correction.info()


In [None]:
df_jobs_for_correction.head()


In [None]:
# Biased model
biased_post_classification_dict = defaultdict()
for iv in tqdm.tqdm(ivs_dummy_perc_and_perc_interactions):
    dv_names_dict_unlabeled_post_classification = compare_actual_and_predicted(
        df_jobs_unlabeled, analysis_type='post_classification', iv_names=iv, print_enabled=False
    )
    biased_post_classification_dict[iv] = dv_names_dict_unlabeled_post_classification


In [None]:
# Uniased model
unbiased_post_classification_dict = defaultdict()
for iv in tqdm.tqdm(ivs_dummy_perc_and_perc_interactions):
    dv_names_dict_labeled_post_classification = compare_actual_and_predicted(
        df_jobs_labeled, analysis_type='post_classification', iv_names=iv, print_enabled=False
    )
    unbiased_post_classification_dict[iv] = dv_names_dict_labeled_post_classification


In [None]:
unbiased_post_classification_dict.keys()


In [None]:
# Get forest_iv results
forest_iv_results_dict = defaultdict(lambda: defaultdict())
forest_iv_params = {
    # 'col': dv,
    # 'var': iv,
    # 'model_unbias': model_unbias,
    'data_test': df_jobs_test,
    'data_unlabel': df_jobs_for_correction,
    # 'control': controls[:2],
    'ntree': n_trees,
    'iterative': True
    # 'diagnostic': True,
    # 'family': sm.families.Gaussian(link=sm.families.links.Identity()),
    # 'select_method': 'optimal',
    # 'method': 'Lasso',
}

for dv, iv in tqdm_product(dvs, ivs_dummy_perc_and_perc_interactions):
    print('-'*20)
    print(f'Analyzing {dv} with {iv}...')
    forest_iv_params['col'] = dv
    forest_iv_params['var'] = iv
    forest_iv_params['model_unbias'] = unbiased_post_classification_dict[iv][dv]['Unbiased']['Results']

    forest_iv_results_dict[dv][iv] = defaultdict()

    results_IV, output, results  = forest_iv(**forest_iv_params)

    forest_iv_results_dict[dv][iv]['Results_IV'] = results_IV
    forest_iv_results_dict[dv][iv]['Output'] = output
    forest_iv_results_dict[dv][iv]['Results'] = results

# result = forest_iv(
#     col=dv,
#     data_test=df_jobs_test,
#     data_unlabel=df_jobs_unlabeled,
#     var=iv,
#     control=controls[:2],
#     ntree=n_trees,
#     model_unbias=model_unbias,
#     diagnostic=True,
#     family=sm.families.Gaussian(link=sm.families.links.Identity()),
#     select_method='optimal',
#     method='Lasso',
#     iterative=False
# )


In [None]:
# Calculate the critical value for a chi-squared distribution
H_critical = scipy.stats.chi2.ppf(0.95, df=4)

for dv, iv in tqdm_product(dvs, ivs_dummy_perc_and_perc_interactions):
    # Get unbiased model
    model_unbias = unbiased_post_classification_dict[iv][dv]['Unbiased']['Results']

    # Get the unbiased coefficients
    coef_unbiased = model_unbias.params

    # Get results
    results = forest_iv_results_dict[dv][iv]['Results']

    # Calculate the squared bias for each beta
    results['bias2'] = ((results[[beta for beta in results.columns if 'beta' in beta ]] - coef_unbiased) ** 2).sum(axis=1)

    # Calculate the total variance
    results['variance'] = (results[[se for se in results.columns if 'se' in se]] ** 2).sum(axis=1)

    # Calculate the mean squared error (MSE)
    results['mse'] = results['bias2'] + results['variance']

    # Sort the DataFrame by MSE
    results = results.sort_values(by='mse')

    # Filter rows where Hotelling is less than H_critical and only keep the first row
    filtered_results = results[(results['Hotelling'] < H_critical) & (results.index == results.index[0])]

    # Display the filtered results
    print(filtered_results[[beta for beta in results.columns if 'beta' in beta ]])


In [None]:
biased_post_classification_dict[iv][dv]['Biased']['Results'].summary()


In [None]:
model_unbias.summary()


In [None]:
results


In [None]:
# Get valid instrumental variables from forest_iv_results_dict
instrumental_variables = list(
    {
        instrument
        for dv, iv in forest_iv_results_dict.items()
        for k, v in iv.items()
        for instrument in v['Output']['IVs']
    }
)


In [None]:
instrumental_variables


In [None]:
df_jobs = df_jobs_for_correction.loc[:,
    (~df_jobs_for_correction.columns.str.contains('_tree_'))
    | (df_jobs_for_correction.columns.isin(instrumental_variables))
].reset_index(drop=True)


In [None]:
df_jobs.info()


In [None]:
df_jobs.head()


In [None]:
iv = ivs_perc[0]
col = dvs[0]
for dv in dvs:
    print(dv, iv)
    results_IV = forest_iv_results_dict[dv][iv]['Results_IV']
    print(results_IV.summary())
    corrected_var = results_IV.predict(df_jobs[col])
    print(corrected_var)


In [None]:
# assert len(df_jobs) > 0 and isinstance(df_jobs, pd.DataFrame), f'ERORR: LENGTH OF DF = {len(df_jobs)}'
# df_jobs.to_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
# df_jobs.to_csv(f'{df_save_dir}df_jobs_for_analysis.csv', index=False)


In [None]:
# print(f'Saving corrected df_jobs length {len(df_jobs)} to txt file.')
# with open(f'{data_dir}df_jobs_for_analysis_len.txt', 'w') as f:
#     f.write(str(len(df_jobs)))


In [None]:
# # Calculate the critical value of Hotelling's T-squared test
# H_critical = chi2.ppf(0.95, df=4)

# # Get the unbiased coefficients
# coef_unbiased = model_unbias.coef

# # Calculate the bias squared, variance, and mean squared error (MSE)
# bias2 = np.sum((coef_unbiased - [beta_1, beta_2, beta_3, beta_4])**2)
# variance = se_1**2 + se_2**2 + se_3**2 + se_4**2
# mse = bias2 + variance

# # Add these columns to the `result` DataFrame
# result = result.assign(
#     bias2=bias2,
#     variance=variance,
#     mse=mse,
# )

# # Sort the DataFrame by MSE and filter to the top row
# result = result.sort_values("mse").iloc[:1]

# # Filter to the rows where Hotelling's T-squared test is less than the critical value
# result = result.query("Hotelling < {}".format(H_critical))

# # Print the results
# print(result)


In [None]:
# HACK
# def compute_embeddings(model, input_ids):
#     outputs = model(input_ids)
#     hidden_states = outputs.hidden_states
#     embeddings = hidden_states[-1]  # Extract embeddings from the last layer
#     return embeddings

# train_data = estimator.get_train_dataloader()
# eval_data = estimator.get_eval_dataloader()

# # Compute embeddings for your train and eval data
# train_embeddings = compute_embeddings(model, next(iter(train_data))[0])
# eval_embeddings = compute_embeddings(model, next(iter(eval_data))[0])

# TODO: get train, test, datasets from transformers save folder, X = np.concatenate((X_test, X_val), axis=0) and y = np.concatenate((y_test, y_val), axis=0) so X_test, y_test will be both of these. Get these to become df_jobs_test and df_jobs_train, then df_jobs_unlabeled will be the same.

# from transformers import BertModel, Trainer

# model = BertModel.from_pretrained("bert-base-uncased")
# trainer = Trainer(model)

# trainer.train()

# # Get the embeddings from the model
# embeddings = model.get_input_embeddings()

# print(embeddings.shape)

# from transformers import BertModel, Trainer

# model = BertModel.from_pretrained("bert-base-uncased")
# trainer = Trainer(model)

# trainer.train()

# # Get the hidden states from the model
# hidden_states = model.get_hidden_states()

# # Get the embeddings from the last layer
# embeddings = hidden_states[-1]

# print(embeddings.shape)
