In [None]:
import os # type:ignore # isort:skip # fmt:skip # noqa # nopep8
import sys # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from pathlib import Path # type:ignore # isort:skip # fmt:skip # noqa # nopep8

mod = sys.modules[__name__]

code_dir = None
code_dir_name = 'Code'
unwanted_subdir_name = 'Analysis'

if code_dir_name not in str(Path.cwd()).split('/')[-1]:
    for _ in range(5):

        parent_path = str(Path.cwd().parents[_]).split('/')[-1]

        if (code_dir_name in parent_path) and (unwanted_subdir_name not in parent_path):

            code_dir = str(Path.cwd().parents[_])

            if code_dir is not None:
                break
else:
    code_dir = str(Path.cwd())
sys.path.append(code_dir)

# %load_ext autoreload
# %autoreload 2


In [None]:
from setup_module.imports import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from estimators_get_pipe import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8
from setup_module.forestIV import * # type:ignore # isort:skip # fmt:skip # noqa # nopep8


### Set variables

In [None]:
# Variables
method = 'Supervised'
classifiers_type = 'all'
if classifiers_type == 'nonlinear':
    classifiers_pipe = classifiers_pipe_nonlinear
elif classifiers_type == 'linear':
    classifiers_pipe = classifiers_pipe_linear
elif classifiers_type == 'ensemble':
    classifiers_pipe = classifiers_pipe_ensemble
elif classifiers_type == 'all':
    classifiers_pipe = classifiers_pipe

results_save_path = f'{models_save_path}{method} Results/'
with open(f'{data_dir}{method}_results_save_path.txt', 'w') as f:
    f.write(results_save_path)
if not os.path.exists(results_save_path):
    os.makedirs(results_save_path)
done_xy_save_path = f'{results_save_path}Search+Xy/'
with open(f'{data_dir}{method}_done_xy_save_path.txt', 'w') as f:
    f.write(done_xy_save_path)
if not os.path.exists(done_xy_save_path):
    os.makedirs(done_xy_save_path)

t = time.time()
n_jobs = -1
n_splits = 10
n_repeats = 3
random_state = 42
refit = True
class_weight = 'balanced'
cv = RepeatedStratifiedKFold(
    n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
)
scoring = 'recall'
scores = [
    'recall', 'accuracy', 'f1', 'roc_auc',
    'explained_variance', 'matthews_corrcoef'
]
scorers = {
    'precision_score': make_scorer(precision_score, zero_division=0),
    'recall_score': make_scorer(recall_score, zero_division=0),
    'accuracy_score': make_scorer(accuracy_score, zero_division=0),
}
analysis_columns = ['Warmth', 'Competence']
text_col = 'Job Description spacy_sentencized'
metrics_dict = {
    f'{scoring.title()} Best Score': np.nan,
    f'{scoring.title()} Best Threshold': np.nan,
    'Train - Mean Cross Validation Score': np.nan,
    f'Train - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Train - Mean Explained Variance - {scoring.title()}': np.nan,
    'Test - Mean Cross Validation Score': np.nan,
    f'Test - Mean Cross Validation - {scoring.title()}': np.nan,
    f'Test - Mean Explained Variance - {scoring.title()}': np.nan,
    'Explained Variance': np.nan,
    'Accuracy': np.nan,
    'Balanced Accuracy': np.nan,
    'Precision': np.nan,
    'Average Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'Matthews Correlation Coefficient': np.nan,
    'Brier Score': np.nan,
    'Fowlkes–Mallows Index': np.nan,
    'R2 Score': np.nan,
    'ROC': np.nan,
    'AUC': np.nan,
    'Log Loss/Cross Entropy': np.nan,
    'Cohen’s Kappa': np.nan,
    'Geometric Mean': np.nan,
    'Classification Report': np.nan,
    'Imbalanced Classification Report': np.nan,
    'Confusion Matrix': np.nan,
    'Normalized Confusion Matrix': np.nan,
}

# Transformer variables
max_length = 512
returned_tensor = 'pt'
cpu_counts = torch.multiprocessing.cpu_count()
device = torch.device('mps') if torch.has_mps and torch.backends.mps.is_built() and torch.backends.mps.is_available(
) else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device_name = str(device.type)
print(f'Using {device_name.upper()}')
# Set random seed
random_state = 42
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
cores = multiprocessing.cpu_count()


### Functions

In [None]:
def save_df_full_summary_excel(
    df_full_summary,
    title,
    text_to_add_list,
    file_save_path,
    sheet_name=None,
    startrow=None,
    startcol=None,
):
    if sheet_name is None:
        sheet_name = 'All'
    if startrow is None:
        startrow = 1
    if startcol is None:
        startcol = 1

    # Define last rows and cols locs
    header_range = 1
    endrow = startrow + header_range + df_full_summary.shape[0]
    endcol = startcol + df_full_summary.shape[1]

    # Remove NAs
    df_full_summary = df_full_summary.fillna('')

    # Write
    writer = pd.ExcelWriter(f'{file_save_path}.xlsx')
    df_full_summary.to_excel(writer, sheet_name=sheet_name, merge_cells=True, startrow=startrow, startcol=startcol)
    workbook  = writer.book
    worksheet = writer.sheets[sheet_name]
    worksheet.set_column(startrow, 1, None, None, {'hidden': True}) # hide the index column

    # Title
    worksheet.merge_range(1, startcol, 1, endcol, title, workbook.add_format({'italic': True, 'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'align': 'left', 'top': True, 'bottom': True, 'left': False, 'right': False}))

    # Main body
    body_max_row_idx, body_max_col_idx = df_full_summary.shape

    for c, r in tqdm_product(range(body_max_col_idx), range(body_max_row_idx)):
        row_to_write = startrow + header_range + r
        col_to_write = startcol + 1 + c # 1 is for index
        body_formats = {'num_format': '0.00', 'font_name': 'Times New Roman', 'font_size': 12, 'font_color': 'black', 'align': 'center', 'text_wrap': True, 'left': False, 'right': False}

        if r == 0:
            body_formats |= {'top': True, 'bottom': True, 'left': False, 'right': False}
            worksheet.set_column(col_to_write, col_to_write, 10)

        if r == body_max_row_idx-1:
            body_formats |= {'bottom': True}

        if c == 0:
            body_formats |= {'align': 'left'}
            worksheet.set_column(col_to_write, col_to_write, 15)

        worksheet.write(row_to_write, col_to_write, df_full_summary.iloc[r, c], workbook.add_format(body_formats))

    # Add Note
    note_format = {'italic': True, 'font_name': 'Times New Roman', 'font_size': 10, 'font_color': 'black', 'align': 'left', 'left': False, 'right': False}
    worksheet.merge_range(endrow, startcol, endrow, endcol, 'Note.', workbook.add_format(note_format))
    # Add text
    for i, text in enumerate(text_to_add_list):
        worksheet.merge_range(endrow + 1 + i , startcol, endrow + 1 + i, endcol, text, workbook.add_format(note_format))

    writer.close()


In [None]:
def make_full_report(
    results, dv, dvs_name, dv_type,
    regression_info_dict=None, regressor_order=None, text_to_add_list=None, title=None, model_names=None
):
    '''
    Make a full report for a regression analysis.
    results: statsmodels regression results object or list of results objects
    dv: str, dependent variable name
    '''

    if regression_info_dict is None:
        # Regression info dict
        regression_info_dict = {
            'Model Name': lambda x: f'{x.model.__class__.__name__}',
            'N': lambda x: f'{int(x.nobs):d}',
            'R-squared': lambda x: f'{x.rsquared:.5f}',
            'R-squared Adj.': lambda x: f'{x.rsquared_adj:.5f}',
            'Log-Likelihood': lambda x: f'{x.llf:.5f}',
            'Pseudo R2': lambda x: f'{x.prsquared:.5f}',
            'F': lambda x: f'{x.fvalue:.5f}',
            'F (p-value)': lambda x: f'{x.f_pvalue:.5f}',
            'df_model': lambda x: f'{x.df_model:.0f}',
            'df_total': lambda x: f'{x.df_resid + x.df_model + 1:.0f}',
            'df_resid': lambda x: f'{x.df_resid:.0f}',
            'AIC': lambda x: f'{x.aic:.5f}',
            'BIC': lambda x: f'{x.bic:.5f}',
            'ICC': lambda x: f'{x.rsquared / (x.rsquared + (x.nobs - 1) * x.mse_resid):.5f}',
            'RMSE': lambda x: f'{x.mse_resid ** 0.5:.5f}',
            'RMSE (std)': lambda x: f'{x.mse_resid ** 0.5 / x.model.endog.std():.5f}',
            'Omnibus': lambda x: f'{sms.omni_normtest(x.resid).statistic:.5f}',
            'Omnibus (p-value)': lambda x: f'{sms.omni_normtest(x.resid).pvalue:.5f}',
            'Skew': lambda x: f'{sms.jarque_bera(x.resid)[-2]:.5f}',
            'Kurtosis': lambda x: f'{sms.jarque_bera(x.resid)[-1]:.5f}',
            'Jarque-Bera (JB)': lambda x: f'{sms.jarque_bera(x.resid)[0]:.5f}',
            'Jarque-Bera (p-value)': lambda x: f'{sms.jarque_bera(x.resid)[1]:.5f}',
            'Intercept': lambda x: f'{x.params["const"]:.5f}',
            'Intercept (std)': lambda x: f'{x.bse["const"]:.5f}',
            'Intercept t': lambda x: f'{x.tvalues["const"]:.5f}',
            'Intercept t (p-value)': lambda x: f'{x.pvalues["const"]:.5f}',
            'Intercept (95% CI)': lambda x: f'{x.conf_int().loc["const"][0]:.5f} - {x.conf_int().loc["const"][1]:.5f}',
            'Unstandardized Coefficent B (b)': lambda x: f'{x.params[0]:.5f}',
            'Standard Error (SE)': lambda x: f'{x.bse[0]:.5f}',
            'Standardized Coefficient b* (β)': lambda x: f'{x.params[0] / x.model.endog.std():.5f}',
            't': lambda x: f'{x.tvalues[0]:.5f}',
            't (p-value)': lambda x: f'{x.pvalues[0]:.5f}',
            '95% CI': lambda x: f'{x.conf_int().iloc[0, 1]:.5f} - {x.conf_int().iloc[0, 1]:.5f}',
            # 'Summary': lambda x: f'{x.summary()}',
            # 'F (p-value - FDR)': lambda x: f'{x.f_pvalue_fdr:.5f}',
            # 'F (p-value - Bonferroni)': lambda x: f'{x.f_pvalue_bonf:.5f}',
            # 't (p-value - FDR)': lambda x: f'{x.pvalues_fdr[1]:.5f}',
            # 't (p-value - Bonferroni)': lambda x: f'{x.pvalues_bonf[1]:.5f}',
        }
    if model_names is None:
        if isinstance(results, list):
            model_names = [
                f'{results[0].model.endog_names.split("_")[0] if "_" in results[0].model.endog_names else results[0].model.endog_names} Model {i}'
                for i in range(len(results[0].model.endog_names))
            ]
            model_names[0] = model_names[0].replace('Model 0', 'Full Model')
        else:
            model_names = [
                f'{results.model.endog_names.split("_")[0] if "_" in results.model.endog_names else results.model.endog_names}'
            ]

    order_type = 'unordered' if regressor_order is None else 'ordered'
    if text_to_add_list is None:
        text_to_add_list = []
        if regressor_order is not None:
            text_to_add_list.append('Models are ordered by independent variable type.')

        else:
            text_to_add_list.append('Models are ordered by coefficient size, largest to smallest.')

    if title is None:
        title = f'{dv_type} OLS Regression {dv}'

    try:
        # Statsmodels summary_col
        full_summary = summary_col(
            results,
            stars=True,
            info_dict=regression_info_dict,
            regressor_order=regressor_order,
            float_format='%0.3f',
            model_names=model_names,
        )
        if isinstance(results, list) and len(results) > 4:
            full_summary.tables[0][full_summary.tables[0].filter(regex='Full Model').columns[0]].loc['Unstandardized Coefficent B (b)': '95% CI'] = ''

        # Add title and notes
        full_summary.add_title(title)
        text_to_add_list.extend(full_summary.extra_txt)
        for text in text_to_add_list:
            full_summary.add_text(text)
        # Save
        save_name = f'{table_save_path}{title}'
        print(f'Saving {save_name}...')
        df_full_summary = pd.read_html(full_summary.as_html())[0]
        df_full_summary.to_csv(f'{save_name}.csv')
        df_full_summary.style.to_latex(f'{save_name}.tex', hrules=True)
        save_df_full_summary_excel(df_full_summary, title, text_to_add_list, save_name)

        return full_summary
    except IndexError:
        return None


In [None]:
def get_standardized_coefficients(results):

    # # Get standardized regression coefficients
    # std = np.asarray(constant.std(0))

    # if 'const' in results.params and 'const' in constant:
    #     std[0] = 1
    # tt = results.t_test(np.diag(std))
    # tt.c_names = results.model.exog_names

    # t-test
    std = results.model.exog.std(0)
    if 'const' in results.params:
        std[0] = 1
    tt = results.t_test(np.diag(std))
    if results.model.__class__.__name__ == 'MixedLM' or 'Group Var' in results.model.exog_names:
        offset = slice(None, -1)
        tt.c_names = results.model.exog_names[offset]
    else:
        offset = slice(None, None)
        tt.c_names = results.model.exog_names

    # Make df with standardized and unstandardized coefficients
    df_std_coef = pd.DataFrame(
        {
            'coef': results.params[offset].apply(lambda x: f'{x:.5f}'),
            'std err': results.bse[offset].apply(lambda x: f'{x:.5f}'),
            'std coef': (results.params[offset] / results.model.exog[offset].std(axis=0)).apply(lambda x: f'{x:.5f}'),
            't': results.tvalues[offset].apply(lambda x: f'{x:.5f}'),
            'P>|t|': results.pvalues[offset].apply(lambda x: f'{x:.5f}'),
            '[0.025': results.conf_int()[0][offset].apply(lambda x: f'{x:.5f}'),
            '0.975]': results.conf_int()[1][offset].apply(lambda x: f'{x:.5f}'),
        }
    )
    # if 'Group Var' in df_std_coef.index:
    #     df_std_coef = df_std_coef.drop('Group Var', axis='index')
    # # Add standardized coefficients and other data from t-test
    # df_std_coef['std coef'] = tt.effect
    # df_std_coef['std err'] = tt.sd
    # df_std_coef['t'] = tt.statistic
    # df_std_coef['P>|t|'] = tt.pvalue
    # df_std_coef['[0.025'] = tt.conf_int()[:, 0]
    # df_std_coef['0.975]'] = tt.conf_int()[:, 1]
    # df_std_coef['var'] = [names[i] for i in range(len(results.model.exog_names))]
    # df_std_coef = df_std_coef.sort_values('std coef', ascending=False)
    df_std_coef = df_std_coef.reset_index().rename(columns={'index': 'var'})
    df_std_coef = df_std_coef.rename(
        columns={
            'var': 'Variable',
            'coef': 'Unstandardized Coefficent B (b)',
            'std err': 'Standard Error',
            'std coef':'Standardized Coefficient b* (β)',
            't': 't-value',
            'P>|t|': 'p-value',
            '[0.025': '95% CI Lower',
            '0.975]': '95% CI Upper'
        }
    )
    # Reorder columns
    df_std_coef = df_std_coef[[
        'Variable',
        'Unstandardized Coefficent B (b)',
        'Standard Error',
        'Standardized Coefficient b* (β)',
        't-value',
        'p-value',
        '95% CI Lower',
        '95% CI Upper'
    ]]

    return tt, df_std_coef


### READ DATA

In [None]:
with open(f'{data_dir}df_jobs_for_analysis_len.txt', 'r') as f:
    df_jobs_len = int(f.read())

df_jobs = pd.read_pickle(f'{df_save_dir}df_jobs_for_analysis.pkl')
assert len(df_jobs) == df_jobs_len, f'DATAFRAME MISSING DATA! DF SHOULD BE OF LENGTH {df_jobs_len} BUT IS OF LENGTH {len(df_jobs)}'
print(f'Dataframe df_jobs_for_analysis loaded with shape: {df_jobs.shape}')


In [None]:
df_jobs['Warmth'].equals(df_jobs['Warmth_predicted'])


In [None]:
df_jobs['Competence'].equals(df_jobs['Competence_predicted'])


In [None]:
df_jobs[['Warmth', 'Warmth_actual']].isna().sum()


## Check biased and unbiased regressions models using human annotated and classifier predicted Warmth and Competence
Source: https://mochenyang.github.io/mochenyangblog/research/2022/01/10/ForestIV.html

### Unbiased and Biased Warmth and CompetenceOLS regression with human annotated actual values as DV and all IVs

In [None]:
def compare_actual_and_predicted(df, endog_type):
    endog_names_dict = defaultdict(lambda: defaultdict())
    exog_names = ivs_dummy_perc_and_perc_interactions + controls[:2]

    for dv in dvs:
        if endog_type == 'pre_classification':
            endog_names_dict[dv] = {
                'Unbiased': {'endog_names': f'{dv}_actual'},
                'Biased': {'endog_names': f'{dv}_predicted'}
            }
            df = df.loc[
                (~df[endog_names_dict[dv]['Unbiased']['endog_names']].isna())
                & (~df[endog_names_dict[dv]['Biased']['endog_names']].isna())
            ]
            print(f'Processing dataframe of length {len(df)}')
        elif endog_type == 'post_classification':
            if f'{dv}_aggr_unlabeled_predicted' in df.columns:
                endog_names_dict[dv] = {
                    'Biased': {'endog_names': f'{dv}_aggr_unlabeled_predicted'}
                }
                df = df.loc[
                    (~df[endog_names_dict[dv]['Biased']['endog_names']].isna())
                ]
                print(f'Processing dataframe of length {len(df)}')
            elif f'{dv}_actual' in df.columns:
                endog_names_dict[dv] = {
                    'Unbiased': {'endog_names': f'{dv}_actual'},
                }
                df = df.loc[
                    (~df[endog_names_dict[dv]['Unbiased']['endog_names']].isna())
                ]
                print(f'Processing dataframe of length {len(df)}')

        print(f'Analyzing {dv} {endog_names_dict[dv].keys()} Models')

        exog = df[exog_names]
        constant = sm.add_constant(exog)

        for dv_type, endog_names in endog_names_dict[dv].items():
            endog = df[endog_names['endog_names']]
            model = sm.OLS(endog=endog, exog=constant, data=df)
            results = model.fit()
            tt, df_std_coef = get_standardized_coefficients(results)
            title = f'{endog_type} {dv_type} OLS Regression {dv}'
            full_summary = make_full_report(
                results, dv, dvs_name=dv, dv_type=dv_type, title=title
            )
            endog_names_dict[dv][dv_type]['R-squared'] = results.rsquared
            endog_names_dict[dv][dv_type]['Results'] = results

            print('\n')
            print('-'*20)
            print(f'{dv_type.upper()} {dv}\n')
            print('-'*20)
            print('\n')
            print(f'{dv_type.upper()} SUMMARY RESULTS:')
            print(results.summary())
            print(full_summary)
            print('\n')
            print('-'*20)
            print(f'{dv_type.upper()} STANDARDIZED BETA REGRESSION COEFFICIENTS FOR {dv}:\n{df_std_coef}')
            print('\n')
            print('-'*20)

            save_name = f'{table_save_path}{title}'
            df_summary_results = pd.DataFrame(csv.reader(results.summary().as_csv().split('\n'), delimiter=','))
            df_summary_results.to_csv(f'{save_name}.csv')
            df_summary_results.style.to_latex(f'{save_name}.tex', hrules=True)
            df_std_coef.to_csv(f'{save_name} - standardized coefficients.csv')
            df_std_coef.style.to_latex(f'{save_name} - standardized coefficients.tex', hrules=True)

        if endog_names_dict[dv][list(endog_names_dict[dv])[0]]['R-squared'] != endog_names_dict[dv][list(endog_names_dict[dv])[-1]]['R-squared']:
            print('\n')
            print('-'*20)
            print(f'{dv} {list(endog_names_dict[dv])[0]} R-Squared does not equal {list(endog_names_dict[dv])[-1]} R-Squared:')
            print(f'{dv} {list(endog_names_dict[dv])[0]} = {endog_names_dict[dv][list(endog_names_dict[dv])[0]]["R-squared"]:.3f}')
            print(f'{dv} {list(endog_names_dict[dv])[-1]} = {endog_names_dict[dv][list(endog_names_dict[dv])[-1]]["R-squared"]:.3f}')
            print('\n')
            print('-'*20)

    return dict(endog_names_dict)


In [None]:
endog_names_dict_pre_classification = compare_actual_and_predicted(df_jobs, endog_type='pre_classification')


In [None]:
endog_names_dict_pre_classification


## Make RandomForestRegressor Classifier


In [None]:
def make_final_indiv_and_aggr_preds(estimator, X):
    pred = estimator.predict(X)
    indiv_pred = [tree.predict(X) for tree in estimator.estimators_]
    aggr_pred = np.mean(indiv_pred, axis=0)

    return pred, indiv_pred, aggr_pred


In [None]:
def get_randomforest_instrumental_variable_estimator(df_jobs, cols_to_compare=None, text_col=None, n_trees=None):

    if cols_to_compare is None:
        cols_to_compare = ['Warmth_actual', 'Warmth_predicted', 'Competence_actual', 'Competence_predicted']
    if text_col is None:
        text_col = 'Job Description spacy_sentencized'
    if n_trees is None:
        n_trees = 100
    cols_dict = defaultdict()
    train_ratio = 0.75
    test_ratio = 0.10
    validation_ratio = 0.15
    test_split = test_size = 1 - train_ratio
    validation_split = test_ratio / (test_ratio + validation_ratio)

    # Make df_jobs_unlabeled
    df_jobs_unlabeled = df_jobs.loc[
        (df_jobs[cols_to_compare].isna()).all(axis='columns')
    ]

    if all(df_jobs_unlabeled.isna().sum()) != 0:
        raise IndexError('Missing data in df_jobs_labeled.')

    df_jobs_unlabeled = df_jobs_unlabeled.dropna(subset=analysis_columns, how='any')
    print(f'Dataframe df_jobs_unlabeled of length: {len(df_jobs_unlabeled)}')

    # Make df_jobs_labeled
    df_jobs_labeled = df_jobs.loc[
        (~df_jobs[cols_to_compare].isna()).all(axis='columns')
    ]

    if all(df_jobs_labeled.isna().sum()) != 0:
        raise IndexError('Missing data in df_jobs_labeled.')

    df_jobs_labeled = df_jobs_labeled.dropna(subset=analysis_columns, how='any')
    print(f'Dataframe df_jobs_labeled of length: {len(df_jobs_labeled)}')

    # Make df labels dict
    df_add_preds_dict = {
        'labeled': df_jobs_labeled,
        'unlabeled': df_jobs_unlabeled
    }

    # Split data
    print('Splitting data...')
    train, test = train_test_split(
        df_jobs_labeled, train_size=1-test_split, test_size=test_split, random_state=random_state
    )
    print(f'Length of train dataset: {len(train)}')
    print(f'Length of test dataset: {len(test)}')
    cols_dict = {
        'train': train, 'test': test,
    }

    for col in analysis_columns:
        assert col in df_jobs_labeled.columns, f'{col} column not found in df_jobs_labeled'
        print('='*20)
        print(f'Training on {col}...')

        X_train = np.array(list(train[text_col].astype('str').values))
        y_train = column_or_1d(train[col].astype('int64').values.tolist(), warn=True)

        X_test = np.array(list(test[text_col].astype('str').values))
        y_test = column_or_1d(test[col].astype('int64').values.tolist(), warn=True)

        X_labeled = np.array(list(df_jobs_labeled[text_col].astype('str').values))
        y_labeled = column_or_1d(df_jobs_labeled[col].astype('int64').values.tolist(), warn=True)

        X_unlabeled = np.array(list(df_jobs_unlabeled[text_col].astype('str').values))
        y_unlabeled = column_or_1d(df_jobs_unlabeled[col].astype('int64').values.tolist(), warn=True)

        # Vectorize using FeatueUnion
        print(f'Vectorizing using {vectorizers_list[-1].__class__.__name__}...')
        vectorizer = vectorizers_list[-1]
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        X_labeled = vectorizer.transform(X_labeled)
        X_unlabeled = vectorizer.transform(X_unlabeled)

        # Train using RandomForestRegressor
        print('Training using RandomForestRegressor...')
        estimator = RandomForestRegressor(n_estimators=n_trees, random_state=random_state, n_jobs=n_jobs)
        estimator.fit(X_train, y_train)

        # Get predictions
        print('Getting predictions...')
        y_train_pred, indiv_y_train_pred, aggr_y_train_pred = make_final_indiv_and_aggr_preds(estimator, X_train)
        y_test_pred, indiv_y_test_pred, aggr_y_test_pred = make_final_indiv_and_aggr_preds(estimator, X_test)
        y_labeled_pred, indiv_y_labeled_pred, aggr_y_labeled_pred = make_final_indiv_and_aggr_preds(estimator, X_labeled)
        y_unlabeled_pred, indiv_y_unlabeled_pred, aggr_y_unlabeled_pred = make_final_indiv_and_aggr_preds(estimator, X_unlabeled)

        # Make col dict
        cols_dict[col] = {
            'estimator': estimator, 'vectorizer': vectorizer,
            'X_train': X_train, 'y_train': y_train, 'y_train_pred': y_train_pred,
            'indiv_y_train_pred': indiv_y_train_pred, 'aggr_y_train_pred': aggr_y_train_pred,
            'X_test': X_test, 'y_test': y_test, 'y_test_pred': y_test_pred,
            'indiv_y_test_pred': indiv_y_test_pred, 'aggr_y_test_pred': aggr_y_test_pred,
            'X_labeled': X_labeled, 'y_labeled': y_labeled, 'y_labeled_pred': y_labeled_pred,
            'indiv_y_labeled_pred': indiv_y_labeled_pred, 'aggr_y_labeled_pred': aggr_y_labeled_pred,
            'X_unlabeled': X_unlabeled, 'y_unlabeled': y_unlabeled, 'y_unlabeled_pred': y_unlabeled_pred,
            'indiv_y_unlabeled_pred': indiv_y_unlabeled_pred, 'aggr_y_unlabeled_pred': aggr_y_unlabeled_pred,
        }

        # Add columns to df
        for df_lab, df in df_add_preds_dict.items():
            df = pd.concat(
                [
                    df.reset_index(drop=True),
                    pd.DataFrame(
                        {
                            f'{col}_{df_lab}_predicted': cols_dict[col][f'y_{df_lab}_pred'],
                            f'{col}_aggr_{df_lab}_predicted': cols_dict[col][f'aggr_y_{df_lab}_pred'],
                        }
                    ).reset_index(drop=True),
                    pd.DataFrame(cols_dict[col][f'indiv_y_{df_lab}_pred']).transpose().add_prefix(f'{col}_tree_').reset_index(drop=True)
                ],
                axis='columns'
            )
            cols_dict[col][f'df_jobs_{df_lab}'] = df

        # Evaluate
        print('Evaluating...')
        score = estimator.score(X_test, y_test)
        mae = mean_absolute_error(y_test, y_test_pred)
        mse = mean_squared_error(y_test, y_test_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_test_pred)

        print('-'*20)
        print(f'Mean Absolute Error: {mae:3f}')
        print(f'Mean Squared Error: {mse:3f}')
        print(f'Root Mean Squared Error: {rmse:3f}')
        print(f'R-squared (R^2) Score: {r2:3f}')
        print('-'*20)

    return n_trees, dict(cols_dict)


In [None]:
n_trees, cols_dict = get_randomforest_instrumental_variable_estimator(df_jobs)


In [None]:
cols_dict.keys()


In [None]:
list_columns = [c for c in df_jobs.columns if df_jobs[c].apply(lambda x: isinstance(x, list)).any()]
non_list_columns = [c for c in df_jobs.columns if not df_jobs[c].apply(lambda x: isinstance(x, list)).any()]


In [None]:
cols_dict['Warmth']['df_jobs_labeled'].info()


In [None]:
cols_dict['Competence']['df_jobs_labeled'].info()


In [None]:
df_jobs_labeled = cols_dict['Warmth']['df_jobs_labeled']\
    .drop(columns=list_columns)\
    .merge(
        cols_dict['Competence']['df_jobs_labeled'],
        how='outer',
        on=non_list_columns
    ).dropna(axis='columns', how='all')\
        .reset_index(drop=True)


In [None]:
df_jobs_labeled.info()


In [None]:
df_jobs_labeled.head()


In [None]:
cols_dict['Warmth']['df_jobs_unlabeled'].info()


In [None]:
cols_dict['Competence']['df_jobs_unlabeled'].info()


In [None]:
df_jobs_unlabeled = cols_dict['Warmth']['df_jobs_unlabeled']\
    .drop(columns=list_columns)\
        .merge(
            cols_dict['Competence']['df_jobs_unlabeled'],
            how='outer',
            on=non_list_columns
        ).dropna(axis='columns', how='all')\
            .reset_index(drop=True)


In [None]:
df_jobs_unlabeled.head()


In [None]:
df_jobs_unlabeled.info()


In [None]:
train = cols_dict['train']


In [None]:
train.head()


In [None]:
df_jobs_train = train.copy()


In [None]:
df_jobs_train.info()


In [None]:
test = cols_dict['test']


In [None]:
test.head()


In [None]:
df_jobs_test = test\
    .drop(columns=list_columns)\
    .merge(
        df_jobs_labeled,
        how='inner',
        on=non_list_columns
    ).reset_index(drop=True)



In [None]:
df_jobs_test.info()


In [None]:
df_jobs_test.head()


### Make unbiased and biased models

In [None]:
# Uniased model
endog_names_dict_post_labeled_classification = compare_actual_and_predicted(df_jobs_labeled, endog_type='post_classification')


In [None]:
# Biased model
endog_names_dict_post_unlabeled_classification = compare_actual_and_predicted(df_jobs_unlabeled, endog_type='post_classification')


In [None]:
endog_names_dict_post_labeled_classification


In [None]:
endog_names_dict_post_unlabeled_classification


# Make instrumental Variable

In [None]:
df_jobs_for_correction = pd.concat([df_jobs_labeled, df_jobs_unlabeled], axis='index')


In [None]:
df_jobs_for_correction.info()


In [None]:
df_jobs_for_correction.head()


In [None]:
def hotelling(beta_IV, vcov_IV, model_unbias):
    b_diff = beta_IV - model_unbias.params
    var_diff = vcov_IV + model_unbias.cov_params()
    return float(np.dot(b_diff, np.dot(np.linalg.inv(var_diff), b_diff)))


In [None]:
def get_corrs(lhs, rhs):
    return np.abs(np.corrcoef(lhs, rhs).mean())


In [None]:
def make_formula_endog_exog_instrument(regressor, control, IVs, var, type, data):
    regressor_ = regressor.replace('%', '').replace(' ', '_')
    control_ = [c.replace('%', '').replace(' ', '_') for c in control]
    IVs_ = [i.replace('%', '').replace(' ', '_') for i in IVs]
    var_ = var.replace('%', '').replace(' ', '_')

    if control:
        if type == 'XZ':
            formula_str = f'{regressor_} ~ {" + ".join(IVs_)}'
            endog_names = regressor
            exog_names = IVs
            instrument_names = None
        elif type == 'YX':
            formula_str = f'{var_} ~ {regressor_} + {" + ".join(control_)}'
            endog_names = var
            exog_names = [regressor] + control
            instrument_names = None
        elif type == 'all':
            formula_str = f'{var_} ~ {regressor_} + {" + ".join(control_)} | {" + ".join(IVs_)} + {" + ".join(control_)}'
            endog_names = var
            exog_names = [regressor] + control
            instrument_names = IVs + control
    elif type == 'XZ':
        formula_str = f'{regressor_} ~ {" + ".join(IVs_)}'
        endog_names = regressor
        exog_names = IVs
        instrument_names = None
    elif type == 'YX':
        formula_str = f'{var_} ~ {regressor_}'
        endog_names = var
        exog_names = regressor
    elif type == 'all':
        formula_str = f'{var_} ~ {regressor_} | {" + ".join(IVs_)}'
        endog_names = var
        exog_names = regressor
        instrument_names = IVs

    endog = data[endog_names]
    exog = data[exog_names]
    instrument = data[instrument_names]
    constant = sm.add_constant(exog)

    formula_data = data.copy()
    formula_data.columns = formula_data.columns.str.replace('%', '').str.replace(' ', '_')

    try:
        ols_model = smf.ols(formula=formula_str, data=formula_data)
    except:
        ols_model = sm.OLS(endog=endog, exog=exog, data=data)

    return formula_data, formula_str, ols_model, endog_names, endog, exog_names, exog, instrument_names, instrument, constant


In [None]:
# Function to select strong IVs using Lasso
def lasso_select_strong(data_unlabel, regressor, candidates):
    formula_data = data_unlabel.copy()
    formula_data.columns = formula_data.columns.str.replace('%', '').str.replace(' ', '_')
    if len(candidates) != 0:
        formula_str = f'{regressor.replace("%", "").replace(" ", "_")} ~ {" + ".join([c.replace("%", "").replace(" ", "_") for c in candidates])}'
        y = formula_data[regressor]
        X = formula_data[candidates]

        lasso = LassoCV(cv=5)
        lasso.fit(X, y)
        selection = lasso.coef_ != 0
        return np.array(candidates)[selection]
    else:
        return candidates


In [None]:
# Function to select valid IVs using Lasso
def lasso_select_valid(col, data_test, regressor, candidates):
    if len(data_test) == 0 or len(candidates) == 0:
        return candidates
    focal_pred = data_test[regressor]
    others_pred = data_test[candidates]
    actual = data_test[f'{col}_actual']
    focal_error = focal_pred - actual

    lasso = LassoCV(cv=5)
    lasso.fit(others_pred, focal_error)
    invalid = lasso.coef_ == 0
    return np.array(candidates)[~invalid]


In [None]:
# Function to perform Lasso selection for validity and strength
def lasso_select(col, data_test, data_unlabel, ntree, regressor, iterative):
    candidates = [f'{col}_tree_{i}' for i in range(0, ntree) if f'{col}_tree_{i}' != regressor]

    def get_corrs(lhs, rhs):
        return np.abs(np.corrcoef(lhs.values, rhs.values.transpose()).mean())

    pp_abs_before = get_corrs(data_unlabel[regressor], data_unlabel[candidates])
    pe_abs_before = get_corrs((data_test[regressor] - data_test[f'{col}_actual']), data_test[candidates])

    if iterative:
        IV_valid = lasso_select_valid(col, data_test, regressor, candidates)
        IVs = lasso_select_strong(data_unlabel, regressor, IV_valid)
        while len(IVs) != len(candidates):
            candidates = IVs
            IV_valid = lasso_select_valid(col, data_test, regressor, candidates)
            IVs = lasso_select_strong(data_unlabel, regressor, IV_valid)
    else:
        IV_valid = lasso_select_valid(col, data_test, regressor, candidates)
        IVs = lasso_select_strong(data_unlabel, regressor, IV_valid)

    if len(IVs) != 0:
        pp_abs_after = get_corrs(data_unlabel[regressor], data_unlabel[IVs])
        pe_abs_after = get_corrs(data_test[regressor] - data_test[f'{col}_actual'], data_test[IVs])
    else:
        pp_abs_after = np.nan
        pe_abs_after = np.nan

    return {
        "IVs": IVs,
        "correlations": [pp_abs_before, pe_abs_before, pp_abs_after, pe_abs_after]
    }


In [None]:
# Function to perform 2SLS estimation
def perform_2sls_estimation(data_unlabel_new, regressor, var, control, IVs, family):
    if family.__class__.__name__ == 'Gaussian' and family.link.__class__.__name__ == 'Identity':
        (
            formula_data, formula_str, ols_model, endog_names, endog, exog_names, exog, instrument_names, instrument, constant
        ) = make_formula_endog_exog_instrument(
            regressor, controls, IVs, var, 'all', data_unlabel_new
        )
        model_IV = IV2SLS(endog=endog, exog=constant, instrument=instrument).fit()
    else:
        print('Only Gaussian family implemented.')
    return model_IV


In [None]:
# ForestIV Main Function (Python implementation)
def forest_iv(col, data_test, data_unlabel, var, control, method, ntree, model_unbias, diagnostic, select_method, family, iterative=True):
    """ForestIV Main Function
    This function implements the main ForestIV approach.

    Args:
        col: Name of classified variable
        data_test: Testing dataframe for random forest, must have a column named "{col}_actual" that contains the ground truth, and all trees' predictions. data_test = pd.DataFrame(test[f'{col}_indiv_pred_test'], test[f'{col}_aggr_pred_test'], f'test[f'{col}_actual])
        data_unlabel: Unlabel dataframe for random forest, must have all trees' predictions. data_unlabel = pd.DataFrame(df_unlabeled, df_unlabeled[f'{col}_indiv_pred_unlabel], df_unlabeled[f'{col}_aggr_pred_unlabel'])
        control: A character vector of control variable names. Pass an empty vector if there are no control variables
        method: "Lasso" for ForestIV method and "IIV" for EnsembleIV method.
        iterative: Whether to perform iterative IV selection or not, default to TRUE. Only relevant when method = "Lasso"
        ntree: Number of trees in the random forest.
        model_unbias: Unbiased estimation.
        family: Model specification, same as in the family parameter in glm.
        diagnostic: Whether to output diagnostic correlations for instrument validity and strength, default to TRUE.
        select_method: method of IV selection. One of "optimal" (LASSO based), "top3", and "PCA".
    Returns:
        ForestIV estimation results
    """

    result = []

    for i in range(0, ntree):
        regressor = f'{col}_tree_{i}'
        print(f'Analyzing {regressor}/{ntree-1} trees')

        if method == 'Lasso':
            output = lasso_select(col, data_test, data_unlabel, ntree, regressor, iterative)
            IVs = output['IVs'].tolist()
            data_unlabel_new = data_unlabel.copy()

        if method == 'IIV':
            output = iiv_select(data_test, data_unlabel, ntree, regressor, select_method)
            IVs = output['IVs'].tolist()
            data_unlabel_new = output['data_unlabel_new']
        print(f'Candidates IVs length: {len(IVs)}')

        if len(IVs) != 0:
            model_IV = perform_2sls_estimation(data_unlabel_new, regressor, var, control, IVs, family)
            beta_IV = model_IV.params
            vcov_IV = model_IV.cov_params()
            se_IV = np.sqrt(np.diag(vcov_IV))
            convergence = 0

            H_stats = hotelling(beta_IV, vcov_IV, model_unbias)
            correlations = output['correlations']
            result.append([*beta_IV, *se_IV, H_stats, convergence, *correlations])

    result = pd.DataFrame(result, columns=[f'beta_{i}' for i in range(1, len(beta_IV) + 1)] +
                                            [f'se_{i}' for i in range(1, len(se_IV) + 1)] +
                                            ['Hotelling', 'Convergence', 'pp_abs_before', 'pe_abs_before', 'pp_abs_after', 'pe_abs_after'])

    return beta_IV, vcov_IV, model_unbias, result if diagnostic else result.iloc[:, :-4]


In [None]:
beta_IV, vcov_IV, model_unbias, result = forest_iv(col='Warmth', data_test=df_jobs_test, data_unlabel=df_jobs_unlabeled, var=ivs_perc[0], control=controls[:2], method='Lasso', ntree=3, model_unbias=endog_names_dict_post_labeled_classification['Warmth']['Unbiased']['Results'], diagnostic=True, select_method='optimal', family=sm.families.Gaussian(link=sm.families.links.Identity()), iterative=False)


In [None]:
result


In [None]:
# import numpy as np
# import pandas as pd
# from statsmodels.formula.api import ivreg
# from statsmodels.formula.api import glm
# from OneSampleMR import tsri
# from ivtools import ivglm

# def forest_iv(data_test, data_unlabel, control, method, iterative=True, ntree, model_unbias, family, diagnostic, select_method):
    # """ForestIV Main Function

    # This function implements the main ForestIV approach.

    # Args:
    #     data_test: Testing dataframe for random forest, must have a column named "actual" that contains the ground truth, and all trees' predictions.
    #     data_unlabel: Unlabel dataframe for random forest, must have all trees' predictions.
    #     control: A character vector of control variable names. Pass an empty vector if there are no control variables
    #     method: "Lasso" for ForestIV method and "IIV" for EnsembleIV method.
    #     iterative: Whether to perform iterative IV selection or not, default to TRUE. Only relevant when method = "Lasso"
    #     ntree: Number of trees in the random forest.
    #     model_unbias: Unbiased estimation.
    #     family: Model specification, same as in the family parameter in glm.
    #     diagnostic: Whether to output diagnostic correlations for instrument validity and strength, default to TRUE.
    #     select_method: method of IV selection. One of "optimal" (LASSO based), "top3", and "PCA".

    # Returns:
    #     ForestIV estimation results
    # """

#     result = []
#     for i in range(1, ntree + 1):
#         # use i-th tree as the endogenous covariate
#         regressor = f'X{i}'

#         # IV selection
#         if method == "Lasso":
#             ivs = lasso_select(data_test, data_unlabel, iterative, ntree, regressor)
#             data_unlabel_new = data_unlabel.copy()
#         elif method == "IIV":
#             ivs = iiv_select(data_test, data_unlabel, ntree, regressor, select_method)
#             data_unlabel_new = ivs['data_unlabel_new']

#         # 2SLS estimation
#         if len(ivs) > 0:
#             if family.family == "gaussian" and family.link == "identity":
#                 model_IV = ivreg(f'{regressor} ~ {" + ".join(ivs + control)}', data=data_unlabel_new)
#                 beta_IV = model_IV.params
#                 vcov_IV = model_IV.cov_params()
#                 se_IV = np.sqrt(np.diag(vcov_IV))
#                 convergence = 0
#             else:
#                 link = 'logit' if family.family == 'binomial' else 'logadd'
#                 model_IV = tsri(f'{regressor} ~ {" + ".join(ivs + control)}', data=data_unlabel_new, link=link)
#                 beta_IV = model_IV.fit.params[1:-1]
#                 vcov_IV = model_IV.fit.cov_params()[1:-1, 1:-1]
#                 se_IV = np.sqrt(np.diag(vcov_IV))
#                 convergence = model_IV.fit.algoInfo['convergence']

#             H_stats = hotelling(beta_IV, vcov_IV, model_unbias)
#             correlations = ivs['correlations']
#             result.append([*beta_IV, *se_IV, H_stats, convergence, *correlations])

#     if diagnostic:
#         result = pd.DataFrame(result)
#         result.columns = [f'beta_{i}' for i in range(1, len(beta_IV) + 1)] + [f'se_{i}' for i in range(1, len(se_IV) + 1)] + ['Hotelling', 'Convergence', 'pp_abs_before', 'pe_abs_before', 'pp_abs_after', 'pe_abs_after']
#         return result
#     else:
#         result = pd.DataFrame(result)
#         result.columns = [f'beta_{i}' for i in range(1, len(beta_IV) + 1)] + [f'se_{i}' for i in range(1, len(se_IV) + 1)] + ['Hotelling', 'Convergence']
#         return result
