# Section 6.3: Simulation Study: Effect of Model Mis-Specification


### Import necessary packages

In [24]:
import os, sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
import numpy as np
import pandas as pd
import pickle as pkl
from ppi_py import logistic
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
from aae import *

### Define Utility Functions

In [25]:
def compute_bias_reduction(df, true_theta):
    # Group by method and m, compute MAPE for each group
    df_grouped = df.groupby(['method', 'm']).agg({
        'point_estimate': lambda x: np.mean(np.mean([
            np.abs((est - true_theta)/(true_theta)) * 100
            for est in x.values
        ], axis=0))
    }).reset_index()
    # Group by method and m, compute mean squared error for each group
    df_grouped_mse = df.groupby(['method', 'm']).agg({
        'point_estimate': lambda x: np.mean(np.mean([
            (est - true_theta)**2
            for est in x.values
        ], axis=0))
    }).reset_index()

    # Merge MAPE and MSE results
    df_grouped = pd.merge(
        df_grouped, 
        df_grouped_mse.rename(columns={'point_estimate': 'mse'}),
        on=['method', 'm']
    )

    # Rename column for clarity
    df_grouped = df_grouped.rename(columns={'point_estimate': 'mape'})
    
    diff_data = []
    for m in ms:
        mape_human = df_grouped[(df_grouped['method'] == 'Human-data-only') & (df_grouped['m'] == m)]['mape'].values[0]
        for method in ['logistic', 'nn-1', 'nn-2']:
            mape_method = df_grouped[(df_grouped['method'] == method) & (df_grouped['m'] == m)]['mape'].values[0]
            diff = mape_method - mape_human
            diff_data.append({'method': method, 'm': m, 'mape_diff': diff})
            
    df_diff = pd.DataFrame(diff_data)
    # Pivot to get the desired table: rows are methods, columns are m
    df_diff_table = df_diff.pivot(index='m', columns='method', values='mape_diff')
    
    # Display table with 2 decimal places
    pd.set_option('display.float_format', lambda x: '%.2f' % x)
    display(df_diff_table)
    return None


### Import the simulated data set

Load the data. The data set contains true choice label (```y```), augmented choice label (```y_aug```), and feature vectors (```X```).

In [26]:
dx = 10
n_samples = 1200
with open(f'./data/sim/train_simcluster_{dx}_{n_samples}.pkl', 'rb') as f:
    data = pkl.load(f)[0]
Y_total = data["y"]
Yhat_total = data["y_aug"]
X_total1 = data["X"]
X_total = [X_total1[i][1, 1:] - X_total1[i][0, 1:] for i in range(len(X_total1))]
X_total = np.array(X_total)

### Problem setup

Specify the range of values for the primary set size (```ms```), and number of trials (```num_trials```).

Compute the ground-truth value of the estimand.

In [27]:
n_total = Y_total.shape[0]  # Total number of labeled examples
ms = np.array([50, 100, 150, 200]).astype(int)  # Test for different sizes of primary set
n = 1000    # Size of the auxiliary set
num_trials = 50
optimizer_options = {
    "ftol": 1e-5,
    "gtol": 1e-5,
    "maxls": 10000,
    "maxiter": 10000,
}

# Saving results settings
# WARNING::: If setting save_results to TRUE, the previous results will be OVERWRITTEN.
save_results = False # TRUE to save results to pickle file

### Running Experiments

In [28]:
# Run AAE with various first-stage models
X_total1_flat = flatten_full(X_total1)
Yhat_total1 = (Yhat_total > 0.5).astype(int)
methods = ['logistic', 'nn-1', 'nn-2']
results = []
for i in range(ms.shape[0]):
    for j in tqdm(range(num_trials)):
        m = ms[i]
        rng = np.random.RandomState(j)
        rand_idx = rng.permutation(n_total)
        _X1, _X_unlabeled1 = np.array(X_total1)[rand_idx[:m]], np.array(X_total1)[rand_idx[m:m+n]]
        _X_unlabeled1_flat = np.array(X_total1_flat)[rand_idx[m:m+n]]
        _Y = Y_total[rand_idx[:m]]
        _Yhat1, _Yhat_unlabeled1 = Yhat_total1[rand_idx[:m]], Yhat_total1[rand_idx[m:m+n]]

        # AAE point estimates
        g_models = {
            'logistic': MLPClassifier(solver='adam', alpha=1e-4, activation='logistic', hidden_layer_sizes=(), random_state=1),
            'nn-1': MLPClassifier(solver='adam', alpha=1e-4, activation='logistic', hidden_layer_sizes=(5,), random_state=1),
            'nn-2': MLPClassifier(solver='adam', alpha=1e-4, activation='logistic', hidden_layer_sizes=(5,2), random_state=1)
        }
        for method in methods:
            aae_pe = aae(_X1, _Y, _Yhat1, _X_unlabeled1, _Yhat_unlabeled1, _X_unlabeled1_flat, g_models[method], concat=1, n_epochs=1000, lr=1e-2)
            
            results += [
                pd.DataFrame(
                    [
                        {
                            "method": method,
                            "m": m,
                            "trial": j,
                            "point_estimate": aae_pe,
                        }
                    ]
                )
            ]


100%|██████████| 50/50 [01:02<00:00,  1.25s/it]
100%|██████████| 50/50 [01:50<00:00,  2.21s/it]
100%|██████████| 50/50 [01:46<00:00,  2.13s/it]
100%|██████████| 50/50 [02:06<00:00,  2.53s/it]


In [29]:
# Run human-data only estimation
for i in range(ms.shape[0]):
    for j in tqdm(range(num_trials)):
        m = ms[i]
        rng = np.random.RandomState(j)
        rand_idx = rng.permutation(n_total)
        _X, _X_unlabeled = X_total[rand_idx[:m]], X_total[rand_idx[m:m+n]]
        _Y = Y_total[rand_idx[:m]]
        _Yhat, _Yhat_unlabeled = Yhat_total[rand_idx[:m]],Yhat_total[rand_idx[m:m+n]]
    
        # Classical point estimate
        human_data_only_pe = logistic(_X, _Y)

        # Append results
        results += [
            pd.DataFrame(
                [
                    {
                        "method": "Human-data-only",
                        "m": m,
                        "trial": j,
                        "point_estimate": human_data_only_pe,
                    }
                ]
            )
        ]

100%|██████████| 50/50 [00:00<00:00, 326.30it/s]
100%|██████████| 50/50 [00:00<00:00, 1129.82it/s]
100%|██████████| 50/50 [00:00<00:00, 1143.35it/s]
100%|██████████| 50/50 [00:00<00:00, 1150.75it/s]


In [None]:
# Save results to pickle file
if save_results:        
    with open(f'res/res_simcluster_{dx}_{n}_{num_trials}.pkl', 'wb') as f:
        pkl.dump(results, f)

### Analyzing Results

#### Loading results and ground truth parameters

In [32]:
dx = 10
n_samples = 1200
n = 1000
num_trials = 50
with open(f'res/res_simcluster_{dx}_{n}_{num_trials}.pkl', 'rb') as f:
    results =pkl.load(f)

with open(f'./data/sim/train_simcluster_{dx}_{n_samples}.pkl', 'rb') as f:
    data = pkl.load(f)[0]
Y_total = data["y"]
Yhat_total = data["y_aug"]
X_total1 = data["X"]
X_total = [X_total1[i][1, 1:] - X_total1[i][0, 1:] for i in range(len(X_total1))]
X_total = np.array(X_total)

# Compute the best-in-class estimator
true_theta = (
    LogisticRegression(
        penalty=None,
        solver="lbfgs",
        max_iter=10000,
        tol=1e-15,
        fit_intercept=False,
    )
    .fit(X_total, Y_total)
    .coef_.squeeze()
)

#### Compute the bias reductions

In [33]:
df = pd.concat(results, axis=0, ignore_index=True)
compute_bias_reduction(df, true_theta)

method,logistic,nn-1,nn-2
m,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
50,-26612.9,-26792.41,-26825.15
100,-386.35,-551.31,-574.82
150,-251.16,-389.32,-406.86
200,-113.72,-236.97,-255.02
