# Replication Notebook for Applications: Sample Fit Reliability

Gabriel Okasa and Kenneth A. Younge

In [None]:
# get current working directory
path = %pwd

## Libraries

In [None]:
# Python version 3.8.8
import statsmodels # version 0.12.2

import samplefit as sf # version 0.3.1
import statsmodels.api as sm
import pandas as pd # version 1.3.5
import numpy as np # version 1.22.0
import matplotlib.pyplot as plt # version 3.4.2

from scipy import stats # version 1.7.2
from sklearn.linear_model import RANSACRegressor # version 1.1.1

# turn off future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# set pandas printing options
pd.set_option('display.float_format', '{:.4f}'.format)

## Applications

### Labor Data

In [None]:
# load lalonde data from NSW RCT
data = pd.read_csv(path+'/data/data_lalonde.csv')
# save data for later plotting
data_lalonde = data.copy()

In [None]:
# latex output for descriptives
print(round(data.describe(), 2).T.to_latex(caption='Descriptive Statistics for the Labor Data'))

In [None]:
# check plot
fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (10, 5), dpi=300) # define the plot layout
# plot ols fit
np.random.seed(0)
ax.scatter(x=data.treatment + np.random.uniform(-0.1, 0.1, len(data.re78)), y=data.re78, color='grey', s=30)
ax.title.set_text('Real Earnings by Treatment Status')
ax.set_xlabel('Training Program')
ax.set_ylabel('Real Earnings')
plt.xticks([-0.5, 0, 1, 1.5], ['', 'Control', 'Treated', ''])
plt.show()
fig.savefig(path+'/figures/scatter_earnings.png', bbox_inches='tight')

In [None]:
# run OLS, HUBER, RANSAC, SFR

# specify model
model = sm.OLS(endog=data.re78, exog=pd.DataFrame(sm.add_constant(data.treatment)))
df = data.shape[0] - 2

# ols fit
ols_fit = model.fit()
ols_param = ols_fit.params[1]
ols_se = ols_fit.bse[1]
ols_t = ols_param/ols_se
ols_p = stats.t.sf(np.abs(ols_t), df) * 2
ols_ci_up = ols_param + stats.t.ppf(1-0.05/2, df) * ols_se
ols_ci_down = ols_param - stats.t.ppf(1-0.05/2, df) * ols_se
# get predictions for ols
ols_pred = pd.DataFrame(ols_fit.predict(sm.add_constant(data.treatment)), index=data.index, columns=['predictions'])

# fit huber
huber_fit = sm.RLM(endog=data.re78, exog=pd.DataFrame(sm.add_constant(data.treatment)), M=sm.robust.norms.HuberT()).fit()
huber_param = huber_fit.params[1]
huber_se = huber_fit.bse[1]
huber_t = huber_param/huber_se
huber_p = stats.t.sf(np.abs(huber_t), df) * 2
huber_ci_up = huber_param + stats.t.ppf(1-0.05/2, df) * huber_se
huber_ci_down = huber_param - stats.t.ppf(1-0.05/2, df) * huber_se
# get predictions for huber
huber_pred = pd.DataFrame(huber_fit.predict(sm.add_constant(data.treatment)), index=data.index, columns=['predictions'])

# fit RANSAC
ransac_fit = RANSACRegressor(random_state=0)
ransac_fit.fit(X=pd.DataFrame(data.treatment), y=data.re78) # intercept is added by default in sklearn
ransac_param = ransac_fit.estimator_.coef_[0]
# get standard error for RANSAC via bootstrapping
ransac_param_boot = []
# loop over
for boot_idx in range(1000):
    # set seed
    np.random.seed(boot_idx)
    # get in bag indices
    in_idx = np.random.choice(np.arange(data.shape[0], dtype=int), size=data.shape[0], replace=True)
    # in bag observations
    endog_in = data.re78[in_idx]
    exog_in = data.treatment[in_idx]
    # estimate ransac
    ransac_fit_boot = RANSACRegressor(random_state=boot_idx)
    ransac_fit_boot.fit(X=pd.DataFrame(exog_in), y=endog_in) # intercept is added by default in sklearn
    ransac_param_boot.append(ransac_fit_boot.estimator_.coef_[0])
# inference
ransac_se = np.std(ransac_param_boot)
ransac_t = ransac_param/ransac_se
ransac_p = stats.t.sf(np.abs(ransac_t), df) * 2
ransac_ci_up = ransac_param + stats.t.ppf(1-0.05/2, df) * ransac_se
ransac_ci_down = ransac_param - stats.t.ppf(1-0.05/2, df) * ransac_se
# get predictions from ransac
ransac_pred = pd.DataFrame(ransac_fit.predict(X=pd.DataFrame(data.treatment)), index=data.index, columns=['predictions'])

# fit from the sfr
sfr = sf.SFR(linear_model=model, random_state=0)
sfr_scores = sfr.score()
sfr_fit = sfr.fit(n_boot=1000)
sfr_param = sfr_fit.params[1]
sfr_se = sfr_fit.stand_err[1]
sfr_t = sfr_param/sfr_se
sfr_p = stats.t.sf(np.abs(sfr_t), df) * 2
sfr_ci_up = sfr_param + stats.t.ppf(1-0.05/2, df) * sfr_se
sfr_ci_down = sfr_param - stats.t.ppf(1-0.05/2, df) * sfr_se
# get predictions for sfr
sfr_pred = pd.DataFrame(sfr_fit.fittedvalues, index=data.index, columns=['predictions'])

In [None]:
# save scores
scores_lalonde = sfr_scores.scores.copy()

In [None]:
# combine results
ols_results = pd.DataFrame({'Coef.': ols_param, 'Std.Err.': ols_se, 't': ols_t, 'P>|t|': ols_p, '[0.025': ols_ci_down, '0.975]': ols_ci_up}, index=['OLS'])
huber_results = pd.DataFrame({'Coef.': huber_param, 'Std.Err.': huber_se, 't': huber_t, 'P>|t|': huber_p, '[0.025': huber_ci_down, '0.975]': huber_ci_up}, index=['HUBER'])
ransac_results = pd.DataFrame({'Coef.': ransac_param, 'Std.Err.': ransac_se, 't': ransac_t, 'P>|t|': ransac_p, '[0.025': ransac_ci_down, '0.975]': ransac_ci_up}, index=['RANSAC'])
sfr_results = pd.DataFrame({'Coef.': sfr_param, 'Std.Err.': sfr_se, 't': sfr_t, 'P>|t|': sfr_p, '[0.025': sfr_ci_down, '0.975]': sfr_ci_up}, index=['SFR'])

In [None]:
# get latex
app1_all_results = pd.concat([ols_results, huber_results, ransac_results, sfr_results])
print(app1_all_results.to_latex(caption='Comparison of The Effect of Training Program on Real Earnings'))
# save
app1_all_results.to_csv(path+'/results/labor_results.csv', index=False)

In [None]:
# plot the scores
np.random.seed(0)
sfr_scores.plot(xname='treatment', xlabel='Training Program', yname='Real Earnings', figsize=(10,5), dpi=300, s=30, jitter=True,
                path=path+'/figures/', fname='sfr_scores_earnings.png')

In [None]:
# format the plot
sfr_scores.figures['treatment'][1].set_xticklabels(['', 'Control', 'Treated', ''])
sfr_scores.figures['treatment'][0]

In [None]:
# save the plot
sfr_scores.figures['treatment'][0].savefig(path+'/figures/sfr_scores_earnings.png')

In [None]:
sfr_annealing = sfr.anneal(share=0.1, n_boot=1000)
sfr_annealing.plot(xname='treatment', yname='Effect on Real Earnings', xlabel='Training Program', dpi=300, ylim=[-1500, 3900], title='SFR: Annealing',
                   path=path+'/figures/', fname='sfr_annealing_earnings.png')

### Microcredit

In [None]:
# load data from microcredit RCT (including the PPP standardizator)
data = pd.read_csv(path+'/data/data_profit.csv')
standardizer = pd.read_csv(path+'/data/data_profit_ppp.csv').iloc[0,0]
# standardize the profit data according to USD PPP per 2 weeks
data.profit = data.profit * standardizer
# save data
data_microcredit = data.copy()

In [None]:
# latex output for descriptives
print(round(data.describe(), 2).T.to_latex(caption='Descriptive Statistics for the Microcredit Data'))

In [None]:
# check plot
np.random.seed(0) # due to jitter
fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (10, 5), dpi=300) # define the plot layout
# plot ols fit (plus jitter)
ax.scatter(x=data.treatment + np.random.uniform(-0.1, 0.1, len(data.profit)), y=data.profit, color='grey', s=30)
ax.title.set_text('Profit by Treatment Status')
ax.set_xlabel('Microcredit Provision')
ax.set_ylabel('Household Profit')
plt.xticks([-0.5, 0, 1, 1.5], ['', 'Control', 'Treated', ''])
plt.show()
fig.savefig(path+'/figures/scatter_profit.png', bbox_inches='tight')

In [None]:
# run OLS, HUBER, RANSAC, SFR

# specify model
model = sm.OLS(endog=data.profit, exog=pd.DataFrame(sm.add_constant(data.treatment)))
df = data.shape[0] - 2

# ols fit
ols_fit = model.fit()
ols_param = ols_fit.params[1]
ols_se = ols_fit.bse[1]
ols_t = ols_param/ols_se
ols_p = stats.t.sf(np.abs(ols_t), df) * 2
ols_ci_up = ols_param + stats.t.ppf(1-0.05/2, df) * ols_se
ols_ci_down = ols_param - stats.t.ppf(1-0.05/2, df) * ols_se
# get predictions for ols
ols_pred = pd.DataFrame(ols_fit.predict(sm.add_constant(data.treatment)), index=data.index, columns=['predictions'])

# fit huber
huber_fit = sm.RLM(endog=data.profit, exog=pd.DataFrame(sm.add_constant(data.treatment)), M=sm.robust.norms.HuberT()).fit()
huber_param = huber_fit.params[1]
huber_se = huber_fit.bse[1]
huber_t = huber_param/huber_se
huber_p = stats.t.sf(np.abs(huber_t), df) * 2
huber_ci_up = huber_param + stats.t.ppf(1-0.05/2, df) * huber_se
huber_ci_down = huber_param - stats.t.ppf(1-0.05/2, df) * huber_se
# get predictions for huber
huber_pred = pd.DataFrame(huber_fit.predict(sm.add_constant(data.treatment)), index=data.index, columns=['predictions'])

# fit RANSAC
ransac_fit = RANSACRegressor(random_state=0)
ransac_fit.fit(X=pd.DataFrame(data.treatment), y=data.profit) # intercept is added by default in sklearn
ransac_param = ransac_fit.estimator_.coef_[0]
# get standard error for RANSAC via bootstrapping
ransac_param_boot = []
# loop over
for boot_idx in range(1000):
    # set seed
    np.random.seed(boot_idx)
    # get in bag indices
    in_idx = np.random.choice(np.arange(data.shape[0], dtype=int), size=data.shape[0], replace=True)
    # in bag observations
    endog_in = data.profit[in_idx]
    exog_in = data.treatment[in_idx]
    # estimate ransac
    ransac_fit_boot = RANSACRegressor(random_state=boot_idx)
    ransac_fit_boot.fit(X=pd.DataFrame(exog_in), y=endog_in) # intercept is added by default in sklearn
    ransac_param_boot.append(ransac_fit_boot.estimator_.coef_[0])
# inference
ransac_se = np.std(ransac_param_boot)
ransac_t = ransac_param/ransac_se
ransac_p = stats.t.sf(np.abs(ransac_t), df) * 2
ransac_ci_up = ransac_param + stats.t.ppf(1-0.05/2, df) * ransac_se
ransac_ci_down = ransac_param - stats.t.ppf(1-0.05/2, df) * ransac_se
# get predictions from ransac
ransac_pred = pd.DataFrame(ransac_fit.predict(X=pd.DataFrame(data.treatment)), index=data.index, columns=['predictions'])

# fit from the sfr
sfr = sf.SFR(linear_model=model, random_state=0)
sfr_scores = sfr.score()
sfr_fit = sfr.fit(n_boot=1000)
sfr_param = sfr_fit.params[1]
sfr_se = sfr_fit.stand_err[1]
sfr_t = sfr_param/sfr_se
sfr_p = stats.t.sf(np.abs(sfr_t), df) * 2
sfr_ci_up = sfr_param + stats.t.ppf(1-0.05/2, df) * sfr_se
sfr_ci_down = sfr_param - stats.t.ppf(1-0.05/2, df) * sfr_se
# get predictions for sfr
sfr_pred = pd.DataFrame(sfr_fit.fittedvalues, index=data.index, columns=['predictions'])

In [None]:
# save scores for later plots
scores_microcredit = sfr_scores.scores.copy()

In [None]:
# combine results
ols_results = pd.DataFrame({'Coef.': ols_param, 'Std.Err.': ols_se, 't': ols_t, 'P>|t|': ols_p, '[0.025': ols_ci_down, '0.975]': ols_ci_up}, index=['OLS'])
huber_results = pd.DataFrame({'Coef.': huber_param, 'Std.Err.': huber_se, 't': huber_t, 'P>|t|': huber_p, '[0.025': huber_ci_down, '0.975]': huber_ci_up}, index=['HUBER'])
ransac_results = pd.DataFrame({'Coef.': ransac_param, 'Std.Err.': ransac_se, 't': ransac_t, 'P>|t|': ransac_p, '[0.025': ransac_ci_down, '0.975]': ransac_ci_up}, index=['RANSAC'])
sfr_results = pd.DataFrame({'Coef.': sfr_param, 'Std.Err.': sfr_se, 't': sfr_t, 'P>|t|': sfr_p, '[0.025': sfr_ci_down, '0.975]': sfr_ci_up}, index=['SFR'])

In [None]:
# get latex
app2_all_results = pd.concat([ols_results, huber_results, ransac_results, sfr_results])
print(app2_all_results.to_latex(caption='Comparison of The Effect of Microcredit Provision on Household Profit'))
# save
app2_all_results.to_csv(path+'/results/microcredit_results.csv', index=False)

In [None]:
# plot the scores
np.random.seed(0)
sfr_scores.plot(xname='treatment', xlabel='Microcredit Provision', yname='Household Profit', figsize=(10,5), dpi=300, s=30, jitter=True,
                path=path+'/figures/', fname='sfr_scores_profit.png')

In [None]:
# format the plot
sfr_scores.figures['treatment'][1].set_xticklabels(['', 'Control', 'Treated', ''])
sfr_scores.figures['treatment'][0]

In [None]:
# save the plot
sfr_scores.figures['treatment'][0].savefig(path+'/figures/sfr_scores_profit.png')

In [None]:
sfr_annealing = sfr.anneal(share=0.1, n_boot=1000)
sfr_annealing.plot(xname='treatment', yname='Effect on Household Profit', xlabel='Microcredit Provision', dpi=300, title='SFR: Annealing',
                   path=path+'/figures/', fname='sfr_annealing_profit.png')

### Charity

In [None]:
# load data from charity RCT
data = pd.read_csv(path+'/data/data_charity.csv')
# save data for later plotting
data_charity = data.copy()

In [None]:
# latex output for descriptives
print(round(data.describe(), 2).T.to_latex(caption='Descriptive Statistics for the Charity Data'))

In [None]:
# check plot
fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (10, 5), dpi=300) # define the plot layout
# plot ols fit
np.random.seed(0)
ax.scatter(x=data.treatment + np.random.uniform(-0.1, 0.1, len(data.amount)), y=data.amount, color='grey', s=30)
ax.title.set_text('Donation Amount by Treatment Status')
ax.set_xlabel('Matching Grant')
ax.set_ylabel('Donation Amount')
plt.xticks([-0.5, 0, 1, 1.5], ['', 'Control', 'Treated', ''])
plt.show()
fig.savefig(path+'/figures/scatter_donation.png', bbox_inches='tight')

In [None]:
# run OLS, HUBER, RANSAC, SFR

# specify model
model = sm.OLS(endog=data.amount, exog=pd.DataFrame(sm.add_constant(data.treatment)))
df = data.shape[0] - 2

# ols fit
ols_fit = model.fit()
ols_param = ols_fit.params[1]
ols_se = ols_fit.bse[1]
ols_t = ols_param/ols_se
ols_p = stats.t.sf(np.abs(ols_t), df) * 2
ols_ci_up = ols_param + stats.t.ppf(1-0.05/2, df) * ols_se
ols_ci_down = ols_param - stats.t.ppf(1-0.05/2, df) * ols_se
# get predictions for ols
ols_pred = pd.DataFrame(ols_fit.predict(sm.add_constant(data.treatment)), index=data.index, columns=['predictions'])

# fit huber
huber_fit = sm.RLM(endog=data.amount, exog=pd.DataFrame(sm.add_constant(data.treatment)), M=sm.robust.norms.HuberT()).fit()
huber_param = huber_fit.params[1]
huber_se = huber_fit.bse[1]
huber_t = huber_param/huber_se
huber_p = stats.t.sf(np.abs(huber_t), df) * 2
huber_ci_up = huber_param + stats.t.ppf(1-0.05/2, df) * huber_se
huber_ci_down = huber_param - stats.t.ppf(1-0.05/2, df) * huber_se
# get predictions for huber
huber_pred = pd.DataFrame(huber_fit.predict(sm.add_constant(data.treatment)), index=data.index, columns=['predictions'])

# fit RANSAC
ransac_fit = RANSACRegressor(random_state=0)
ransac_fit.fit(X=pd.DataFrame(data.treatment), y=data.amount) # intercept is added by default in sklearn
ransac_param = ransac_fit.estimator_.coef_[0]
# get standard error for RANSAC via bootstrapping
ransac_param_boot = []
# loop over
for boot_idx in range(1000):
    # set seed
    np.random.seed(boot_idx)
    # get in bag indices
    in_idx = np.random.choice(np.arange(data.shape[0], dtype=int), size=data.shape[0], replace=True)
    # in bag observations
    endog_in = data.amount[in_idx]
    exog_in = data.treatment[in_idx]
    # estimate ransac
    ransac_fit_boot = RANSACRegressor(random_state=boot_idx)
    ransac_fit_boot.fit(X=pd.DataFrame(exog_in), y=endog_in) # intercept is added by default in sklearn
    ransac_param_boot.append(ransac_fit_boot.estimator_.coef_[0])
# inference
ransac_se = np.std(ransac_param_boot)
ransac_t = ransac_param/ransac_se
ransac_p = stats.t.sf(np.abs(ransac_t), df) * 2
ransac_ci_up = ransac_param + stats.t.ppf(1-0.05/2, df) * ransac_se
ransac_ci_down = ransac_param - stats.t.ppf(1-0.05/2, df) * ransac_se
# get predictions from ransac
ransac_pred = pd.DataFrame(ransac_fit.predict(X=pd.DataFrame(data.treatment)), index=data.index, columns=['predictions'])

# fit from the sfr
sfr = sf.SFR(linear_model=model, random_state=0)
sfr_scores = sfr.score()
sfr_fit = sfr.fit(n_boot=1000)
sfr_param = sfr_fit.params[1]
sfr_se = sfr_fit.stand_err[1]
sfr_t = sfr_param/sfr_se
sfr_p = stats.t.sf(np.abs(sfr_t), df) * 2
sfr_ci_up = sfr_param + stats.t.ppf(1-0.05/2, df) * sfr_se
sfr_ci_down = sfr_param - stats.t.ppf(1-0.05/2, df) * sfr_se
# get predictions for sfr
sfr_pred = pd.DataFrame(sfr_fit.fittedvalues, index=data.index, columns=['predictions'])

In [None]:
# save scores
scores_charity = sfr_scores.scores.copy()

In [None]:
# combine results
ols_results = pd.DataFrame({'Coef.': ols_param, 'Std.Err.': ols_se, 't': ols_t, 'P>|t|': ols_p, '[0.025': ols_ci_down, '0.975]': ols_ci_up}, index=['OLS'])
huber_results = pd.DataFrame({'Coef.': huber_param, 'Std.Err.': huber_se, 't': huber_t, 'P>|t|': huber_p, '[0.025': huber_ci_down, '0.975]': huber_ci_up}, index=['HUBER'])
ransac_results = pd.DataFrame({'Coef.': ransac_param, 'Std.Err.': ransac_se, 't': ransac_t, 'P>|t|': ransac_p, '[0.025': ransac_ci_down, '0.975]': ransac_ci_up}, index=['RANSAC'])
sfr_results = pd.DataFrame({'Coef.': sfr_param, 'Std.Err.': sfr_se, 't': sfr_t, 'P>|t|': sfr_p, '[0.025': sfr_ci_down, '0.975]': sfr_ci_up}, index=['SFR'])

In [None]:
# get latex
app3_all_results = pd.concat([ols_results, huber_results, ransac_results, sfr_results])
print(app3_all_results.to_latex(caption='Comparison of The Effect of Matching Grants on Donation Amount'))
# save
app3_all_results.to_csv(path+'/results/charity_results.csv', index=False)

In [None]:
# plot the scores
np.random.seed(0)
sfr_scores.plot(xname='treatment', xlabel='Matching Grant', yname='Donation Amount', figsize=(10,5), dpi=300, s=30, jitter=True,
                path=path+'/figures/', fname='sfr_scores_amount.png')

In [None]:
# format the plot
sfr_scores.figures['treatment'][1].set_xticklabels(['', 'Control', 'Treated', ''])
sfr_scores.figures['treatment'][0]

In [None]:
# save the plot
sfr_scores.figures['treatment'][0].savefig(path+'/figures/sfr_scores_amount.png')

In [None]:
sfr_annealing = sfr.anneal(share=0.1, n_boot=1000)
sfr_annealing.plot(xname='treatment', yname='Effect on Donation Amount', xlabel='Matching Grant', dpi=300, ylim=[-0.5, 0.5], title='SFR: Annealing',
                   path=path+'/figures/', fname='sfr_annealing_amount.png')

## Summary

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize = (22.5, 5), dpi=300) # define the plot layout
fig.subplots_adjust(wspace=0.25)

# plot amip fit
np.random.seed(0)
# lalonde data
np.random.seed(0)
lalonde_plot = ax1.scatter(x=data_lalonde.treatment + np.random.uniform(-0.1, 0.1, len(data_lalonde.re78)),
                               y=data_lalonde.re78, c=scores_lalonde, cmap='RdYlGn', s=30)
ax1.title.set_text("Labor Data")
ax1.set_xlabel('Training Program')
ax1.set_ylabel('Real Earnings')
ax1.set_xticks([-0.5, 0, 1, 1.5])
ax1.set_xticklabels(['', 'Control', 'Treated', ''])
# microcredit data
microcredit_plot = ax2.scatter(x=data_microcredit.treatment + np.random.uniform(-0.1, 0.1, len(data_microcredit.profit)),
                               y=data_microcredit.profit, c=scores_microcredit, cmap='RdYlGn', s=30)
ax2.title.set_text("Microcredit Data")
ax2.set_xlabel('Microcredit Provision')
ax2.set_ylabel('Household Profit')
ax2.set_xticks([-0.5, 0, 1, 1.5])
ax2.set_xticklabels(['', 'Control', 'Treated', ''])
# add legend
legend = ax2.legend(*microcredit_plot.legend_elements(), title="Reliability Score",
                    bbox_to_anchor=(-1.25, -0.3, 3.5, .102), loc=3,
                    ncol=12, mode="expand", borderaxespad=0., fancybox=True, shadow=True)
ax2.add_artist(legend)
# charity data
np.random.seed(0)
charity_plot = ax3.scatter(x=data_charity.treatment + np.random.uniform(-0.1, 0.1, len(data_charity.amount)),
                           y=data_charity.amount, c=scores_charity, cmap='RdYlGn', s=30)
ax3.title.set_text("Charity Data")
ax3.set_xlabel('Matching Grant')
ax3.set_ylabel('Donation Amount')
ax3.set_xticks([-0.5, 0, 1, 1.5])
ax3.set_xticklabels(['', 'Control', 'Treated', ''])

plt.show()
fig.savefig(path+'/figures/scores_all.png', bbox_inches='tight')

In [None]:
# get latex
all_results = pd.concat([app1_all_results.loc[['OLS', 'SFR'], :], app2_all_results.loc[['OLS', 'SFR'], :], app3_all_results.loc[['OLS', 'SFR'], :]])
print(all_results.to_latex(caption='Fitting: Comparison for Labor, Microcredit and Charity Data'))
# save
all_results.to_csv(path+'/results/all_emp_results.csv', index=False)