# Replication Notebook for Illustration: Sample Fit Reliability

Gabriel Okasa and Kenneth A. Younge

In [None]:
# get current working directory
path = %pwd

## Libraries

In [None]:
# Python version 3.8.8
import samplefit as sf # version 0.3.1
import statsmodels.api as sm # version 0.12.2
import pandas as pd # version 1.3.5
import numpy as np # version 1.22.0
import matplotlib as mpl # version 3.4.2
import matplotlib.pyplot as plt

from scipy import stats # version 1.7.2

# turn off future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Boston Housing Dataset

In [None]:
# load data from statsmodels
boston = sm.datasets.get_rdataset("Boston", "MASS")
Y = boston.data['crim'] # crime rate
X = boston.data['lstat'] # lower status
X = sm.add_constant(X)

In [None]:
# latex output for descriptives
print(round(boston.data[['crim', 'lstat']].describe(), 2).T.to_latex(caption='Descriptive Statistics for the Boston Dataset'))

In [None]:
# check plot
fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (10, 5), dpi=300) # define the plot layout
# plot ols fit
ax.scatter(x=boston.data['lstat'], y=boston.data['crim'], color='grey', s=30)
ax.title.set_text('Crime Rate by Social Status')
ax.set_xlabel('% Lower Status of the Population')
ax.set_ylabel('Per Capita Crime Rate')
#plt.xticks([-0.5, 0, 1, 1.5], ['', 'Control', 'Treated', ''])
plt.show()
fig.savefig(path + '/figures/scatter_boston.png', bbox_inches='tight')

### Assess Model Fit

In [None]:
model = sm.OLS(endog=Y, exog=X)
model_fit = model.fit()
model_fit.summary()

In [None]:
# get latex
pd.set_option('display.float_format', '{:.4f}'.format)
print(model_fit.summary2().tables[1].to_latex())

### Assess Sample Reliability

In [None]:
sample = sf.SFR(linear_model=model, random_state=0)
sample_scores = sample.score()
sample_scores.plot(yname='Per Capita Crime Rate', xname='lstat', xlabel='% Lower Status of the Population', dpi=300, title='SFR: Scoring',
                   path=path+'/figures/', fname='sfr_scores_boston.png')

### Assess Sample Sensitivity

In [None]:
sample_annealing = sample.anneal(share=0.1, n_boot=1000)
sample_annealing.plot(xname='lstat', xlabel='% Lower Status of the Population', dpi=300, ylim=[0.125,0.75], title='SFR: Annealing',
                      path=path+'/figures/', fname='sfr_annealing_boston.png')

### Assess Sample Fit

In [None]:
sample_fit = sample.fit(n_boot=1000)
sample_fit_summary = sample_fit.summary(get_table=True)

In [None]:
# get latex
pd.set_option('display.float_format', '{:.4f}'.format)
print(sample_fit_summary.to_latex(caption='SFR: Fitting'))
# save
sample_fit_summary.to_csv(path+'/results/boston_results.csv', index=False)

## Summary

In [None]:
# get latex
pd.set_option('display.float_format', '{:.4f}'.format)
summary_all = pd.concat([model_fit.summary2().tables[1].loc[['lstat'], :].rename(index={'lstat': 'OLS'}), sample_fit_summary.loc[['lstat'], :].rename(columns={'coef': 'Coef.', 'std err': 'Std.Err.'}, index={'lstat': 'SFR'})])
print(summary_all.to_latex(caption='Fitting - Boston Housing Data'))
# save
summary_all.to_csv(path+'/results/all_boston_results.csv', index=False)