## Load in packages and set up environment

In [1]:
using Pkg
Pkg.activate(".")

[32m[1mActivating[22m[39m environment at `~/Documents/Stanford/Stefan/EBCrossFitPaper/Project.toml`


In [2]:
using EBayes
using EBayesDatasets

┌ Info: Recompiling stale cache file /Users/ignatiad/.julia/compiled/v1.2/EBayesDatasets/PGIhc.ji for EBayesDatasets [030c8bae-559e-11e9-37f6-d9ac2af9a810]
└ @ Base loading.jl:1240


In [6]:
using DataDeps
using CSV
using DataFrames
using Random
using Query

## Load dataset and preprocess

In [10]:
crime_path = @datadep_str "communities-and-crime"
crime_df = CSV.File("$crime_path/CommViolPredUnnormalizedData.txt", header=false) |>
           DataFrame
first(crime_df,6)

Unnamed: 0_level_0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8
Unnamed: 0_level_1,String,String,String,String,Int64,Int64,Float64,Float64
1,BerkeleyHeightstownship,NJ,39,5320,1,11980,3.1,1.37
2,Marpletownship,PA,45,47616,1,23123,2.82,0.8
3,Tigardcity,OR,?,?,1,29344,2.43,0.74
4,Gloversvillecity,NY,35,29443,1,16656,2.4,1.7
5,Bemidjicity,MN,7,5068,1,11245,2.76,0.53
6,Springfieldcity,MO,?,?,1,140494,2.45,2.51


In [16]:
crime_colnames = CSV.File(EBayesDatasets.crime_colnames_path,
                     header=false) |>
          DataFrame
names!(crime_df, Symbol.(crime_colnames[:,1]))
first(crime_colnames,6)

Unnamed: 0_level_0,Column1
Unnamed: 0_level_1,String
1,communityname
2,state
3,countyCode
4,communityCode
5,fold
6,population


In [27]:
crime_df_filt = @from i in crime_df begin
                @where i.nonViolPerPop != "?"
                @select i
                @collect DataFrame
    end

crime_df_filt = crime_df_filt |>
                @mutate(nonViolPerPop = parse.(Float64, _.nonViolPerPop)) |>
                DataFrame; 
size(crime_df_filt)

(2118, 147)

Non-violent crime rate is the quantity we will try to predict:

In [29]:
crime_rate = crime_df_filt[:,:nonViolPerPop] ./ 100_000
findmax(crime_rate), findmin(crime_rate)

((0.2711976, 1817), (0.0011679000000000001, 373))

Let us also record the population of every community and the number of crimes.

In [34]:
npopulation = crime_df_filt[:,:population]
ncrimes = Int64.(round.(crime_df_filt[:,:nonViolPerPop] ./ 100_000 .* npopulation));
extrema(ncrimes)

(14, 451432)

## Set up XGBoost and EBCF models

In [24]:
using MLJ
using MLJBase

In [25]:
MLJ.@load XGBoostRegressor
xgb_tree = XGBoostRegressor(max_depth=5)
r_num_round = range(xgb_tree, :num_round, lower=2, upper=100)
r_eta = range(xgb_tree, :eta, lower=0.01, upper=1.0)
nested_ranges = [r_num_round, r_eta]
tuned_XGBoost = TunedModel(model=xgb_tree,
                           tuning=Grid(resolution=8),
                           resampling=CV(nfolds=5),
                           ranges=nested_ranges,
                           measure=rms);



In [98]:
ebcf_xgb = EBayesCrossFit(tuned_XGBoost)

EBayesCrossFit{MLJ.DeterministicTunedModel{Grid,XGBoostRegressor},Int64}([34mDeterministicTunedModel @ 1…93[39m, 5)

## Subsample dataset and apply methods

In [35]:
using Distributions

In [89]:
B_subsample_200 = 200

200

In [58]:
# Hypergeometric(s, f, n)  s successes, f failures, n trials
# (s k)(f n-k)/(s+f n)

# prob k incidents =  (from violent results choose k *  from non violent results choose n-k) /

# so here; Hypergeometric(Violent, Non-Violent, B_subsample)

Random.seed!(1)
subsampled_crimes_200 = rand.( Hypergeometric.(ncrimes, npopulation .- ncrimes, B_subsample_200))
subsampled_crime_rate_200 = subsampled_crimes_200 ./ B_subsample_200;

In [90]:
vst_crimes_200 =  NormalSamples( sqrt.(subsampled_crime_rate_200),
                             fill( sqrt(1/B_subsample_200/4), length(subsampled_crime_rate_200)));

In [91]:
# sanity check: Check approximate variance stabilization
var( vst_crimes_200.Z .- sqrt.(crime_rate ))*4*B_subsample_200

1.0485465098587923

## Apply methods that do not use covariates

In [92]:
unbiased_preds_200 = vst_crimes_200.Z #unbiased at sqrt_scale
unbiased_errors_200 = (unbiased_preds_200.^2 .- crime_rate).^2 .*1_000_000 # so that MSE is per 1_k population;
mean(unbiased_errors_200) # so that MSE is per 1_k population

223.8764867761662

In [93]:
sure_fit_200 = fit(Normal(), SURE(), GrandMeanLocation(), vst_crimes_200)
sure_preds_200 = predict(sure_fit_200);

In [94]:
sure_errors_200 = (sure_preds_200.^2 .- crime_rate).^2 .*1_000_000 # so that MSE is per 1_k population;
mean(sure_errors_200)

184.19454193213878

## Apply covariate-powered methods

In [95]:
X = crime_df_filt[:,6:129]; #lasnames(X)
X_sub =X[:, findall( typeof.(eachcol(X)) .== Vector{Float64})]
names(X_sub)

74-element Array{Symbol,1}:
 :householdsize        
 :racepctblack         
 :racePctWhite         
 :racePctAsian         
 :racePctHisp          
 :agePct12t21          
 :agePct12t29          
 :agePct16t24          
 :agePct65up           
 :pctUrban             
 :pctWWage             
 :pctWFarmSelf         
 :pctWInvInc           
 ⋮                     
 :MedRentPctHousInc    
 :MedOwnCostPctInc     
 :MedOwnCostPctIncNoMtg
 :PctForeignBorn       
 :PctBornSameState     
 :PctSameHouse85       
 :PctSameCity85        
 :PctSameState85       
 :LandArea             
 :PopDens              
 :PctUsePubTrans       
 :LemasPctOfficDrugUn  

In [96]:
# sanity check
mean(X_sub[:, :PctUsePubTrans]) #~3.041

3.0503305004721435

In [None]:
Random.seed!(1)
ebcf_fit_200 = fit(ebcf_xgb, X_sub, vst_crimes_200, verbosity=1);

┌ Info: Training [34mMachine{DeterministicTunedModel} @ 1…50[39m.
└ @ MLJ /Users/ignatiad/.julia/packages/MLJ/K371Q/src/machines.jl:141
┌ Info: Mimimizing rms. 
└ @ MLJ /Users/ignatiad/.julia/packages/MLJ/K371Q/src/tuning.jl:194

# Repeat analysis above with B=500 subsampling

In [None]:
B_subsample_500 = 500
Random.seed!(1)
subsampled_crimes_500 = rand.( Hypergeometric.(ncrimes, npopulation .- ncrimes, B_subsample_500))
subsampled_crime_rate_500 = subsampled_crimes_500 ./ B_subsample_500;

In [None]:
vst_crimes_500 =  NormalSamples( sqrt.(subsampled_crime_rate),
                             fill( sqrt(1/B_subsample_500/4), length(subsampled_crime_rate_500)));
var( vst_crimes_500.Z .- sqrt.(crime_rate ))*4*B_subsample_500

In [None]:
unbiased_preds_500 = vst_crimes_500.Z #unbiased at sqrt_scale
unbiased_errors_500 = (unbiased_preds_500.^2 .- crime_rate).^2 .*1_000_000 # so that MSE is per 1_k population;
@show mean(unbiased_errors_500)

sure_fit_500 = fit(Normal(), SURE(), GrandMeanLocation(), vst_crimes_500)
sure_preds_500 = predict(sure_fit_500);

sure_errors_500 = (sure_preds_500.^2 .- crime_rate).^2 .*1_000_000 # so that MSE is per 1_k population;
mean(sure_errors_500)