In [2]:
Pkg.activate(".")

[32m[1mActivating[22m[39m environment at `~/Documents/Stanford/Stefan/EBCrossFitPaper/Project.toml`


In [3]:
using EBayes
using EBayesDatasets
import JuliaDB
using StatsBase
import DataFrames

┌ Info: Recompiling stale cache file /Users/ignatiad/.julia/compiled/v1.2/EBayes/AL9IL.ji for EBayes [bad9efff-1a8e-41fb-9e7d-5d6f530fb0a3]
└ @ Base loading.jl:1240
┌ Info: Recompiling stale cache file /Users/ignatiad/.julia/compiled/v1.2/EBayesDatasets/PGIhc.ji for EBayesDatasets [030c8bae-559e-11e9-37f6-d9ac2af9a810]
└ @ Base loading.jl:1240


Load in the MovieLens dataset. The preprocessing of the data occurs in the [EBayesDatasets.jl](https://github.com/nignatiadis/EBayesDatasets.jl) package.

In [4]:
mv_lens = EBayesDatasets.MovieLens(;n_min_train=3, prop_test=0.1);

│   caller = EBayesDatasets.MovieLens(::Int64, ::Int64, ::Int64, ::Float64) at movielens.jl:54
└ @ EBayesDatasets /Users/ignatiad/.julia/dev/EBayesDatasets/src/movielens.jl:54
│     df[!, col_ind] = v
│     df
│ end` instead.
│   caller = EBayesDatasets.MovieLens(::Int64, ::Int64, ::Int64, ::Float64) at movielens.jl:139
└ @ EBayesDatasets /Users/ignatiad/.julia/dev/EBayesDatasets/src/movielens.jl:139


## Pulp Fiction and Urban Justice

In the introduction of the paper we mention the ratings of these two movies, so let us compute these:

In [5]:
movie_titles = JuliaDB.select(mv_lens.movie_df, :title);
length(movie_titles)

12481

In [6]:
pulp_fiction_idx = findall( match.(r"Pulp", movie_titles) .!= nothing)[1]
pulp_fiction = mv_lens.movie_df[pulp_fiction_idx]

(movieId = 296,
 title = "Pulp Fiction (1994)",
 genres = "Comedy|Crime|Drama|Thriller",
 year = 1994,
 test_mean = 4.1734962f0,
 test_n = 60641,
 test_sd = 0.9773011f0,
 train_mean = 4.1809115f0,
 train_n = 6669,
 train_sd = 0.96609414f0,)

In [7]:
# let us average over test and train sets
pulp_fiction_avg = mean( [pulp_fiction[:test_mean], pulp_fiction[:train_mean]],
                    Weights([pulp_fiction[:test_n], pulp_fiction[:train_n]]))
round(Float64(pulp_fiction_avg), digits=1)

4.2

In [8]:
urban_justice_idx = findall( match.(r"Urban Justice", movie_titles) .!= nothing)[1]
urban_justice = mv_lens.movie_df[urban_justice_idx]

(movieId = 66652,
 title = "Urban Justice (2007)",
 genres = "Action",
 year = 2007,
 test_mean = 1.5454545f0,
 test_n = 11,
 test_sd = 0.9862693f0,
 train_mean = 1.8333334f0,
 train_n = 3,
 train_sd = 0.28867513f0,)

In [9]:
# let us average over test and train sets
urban_justice_avg = mean( [urban_justice[:test_mean], urban_justice[:train_mean]],
                    Weights([urban_justice[:test_n], urban_justice[:train_n]]))
round(Float64(urban_justice_avg), digits=1)

1.6

# Set up EBCrossFit with XGBoost

In [10]:
using MLJ
using MLJBase

In [11]:
MLJ.@load XGBoostRegressor
xgb_tree = XGBoostRegressor(max_depth=5)
r_num_round = range(xgb_tree, :num_round, lower=2, upper=100)
r_eta = range(xgb_tree, :eta, lower=0.01, upper=1.0)
nested_ranges = [r_num_round, r_eta]
tuned_XGBoost = TunedModel(model=xgb_tree,
                           tuning=Grid(resolution=8),
                           resampling=CV(nfolds=5),
                           ranges=nested_ranges,
                           measure=rms);



In [12]:
ebcf_xgb = EBayesCrossFit(tuned_XGBoost)

EBayesCrossFit{MLJ.DeterministicTunedModel{Grid,XGBoostRegressor},Int64}([34mDeterministicTunedModel @ 1…71[39m, 5)

# Evaluation of methods

In [13]:
ss = mv_lens.Zs_train  #Z_i and σ_i calculated based on 10% of users
ground_truth = mv_lens.Zs_test.Z; #average movie ratings based on other 90% of users

In [14]:
ss

12481-element StructArray(::Array{Float64,1}, ::Array{Float64,1}) with eltype NormalSample{Float64}:
 NormalSample{Float64}(3.915266275405884, 0.01341832039868122)  
 NormalSample{Float64}(3.2022109031677246, 0.0201176920943941)  
 NormalSample{Float64}(3.15059757232666, 0.02645977858585608)   
 NormalSample{Float64}(2.8026819229125977, 0.058021348063866655)
 NormalSample{Float64}(3.091531276702881, 0.027415793044111868) 
 NormalSample{Float64}(3.8189802169799805, 0.019402497795562867)
 NormalSample{Float64}(3.3526148796081543, 0.026386293639319304)
 NormalSample{Float64}(3.0565216541290283, 0.08740960314020435) 
 NormalSample{Float64}(3.0024330615997314, 0.046236734874957754)
 NormalSample{Float64}(3.431675910949707, 0.017478881086919393) 
 NormalSample{Float64}(3.651515245437622, 0.021805049915141168) 
 NormalSample{Float64}(2.6008522510528564, 0.04996164588421241) 
 NormalSample{Float64}(3.316793918609619, 0.08189783492788894)  
 ⋮                                                    

### Apply unbiased predictor (MLE)

In [26]:
unbiased_preds = ss.Z
unbiased_errors = (unbiased_preds .- ground_truth).^2;

In [27]:
unbiased_mse = mean(unbiased_preds_errors)

0.09830188086050119

### Apply SURE predictor

In [28]:
sure_fit = fit(Normal(), SURE(), GrandMeanLocation(), ss)
sure_preds = predict(sure_fit);
sure_errors = (sure_preds .- ground_truth).^2;

In [29]:
sure_mse = mean(sure_errors)

0.0609882885478651

### Apply EBCF and XGBoost predictors

In [21]:
X = copy(mv_lens.X_df)[:,1:20]
X = DataFrames.mapcols( x-> convert(Vector{Float64}, x), X)
names(X)

20-element Array{Symbol,1}:
 :year       
 :n          
 :Action     
 :Adventure  
 :Animation  
 :Children   
 :Comedy     
 :Crime      
 :Documentary
 :Drama      
 :Fantasy    
 :FilmNoir   
 :Horror     
 :Musical    
 :Mystery    
 :Romance    
 :SciFi      
 :Thriller   
 :War        
 :Western    

In [22]:
using Random
Random.seed!(0)
ebcf_fit = fit(ebcf_xgb, X, ss, verbosity=1);

┌ Info: Training [34mMachine{DeterministicTunedModel} @ 3…60[39m.
└ @ MLJ /Users/ignatiad/.julia/packages/MLJ/K371Q/src/machines.jl:141
┌ Info: Mimimizing rms. 
└ @ MLJ /Users/ignatiad/.julia/packages/MLJ/K371Q/src/tuning.jl:194
┌ Info: Training best model on all supplied data.
└ @ MLJ /Users/ignatiad/.julia/packages/MLJ/K371Q/src/tuning.jl:287
┌ Info: Training [34mMachine{DeterministicTunedModel} @ 3…60[39m.
└ @ MLJ /Users/ignatiad/.julia/packages/MLJ/K371Q/src/machines.jl:141
┌ Info: Mimimizing rms. 
└ @ MLJ /Users/ignatiad/.julia/packages/MLJ/K371Q/src/tuning.jl:194
┌ Info: Training best model on all supplied data.
└ @ MLJ /Users/ignatiad/.julia/packages/MLJ/K371Q/src/tuning.jl:287
┌ Info: Training [34mMachine{DeterministicTunedModel} @ 3…60[39m.
└ @ MLJ /Users/ignatiad/.julia/packages/MLJ/K371Q/src/machines.jl:141
┌ Info: Mimimizing rms. 
└ @ MLJ /Users/ignatiad/.julia/packages/MLJ/K371Q/src/tuning.jl:194
┌ Info: Training best model on all supplied data.
└ @ MLJ /Users/ignati

In [23]:
xgboost_preds = ebcf_fit.reg_preds
xgboost_errors = (xgboost_preds .- ground_truth).^2
mean(xgboost_errors)

0.15046060397127844

In [24]:
ebcf_preds = predict(ebcf_fit)
ebcf_errors = (ebcf_preds .- ground_truth).^2;
mean(ebcf_errors)

0.05472779360220277

## Calculate MSEs for all methods across all movies

In [36]:
N_movies = length(ss)

12481

In [57]:
all_mses_df = DataFrames.DataFrame(
             [(method="Unbiased", mse = mean(unbiased_errors), mse_2se = 2*std(unbiased_errors)/sqrt(N_movies)),
              (method="XGBoost", mse = mean(xgboost_errors), mse_2se = 2*std(xgboost_errors)/sqrt(N_movies)),
              (method="SURE", mse = mean(sure_errors), mse_2se = 2*std(sure_errors)/sqrt(N_movies)),
              (method="EBCF", mse = mean(ebcf_errors), mse_2se = 2*std(ebcf_errors)/sqrt(N_movies))
         ])
all_mses_df[!,:mse] = round.(all_mses_df[!,:mse], digits=3)
all_mses_df[!,:mse_2se] = round.( all_mses_df[!,:mse_2se], digits=3)

all_mses_df

Unnamed: 0_level_0,method,mse,mse_2se
Unnamed: 0_level_1,String,Float64,Float64
1,Unbiased,0.098,0.005
2,XGBoost,0.15,0.005
3,SURE,0.061,0.002
4,EBCF,0.055,0.002


## Calculate MSEs for horror & sci-fi movies

In [77]:
horror_plus_scifi_idx = findall((X[:,:Horror] .== 1) .& (X[:,:SciFi] .== 1))
N_hs = length(horror_plus_scifi_idx)

253

In [76]:
# let us look at one example of horror-scifi movie
 mv_lens.movie_df[horror_plus_scifi_idx[1]]

(movieId = 196,
 title = "Species (1995)",
 genres = "Horror|Sci-Fi",
 year = 1995,
 test_mean = 2.8511827f0,
 test_n = 12260,
 test_sd = 0.97894526f0,
 train_mean = 2.8421052f0,
 train_n = 1368,
 train_sd = 0.9887387f0,)

In [78]:
unbiased_errors_hs = unbiased_errors[horror_plus_scifi_idx]
xgboost_errors_hs = xgboost_errors[horror_plus_scifi_idx]
sure_errors_hs = sure_errors[horror_plus_scifi_idx]
ebcf_errors_hs = ebcf_errors[horror_plus_scifi_idx]

hs_mses_df = DataFrames.DataFrame(
             [(method="Unbiased", mse = mean(unbiased_errors_hs), mse_2se = 2*std(unbiased_errors_hs)/sqrt(N_hs)),
              (method="XGBoost", mse = mean(xgboost_errors_hs), mse_2se = 2*std(xgboost_errors_hs)/sqrt(N_hs)),
              (method="SURE", mse = mean(sure_errors_hs), mse_2se = 2*std(sure_errors_hs)/sqrt(N_hs)),
              (method="EBCF", mse = mean(ebcf_errors_hs), mse_2se = 2*std(ebcf_errors_hs)/sqrt(N_hs))
         ])
hs_mses_df[!,:mse] = round.(hs_mses_df[!,:mse], digits=3)
hs_mses_df[!,:mse_2se] = round.(hs_mses_df[!,:mse_2se], digits=3)

hs_mses_df

Unnamed: 0_level_0,method,mse,mse_2se
Unnamed: 0_level_1,String,Float64,Float64
1,Unbiased,0.098,0.032
2,XGBoost,0.21,0.036
3,SURE,0.064,0.018
4,EBCF,0.051,0.012
