In [3]:
%%capture
import stata_setup, os
if os.name == 'nt':
    stata_setup.config('C:/Program Files/Stata17/','mp')
else:
    stata_setup.config('/usr/local/stata17','mp')

## Resampling Methods

In [4]:
%%stata -qui

use "../data/data", clear
rename log_flesch_kincaid_grade_level FKG
quietly tabulate year, generate(y_)
quietly tabulate cluster, generate(c_)

local journals  ecm jpe qje res  //AER based category

local jel_imp a_imp b_imp c_imp  e_imp f_imp g_imp h_imp i_imp j_imp k_imp /// 
		l_imp m_imp n_imp o_imp p_imp q_imp r_imp y_imp z_imp // D JEL based case




### Cross-Validation
#### Validation Set Approach

In [5]:
%%stata -qui
splitsample , generate(sample) split(.80 .20) rseed(42)
label define slabel 1 "Training" 2 "Validation"
label values sample slabel




In [6]:
%stata tabulate sample


     sample |      Freq.     Percent        Cum.
------------+-----------------------------------
   Training |      3,990       79.99       79.99
 Validation |        998       20.01      100.00
------------+-----------------------------------
      Total |      4,988      100.00


In [7]:
%%stata -qui
#delimit ;
qui reg FKG log_num_authors log_num_pages both_genders prop_women
        `journals' `jel_imp' y_2-y_20  c_2-c_215  jel_flag
        if sample==1;
#delimit cr
estimates store ols




In [8]:
%stata lassogof ols, over(sample)


Penalized coefficients
-------------------------------------------------------------
Name             sample |         MSE    R-squared        Obs
------------------------+------------------------------------
ols                     |
               Training |    .0261068       0.0990      3,990
             Validation |    .0254966      -0.0304        998
-------------------------------------------------------------


#### Leave-One-Out Cross-Validation

One needs to install the user-written package ```cv_regress``` by issuing the command ```ssc install cv_regress``` before executing the following code:

In [9]:
%%stata -qui
#delimit ;
qui reg FKG log_num_authors log_num_pages both_genders prop_women
        `journals' `jel_imp' y_2-y_20  c_2-c_215  jel_flag;
#delimit cr




In [10]:
%stata cv_regress



Leave-One-Out Cross-Validation Results 
-----------------------------------------
         Method          |    Value
-------------------------+---------------
Root Mean Squared Errors |       0.1692
Log Mean Squared Errors  |      -3.5528
Mean Absolute Errors     |       0.1305
Pseudo-R2                |      0.02028
-----------------------------------------


Given the original sample $\{Y_1,\ldots,Y_n\}$ and the loocv predictions $\{\widehat{Y}_1,\ldots,\widehat{Y}_n\}$, then
$$
\begin{align}
\text{Root Mean Squared Errors}&=&\sqrt{n^{-1}\sum_{i=1}^n(Y_i-\widehat{Y}_i)^2}\\
\text{Mean Absolute Errors}&=&n^{-1}\sum_{i=1}^n|Y_i-\widehat{Y}_i|\\
\text{Pseudo-R2}&=&\widehat{\text{corr}}(Y_i,\widehat{Y}_i)^2
\end{align}
$$

#### _k_-Fold Cross-Validation

One needs to install the user-written package ```crossfold``` by issuing the command ```ssc install crossfold``` before executing the following code:

In [9]:
%%stata

#delimit ;
set seed 42;
crossfold reg FKG log_num_authors log_num_pages both_genders prop_women
              `journals' `jel_imp' y_2-y_20  c_2-c_215  jel_flag,
              k(5) stub(fold);
* Inpongo 5 pliegues los cuales con stub les doy el nombre fold
#delimit cr


. 
. #delimit ;
delimiter now ;
. set seed 42;

. crossfold reg FKG log_num_authors log_num_pages both_genders prop_women
>               `journals' `jel_imp' y_2-y_20  c_2-c_215  jel_flag,
>               k(5) stub(fold);

             |      RMSE 
-------------+-----------
       fold1 |  .1721115 
       fold2 |  .1694547 
       fold3 |  .1751916 
       fold4 |  .1723403 
       fold5 |  .1597371 

. #delimit cr
delimiter now cr
. 


In [10]:
%%capture
import pandas as pd
from pystata import stata
from sfi import Scalar, Matrix
stata.run('''
        #delimit ;
        set seed 42;
        crossfold reg FKG log_num_authors log_num_pages both_genders prop_women
                      `journals' `jel_imp' y_2-y_20  c_2-c_215  jel_flag,
                      k(5) stub(fold);
        #delimit cr
        ''')
df_rmse = pd.DataFrame(sum(Matrix.get('r(fold)'),[]))
rows = Matrix.getRowNames('r(fold)')

stata.run('''
        #delimit ;
        set seed 42;
        crossfold reg FKG log_num_authors log_num_pages both_genders prop_women
                      `journals' `jel_imp' y_2-y_20  c_2-c_215  jel_flag,
                      k(5) stub(fold) mae;
        #delimit cr
        ''')
df_mae = pd.DataFrame(sum(Matrix.get('r(fold)'),[]))

stata.run('''
        #delimit ;
        set seed 42;
        crossfold reg FKG log_num_authors log_num_pages both_genders prop_women
                      `journals' `jel_imp' y_2-y_20  c_2-c_215  jel_flag,
                      k(5) stub(fold) r2;
        #delimit cr
        ''')
df_r2 = pd.DataFrame(sum(Matrix.get('r(fold)'),[]))

# Export to result with Dataframe format
result = pd.concat([df_rmse,df_mae,df_r2],axis=1)
result.columns = ['RMSE','MAE','pseudo R2']
result.index = rows

In [11]:
print(result)

           RMSE       MAE  pseudo R2
fold1  0.172111  0.132485   0.008257
fold2  0.169455  0.130320   0.020247
fold3  0.175192  0.138475   0.014250
fold4  0.172340  0.128805   0.032089
fold5  0.159737  0.124862   0.022078


In this case $\sqrt{CV_{(5)}}$ equals

In [12]:
import math as math
import statistics as st
print(math.sqrt(st.mean(result['RMSE']**2)))

0.16985079515275103
