In [1]:
%%capture
import stata_setup, os
if os.name == 'nt':
    stata_setup.config('C:/Program Files/Stata17/','mp')
else:
    stata_setup.config('/usr/local/stata17','mp')

## Preparing the data

In [2]:
%%stata -qui

use "../data/data", clear
rename log_flesch_kincaid_grade_level FKG
quietly tabulate year, generate(y_)
quietly tabulate cluster, generate(c_)

local journals  ecm jpe qje res  //AER based category

local jel_imp a_imp b_imp c_imp  e_imp f_imp g_imp h_imp i_imp j_imp k_imp /// 
		l_imp m_imp n_imp o_imp p_imp q_imp r_imp y_imp z_imp // D JEL based case




## Double Selection Lasso

The Double Selection Lasso is implemented by the ```dsregress``` Stata command. It'll utilize a 'plugin' value for the $\lambda$s parameters by default, but they can also use standard cross-validation instead.

In [3]:
%%stata -qui
#delimit ;
dsregress FKG  log_num_authors log_num_pages both_genders prop_women,
			controls(`journals' `jel_imp' y_2-y_20  c_2-c_215  jel_flag)
            vce(cluster cluster) rseed(42);
estimates store ds_plugin;
dsregress FKG  log_num_authors log_num_pages both_genders prop_women,
			controls(`journals' `jel_imp' y_2-y_20  c_2-c_215  jel_flag)
            vce(cluster cluster) selection(cv) rseed(42);
estimates store ds_cv;
#delimit cr




In [7]:
%%stata
#delimit ;
lassocoef (ds_plugin, for(FKG)) (ds_cv, for(FKG))
          (ds_plugin, for(prop_women)) (ds_cv, for(prop_women))  ;
#delimit cr


. #delimit ;
delimiter now ;
. lassocoef (ds_plugin, for(FKG)) (ds_cv, for(FKG))
>           (ds_plugin, for(prop_women)) (ds_cv, for(prop_women))  ;

-----------------------------------------------------------
             | ds_plugin   ds_cv     ds_plugin     ds_cv   
             |    FKG       FKG     prop_women  prop_women 
-------------+---------------------------------------------
         c_4 |     x    
        c_36 |     x    
        c_50 |     x    
       c_165 |     x    
       c_183 |     x    
        c_51 |                           x     
       c_100 |                           x     
       c_108 |                           x     
       c_174 |                           x     
       c_196 |                           x     
       c_imp |                                       x     
       i_imp |                                       x     
       k_imp |                                       x     
       l_imp |                                       x     
   

In [8]:
%%stata
estimates restore ds_cv
dsregress


. estimates restore ds_cv
(results ds_cv are active now)

. dsregress

Double-selection linear model         Number of obs               =      4,988
                                      Number of controls          =        257
                                      Number of selected controls =         37
                                      Wald chi2(4)                =      13.61
                                      Prob > chi2                 =     0.0087

                              (Std. err. adjusted for 215 clusters in cluster)
------------------------------------------------------------------------------
             |               Robust
         FKG | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
log_num_au~s |  -.0053689   .0042331    -1.27   0.205    -.0136657    .0029278
log_num_pa~s |   .0161836   .0052931     3.06   0.002     .0058093     .026558
both_genders |   .0005