In [1]:
# basic libraries
import pandas as pd
import numpy as np
import random

# sklearn functions
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_spd_matrix

# function to create splits for cross-fitting
def partition (list_in, n):
    random.shuffle(list_in)
    return [list_in[i::n] for i in range(n)]

# function to use splits for orthogonalization
def orthog(ind, dep, indices, mod):
    
    # fit the model
    modfit = mod.fit(
        np.delete(ind, indices, 0),
        np.delete(dep, indices, 0).ravel()
    )
    
    # predict
    dephat = modfit.predict(
        ind[indices]
    ).reshape(-1, 1)
    
    # residualize
    depres = dep[indices] - dephat
    
    return depres

def dml(X, y, d, ymod, dmod = None, splits = 2):
    
    # reshape the data if necessary
    if len(y.shape) == 1:
        y = y.reshape(-1, 1)
    if len(d.shape) == 1:
        d = d.reshape(-1, 1)
    
    # double model logic
    if dmod is None:
        dmod = ymod
    
    # split indices
    I = partition(list(range(len(y))), splits)
    
    # initialize empty lists
    dlist  = np.empty((0,1), float)
    ylist  = np.empty((0,1), float)
    thetas = np.empty((0,1), float)
    
    # perform cross-fitting
    for i in I:
        
        # get orthogonalized treatment
        dorth = orthog(X, d, i, dmod)
        dlist = np.append(dlist, dorth, 0)
        
        # get orthogonalized response
        yorth = orthog(X, y, i, ymod)
        ylist = np.append(ylist, yorth, 0)
        
        # calculate intermediate thetas
        thetas = np.append(thetas,
                           np.mean(dorth*yorth)/np.mean(dorth**2))
    
    # prep post-orthogonalization regressors
    D = np.hstack( (np.ones((len(dlist), 1)) , dlist) )
    
    # fit the DML2 model
    coefs = np.linalg.lstsq(D, ylist, rcond = None)[0]
    
    # get var-cov matrix for DML2
    res = ylist - (coefs[0]*D[0] + coefs[1]*D[1])
    vcv = np.true_divide(1, len(y) - 2
    )*np.dot(np.dot(res.T,res), np.linalg.inv(np.dot(D.T, D)))
    
    # get DML1 and DML2 coefficient
    theta1 = np.mean(thetas)
    theta2 = coefs[1]
    
    # calculate the dml1 standard error
    se1 = np.sqrt(np.mean( (ylist - theta1*dlist)**2*dlist**2
            ) / (np.mean(dlist**2)**2)
        ) / np.sqrt(len(dlist) - 1)
    
    # calculate the dml2 standard error
    se2 = np.sqrt(np.diagonal(vcv))[1]
    
    # present the output
    return {
        'dml1':{
            'coef_se':np.hstack((theta1, se1))
        },
        'dml2':{
            'coef_se':np.hstack((theta2, se2))
        },
        'orth_data':np.hstack((ylist, dlist)),
        'indices':I
    }

def synth(
    N = 500, K = 10, theta = 0.5,
    seed = 1, g = None, m = None
):
    
    # set the randomization seed
    np.random.seed(seed)
    
    # define 
    b = np.sin([1/K for K in range(1,(K + 1))]) + 0.01
    sigma = make_spd_matrix(K, 1)
    X = np.random.multivariate_normal(np.ones(K), sigma, size = [N,])
    
    # if no functions are supplied for 
    if g is None:
        def g(x):
            return np.power(np.sin(x),2)
    if m is None:
        def m(x,nu=0.,gamma=1.):
            return 0.5/np.pi*(np.sinh(gamma))/(np.cosh(gamma)-np.cos(x-nu))
    
    # define error terms
    e1 = np.random.standard_normal(size=[N,])
    e2 = np.random.standard_normal(size=[N,])
    
    # compute the variables
    G = g(np.dot(X,b))
    M = m(np.dot(X,b))
    d = M + e1
    y = np.dot(theta,d) + G + e2
    
    return y, d, X

In [2]:
%%capture

# set monte carlo parameters
obs_min = 100
obs_max = 2000

S_min = 2
S_max = 100

K_min = 1
K_max = 40

iters = 10000

# initialize empty data frame
dmlDF = pd.DataFrame()

# set the randomization
random.seed(0)

# monte carlo simulation
for i in range(iters):
    
    # select random parameters
    n = random.randint(obs_min, obs_max)
    s = random.randint(S_min, S_max)
    k = random.randint(K_min, K_max)
    
    # create the synthetic data
    out = synth(N = n, K = k, seed = i)
    
    # fit the model
    check = dml(X = out[2], y = out[0], d = out[1], ymod = ElasticNetCV(), splits = s)
    
    # append results to dataframe
    dmlDF = dmlDF.append(
        pd.DataFrame(
            {
                'N':[n],
                'splits':[s],
                'K':[k],
                'dml1_theta':[check['dml1']['coef_se'][0]],
                'dml1_se':[check['dml1']['coef_se'][1]],
                'dml2_theta':[check['dml2']['coef_se'][0]],
                'dml2_se':[check['dml2']['coef_se'][1]]
            }
        )
    )

# split the dataframe columns to adjust and stack
dml1DF = dmlDF[['N', 'splits', 'K', 'dml1_theta', 'dml1_se']]
dml2DF = dmlDF[['N', 'splits', 'K', 'dml2_theta', 'dml2_se']]

# add column specifying estimation method
dml1DF['type'] = ['dml1']*dml1DF.shape[0]
dml2DF['type'] = ['dml2']*dml2DF.shape[0]

# rename the columns to be consistent
dml1DF.columns = ['N', 'splits', 'K', 'theta', 'se', 'type']
dml2DF.columns = ['N', 'splits', 'K', 'theta', 'se', 'type']

# stack the columns
dmlDF = pd.concat([dml1DF, dml2DF]).reset_index(drop = True)

In [4]:
# load in old dataframe
dmlDF_sav = pd.read_csv('dml_monte_carlo.csv')

# add new data to dataframe
dmlDF = pd.concat([dmlDF_sav, dmlDF])

# write the dataframe to a csv
dmlDF.to_csv("dml_monte_carlo.csv", index=False)