In [23]:
# pip install cvxpy # install this on first try

import ot
import pandas as pd

import numpy as np
from numpy import random
import os
import glob

import matplotlib.pylab as plt
import ot.plot
import cvxpy as cp

import scipy.stats as stats
import seaborn as sns
import scipy.special as sps
import time as t

In [24]:
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "*.csv"))
files_list = glob.glob('*.{}'.format("csv"))

files_names = [s.replace(".csv", "") for s in files_list]
files_names

['SC',
 'FL',
 'TN',
 'full data',
 'NC',
 'WY',
 'GA',
 'MS',
 'AL',
 'WI',
 'example',
 'MT',
 'SD',
 'KS',
 'TX']

In [25]:
files_dict = {}

for k in range(len(csv_files)):
    
    key = files_names[k]
    
    # read the csv file
    df = pd.read_csv(csv_files[k])
    
    # display(df)
    
    # store in dictionary
    files_dict[key] = df

In [26]:
# defining target
target = files_dict["MT"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]

In [27]:
target # view the target df
target.shape

(25173, 4)

In [28]:
# controls
control1 = files_dict["SC"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]
control2 = files_dict["FL"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]
control3 = files_dict["TN"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]
control4 = files_dict["NC"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]
control5 = files_dict["WY"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]
control6 = files_dict["GA"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]
control7 = files_dict["MS"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]
control8 = files_dict["AL"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]
control9 = files_dict["WI"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]
control10 = files_dict["SD"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]
control11 = files_dict["KS"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]
control12 = files_dict["TX"][["HINSCAID", "EMPSTAT", "UHRSWORK", "INCWAGE"]]

In [29]:
# code for implementation

## barycentric projection
def baryc_proj(source, target):
    
    n1 = source.shape[0]
    n2 = target.shape[0]   
    p = source.shape[1]
    a_ones, b_ones = np.ones((n1,)) / n1, np.ones((n2,)) / n2
    
    M = ot.dist(source, target)
    M = M.astype('float64')
    M /= M.max()
    OTplan = ot.bregman.sinkhorn_stabilized(a_ones, b_ones, M, reg = 5*1e-3)# try emd
    
    # initialization
    OTmap = np.empty((0, p))

    for i in range(n1):
        
        # normalization
        OTplan[i,:] = OTplan[i,:] / sum(OTplan[i,:])
    
        # conditional expectation
        OTmap = np.vstack([OTmap, (target.T @ OTplan[i,:])])
    
    OTmap = np.array(OTmap)
    
    return(OTmap)

In [30]:
# optimization routine

def to_optimize(lambdas):
    
    ans = []
    for i in range(J):
        temp = lambdas[i] * (G_list[i] - globtarget)
        ans.append(sum(sum(temp**2)))
    
    return sum(ans) / n

In [31]:
# optimal weights calculations

def get_optimal_weights(Glist):
        
    mylambda = cp.Variable(J)

    objective = cp.Minimize(to_optimize(mylambda))
    constraints = [mylambda >= 0, mylambda <= 1, cp.sum(mylambda) == 1]

    prob = cp.Problem(objective, constraints)
    prob.solve()

    weights = mylambda.value
    
    return(weights)

In [32]:
# full implementation

def DSCreplication(target, controls):
    
    global n, d, J, globtarget
    n = target.shape[0]
    d = target.shape[1]
    J = len(controls)
    globtarget = target

    # Barycentric Projection
    global G_list
    G_list = []
    for i in range(len(controls)):
        G_list.append(baryc_proj(target, controls[i]))
    
    
    # Obtain optimal weights(to_optimze needs to be pre-defined)
    weights = get_optimal_weights(G_list)
    projection = weights[0]*G_list[0]
    for j in range(J-1):
        projection += weights[j+1]*G_list[j+1]
    
    
    return(weights, projection)

def DSCreplicationV2(target, controls):
    
    n = target.shape[0]
    d = target.shape[1]
    J = len(controls)
    
    
    # Barycentric Projection
    G_list = []
    for i in range(len(controls)):
        G_list.append(baryc_proj(target, controls[i]))
    
    
    # Function to optimize
    def to_optimize(lambdas):
                
        ans = []
        for i in range(J):
            temp = lambdas[i] * (G_list[i] - target)
            ans.append(sum(sum(temp**2)))
    
        return sum(ans) / n

    
    # Obtain optimal weights
    mylambda = cp.Variable(J)

    objective = cp.Minimize(to_optimize(mylambda))
    constraints = [mylambda >= 0, mylambda <= 1, cp.sum(mylambda) == 1]

    prob = cp.Problem(objective, constraints)
    prob.solve()

    
    weights = mylambda.value
    projection = weights[0]*G_list[0]
    for j in range(J-1):
        projection += weights[j+1]*G_list[j+1]
    
    
    return(weights, projection)

In [33]:
# full implementation and estimation

In [34]:
states_controls = [control1.to_numpy(), control2.to_numpy()] # .iloc[0:200,]

In [35]:
target = target.to_numpy() # iloc[0:100,].to_numpy()

In [36]:
type(target)

numpy.ndarray

In [None]:
weights_s, projection_s = DSCreplicationV2(target, states_controls)

In [None]:
weights_s

In [21]:
projection_s_rnd = projection_s[:,0:2].round(decimals = 0).astype('int64')
# projection4rnd1 = projection4[:,2:3].round(decimals = 0).astype('int64')

# type(projection4rnd)
# [projection4rnd[i] + projection4rnd1[i] for i in range(len(projection4rnd))]
np.concatenate([projection_s_rnd, projection_s[:,2:4]], axis = 1)

array([[1.00000000e+00, 1.00000000e+00, 3.39739183e+03, 2.67000705e+03],
       [1.00000000e+00, 1.00000000e+00, 3.29612757e+03, 2.44996025e+03],
       [1.00000000e+00, 1.00000000e+00, 3.36140618e+03, 2.71539506e+03],
       [1.00000000e+00, 1.00000000e+00, 3.80325858e+03, 5.12585527e+03],
       [1.00000000e+00, 1.00000000e+00, 2.21848368e+03, 1.26069054e+03],
       [1.00000000e+00, 1.00000000e+00, 2.13394883e+03, 1.36843832e+03],
       [1.00000000e+00, 1.00000000e+00, 3.60918317e+03, 2.36210248e+03],
       [1.00000000e+00, 1.00000000e+00, 1.91885960e+03, 1.20914645e+03],
       [1.00000000e+00, 1.00000000e+00, 2.76559643e+03, 2.37226432e+03],
       [1.00000000e+00, 1.00000000e+00, 4.10141158e+03, 6.76167005e+03],
       [1.00000000e+00, 1.00000000e+00, 2.47855385e+03, 1.40665033e+03],
       [1.00000000e+00, 1.00000000e+00, 2.96929473e+03, 1.49343883e+03],
       [1.00000000e+00, 1.00000000e+00, 2.60674228e+03, 1.60003173e+03],
       [1.00000000e+00, 1.00000000e+00, 1.04455349e

In [22]:
target

array([[1.0000e+00, 1.0000e+00, 3.8800e+03, 2.7160e+03],
       [1.0000e+00, 1.0000e+00, 3.7200e+03, 2.3250e+03],
       [1.0000e+00, 1.0000e+00, 3.7600e+03, 2.8200e+03],
       [1.0000e+00, 1.0000e+00, 4.3600e+03, 6.5400e+03],
       [1.0000e+00, 1.0000e+00, 1.6800e+03, 0.0000e+00],
       [1.0000e+00, 1.0000e+00, 1.0500e+03, 7.4550e+02],
       [1.0000e+00, 1.0000e+00, 4.4550e+03, 2.0250e+03],
       [1.0000e+00, 1.0000e+00, 4.4000e+02, 5.2800e+02],
       [1.0000e+00, 1.0000e+00, 1.9200e+03, 2.6880e+03],
       [1.0000e+00, 1.0000e+00, 4.2400e+03, 8.4800e+03],
       [1.0000e+00, 2.0000e+00, 2.3500e+03, 1.8800e+02],
       [1.0000e+00, 1.0000e+00, 3.5500e+03, 0.0000e+00],
       [1.0000e+00, 1.0000e+00, 2.4800e+03, 7.4400e+02],
       [1.0000e+00, 1.0000e+00, 1.2810e+04, 8.3265e+03],
       [1.0000e+00, 1.0000e+00, 4.9500e+03, 6.8200e+03],
       [1.0000e+00, 1.0000e+00, 3.2800e+03, 1.8860e+03],
       [1.0000e+00, 1.0000e+00, 1.2400e+03, 8.0600e+02],
       [2.0000e+00, 1.0000e+00,

In [None]:
# random testing

In [119]:
bin1 = np.column_stack((np.random.randint(2, size = 100), 
                        np.random.randint(2, size = 100), 
                        np.random.randint(2, size = 100)))
bin2 = np.column_stack((np.random.randint(2, size = 100), 
                        np.random.randint(2, size = 100),
                        np.random.randint(2, size = 100)))
bin3 = np.column_stack((np.random.randint(2, size = 100), 
                        np.random.randint(2, size = 100),
                        np.random.randint(2, size = 100)))
controls4 = [bin1, bin2, bin3]

In [None]:
TARGET4 = np.column_stack((np.random.randint(2, size = 100), 
                           np.random.randint(2, size = 100),
                           np.random.randint(2, size = 100)))
weights4, projection4 = DSCreplicationV2(TARGET4, controls4)

In [148]:
projection4rnd = projection4[:,0:2].round(decimals = 0).astype('int64')
projection4rnd1 = projection4[:,2:3].round(decimals = 0).astype('int64')

# type(projection4rnd)
# [projection4rnd[i] + projection4rnd1[i] for i in range(len(projection4rnd))]
# np.concatenate([projection4rnd, projection4rnd1], axis = 1)

array([[1, 1, 0],
       [0, 0, 1],
       [0, 1, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       [1, 0, 1],
       [1, 0, 0],
       [0, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 1],
       [1, 0, 0],
       [0, 1, 1],
       [1, 1, 1],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 1, 1],
       [0, 0, 0],
       [1, 1, 1],
       [1, 0, 0],
       [1, 0, 1],
       [0, 0, 1],
       [0, 1, 1],
       [1, 1, 0],
       [0, 0, 0],
       [0, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 1],
       [1, 1, 0],
       [1, 1, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 1],
       [0, 1, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 0],
       [1, 0, 1],
       [1, 1, 0],
       [1, 0, 0],
       [1, 0, 1],
       [0, 0, 0],
       [0, 1, 1],
       [0, 1, 1],
       [1, 0, 1],
       [0, 0, 0],
       [0, 0, 1],
       [1, 0, 1],
       [0, 0, 0],
       [1, 1, 1],
       [0,

In [150]:
# projection4.round(decimals = 0).astype('int64')

array([[1, 1, 0],
       [0, 0, 1],
       [0, 1, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       [1, 0, 1],
       [1, 0, 0],
       [0, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 1],
       [1, 0, 0],
       [0, 1, 1],
       [1, 1, 1],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 1, 1],
       [0, 0, 0],
       [1, 1, 1],
       [1, 0, 0],
       [1, 0, 1],
       [0, 0, 1],
       [0, 1, 1],
       [1, 1, 0],
       [0, 0, 0],
       [0, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 1],
       [1, 1, 0],
       [1, 1, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 1],
       [0, 1, 1],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 0],
       [1, 0, 1],
       [1, 1, 0],
       [1, 0, 0],
       [1, 0, 1],
       [0, 0, 0],
       [0, 1, 1],
       [0, 1, 1],
       [1, 0, 1],
       [0, 0, 0],
       [0, 0, 1],
       [1, 0, 1],
       [0, 0, 0],
       [1, 1, 1],
       [0,

In [152]:
# projection4[:,1:3].round(decimals = 0).astype('int64')

array([[1, 0],
       [0, 1],
       [1, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 1],
       [0, 1],
       [1, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [0, 1],
       [0, 0],
       [0, 1],
       [1, 0],
       [0, 0],
       [0, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 1],
       [0, 0],
       [0, 1],
       [0, 1],
       [0, 0],
       [1, 1],
       [0, 1],
       [1, 0],
       [0, 0],
       [1, 0],
       [0, 0],
       [0, 0],
       [0, 1],
       [0, 1],
       [0, 0],
       [1, 0],
       [1, 1],
       [0,

In [154]:
# below are remnants from testing/debugging session on 7/6

In [158]:
s = baryc_proj(target, states_controls[1]) # barycentric projections are fine
states_controls[1]



array([[1.00000000e+00, 1.00000000e+00, 3.58492603e-02, 3.04593366e-03],
       [1.00000000e+00, 1.00000000e+00, 2.21142425e-02, 1.15983789e-03],
       [1.00000000e+00, 1.00000000e+00, 3.54173415e-02, 1.26313596e-02],
       [1.00000000e+00, 1.00000000e+00, 2.98023971e-02, 8.12792650e-03],
       [1.00000000e+00, 1.00000000e+00, 3.36356765e-02, 1.20967468e-03],
       [1.00000000e+00, 1.00000000e+00, 3.93046107e-02, 9.89486704e-03],
       [1.00000000e+00, 1.00000000e+00, 1.68448332e-02, 2.82710487e-03],
       [1.00000000e+00, 1.00000000e+00, 3.94557823e-02, 9.40080328e-03],
       [1.00000000e+00, 1.00000000e+00, 4.13022352e-02, 1.21307264e-02],
       [1.00000000e+00, 1.00000000e+00, 4.31918799e-03, 9.06123355e-04],
       [1.00000000e+00, 1.00000000e+00, 2.93704784e-02, 3.54294232e-03],
       [1.00000000e+00, 1.00000000e+00, 3.28258287e-02, 8.09168156e-03],
       [1.00000000e+00, 2.00000000e+00, 4.80509664e-02, 9.07256009e-03],
       [1.00000000e+00, 1.00000000e+00, 2.43602203e

In [159]:
# Barycentric Projection
G_list = []
for i in range(len(states_controls)):
    G_list.append(baryc_proj(target, states_controls[i]))

# Function to optimize
def to_optimize(lambdas):
    ans = []
    for i in range(2):
        temp = lambdas[i] * (G_list[i] - target)
        ans.append(sum(sum(temp**2)))

    return sum(ans) / 100

# Obtain optimal weights
mylambda = cp.Variable(2)

objective = cp.Minimize(to_optimize(mylambda))
constraints = [mylambda >= 0, mylambda <= 1, cp.sum(mylambda) == 1]

prob = cp.Problem(objective, constraints)
prob.solve(verbose=True)

weights = mylambda.value
# projection = weights[0]*G_list[0]
# for j in range(J-1):
#     projection += weights[j+1]*G_list[j+1]

                                     CVXPY                                     
                                     v1.2.1                                    
(CVXPY) Jul 06 10:43:21 AM: Your problem has 2 variables, 3 constraints, and 0 parameters.
(CVXPY) Jul 06 10:43:21 AM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Jul 06 10:43:21 AM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Jul 06 10:43:21 AM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Jul 06 10:43:21 AM: Compiling problem (target solver=OSQP).
(CVXPY) Jul 06 10:43:21 AM: Reduction chain: CvxAttr2Constr -> Qp2SymbolicQp -> QpMatrixStuffing 

In [160]:
prob = cp.Problem(objective, constraints)
prob.solve()

0.005520373431650976

In [177]:
# testing below

In [46]:
n1 = n2 = 100
dim = 3

covmat = np.zeros((dim, dim))
np.fill_diagonal(covmat, 1)

covmat2 = np.full((dim, dim), 0.3)
np.fill_diagonal(covmat2, 1)

covmat3 = np.full((dim, dim), 0.5)
np.fill_diagonal(covmat3, 1)

covmat4 = np.full((dim, dim), 0.8)
np.fill_diagonal(covmat4, 1)

a_ones, b_ones = np.ones((n1,)) / n1, np.ones((n2,)) / n2

c1 = random.multivariate_normal(mean = [20, 20, 20], cov = covmat4, size = n1)
c2 = random.multivariate_normal(mean = [100, 100, 100], cov = covmat4, size = n1)
c3 = random.multivariate_normal(mean = [50, 50, 50], cov = covmat3, size = n1)
cs = [c1, c2, c3]

TARGET = random.multivariate_normal(mean = [25]*dim, cov = covmat, size = n1)

In [47]:
type(cs[1])

numpy.ndarray

In [48]:
type(TARGET)

numpy.ndarray

In [49]:
ts = t.time()

weights1, projection1 = DSCreplication(TARGET, cs)

print(t.time() - ts)
print(weights1)

3.0114848613739014
[0.95644033 0.00437226 0.03918741]
