In [1]:
import numpy as np
from numpy import random

import pandas as pd
import matplotlib.pylab as plt
import ot
import ot.plot
import cvxpy as cp

import scipy.stats as stats
import seaborn as sns
import scipy.special as sps
import time as t

# STEPS
1. [Setup](#Setup)
2. [Barycentric Projection](#(1)-Barycentric-Projection)
3. [Get Optimal Weights](#(2)-Optimal-Weights)
4. [Complete Function](#Complete-Function)
5. [Utils](#Utils)
6. [To-do](#TO-DO)

## Setup

In [2]:
n1 = n2 = 100
dim = 2

covmat = np.zeros((dim, dim))
np.fill_diagonal(covmat, 1)

covmat2 = np.full((dim, dim), 0.3)
np.fill_diagonal(covmat2, 1)

covmat3 = np.full((dim, dim), 0.5)
np.fill_diagonal(covmat3, 1)

covmat4 = np.full((dim, dim), 0.8)
np.fill_diagonal(covmat4, 1)

a_ones, b_ones = np.ones((n1,)) / n1, np.ones((n2,)) / n2

control1 = random.multivariate_normal(mean = [20, 20], cov = covmat4, size = n1)
control2 = random.multivariate_normal(mean = [100, 100], cov = covmat4, size = n1)
control3 = random.multivariate_normal(mean = [50, 50], cov = covmat3, size = n1)
controls = [control1, control2, control3]

TARGET = random.multivariate_normal(mean = [25]*dim, cov = covmat, size = n1)


a_ones, b_ones = np.ones((n1,)) / n1, np.ones((n2,)) / n2

## Functions

### (1) Barycentric Projection
`baryc_proj` calculates the conditional expectation of each column of the transport plan and outputs the transport map. 
<br>

NOTE: we may consider keeping the distance matrix calculation outside of this function for convenience

In [3]:
def baryc_proj(source, target):
    
    M = ot.dist(source, target)
    M /= M.max()
    OTplan = ot.bregman.sinkhorn_stabilized(a_ones, b_ones, M, reg = 5*1e-3)
    
    n = OTplan.shape[0]
    p = OTplan.shape[1]
    
    # initialization
    OTmap = np.empty((0, dim))

    for i in range(n):
        
        # normalization
        OTplan[i,:] = OTplan[i,:] / sum(OTplan[i,:])
    
        # conditional expectation
        OTmap = np.vstack([OTmap, (target.T @ OTplan[i,:])])
    
    OTmap = np.array(OTmap)
    
    return(OTmap)

### (2) Optimal Weights
`to_optimize` defines the function that we want to minimize; it is for use within optimization with CVXPY. Refer to (2.7) in DSC<br>
`get_optimal_weights` defines the actual optimization with CVXPY

In [4]:
def to_optimize(lambdas):
    
    ans = []
    for i in range(J):
        temp = lambdas[i] * (G_list[i] - TARGET)
        ans.append(sum(sum(temp**2)))
    
    return sum(ans) / n

In [5]:
def get_optimal_weights(Glist):
        
    mylambda = cp.Variable(J)

    objective = cp.Minimize(to_optimize(mylambda))
    constraints = [mylambda >= 0, mylambda <= 1, cp.sum(mylambda) == 1]

    prob = cp.Problem(objective, constraints)
    prob.solve()

    weights = mylambda.value
    
    return(weights)

In [None]:
get_optimal_weights(TARGET)

# Complete Function
The complete function takes in the target and controls and outputs the optimal weights and actual projection. It requires the globalization of variables n, d, J and G_list, which are useful outside of the function as well.

In [6]:
def DSCreplication(target, controls):
    
    global n, d, J
    n = target.shape[0]
    d = target.shape[1]
    J = len(controls)

    # Barycentric Projection
    global G_list
    G_list = []
    for i in range(len(controls)):
        G_list.append(baryc_proj(target, controls[i]))
    
    
    # Obtain optimal weights(to_optimze needs to be pre-defined)
    weights = get_optimal_weights(G_list)
    projection = weights[0]*G_list[0]
    for j in range(J-1):
        projection += weights[j+1]*G_list[j+1]
    
    
    return(weights, projection)
    

## Alternative Function

This different version does not require the globalization of variables. Also, only `get_condE` needs to be defined prior to this function. However, having key variables(i.e. individual transport maps, distance matrices) be only local may be incovenient when user wishes to observe or use them.

We may expect slightly improved efficiency due to the fact that this version calls one less function; however how efficient it will actually be is yet to be tested in higher dimensions - larger samples.

In [10]:
def DSCreplicationV2(target, controls):
    
    n = target.shape[0]
    d = target.shape[1]
    J = len(controls)
    
    
    # Barycentric Projection
    G_list = []
    for i in range(len(controls)):
        G_list.append(baryc_proj(target, controls[i]))
    
    
    # Function to optimize
    def to_optimize(lambdas):
                
        ans = []
        for i in range(J):
            temp = lambdas[i] * (G_list[i] - target)
            ans.append(sum(sum(temp**2)))
    
        return sum(ans) / n

    
    # Obtain optimal weights
    mylambda = cp.Variable(J)

    objective = cp.Minimize(to_optimize(mylambda))
    constraints = [mylambda >= 0, mylambda <= 1, cp.sum(mylambda) == 1]

    prob = cp.Problem(objective, constraints)
    prob.solve()

    
    weights = mylambda.value
    projection = weights[0]*G_list[0]
    for j in range(J-1):
        projection += weights[j+1]*G_list[j+1]
    
    
    return(weights, projection)

In [8]:
ts = t.time()

weights1, projection1 = DSCreplication(TARGET, controls)

print(t.time() - ts)
print(weights1)

0.7805440425872803
[0.95789009 0.00422008 0.03788983]


In [11]:
ts = t.time()

weights2, projection2 = DSCreplicationV2(TARGET, controls)

print(t.time() - ts)
print(weights2)

0.8278801441192627
[0.95789009 0.00422008 0.03788983]


# TESTING

## (1)

In [12]:
n1 = n2 = 100
dim = 2

covmat = np.zeros((dim, dim))
np.fill_diagonal(covmat, 1)

covmat2 = np.full((dim, dim), 0.3)
np.fill_diagonal(covmat2, 1)

covmat3 = np.full((dim, dim), 0.5)
np.fill_diagonal(covmat3, 1)

covmat4 = np.full((dim, dim), 0.8)
np.fill_diagonal(covmat4, 1)

a_ones, b_ones = np.ones((n1,)) / n1, np.ones((n2,)) / n2

control11 = random.multivariate_normal(mean = [-100, -100], cov = covmat4, size = n1)
control12 = random.multivariate_normal(mean = [20, 20], cov = covmat4, size = n1)
control13 = random.multivariate_normal(mean = [200]*dim, cov = covmat3, size = n1)
controls1 = [control11, control12, control13]

TARGET1 = random.multivariate_normal(mean = [0]*dim, cov = covmat, size = n1)


a_ones, b_ones = np.ones((n1,)) / n1, np.ones((n2,)) / n2

In [13]:
weights1, projection1 = DSCreplication(TARGET1, controls1)
weights1

array([1.71587074e-03, 9.97413574e-01, 8.70554808e-04])

## (2) - define mixed_multi_gauss in Utils section first

In [16]:
control21 = mixed_multi_gauss([10,10], [20,20], covmat3, covmat3, 100, 0.8)
control22 = mixed_multi_gauss([10,10], [100,100], covmat3, covmat3, 100, 0.8)
control23 = mixed_multi_gauss([80,80], [90,90], covmat3, covmat3, 100, 0.3)
controls2 = [control21, control22, control23]

TARGET2 = random.multivariate_normal(mean = [50]*dim, cov = covmat, size = 100)

In [17]:
weights2, projection2 = DSCreplicationV2(TARGET2, controls2)
weights2

array([0.33386639, 0.31570408, 0.35042952])

## (3) -- check this example; projections seem off

In [18]:
from sklearn.datasets import make_swiss_roll

roll, _ =  make_swiss_roll(n_samples=n1, noise=0.0, random_state=1)
roll = roll[:, 0:2]

roll2, _ =  make_swiss_roll(n_samples=n1, noise=3, random_state=10)
roll2 = roll2[:, 0:2]

TARGET3, _ = make_swiss_roll(n_samples=n1, noise=5, random_state=20)
TARGET3 = TARGET3[:, 0:2]


controls3 = [roll, roll2]

In [19]:
weights3, projection3 = DSCreplicationV2(TARGET3, controls3)
weights3

array([0.46744788, 0.53255212])

## Utils

In [15]:
def mixed_multi_gauss(mean1, mean2, cov1, cov2, samplesize, partition):
    
    size1 = int(samplesize * partition)
    size2 = int(samplesize - size1)
    
    gauss1 = random.multivariate_normal(mean = mean1, cov = cov1, size = size1)
    gauss2 = random.multivariate_normal(mean = mean2, cov = cov2, size = size2)
    
    mixed = np.concatenate((gauss1, gauss2), axis = 0)
    np.random.shuffle(mixed)
    
    return(mixed)


# TO DO

* Finalize naming of variables and functions
* Test for histograms and for different sizes of controls -- if needed, generalize
* Testing with real-life data(Medicaid, Image data)

### ARCHIVE : using variables defined in function for pre-defined functions outside of function(?)

In [None]:
def foo():
    b = a +100
    return(b)
    

In [None]:
def voo():
    a = 10
    
    #def foo():
    #    b = a +90
    #    return(b)
    
    c = foo()
    return(c)

In [None]:
voo()