In [None]:
# data_processing.py

def centre_data(data):
    return data - data.mean()

def scale_data(data, target_bounds, data_bounds=None):
    '''
       Requires:
           data           - Pandas DataFrame floats
           bounds         - Dictionary: Upper and lower bounds for
                            labelled columns in 'data'
           target_scale   - Tuple: Upper and lower bounds for scaled 'data'
           centre_data    - Bool: Flag to indicate if data should be centred 
                            before scaling

       Returns:
           result         - Pandas DataFrame with "data" columns
                            scaled to "target_scale"
    '''
    if isinstance(target_bounds, tuple):
        min_target, max_target = target_bounds

    result = data.copy()
    
    if data_bounds is None:
        data_bounds = dict()
        for key in list(data.columns.values):
            data_bounds[key] = (data[key].min(),data[key].max())

    for key in list(data.columns.values):
        if key in data_bounds.keys():  
            min_data, max_data = data_bounds[key]
            if isinstance(target_bounds, dict):
                min_target, max_target = target_bounds[key]
            result[key] = (data[key]
                           .apply(lambda x:
                                  min_target +
                                  ((max_target - min_target) *
                                   (float(x) - min_data)) /
                                  (max_data - min_data)
                                  ))
        else:
            print('Warning: scale_data: column "{c}" \
                   is not present in data with columns: \n{col}'.format(
                c=key, col=list(data.columns.values)))

    return result




def centered_sample_covariance_matrix(X):
    '''
        Requires
            X        - dataset
            
        Returns      - zero mean covariance estimation query
    '''
    try:
        result = pow(X.shape[0], -1)*numpy.matmul(numpy.transpose(X),X)
    except:
        result = pow(X_scaled.shape[0], -1)*numpy.transpose(X).dot(X)
    finally:
        return result





In [None]:
#  model_evaluation.py

# Metric for Covariance estimation: Residual Sum Squares of 'total variance captured'
def principle_component_RSS(true, pred):
    true_eigenvals, _ = numpy.linalg.eig(true)
    pred_eigenvals, _ = numpy.linalg.eig(pred)    
    return numpy.sum(pow(numpy.subtract(true_eigenvals,pred_eigenvals), 2))

In [None]:
# differential_privacy_mechanisms.py

def gaussian_mechanism(x, scale):
    if any([isinstance(x, float), isinstance(x, int)]):
        M = x + numpy.random.normal(loc=0, scale=scale)
    else:
        M = x + numpy.random.normal(loc=0, scale=scale, size=len(x))
    return M

def gaussian_mechanism_matrix_sample(data,
                                     epsilon,
                                     delta,
                                     sensitivity,
                                     symmetric=False,
                                     verbose=False):
    
    from mpmath import log
    
    n, m = data.shape
    
    if symmetric:
        adjusted_epsilon = 2 * epsilon / (pow(n, 2) + n)
    else:
        adjusted_epsilon = epsilon / pow(n, 2)
        
    gaussian_mechanism_scale = (
        pow(2*log(1.25/delta), 0.5)*sensitivity
        ) / adjusted_epsilon
      
    if verbose:
        print('\nAdding Guassian N(0,{std}) noise to matrix shape {s}'.format(
            s=data.shape, 
            std=round(gaussian_mechanism_scale,4))
        ) 
        print('with differential privacy parameters')
        print('total epsilon\t{te}\nepsilon\t\t{e}\ndelta\t\t{d}\nsensitivity\t{s}\n'.format(
            te=epsilon,
            e=adjusted_epsilon, 
            d=round(delta,6),
            s=sensitivity)
        )
    
    if all([symmetric, n == m]):
        data_dp_sample = numpy.zeros(data.shape)
        
        for x, y in zip(*numpy.triu_indices(n)):
            
            data_dp_sample[x,y] = gaussian_mechanism(data[x,y], gaussian_mechanism_scale)
            if not(x == y):
                data_dp_sample[y,x] = data_dp_sample[x,y]
    else:
        data_dp_sample = numpy.array([
            numpy.array([
                gaussian_mechanism(v, gaussian_mechanism_scale)
                for v in x
            ])
            for x in data
        ])
        
    return data_dp_sample


# Establish parameters: gaussian_sensitivity, MVG_sensitivity
def centered_covariance_query_sensitivity(n, m, c):
    '''
       'A Differential Privacy Mechanism Design Under Matrix-Valued Query'
        Chanyaswad, Dytso, Poor & Mittal 2018, p.18.:
        https://arxiv.org/abs/1802.10077 (accessed 16/12/2018)

       Requires
           n   - divisor of query function
           m   - number of unit values / observations to be varied under adjacency definition
           c   - maximum possible value in range of single observation
           
       Returns - Sensitivity calculation for zero mean covariance estimation query 
                 ie f(X) = n^-1 * transpose(X)X
    '''
    return (2 * m * c**2) / n
