In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import pandas as pd
from scipy.optimize import linprog
%matplotlib inline

# Usefule code from lecture 1

In [None]:
# Linear regression
def rec_lin_reg(Q, r):
    rec = np.linalg.lstsq(Q, r, rcond=None)[0]
    rec = np.clip(rec, 0, 1)
    rec = np.rint(rec)
    return rec

In [None]:
# Number of individuals
n = 100
# Number of subsets (queries)
m = 5*n
#Number of iterations
num_iterations = 30
#Probability that a data point is 1
p = 0.5

Error = []
lin_reg_error_list = []
noise_range = range(0, 10)
for noise_scale in noise_range:
    lin_reg_error = []
    lin_prog_error = []
    for sim in range(num_iterations):
        data = np.random.choice([0, 1], size=(n, 1), p=[1-p, p])
        Q = np.random.choice([0, 1], size=(m, n))
        r = Q.dot(data) + np.random.choice(range(-noise_scale, noise_scale+1), size=(m, 1))
        rec_reg = rec_lin_reg(Q, r)
        lin_reg_error.append(np.mean(abs(rec_reg-data)))
    lin_reg_error_list.append(np.mean(lin_reg_error))

plt.plot(noise_range, lin_reg_error_list, label='Linear regression')
plt.xlabel('Noise range')
plt.ylabel('Reconstruction Error')
plt.title('Reconstruction error as a function of noise range')
plt.legend()
plt.show()

# Linear programming example

Linear programming solves problems of the following form:

$~~~~~~~~~~~~~~~~~~~~ \underset{x}{\min} c^{T} x$

subject to: $A_{ub} x \le b_{ub}$

$~~~~~~~~~~~~~~~~~ l \le x \le u$

In [None]:
# Miniize 3x -2y + 4z
# subject to:
# x + y + z <= 5
# x - 2y + 4z >= 2
# x, y, z >= 0
# x < 4, y < 3
c = np.array([3, -2, 4])
A_1 = np.array([1, 1, 1])
A_2 = np.array([-1, 2, -4])
A_ub = np.vstack([A_1, A_2])
b_ub = np.array([5, -2])
lhs_bounds = np.zeros((3, 1))
rhs_bounds = np.array([[4], [3], [None]])
bounds = np.hstack([lhs_bounds, rhs_bounds])
res = linprog(c, A_ub=A_ub, b_ub=b_ub, bounds=bounds)
print(res)

# Linear programming reconstruction

In [None]:
def linear_programming(subsets, responses):
    return 0


# Census data

In [None]:
#open file
df = pd.read_csv('Census data.csv')
df.head()

#get list of column names
col_names = list(df.columns.values)
print(col_names)

#get age histogram
df.hist(column='age')
#plot histogram
plt.show()


# Basic anonymization techniques

In [None]:
num_trials = 10000
num_samples = 500

exact_response_arr = np.zeros(num_trials)
for i in range(num_trials):
    data = np.random.uniform(0, 1, num_samples)
    exact_response_arr[i] = np.mean(data)

exact_unique_values = np.unique(exact_response_arr)
print('Nomber of unique exact responses', exact_unique_values.size)

#plot histogram of exact response and rounded output
plt.hist(exact_response_arr, bins=100, alpha=0.5, label='Exact response')
plt.legend(loc='upper right')
plt.xlabel('Response')
plt.ylabel('frequency')
plt.show()
