In [1]:
import sys
sys.path.append('..')

In [2]:
from similarity_check.SimilarityCheck import *
from synthetic_data_generation.generator import *
from utils import *
# Import libraries
from privacy_check import PrivacyCheck

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# define path to the data you want to test
path_test_data = "../Subsample_training.csv"

# take the comment out to see the first 10 rows of your data

# indicate which columns are categorical, and which are sensitive
cat_cols = ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE', 'Risk_Flag']
sensitive_cols = ["first_name", "last_name", "email", "gender", "ip_address", "nationality", "city"]

my_metadata = {
    'fields':
        {
            'Income': {'type': 'numerical', 'subtype': 'integer'},
            'Age': {'type': 'numerical', 'subtype': 'integer'},
            'Experience': {'type': 'numerical', 'subtype': 'integer'},
            'CURRENT_JOB_YRS': {'type': 'numerical', 'subtype': 'integer'},
            'CURRENT_HOUSE_YRS': {'type': 'numerical', 'subtype': 'integer'},
            'Married/Single': {'type': 'categorical'},
            'House_Ownership': {'type': 'categorical'},
            'Car_Ownership': {'type': 'categorical'},
            'Profession': {'type': 'categorical'},
            'CITY': {'type': 'categorical'},
            'STATE': {'type': 'categorical'},
            'Risk_Flag': {'type': 'boolean'}
        },
    'constraints': [],
    'model_kwargs': {},
    'name': None,
    'primary_key': None,
    'sequence_index': None,
    'entity_columns': [],
    'context_columns': []
}

data = get_data(path_test_data)
# checking that it can deal with nan values
data.iloc[3, 2] = float("nan")
print(data.head())
# create object
generator = Generator(num_epochs = 1, n_samples=100, architecture='CTGAN',
                        data=data,
                        categorical_columns=cat_cols,
                        sensitive_columns=sensitive_cols)
print("Generating data")
synth_data = generator.generate().iloc[:, 2:]

   Unnamed: 0      Id     Income  Age  Experience Married/Single  \
0      189701  189702  2201763.0   70          15         single   
1      176485  176486  7159687.0   38           5         single   
2       27140   27141  4539480.0   46           3         single   
3       95936   95937        NaN   42          16         single   
4       82608   82609  6501844.0   79          14         single   

  House_Ownership Car_Ownership           Profession       CITY  \
0          rented            no              Dentist      Kulti   
1          rented            no    Computer_operator    Jalgaon   
2          rented           yes     Fashion_Designer  Eluru[25]   
3          rented            no  Biomedical_Engineer      Bhind   
4          rented           yes             Engineer     Bhopal   

            STATE  CURRENT_JOB_YRS  CURRENT_HOUSE_YRS  Risk_Flag  
0     West_Bengal               12                 14          0  
1     Maharashtra                3                 11 

In [15]:
synth_data

Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,3400262.0,56,16,married,norent_noown,yes,Physician,Gudivada,Mizoram,4,13,1
1,4302382.0,39,9,married,owned,yes,Physician,Mehsana,West_Bengal,7,10,1
2,4474086.0,35,14,single,owned,yes,Technician,Kurnool[18],Jammu_and_Kashmir,4,13,1
3,988883.0,46,16,single,rented,no,Computer_hardware_engineer,"Khora,_Ghaziabad",Uttar_Pradesh,2,10,0
4,8706997.0,74,10,married,owned,no,Design_Engineer,Vasai-Virar,Uttar_Pradesh,5,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,2137524.0,29,7,single,owned,no,Official,Ratlam,Chandigarh,10,11,1
96,724711.0,38,7,single,norent_noown,yes,Scientist,Bardhaman,Bihar,4,14,1
97,7931841.0,57,15,single,owned,no,Graphic_Designer,Bhubaneswar,Chandigarh,6,12,1
98,8669889.0,51,8,single,rented,yes,Web_designer,Alappuzha,Jharkhand,7,12,1


In [16]:
real_data = data.drop(["Unnamed: 0", "Id"], axis = 1)

In [25]:
# Create instance of privacy check
privacy = PrivacyCheck(real_data, synth_data, my_metadata)

### SDV Diagnostic Report

In [26]:
privacy.generate_report()

Creating report: 100%|██████████| 4/4 [00:00<00:00, 14.11it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data





In [27]:
privacy.get_visualization("Coverage")

In [29]:
privacy.get_visualization("Boundaries")

In [28]:
privacy.get_visualization("Synthesis")

## Our privacy score

The Sythesis score above is **not satisfactory**, it only checks if rows in the synthetic dataset **exactly** appear in the real data. For instance, even if the only difference between a sythetic and real row is that the income differs by 1 (not a lot), the row is considered as novel.

#### **Privacy score definition:** 

*Privacy score*

For every row in the synthetic row, we compute the closest neighbour in the original dataset. Then we look at the synthetic row that is the closest to any real row and look at the distance $D$. The privacy score $S_{privacy}$ is equal to $S_{privacy} = 1 - D$. How we compute the distance between rows is a crucial aspect of it. Here is how we compute it:
1. Normalize numerical columns with real mean/sd
2. For every synthetic row, we:
    - Compute the distance to every real row
    - Find the real row with the smallest distance -> found nearest neighbour for that row
3. Privacy Score is equal to the smallest distance to any nearest neighbour among all synthetic rows

*Distance measure*

For two rows $x$ and $y$ with identical lengths and columns, we compute the average element-wise distance. How we compute the distance between two elements of the same column depends on the column type: numerical or string.
- **Numerical columns**: Since we normalized the columns, for every element at position $j$ we compute the area under the central normal pdf between the two points.
$$
d_{num, j} = |\mathbf{\Phi}(x_j) - \mathbf{\Phi}(y_j)|
$$ 
- **Non-numerical columns**: If the two elements $x_k$ and $y_k$ are equal the distance is 0 and if they are not, the distance is 1.
$$
d_{cat, k} = 
\begin{cases} 
 0& , if\  x_k = y_k\\
 1& ,otherwise
\end{cases}
$$

The distance $dist_{xy}$ between the two rows is:
$$
dist_{xy} = \frac{1}{n}(\sum_{j \in NonNum\_cols} d_{cat, j} + \sum_{k \in Num\_cols} d_{num, k})
$$ 
with $n$ the length of the rows.

In [30]:
# Compute privacy score
privacy.generate_privacy_score()

Computing privacy score: 100%|██████████| 100/100 [01:02<00:00,  1.61it/s]


In [21]:
# Display results
score, pairs = privacy.get_privacy_score()

############ SCORE ############
Privacy score:  85.59%
############ TOP 3 CLOSEST PAIRS ############
1. Closest pair with distance:  0.1441


|                   | Synthetic obs. (idx: 3)    | Closest real obs. (idx: 141)   |
|:------------------|:---------------------------|:-------------------------------|
| Income            | 988883.0                   | 5028396.0                      |
| Age               | 46                         | 38                             |
| Experience        | 16                         | 16                             |
| Married/Single    | single                     | single                         |
| House_Ownership   | rented                     | rented                         |
| Car_Ownership     | no                         | no                             |
| Profession        | Computer_hardware_engineer | Flight_attendant               |
| CITY              | Khora,_Ghaziabad           | Khora,_Ghaziabad               |
| STATE             | Uttar_Pradesh              | Uttar_Pradesh                  |
| CURRENT_JOB_YRS   | 2                          | 4                              |
| CURRENT_HOUSE_YRS | 10                         | 10                             |
| Risk_Flag         | 0                          | 0                              |

2. Closest pair with distance:  0.1684


|                   | Synthetic obs. (idx: 87)   | Closest real obs. (idx: 687)   |
|:------------------|:---------------------------|:-------------------------------|
| Income            | 3357458.0                  | 3553290.0                      |
| Age               | 79                         | 50                             |
| Experience        | 0                          | 8                              |
| Married/Single    | single                     | single                         |
| House_Ownership   | rented                     | rented                         |
| Car_Ownership     | no                         | no                             |
| Profession        | Biomedical_Engineer        | Technician                     |
| CITY              | Phagwara                   | Phagwara                       |
| STATE             | Punjab                     | Punjab                         |
| CURRENT_JOB_YRS   | 6                          | 8                              |
| CURRENT_HOUSE_YRS | 11                         | 11                             |
| Risk_Flag         | 0                          | 0                              |

3. Closest pair with distance:  0.1911


|                   | Synthetic obs. (idx: 10)   | Closest real obs. (idx: 1989)   |
|:------------------|:---------------------------|:--------------------------------|
| Income            | 7217948.0                  | 8344782.0                       |
| Age               | 77                         | 77                              |
| Experience        | 14                         | 16                              |
| Married/Single    | single                     | single                          |
| House_Ownership   | rented                     | rented                          |
| Car_Ownership     | no                         | no                              |
| Profession        | Scientist                  | Scientist                       |
| CITY              | Madanapalle                | Rohtak                          |
| STATE             | Chandigarh                 | Haryana                         |
| CURRENT_JOB_YRS   | 10                         | 12                              |
| CURRENT_HOUSE_YRS | 10                         | 10                              |
| Risk_Flag         | 0                          | 0                               |