## Example analyzation and anonymization of sensitive dataset

In [1]:
from pyaaas import AaaS
from pyaaas.privacy_models import KAnonymity, LDiversityDistinct
from pyaaas import AttributeType
from pyaaas import Dataset
import pandas as pd

ImportError: cannot import name 'AaaS' from 'pyaaas' (/home/lord/GIthub/PyAaaS/pyaaas/__init__.py)

#### Create ARXaaS connection

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

In [2]:
aaas = AaaS("http://localhost") # connecting to online service

NameError: name 'AaaS' is not defined

#### fetch sensitive data

In [3]:
data_df = pd.read_csv("../data/data2.csv", sep=";")

NameError: name 'pd' is not defined

In [9]:
data_df = data_df[:6]
data_df

Unnamed: 0,zipcode,age,salary,disease
0,47677,29,3,gastric ulcer
1,47602,22,4,gastritis
2,47678,27,5,stomach cancer
3,47905,43,6,gastritis
4,47909,52,11,flu
5,47906,47,8,bronchitis


### Create Dataset

In [10]:
dataset = Dataset.from_pandas(data_df)
dataset.describe()

data:
  headers:
    ['zipcode', 'age', 'salary', 'disease']
rows:
    [47677, 29, 3, 'gastric ulcer']
    [47602, 22, 4, 'gastritis']
    [47678, 27, 5, 'stomach cancer']
    [47905, 43, 6, 'gastritis']
    [47909, 52, 11, 'flu']
    ...
attributes:
  field_name=zipcode, type=QUASIIDENTIFYING, hierarchy=None
  field_name=age, type=QUASIIDENTIFYING, hierarchy=None
  field_name=salary, type=QUASIIDENTIFYING, hierarchy=None
  field_name=disease, type=QUASIIDENTIFYING, hierarchy=None



### Set the AttributeType for the dataset fields

In [7]:
dataset.set_attribute('salary',
                       AttributeType.IDENTIFYING)

### Set Generalization Hierarchies
Note that if the hierarchy does not have a header row in the csv file, please set header=None in read_csv() or the first row will be interpreted as a header and ARXaaS will throw an exception for the missing hierarchy data.

In [8]:
zipcode_hierarchy = pd.read_csv("../data/data2_zipcode_hierarchy.csv", sep=";", header=None)
age_hierarchy = pd.read_csv("../data/data2_age_hierarchy.csv", sep=";", header=None)
disease_hierarchy = pd.read_csv("../data/data2_disease_hierarchy.csv", sep=";", header=None)

In [9]:
zipcode_hierarchy

Unnamed: 0,0,1,2,3,4,5
0,47677,4767*,476**,47***,4****,*****
1,47602,4760*,476**,47***,4****,*****
2,47678,4767*,476**,47***,4****,*****
3,47905,4790*,479**,47***,4****,*****
4,47909,4790*,479**,47***,4****,*****
5,47906,4790*,479**,47***,4****,*****
6,47605,4760*,476**,47***,4****,*****
7,47673,4767*,476**,47***,4****,*****
8,47607,4760*,476**,47***,4****,*****


In [10]:
dataset.set_hierarchy('age', age_hierarchy)
dataset.set_hierarchy("zipcode", zipcode_hierarchy)
dataset.set_hierarchy("disease", disease_hierarchy)

### Create Privacy Models

In [11]:
kanon = KAnonymity(2)

### Create Risk Profile

In [12]:
risk_profile = aaas.risk_profile(dataset)

INFO:pyaaas.aaas_connector:Connecting to ARXaaS service


TypeError: 'RequestDefinitionBuilder' object is not callable

In [15]:
risk_profile.re_identification_risk

{'Prosecutor_attacker_success_rate': '100.0',
 'records_affected_by_highest_prosecutor_risk': '100.0',
 'sample_uniques': '100.0',
 'estimated_prosecutor_risk': '100.0',
 'population_model': 'ZAYATZ',
 'highest_journalist_risk': '100.0',
 'records_affected_by_lowest_risk': '100.0',
 'estimated_marketer_risk': '100.0',
 'Journalist_attacker_success_rate': '100.0',
 'highest_prosecutor_risk': '100.0',
 'estimated_journalist_risk': '100.0',
 'lowest_risk': '100.0',
 'Marketer_attacker_success_rate': '100.0',
 'average_prosecutor_risk': '100.0',
 'records_affected_by_highest_journalist_risk': '100.0',
 'population_uniques': '100.0',
 'quasi_identifiers': '[zipcode, disease, age]'}

In [19]:
risk_profile.distribution_of_risk_dataframe().head()

Unnamed: 0,interval,recordsWithMaxmalRiskWithinInterval,recordsWithRiskWithinInteval
0,"]50,100]",1.0,1.0
1,"]33.4,50]",0.0,0.0
2,"]25,33.4]",0.0,0.0
3,"]20,25]",0.0,0.0
4,"]16.7,20]",0.0,0.0


## Anonymize

In [20]:
anon_result = aaas.anonymize(dataset, [kanon])

In [21]:
anon_result.dataset.to_dataframe()

Unnamed: 0,zipcode,age,salary,disease
0,476**,"[20,30[",*,respiratory&digestive disease
1,476**,"[20,30[",*,respiratory&digestive disease
2,476**,"[20,30[",*,respiratory&digestive disease
3,479**,>=40,*,respiratory&digestive disease
4,479**,>=40,*,respiratory&digestive disease
5,479**,>=40,*,respiratory&digestive disease
6,476**,"[30,40[",*,respiratory&digestive disease
7,476**,"[30,40[",*,respiratory&digestive disease
8,476**,"[30,40[",*,respiratory&digestive disease


#### Anonymization Status

In [22]:
anon_result.anonymization_status

'ANONYMOUS'

#### RiskProfile for the anonymized dataset

In [23]:
anon_rp = anon_result.risk_profile

In [24]:
anon_rp.re_identification_risk

{'Prosecutor_attacker_success_rate': '33.33333333333333',
 'records_affected_by_highest_prosecutor_risk': '100.0',
 'sample_uniques': '0.0',
 'estimated_prosecutor_risk': '33.33333333333333',
 'population_model': 'DANKAR',
 'highest_journalist_risk': '33.33333333333333',
 'records_affected_by_lowest_risk': '100.0',
 'estimated_marketer_risk': '33.33333333333333',
 'Journalist_attacker_success_rate': '33.33333333333333',
 'highest_prosecutor_risk': '33.33333333333333',
 'estimated_journalist_risk': '33.33333333333333',
 'lowest_risk': '33.33333333333333',
 'Marketer_attacker_success_rate': '33.33333333333333',
 'average_prosecutor_risk': '33.33333333333333',
 'records_affected_by_highest_journalist_risk': '100.0',
 'population_uniques': '0.0',
 'quasi_identifiers': '[zipcode, disease, age]'}

In [25]:
anon_rp.distribution_of_risk_dataframe().head(10)

Unnamed: 0,interval,recordsWithMaxmalRiskWithinInterval,recordsWithRiskWithinInteval
0,"]50,100]",1.0,0.0
1,"]33.4,50]",1.0,0.0
2,"]25,33.4]",1.0,1.0
3,"]20,25]",0.0,0.0
4,"]16.7,20]",0.0,0.0
5,"]14.3,16.7]",0.0,0.0
6,"]12.5,14.3]",0.0,0.0
7,"]10,12.5]",0.0,0.0
8,"]9,10]",0.0,0.0
9,"]8,9]",0.0,0.0
