## Example analyzation and anonymization of sensitive dataset

In [51]:
from pyaaas import ARXaaS
from pyaaas.privacy_models import KAnonymity, LDiversityDistinct
from pyaaas import AttributeType
from pyaaas import Dataset
from pyaaas.models.hierarchy import IntervalHierarchyBuilder, RedactionHierarchyBuilder, OrderHierarchyBuilder
import pandas as pd

#### Create ARXaaS connection

In [52]:
aaas = ARXaaS("http://localhost:8080/") # connecting to online service

#### fetch sensitive data

In [53]:
data_df = pd.read_csv("../data/data2.csv", sep=";")

In [54]:
data_df

Unnamed: 0,zipcode,age,salary,disease
0,47677,29,3,gastric ulcer
1,47602,22,4,gastritis
2,47678,27,5,stomach cancer
3,47905,43,6,gastritis
4,47909,52,11,flu
5,47906,47,8,bronchitis
6,47605,30,7,bronchitis
7,47673,36,9,pneumonia
8,47607,32,10,stomach cancer


### Create Dataset

In [55]:
dataset = Dataset.from_pandas(data_df)

### Set the AttributeType for the dataset fields

In [56]:
dataset.set_attribute_type(AttributeType.IDENTIFYING,'salary'  )

### Generate hierarchy

### Set Generalization Hierarchies
Note that if the hierarchy does not have a header row in the csv file, please set header=None in read_csv() or the first row will be interpreted as a header and ARXaaS will throw an exception for the missing hierarchy data.

In [57]:
zipcode_hierarchy = pd.read_csv("../data/data2_zipcode_hierarchy.csv", sep=";", header=None)
age_hierarchy = pd.read_csv("../data/data2_age_hierarchy.csv", sep=";", header=None)
disease_hierarchy = pd.read_csv("../data/data2_disease_hierarchy.csv", sep=";", header=None)

In [58]:
zipcode_hierarchy

Unnamed: 0,0,1,2,3,4,5
0,47677,4767*,476**,47***,4****,*****
1,47602,4760*,476**,47***,4****,*****
2,47678,4767*,476**,47***,4****,*****
3,47905,4790*,479**,47***,4****,*****
4,47909,4790*,479**,47***,4****,*****
5,47906,4790*,479**,47***,4****,*****
6,47605,4760*,476**,47***,4****,*****
7,47673,4767*,476**,47***,4****,*****
8,47607,4760*,476**,47***,4****,*****


In [59]:
dataset.set_hierarchy('age', age_hierarchy)
dataset.set_hierarchy("zipcode", zipcode_hierarchy)
dataset.set_hierarchy("disease", disease_hierarchy)

### Create Privacy Models

In [60]:
kanon = KAnonymity(4)

### Create Risk Profile

In [61]:
risk_profile = aaas.risk_profile(dataset)

In [62]:
risk_profile.re_identification_risk

{'Prosecutor_attacker_success_rate': '100.0',
 'records_affected_by_highest_prosecutor_risk': '100.0',
 'sample_uniques': '100.0',
 'estimated_prosecutor_risk': '100.0',
 'population_model': 'ZAYATZ',
 'highest_journalist_risk': '100.0',
 'records_affected_by_lowest_risk': '100.0',
 'estimated_marketer_risk': '100.0',
 'Journalist_attacker_success_rate': '100.0',
 'highest_prosecutor_risk': '100.0',
 'estimated_journalist_risk': '100.0',
 'lowest_risk': '100.0',
 'Marketer_attacker_success_rate': '100.0',
 'average_prosecutor_risk': '100.0',
 'records_affected_by_highest_journalist_risk': '100.0',
 'population_uniques': '100.0',
 'quasi_identifiers': '[zipcode, disease, age]'}

In [63]:
risk_profile.distribution_of_risk_dataframe().head()

Unnamed: 0,interval,recordsWithMaxmalRiskWithinInterval,recordsWithRiskWithinInteval
0,"]50,100]",1.0,1.0
1,"]33.4,50]",0.0,0.0
2,"]25,33.4]",0.0,0.0
3,"]20,25]",0.0,0.0
4,"]16.7,20]",0.0,0.0


## Anonymize

In [64]:
anon_result = aaas.anonymize(dataset, [kanon])

RequestException: {"timestamp":"2019-04-23T10:58:59.503+0000","message":"Attribute 'age': hierarchy misses some values or contains duplicates","details":"uri=/api/anonymize"}

In [65]:
anon_result.dataset.to_dataframe()

NameError: name 'anon_result' is not defined

#### Anonymization Status

In [66]:
anon_result.anonymization_status

NameError: name 'anon_result' is not defined

#### RiskProfile for the anonymized dataset

In [17]:
anon_rp = anon_result.risk_profile

NameError: name 'anon_result' is not defined

In [18]:
anon_rp.re_identification_risk

NameError: name 'anon_rp' is not defined

In [19]:
anon_rp.distribution_of_risk_dataframe().head(10)

NameError: name 'anon_rp' is not defined