## Example analyzation and anonymization of sensitive dataset

In [81]:
from pyaaas import ARXaaS
from pyaaas.privacy_models import KAnonymity, LDiversityDistinct
from pyaaas.models.attribute_type import AttributeType
from pyaaas.models.dataset import Dataset
import pandas as pd

#### Create ARXaaS connection

In [82]:
import logging
logging.basicConfig(level=logging.INFO)

In [83]:
aaas = ARXaaS('http://localhost:8080/') # connecting to online service

INFO:pyaaas.aaas_connector:Connected to url=http://localhost:8080/ status=200


#### fetch sensitive data

In [84]:
data_df = pd.read_csv("../data/data2.csv", sep=";")

In [85]:
data_df = data_df[:6]
data_df

Unnamed: 0,zipcode,age,salary,disease
0,47677,29,3,gastric ulcer
1,47602,22,4,gastritis
2,47678,27,5,stomach cancer
3,47905,43,6,gastritis
4,47909,52,11,flu
5,47906,47,8,bronchitis


### Create Dataset

In [86]:
dataset = Dataset.from_pandas(data_df)
dataset.describe()

data:
  headers:
    ['zipcode', 'age', 'salary', 'disease']
rows:
    [47677, 29, 3, 'gastric ulcer']
    [47602, 22, 4, 'gastritis']
    [47678, 27, 5, 'stomach cancer']
    [47905, 43, 6, 'gastritis']
    [47909, 52, 11, 'flu']
    ...
attributes:
  field_name=zipcode, type=QUASIIDENTIFYING, hierarchy=None
  field_name=age, type=QUASIIDENTIFYING, hierarchy=None
  field_name=salary, type=QUASIIDENTIFYING, hierarchy=None
  field_name=disease, type=QUASIIDENTIFYING, hierarchy=None



### Set the AttributeType for the dataset fields

In [87]:
dataset.set_attribute_type(AttributeType.IDENTIFYING, 'salary')

### Set Generalization Hierarchies
Note that if the hierarchy does not have a header row in the csv file, please set header=None in read_csv() or the first row will be interpreted as a header and ARXaaS will throw an exception for the missing hierarchy data.

In [88]:
zipcode_hierarchy = pd.read_csv("../data/data2_zipcode_hierarchy.csv", sep=";", header=None)
age_hierarchy = pd.read_csv("../data/data2_age_hierarchy.csv", sep=";", header=None)
disease_hierarchy = pd.read_csv("../data/data2_disease_hierarchy.csv", sep=";", header=None)

In [89]:
age_hierarchy

Unnamed: 0,0,1
0,22,"[22, 53["
1,27,"[22, 53["
2,29,"[22, 53["
3,30,"[22, 53["
4,32,"[22, 53["
5,36,"[22, 53["
6,43,"[22, 53["
7,47,"[22, 53["
8,52,"[22, 53["


In [90]:
dataset.set_hierarchy("age", age_hierarchy)
dataset.set_hierarchy("zipcode", zipcode_hierarchy)
dataset.set_hierarchy("disease", disease_hierarchy)

### Create Privacy Models

In [91]:
kanon = KAnonymity(2)

### Create Risk Profile

In [92]:
risk_profile = aaas.risk_profile(dataset)

In [93]:
risk_profile.re_identification_risk

{'Prosecutor_attacker_success_rate': '100.0',
 'records_affected_by_highest_prosecutor_risk': '100.0',
 'sample_uniques': '100.0',
 'estimated_prosecutor_risk': '100.0',
 'population_model': 'ZAYATZ',
 'highest_journalist_risk': '100.0',
 'records_affected_by_lowest_risk': '100.0',
 'estimated_marketer_risk': '100.0',
 'Journalist_attacker_success_rate': '100.0',
 'highest_prosecutor_risk': '100.0',
 'estimated_journalist_risk': '100.0',
 'lowest_risk': '100.0',
 'Marketer_attacker_success_rate': '100.0',
 'average_prosecutor_risk': '100.0',
 'records_affected_by_highest_journalist_risk': '100.0',
 'population_uniques': '100.0',
 'quasi_identifiers': '[zipcode, disease, age]'}

In [94]:
risk_profile.distribution_of_risk_dataframe().head()

Unnamed: 0,interval,recordsWithMaxmalRiskWithinInterval,recordsWithRiskWithinInteval
0,"]50,100]",1.0,1.0
1,"]33.4,50]",0.0,0.0
2,"]25,33.4]",0.0,0.0
3,"]20,25]",0.0,0.0
4,"]16.7,20]",0.0,0.0


## Anonymize

In [95]:
anon_result = aaas.anonymize(dataset, [kanon])

In [96]:
anon_result.dataset.to_dataframe()

Unnamed: 0,zipcode,age,salary,disease
0,47***,"[22, 53[",*,stomach disease
1,47***,"[22, 53[",*,stomach disease
2,47***,"[22, 53[",*,stomach disease
3,47***,"[22, 53[",*,stomach disease
4,47***,"[22, 53[",*,respiratory infection
5,47***,"[22, 53[",*,respiratory infection


#### Anonymization Status

In [97]:
anon_result.anonymization_status

'ANONYMOUS'

#### RiskProfile for the anonymized dataset

In [98]:
anon_rp = anon_result.risk_profile

In [99]:
anon_rp.re_identification_risk

{'Prosecutor_attacker_success_rate': '33.33333333333333',
 'records_affected_by_highest_prosecutor_risk': '33.33333333333333',
 'sample_uniques': '0.0',
 'estimated_prosecutor_risk': '50.0',
 'population_model': 'DANKAR',
 'highest_journalist_risk': '50.0',
 'records_affected_by_lowest_risk': '66.66666666666666',
 'estimated_marketer_risk': '33.33333333333333',
 'Journalist_attacker_success_rate': '33.33333333333333',
 'highest_prosecutor_risk': '50.0',
 'estimated_journalist_risk': '50.0',
 'lowest_risk': '25.0',
 'Marketer_attacker_success_rate': '33.33333333333333',
 'average_prosecutor_risk': '33.33333333333333',
 'records_affected_by_highest_journalist_risk': '33.33333333333333',
 'population_uniques': '0.0',
 'quasi_identifiers': '[zipcode, disease, age]'}

In [100]:
anon_rp.distribution_of_risk_dataframe().head(10)

Unnamed: 0,interval,recordsWithMaxmalRiskWithinInterval,recordsWithRiskWithinInteval
0,"]50,100]",1.0,0.0
1,"]33.4,50]",1.0,0.333333
2,"]25,33.4]",0.666667,0.0
3,"]20,25]",0.666667,0.666667
4,"]16.7,20]",0.0,0.0
5,"]14.3,16.7]",0.0,0.0
6,"]12.5,14.3]",0.0,0.0
7,"]10,12.5]",0.0,0.0
8,"]9,10]",0.0,0.0
9,"]8,9]",0.0,0.0
