## Example analyzation and anonymization of sensitive dataset

In [2]:
from pyarxaas import ARXaaS, Dataset
from pyarxaas.privacy_models import KAnonymity, LDiversityDistinct
from pyarxaas import AttributeType
import pandas as pd

#### Create ARXaaS connection

In [5]:
# set logging level to INFO to show pyARXaas logging
import logging
logging.basicConfig(level=logging.INFO)

In [15]:
arxaas = ARXaaS('http://localhost:8080/') # connecting to online service

INFO:pyarxaas.arxaas_connector:Connected to url=http://localhost:8080/ status=200


#### fetch sensitive data

In [16]:
data_df = pd.read_csv("../data/data2.csv", sep=";")

In [17]:
data_df = data_df[:6]
data_df

Unnamed: 0,zipcode,age,salary,disease
0,47677,29,3,gastric ulcer
1,47602,22,4,gastritis
2,47678,27,5,stomach cancer
3,47905,43,6,gastritis
4,47909,52,11,flu
5,47906,47,8,bronchitis


### Create Dataset

In [18]:
dataset = Dataset.from_pandas(data_df)
dataset.describe()

data:
  headers:
    ['zipcode', 'age', 'salary', 'disease']
rows:
    [47677, 29, 3, 'gastric ulcer']
    [47602, 22, 4, 'gastritis']
    [47678, 27, 5, 'stomach cancer']
    [47905, 43, 6, 'gastritis']
    [47909, 52, 11, 'flu']
    ...
attributes:
  field_name=zipcode, type=QUASIIDENTIFYING, hierarchy=None
  field_name=age, type=QUASIIDENTIFYING, hierarchy=None
  field_name=salary, type=QUASIIDENTIFYING, hierarchy=None
  field_name=disease, type=QUASIIDENTIFYING, hierarchy=None



### Set the AttributeType for the dataset fields

In [19]:
dataset.set_attribute_type(AttributeType.IDENTIFYING, 'salary')

### Set Generalization Hierarchies
Note that if the hierarchy does not have a header row in the csv file, please set header=None in read_csv() or the first row will be interpreted as a header and ARXaaS will throw an exception for the missing hierarchy data.

In [20]:
zipcode_hierarchy = pd.read_csv("../data/data2_zipcode_hierarchy.csv", sep=";", header=None)
age_hierarchy = pd.read_csv("../data/data2_age_hierarchy.csv", sep=";", header=None)
disease_hierarchy = pd.read_csv("../data/data2_disease_hierarchy.csv", sep=";", header=None)

In [21]:
age_hierarchy

Unnamed: 0,0,1
0,22,"[22, 53["
1,27,"[22, 53["
2,29,"[22, 53["
3,30,"[22, 53["
4,32,"[22, 53["
5,36,"[22, 53["
6,43,"[22, 53["
7,47,"[22, 53["
8,52,"[22, 53["


In [22]:
dataset.set_hierarchy("age", age_hierarchy)
dataset.set_hierarchy("zipcode", zipcode_hierarchy)
dataset.set_hierarchy("disease", disease_hierarchy)

### Create Privacy Models

In [23]:
kanon = KAnonymity(2)

### Create Risk Profile

In [26]:
risk_profile = arxaas.risk_profile(dataset)

In [27]:
risk_profile.re_identification_risk

{'estimated_journalist_risk': 1.0,
 'records_affected_by_highest_prosecutor_risk': 1.0,
 'sample_uniques': 1.0,
 'lowest_risk': 1.0,
 'estimated_prosecutor_risk': 1.0,
 'highest_journalist_risk': 1.0,
 'records_affected_by_lowest_risk': 1.0,
 'average_prosecutor_risk': 1.0,
 'estimated_marketer_risk': 1.0,
 'highest_prosecutor_risk': 1.0,
 'records_affected_by_highest_journalist_risk': 1.0,
 'population_uniques': 1.0}

In [28]:
risk_profile.distribution_of_risk_dataframe().head()

Unnamed: 0,interval,recordsWithMaxmalRiskWithinInterval,recordsWithRiskWithinInteval
0,"]50,100]",1.0,1.0
1,"]33.4,50]",0.0,0.0
2,"]25,33.4]",0.0,0.0
3,"]20,25]",0.0,0.0
4,"]16.7,20]",0.0,0.0


## Anonymize

In [35]:
anon_result = arxaas.anonymize(dataset, [kanon])

In [36]:
anon_result.dataset.to_dataframe()

Unnamed: 0,zipcode,age,salary,disease
0,47***,"[22, 53[",*,stomach disease
1,47***,"[22, 53[",*,stomach disease
2,47***,"[22, 53[",*,stomach disease
3,47***,"[22, 53[",*,stomach disease
4,47***,"[22, 53[",*,respiratory infection
5,47***,"[22, 53[",*,respiratory infection


#### Anonymization Status

In [37]:
anon_result.anonymization_status

'ANONYMOUS'

#### RiskProfile for the anonymized dataset

In [38]:
anon_rp = anon_result.risk_profile

In [39]:
anon_rp.re_identification_risk

{'estimated_journalist_risk': 0.5,
 'records_affected_by_highest_prosecutor_risk': 0.3333333333333333,
 'sample_uniques': 0.0,
 'lowest_risk': 0.25,
 'estimated_prosecutor_risk': 0.5,
 'highest_journalist_risk': 0.5,
 'records_affected_by_lowest_risk': 0.6666666666666666,
 'average_prosecutor_risk': 0.3333333333333333,
 'estimated_marketer_risk': 0.3333333333333333,
 'highest_prosecutor_risk': 0.5,
 'records_affected_by_highest_journalist_risk': 0.3333333333333333,
 'population_uniques': 0.0}

In [40]:
anon_rp.distribution_of_risk_dataframe().head(10)

Unnamed: 0,interval,recordsWithMaxmalRiskWithinInterval,recordsWithRiskWithinInteval
0,"]50,100]",1.0,0.0
1,"]33.4,50]",1.0,0.333333
2,"]25,33.4]",0.666667,0.0
3,"]20,25]",0.666667,0.666667
4,"]16.7,20]",0.0,0.0
5,"]14.3,16.7]",0.0,0.0
6,"]12.5,14.3]",0.0,0.0
7,"]10,12.5]",0.0,0.0
8,"]9,10]",0.0,0.0
9,"]8,9]",0.0,0.0
