# Hierachy generation using PyARXaaS

In [1]:
from pyaaas import ARXaaS
from pyaaas.privacy_models import KAnonymity, LDiversityDistinct
from pyaaas import AttributeType
from pyaaas import Dataset
from pyaaas.models.hierarchy import IntervalHierarchyBuilder, RedactionHierarchyBuilder, OrderHierarchyBuilder
import pandas as pd

#### Create connection to ARXaaS

In [2]:
arxaas = ARXaaS("http://localhost:8080")

#### Fetch data 

In [3]:
data_df = pd.read_csv("../data/data2.csv", sep=";")

In [4]:
data_df

Unnamed: 0,zipcode,age,salary,disease
0,47677,29,3,gastric ulcer
1,47602,22,4,gastritis
2,47678,27,5,stomach cancer
3,47905,43,6,gastritis
4,47909,52,11,flu
5,47906,47,8,bronchitis
6,47605,30,7,bronchitis
7,47673,36,9,pneumonia
8,47607,32,10,stomach cancer


### Create Redaction based hierarchy

#### 1. Extract column to create hierarchy from

In [5]:
column = data_df["zipcode"].tolist()
column

[47677, 47602, 47678, 47905, 47909, 47906, 47605, 47673, 47607]

#### 2. Create hierarchy builder to use

In [6]:
redaction_based = RedactionHierarchyBuilder()

#### 3. Call the ARXaaS service to create the hierarchy

In [7]:
redaction_hierarchy = arxaas.hierarchy(redaction_based, column)

In [8]:
redaction_hierarchy

[['47677', '4767*', '476**', '47***', '4****', '*****'],
 ['47602', '4760*', '476**', '47***', '4****', '*****'],
 ['47678', '4767*', '476**', '47***', '4****', '*****'],
 ['47905', '4790*', '479**', '47***', '4****', '*****'],
 ['47909', '4790*', '479**', '47***', '4****', '*****'],
 ['47906', '4790*', '479**', '47***', '4****', '*****'],
 ['47605', '4760*', '476**', '47***', '4****', '*****'],
 ['47673', '4767*', '476**', '47***', '4****', '*****'],
 ['47607', '4760*', '476**', '47***', '4****', '*****']]

### Create interval based hierarchy

#### 1. Extract column to create hierarchy from

In [9]:
column = data_df["age"].tolist()
column

[29, 22, 27, 43, 52, 47, 30, 36, 32]

#### 2. Create hierarchy builder to use

In [10]:
interval_based = IntervalHierarchyBuilder()
interval_based.add_interval(0,18, "child")
interval_based.add_interval(18,30, "young-adult")
interval_based.add_interval(30,60, "adult")
interval_based.add_interval(60,120, "old")
interval_based.level(0).add_group(2, "young")
interval_based.level(0).add_group(2, "adult")

#### 3. Call the ARXaaS service to create the hierarchy

In [11]:
interval_hierarchy = arxaas.hierarchy(interval_based, column)

In [12]:
interval_hierarchy

[['29', 'young-adult', 'young', '*'],
 ['22', 'young-adult', 'young', '*'],
 ['27', 'young-adult', 'young', '*'],
 ['43', 'adult', 'adult', '*'],
 ['52', 'adult', 'adult', '*'],
 ['47', 'adult', 'adult', '*'],
 ['30', 'adult', 'adult', '*'],
 ['36', 'adult', 'adult', '*'],
 ['32', 'adult', 'adult', '*']]

### Create Order based hierarchy

#### 1. Extract column to create hierarchy from


In [15]:
column = data_df["disease"].tolist()
column

['gastric ulcer',
 'gastritis',
 'stomach cancer',
 'gastritis',
 'flu',
 'bronchitis',
 'bronchitis',
 'pneumonia',
 'stomach cancer']

#### 2. Strip to uniques and order column values

In [16]:
column = set(column)
column = list(column)
column[1], column[3] = column[3], column[1]
column

['flu',
 'gastritis',
 'bronchitis',
 'pneumonia',
 'gastric ulcer',
 'stomach cancer']

#### 2. Create hierarchy builder to use

In [17]:
order_based = OrderHierarchyBuilder()
order_based.level(0).add_group(3, "stomach-related")
order_based.level(0).add_group(3, "lung-related")

#### 3. Call the ARXaaS service to create the hierarchy*

In [18]:
order_hierarchy = arxaas.hierarchy(order_based, column)

In [19]:
order_hierarchy

[['flu', 'stomach-related', '*'],
 ['gastritis', 'stomach-related', '*'],
 ['bronchitis', 'stomach-related', '*'],
 ['pneumonia', 'lung-related', '*'],
 ['gastric ulcer', 'lung-related', '*'],
 ['stomach cancer', 'lung-related', '*']]

### Example anonymization

In [20]:
dataset = Dataset.from_pandas(data_df)

In [21]:
dataset.set_attribute_type(AttributeType.IDENTIFYING, "salary")

In [22]:
dataset.describe()

data:
  headers:
    ['zipcode', 'age', 'salary', 'disease']
rows:
    [47677, 29, 3, 'gastric ulcer']
    [47602, 22, 4, 'gastritis']
    [47678, 27, 5, 'stomach cancer']
    [47905, 43, 6, 'gastritis']
    [47909, 52, 11, 'flu']
    ...
attributes:
  field_name=zipcode, type=QUASIIDENTIFYING, hierarchy=None
  field_name=age, type=QUASIIDENTIFYING, hierarchy=None
  field_name=salary, type=IDENTIFYING, hierarchy=None
  field_name=disease, type=QUASIIDENTIFYING, hierarchy=None



In [23]:
dataset.set_hierarchy("age", interval_hierarchy)

In [24]:
dataset.set_hierarchy("zipcode", redaction_hierarchy)

In [25]:
dataset.set_hierarchy("disease", order_hierarchy)

In [26]:
anon_result = arxaas.anonymize(dataset=dataset, privacy_models=[KAnonymity(2)])

In [27]:
anon_result.dataset.to_dataframe()

Unnamed: 0,zipcode,age,salary,disease
0,476**,*,*,lung-related
1,476**,*,*,stomach-related
2,476**,*,*,lung-related
3,479**,*,*,stomach-related
4,479**,*,*,stomach-related
5,479**,*,*,stomach-related
6,476**,*,*,stomach-related
7,476**,*,*,lung-related
8,476**,*,*,lung-related
