In [4]:
from pyaaas.aaas import AaaS
from pyaaas.models.privacy_models import KAnonymity
from pyaaas.attribute_type import AttributeType
from pyaaas.dataset import Dataset
import pandas as pd

In [5]:
aaas = AaaS("http://localhost:8080/") # connecting to online service

In [6]:
data_df = pd.read_csv("../data/dummy-dataset-260219.csv", sep=";")

In [7]:
data_df

Unnamed: 0,Dummy_tag,ID,Navn,Alder,Sivilstatus,Barn,Innsatsgruppe,Innvandrerbakgrunn,Ledighetsstatus,Utdanning,Ytelse,Medisinsk forhold
0,dummy_data,PY827,Eirik,47,Ukjent,1,Situasjonsbestemt innsats,Togo,Delvis ledig,Høyere utd inntil 4 år,Tiltakspenger,Ingen
1,dummy_data,CX244,Ella,30,Gift,2,Varig tilpasset,Surinam,Ordinær tiltaksdeltaker,Grunnskole,Dagpenger,Ingen
2,dummy_data,ZE697,Solveig,37,Ukjent,1,Standard innsats,Malta,Delvis ledig,Grunnskole,Dagpenger,Ingen
3,dummy_data,AR215,Eirik,52,Ugift,1,Standard innsats,Norge,Andre arbeidssøkende,Høyere utd inntil 4 år,Uføretrygd,Ingen
4,dummy_data,CZ546,Heidi,37,Ukjent,3,Spesielt tilpasset,Surinam,Andre arbeidssøkende,Høyere utd over 4 år,Dagpenger,Ingen
5,dummy_data,XF859,Nora,30,Gift,2,Spesielt tilpasset,Malta,Andre arbeidssøkende,VGS,Uføretrygd,Ingen
6,dummy_data,LN430,Sara,42,Enke,2,Situasjonsbestemt innsats,Norge,Andre arbeidssøkende,Høyere utd inntil 4 år,Tiltakspenger,Ingen
7,dummy_data,UX876,Johan,39,Enke,1,Varig tilpasset,Norge,Helt ledig,VGS,Tiltakspenger,Ingen
8,dummy_data,TO107,Tobias,37,Enke,2,Standard innsats,Norge,Delvis ledig,Grunnskole,Dagpenger,Ingen
9,dummy_data,OY439,Ella,42,Gift,2,Standard innsats,Malta,Helt ledig,Ukjent,Tiltakspenger,Ingen


## Create Dataset

In [8]:
dataset = Dataset.from_pandas(data_df)

In [9]:
headers = data_df.columns.values.tolist()
headers

['Dummy_tag',
 'ID',
 'Navn',
 'Alder',
 'Sivilstatus',
 'Barn',
 'Innsatsgruppe',
 'Innvandrerbakgrunn',
 'Ledighetsstatus',
 'Utdanning',
 'Ytelse',
 'Medisinsk forhold']

### Set the AttributeType for the dataset fields

In [13]:
dataset.set_attributes(['ID',
                        'Navn',
                        'Medisinsk forhold'],
                       AttributeType.IDENTIFYING)

In [14]:
dataset.set_attributes(['Innsatsgruppe',
                        'Innvandrerbakgrunn',
                        'Ledighetsstatus',
                           'Ytelse'],
                        AttributeType.QUASIIDENTIFYING)

dataset.set_attributes(['Dummy_tag',
                        'Alder',
                        'Sivilstatus',
                        'Barn',
                        'Utdanning'],
                       AttributeType.INSENSITIVE)

### Set Generalization Hierarchies
Note that if the hierarchy does not have a header row in the csv file, please set header=None in read_csv() or the first row will be interpreted as a header and ARXaaS will throw an exception for the missing hierarchy data.

In [15]:
ytelse_hierarchy = pd.read_csv("../hierarchies/Ytelse_hierarchy.csv", sep=";", header=None)
innsatsgruppe_hierarchy = pd.read_csv("../hierarchies/innsatsgruppe_hierarchy.csv", sep=";", header=None)
innvandrerbakgrunn_hierarchy = pd.read_csv("../hierarchies/innvandrerbakgrunn_hierarchy.csv", sep=";", header=None)
ledighetsstatus_hierarchy = pd.read_csv("../hierarchies/ledighetsstatus_hierarchy.csv", sep=";", header=None)

In [16]:
dataset.set_hierarchy('Ytelse', ytelse_hierarchy)
dataset.set_hierarchy("Innsatsgruppe", innsatsgruppe_hierarchy)
dataset.set_hierarchy("Innvandrerbakgrunn", innvandrerbakgrunn_hierarchy)
dataset.set_hierarchy("Ledighetsstatus", ledighetsstatus_hierarchy)

In [17]:
ledighetsstatus_hierarchy

Unnamed: 0,0,1,2
0,Andre arbeidssøkende,Delvis ledig,*
1,Delvis ledig,Delvis ledig,*
2,Helt ledig,ledig,*
3,Ordinær tiltaksdeltaker,ledig,*


### Create Privacy Models

In [18]:
kanon = KAnonymity(4)

### Create Risk Profile

In [26]:
risk_profile = aaas.risk_profile(dataset)

In [27]:
risk_profile.re_identification_risk_dataframe()

Unnamed: 0,Journalist_attacker_success_rate,Marketer_attacker_success_rate,Prosecutor_attacker_success_rate,average_prosecutor_risk,estimated_journalist_risk,estimated_marketer_risk,estimated_prosecutor_risk,highest_journalist_risk,highest_prosecutor_risk,lowest_risk,population_model,population_uniques,quasi_identifiers,records_affected_by_highest_journalist_risk,records_affected_by_highest_prosecutor_risk,records_affected_by_lowest_risk,sample_uniques
0,12.06,12.06,12.06,12.06,100.0,12.06,100.0,100.0,100.0,0.4878048780487805,PITMAN,0.042243729241281,"[Innvandrerbakgrunn, Ytelse, Innsatsgruppe, Le...",2.8000000000000003,2.8000000000000003,15.62,2.8000000000000003


In [28]:
risk_profile.distribution_of_risk_dataframe()

Unnamed: 0,interval,recordsWithMaxmalRiskWithinInterval,recordsWithRiskWithinInteval
0,"]50,100]",1.0,0.028
1,"]33.4,50]",0.972,0.0448
2,"]25,33.4]",0.9272,0.0456
3,"]20,25]",0.8816,0.032
4,"]16.7,20]",0.8496,0.036
5,"]14.3,16.7]",0.8136,0.0396
6,"]12.5,14.3]",0.774,0.0294
7,"]10,12.5]",0.7446,0.0574
8,"]9,10]",0.6872,0.036
9,"]8,9]",0.6512,0.0216


### Raw profile data

In [29]:
risk_profile.re_identification_risk

{'Prosecutor_attacker_success_rate': '12.06',
 'records_affected_by_highest_prosecutor_risk': '2.8000000000000003',
 'sample_uniques': '2.8000000000000003',
 'estimated_prosecutor_risk': '100.0',
 'population_model': 'PITMAN',
 'highest_journalist_risk': '100.0',
 'records_affected_by_lowest_risk': '15.620000000000001',
 'estimated_marketer_risk': '12.06',
 'Journalist_attacker_success_rate': '12.06',
 'highest_prosecutor_risk': '100.0',
 'estimated_journalist_risk': '100.0',
 'lowest_risk': '0.4878048780487805',
 'Marketer_attacker_success_rate': '12.06',
 'average_prosecutor_risk': '12.06',
 'records_affected_by_highest_journalist_risk': '2.8000000000000003',
 'population_uniques': '0.042243729241281044',
 'quasi_identifiers': '[Innvandrerbakgrunn, Ytelse, Innsatsgruppe, Ledighetsstatus]'}

## Anonymize

In [30]:
anon_ds = aaas.anonymize(dataset, [kanon])

In [31]:
anon_ds.to_dataframe()

Unnamed: 0,Dummy_tag,ID,Navn,Alder,Sivilstatus,Barn,Innsatsgruppe,Innvandrerbakgrunn,Ledighetsstatus,Utdanning,Ytelse,Medisinsk forhold
0,dummy_data,*,*,47,Ukjent,1,Spesielt tilpasset,Togo,Delvis ledig,Høyere utd inntil 4 år,Tiltakspenger,*
1,dummy_data,*,*,30,Gift,2,Varig tilpasset,Surinam,ledig,Grunnskole,Dagpenger,*
2,dummy_data,*,*,37,Ukjent,1,Varig tilpasset,Malta,Delvis ledig,Grunnskole,Dagpenger,*
3,dummy_data,*,*,52,Ugift,1,Varig tilpasset,Norge,Delvis ledig,Høyere utd inntil 4 år,Uføretrygd,*
4,dummy_data,*,*,37,Ukjent,3,Spesielt tilpasset,Surinam,Delvis ledig,Høyere utd over 4 år,Dagpenger,*
5,dummy_data,*,*,30,Gift,2,Spesielt tilpasset,Malta,Delvis ledig,VGS,Uføretrygd,*
6,dummy_data,*,*,42,Enke,2,Spesielt tilpasset,Norge,Delvis ledig,Høyere utd inntil 4 år,Tiltakspenger,*
7,dummy_data,*,*,39,Enke,1,Varig tilpasset,Norge,ledig,VGS,Tiltakspenger,*
8,dummy_data,*,*,37,Enke,2,Varig tilpasset,Norge,Delvis ledig,Grunnskole,Dagpenger,*
9,dummy_data,*,*,42,Gift,2,Varig tilpasset,Malta,ledig,Ukjent,Tiltakspenger,*


## Create RiskProfile for the anonymized dataset

In [32]:
anon_rp = aaas.risk_profile(anon_ds)

In [35]:
anon_rp.re_identification_risk_dataframe()

Unnamed: 0,Journalist_attacker_success_rate,Marketer_attacker_success_rate,Prosecutor_attacker_success_rate,average_prosecutor_risk,estimated_journalist_risk,estimated_marketer_risk,estimated_prosecutor_risk,highest_journalist_risk,highest_prosecutor_risk,lowest_risk,population_model,population_uniques,quasi_identifiers,records_affected_by_highest_journalist_risk,records_affected_by_highest_prosecutor_risk,records_affected_by_lowest_risk,sample_uniques
0,96.8,96.8,96.8,96.8,100.0,96.8,100.0,100.0,100.0,25.0,PITMAN,30.589150757853677,"[Alder, Innvandrerbakgrunn, Medisinsk forhold,...",93.82,93.82,0.08,93.82


In [34]:
anon_rp.distribution_of_risk_dataframe()


Unnamed: 0,interval,recordsWithMaxmalRiskWithinInterval,recordsWithRiskWithinInteval
0,"]50,100]",1.0,0.9382
1,"]33.4,50]",0.0618,0.0556
2,"]25,33.4]",0.0062,0.0054
3,"]20,25]",0.0008,0.0008
4,"]16.7,20]",0.0,0.0
5,"]14.3,16.7]",0.0,0.0
6,"]12.5,14.3]",0.0,0.0
7,"]10,12.5]",0.0,0.0
8,"]9,10]",0.0,0.0
9,"]8,9]",0.0,0.0
