# Tutorial 7: Batch avatarization

In this tutorial, we will perform the avatarization on batch of data. This can be useful if you have to much data that can be avatarized in one shot.

# TODO: add a schema of the process


## Connection

In [1]:
import warnings

warnings.filterwarnings("ignore")

from avatars.client import ApiClient


In [2]:
import os

url = os.environ.get("AVATAR_BASE_URL")
username = os.environ.get("AVATAR_USERNAME")
password = os.environ.get("AVATAR_PASSWORD")

In [3]:
# This is the client that you'll be using for all of your requests
from avatars.models import (
    AvatarizationJobCreate,
    AvatarizationParameters,
    ImputationParameters,
)
from avatars.lib.split import get_split_for_batch
from typing import Any, Dict, List, Tuple
import math

import time

import numpy as np

from avatars.client import ApiClient

from avatars.models import (
    AvatarizationBatchJobCreate,
    AvatarizationBatchParameters,
    AvatarizationBatchResult,
    AvatarizationJob,
    PrivacyMetricsBaseParameters,
    PrivacyMetricsBatchJobCreate,
    PrivacyMetricsBatchParameters,
    PrivacyMetricsJob,
    PrivacyMetricsJobCreate,
    PrivacyMetricsParameters,
    PrivacyBatchDatasetMapping,
    SignalBatchDatasetMapping,
    SignalMetricsBaseParameters,
    SignalMetricsBatchJobCreate,
    SignalMetricsBatchParameters,
)

from avatars.lib.split import get_split_for_batch

# The following are not necessary to 
# run avatar but are used in this tutorial
import pandas as pd
# from sklearn.model_selection import train_test_split

# Change this to your actual server endpoint, e.g. base_url="https://avatar.company.com"
client = ApiClient(base_url=url)
client.authenticate(username=username, password=password)

# Verify that we can connect to the API server
client.health.get_health()

{'message': 'ok'}

## Load the data
We will use the dataset `adult`.

In [4]:

df = pd.read_csv("../fixtures/adult_with_missing.csv")
print(len(df))


48842


In [5]:
# create some batches with from the df

RowLimit = 5000

training, splits = get_split_for_batch(
    df,
    row_limit=RowLimit,
)
print(training.shape)
print(len(splits))

(4885, 15)
9


## Launch batch avatarization


In [6]:
from avatars.models import ImputeMethod


dataset_ref = client.pandas_integration.upload_dataframe(training, timeout=10)
dataset_splited_ids = [
    client.pandas_integration.upload_dataframe(split, timeout=10).id for split in splits
]


batch_job = client.jobs.create_avatarization_batch_job(
    AvatarizationBatchJobCreate(
        parameters=AvatarizationBatchParameters(
            training_dataset_id=dataset_ref.id,
            dataset_ids=dataset_splited_ids,
            k=20,
            imputation=ImputationParameters(method=ImputeMethod.mean)
        )
    )
)
batch_job = client.jobs.get_avatarization_batch_job(batch_job.id, timeout=10)


In [7]:
batch_job = client.jobs.get_avatarization_batch_job(batch_job.id, timeout=10000)
batch_job

AvatarizationBatchJob(id=UUID('b1edcadc-d9c7-4b30-b1b7-367a7c40582c'), status=<JobStatus.success: 'success'>, error_message=None, traceback=None, result=AvatarizationBatchResult(privacy_metrics=None, signal_metrics=None, training_result=AvatarizationPerBatchResult(privacy_metrics=None, signal_metrics=None, avatars_dataset=Dataset(id=UUID('da631a2a-1198-44a6-be30-e75d8c3264f3'), hash='b87090d5ca82da243cc42497bf15eff22f300cccdf0472cc639e31170f158c7e', name=None, columns=[ColumnDetail(type=<ColumnType.float: 'float'>, label='age'), ColumnDetail(type=<ColumnType.category: 'category'>, label='workclass'), ColumnDetail(type=<ColumnType.float: 'float'>, label='fnlwgt'), ColumnDetail(type=<ColumnType.category: 'category'>, label='education'), ColumnDetail(type=<ColumnType.float: 'float'>, label='educational-num'), ColumnDetail(type=<ColumnType.category: 'category'>, label='marital-status'), ColumnDetail(type=<ColumnType.category: 'category'>, label='occupation'), ColumnDetail(type=<ColumnType.

## Launch privacy metric per batch

In [8]:
privacy_job_ref = client.jobs.create_privacy_metrics_batch_job(
        PrivacyMetricsBatchJobCreate(
            parameters=PrivacyMetricsBatchParameters(
                avatarization_batch_job_id=batch_job.id,
                common_parameters=PrivacyMetricsBaseParameters(
                    imputation=ImputationParameters(method=ImputeMethod.mean)
                )
            ),
        )
    )
privacy_job = client.jobs.get_privacy_metrics_batch_job(privacy_job_ref.id, timeout=100000)

print("Mean metrics")
# print(privacy_job.result.mean_metrics)

print("Worst metrics")
print(privacy_job.result.worst_metrics)

Mean metrics
Worst metrics
hidden_rate=98.78186098884225 local_cloaking=53.0 distance_to_closest=3.6061315536499023 closest_distances_ratio=0.8707497535858303 column_direct_match_protection=98.12479411680786 categorical_hidden_rate=99.53092934623278 row_direct_match_protection=100.0 correlation_protection_rate=None inference_continuous=None inference_categorical=None closest_rate=None


## Launch signal metrics per batch 


In [9]:
signal_job_ref = client.jobs.create_signal_metrics_batch_job(
        SignalMetricsBatchJobCreate(
            parameters=SignalMetricsBatchParameters(
                avatarization_batch_job_id=batch_job.id,
                common_parameters=SignalMetricsBaseParameters()
            ),
        )
    )
signal_job = client.jobs.get_signal_metrics_batch_job(signal_job_ref.id)

print("Mean metrics")
print(signal_job.result.mean_metrics)



Mean metrics
hellinger_mean=0.10215207881001524 hellinger_std=0.11593553089369964 correlation_difference_ratio=1.8754241800610578


## Built the anonymized dataset



In [8]:
avatars = get_avatar_dataset_from_batch_result(batch_job, client=client)

In [9]:
avatars

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,26.0,Private,202801.0,Some-college,10.0,Never-married,,Not-in-family,White,Male,0.0,0.0,37.0,United-States,<=50K
1,42.0,,206354.0,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,0.0,41.0,United-States,<=50K
2,22.0,Local-gov,209281.0,Assoc-voc,11.0,Never-married,Adm-clerical,,White,Female,0.0,0.0,36.0,United-States,<=50K
3,35.0,,226918.0,,,Married-civ-spouse,Craft-repair,,Black,Male,0.0,0.0,38.0,United-States,<=50K
4,49.0,Self-emp-not-inc,76017.0,HS-grad,9.0,Never-married,Farming-fishing,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4879,21.0,?,394152.0,,,Never-married,,Own-child,,Female,1.0,0.0,40.0,United-States,<=50K
4880,36.0,Private,115347.0,,11.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,2.0,0.0,46.0,United-States,<=50K
4881,53.0,Private,124768.0,12th,8.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K
4882,25.0,,97607.0,Some-college,,Never-married,Sales,Own-child,White,Male,0.0,0.0,42.0,United-States,<=50K


In [4]:
df = pd.DataFrame(
    data={
         "a": [1, 3, 5, 1, 3, 5, 1, 3, 5, 1, 3, 5],
         "b": ["a", "b", "a", "a", "b", "a", "a", "b", "a", "a", "b", "a"],
         }
     )

In [6]:
from avatars.api import upload_batch_and_get_order

train, splits = get_split_for_batch(df, row_limit=6, seed=42)

ref, dataset_splits, order = upload_batch_and_get_order(train, splits, client=client)

In [7]:
order

{UUID('630bddae-3b24-4ce4-b274-d5bcee97f027'): Int64Index([0, 1, 10, 3, 7, 2], dtype='int64'),
 UUID('3e83251e-2236-4a86-ba27-a1fef14e9642'): Int64Index([9, 4, 11, 6, 5, 8], dtype='int64')}