# Tutorial 7: Batch avatarization

In this tutorial, we will perform the avatarization on batch of data. This can be useful if you have to much data that can be avatarized in one shot.

# TODO: add a schema of the process


## Connection

In [20]:
import warnings

warnings.filterwarnings("ignore")

In [21]:
import os

url = os.environ.get("AVATAR_BASE_URL")
username = os.environ.get("AVATAR_USERNAME")
password = os.environ.get("AVATAR_PASSWORD")

In [22]:
# This is the client that you'll be using for all of your requests
from avatars.client import ApiClient
from avatars.models import (
    AvatarizationJobCreate,
    AvatarizationParameters,
    ImputationParameters,
)
from avatars.lib.split import get_split_for_batch

# The following are not necessary to run avatar but are used in this tutorial
import pandas as pd
from sklearn.model_selection import train_test_split

# Change this to your actual server endpoint, e.g. base_url="https://avatar.company.com"
client = ApiClient(base_url=url)
client.authenticate(username=username, password=password)

# Verify that we can connect to the API server
client.health.get_health()

ConnectError: [Errno 61] Connection refused

In [None]:


df_1 = pd.read_csv("../fixtures/adult_with_cities.csv")
df = pd.concat([df_1 for i in range(4)])


In [23]:
from typing import Any, Dict, List, Tuple
import math

import time


import numpy as np

from avatars.models import (
    AvatarizationBatchJobCreate,
    AvatarizationBatchParameters,
    AvatarizationBatchResult,
    AvatarizationJob,
    PrivacyMetricsBatchJobCreate,
    PrivacyMetricsBatchParameters,
    PrivacyMetricsJob,
    PrivacyMetricsJobCreate,
    PrivacyMetricsParameters,
    PrivacyMetricsPerBatchParameters,
    PrivacyMetricsReferenceParameters,
    SignalMetricsBatchJobCreate,
    SignalMetricsBatchParameters,
    SignalMetricsReferenceParameters,
)

from avatars.lib.split import get_split_for_batch

def get_avatar_using_batch(
    reference_df: pd.DataFrame, splits: List[pd.DataFrame], parameters: Dict[str, Any]
) -> AvatarizationBatchResult:

    start = time.time()

    dataset_ref = client.pandas_integration.upload_dataframe(reference_df, timeout=10)
    dataset_splited_ids = [ client.pandas_integration.upload_dataframe(split, timeout=10).id for split in splits]
    batch_job = client.jobs.create_avatarization_batch_job(
        AvatarizationBatchJobCreate(parameters=
        AvatarizationBatchParameters(
        reference_dataset_id=dataset_ref.id,
        dataset_ids=dataset_splited_ids,
        **parameters,
        )
        )
    )
    batch_job = client.jobs.get_avatarization_batch_job(batch_job.id, timeout=10000)
    print("time", time.time() - start)

    return batch_job





def get_privacy_metrics_with_batch(
    batch_job: AvatarizationBatchResult, parameters: Dict[str, Any]
) -> List[PrivacyMetricsJob]:
    # Initialization
    start = time.time()
    privacy_job_ref = client.jobs.create_privacy_metrics_batch_job(
        PrivacyMetricsBatchJobCreate(
            parameters=PrivacyMetricsBatchParameters(
                avatarization_batch_job_id=batch_job.id,
                reference_parameters=PrivacyMetricsReferenceParameters(**parameters)
            ),
        )
    )
    print(privacy_job_ref.id)
    privacy_job = client.jobs.get_privacy_metrics_batch_job(privacy_job_ref.id, timeout=100000)
    print(time.time() - start)
    
    return privacy_job


def get_signal_metrics_with_batch(
    batch_job: AvatarizationBatchResult, parameters: Dict[str, Any]
) -> List[PrivacyMetricsJob]:
    # Initialization
    start = time.time()
    signal_job_ref = client.jobs.create_signal_metrics_batch_job(
        SignalMetricsBatchJobCreate(
            parameters=SignalMetricsBatchParameters(
                avatarization_batch_job_id=batch_job.id,
                reference_parameters=SignalMetricsReferenceParameters(**parameters)
            ),
        )
    )
    signal_job = client.jobs.get_signal_metrics_batch_job(signal_job_ref.id)
    print(time.time() - start)
    
    return signal_job


In [24]:
df.dtypes

age                float64
workclass           object
fnlwgt             float64
education           object
educational-num    float64
marital-status      object
occupation          object
relationship        object
race                object
gender              object
capital-gain       float64
capital-loss       float64
hours-per-week     float64
native-country      object
income              object
city                object
dtype: object

In [25]:
RowLimit = 50000

ref, splits = get_split_for_batch(
    df,
    row_limit=RowLimit,
)


print(len(splits[0]))

# 44875 -> 35974

48842


In [7]:
splits = get_avatar_using_batch(
    reference_df=ref,
    splits=splits,
    parameters={"k": 20, "imputation": ImputationParameters(method="mean")},
)
splits

time 69.68974208831787


AvatarizationBatchJob(id=UUID('11578ef0-73da-464d-a605-e77b011d6a28'), status=<JobStatus.success: 'success'>, error_message=None, traceback=None, result=AvatarizationBatchResult(privacy_metrics=None, signal_metrics=None, reference_result=AvatarizationPerBatchResult(privacy_metrics=None, signal_metrics=None, avatars_dataset=Dataset(id=UUID('8a6619fe-9dcc-4f4f-83e6-2a7d055f8d3a'), hash='372f020864eac5f55962ca6b9d7a8175fd5e6d851e69d3cdbd961941a2f58060', name=None, columns=[ColumnDetail(type=<ColumnType.float: 'float'>, label='age'), ColumnDetail(type=<ColumnType.category: 'category'>, label='workclass'), ColumnDetail(type=<ColumnType.float: 'float'>, label='fnlwgt'), ColumnDetail(type=<ColumnType.category: 'category'>, label='education'), ColumnDetail(type=<ColumnType.float: 'float'>, label='educational-num'), ColumnDetail(type=<ColumnType.category: 'category'>, label='marital-status'), ColumnDetail(type=<ColumnType.category: 'category'>, label='occupation'), ColumnDetail(type=<ColumnType

In [8]:
client.jobs.find_all_jobs_by_user(nb_days=1)


[GenericJob(id=UUID('11578ef0-73da-464d-a605-e77b011d6a28'), status=<JobStatus.success: 'success'>, error_message=None, traceback=None, result=None, parameters=GenericParameters(), current_progress=None),
 GenericJob(id=UUID('2aa81d1a-e2b3-4221-b930-4b64723e1976'), status=<JobStatus.success: 'success'>, error_message=None, traceback=None, result=None, parameters=GenericParameters(), current_progress=None),
 GenericJob(id=UUID('473dea96-cec3-44ab-8d77-ba6af12c122a'), status=<JobStatus.pending: 'pending'>, error_message=None, traceback=None, result=None, parameters=GenericParameters(), current_progress=None),
 GenericJob(id=UUID('31303790-1d94-4067-aefa-c69704325943'), status=<JobStatus.pending: 'pending'>, error_message=None, traceback=None, result=None, parameters=GenericParameters(), current_progress=None),
 GenericJob(id=UUID('68def5e3-a910-4f6b-a29d-0b850927b589'), status=<JobStatus.pending: 'pending'>, error_message=None, traceback=None, result=None, parameters=GenericParameters(),

In [9]:
privacy_results = get_privacy_metrics_with_batch(
    batch_job = splits,
    parameters={
        "closest_rate_percentage_threshold": 0.3,
        "closest_rate_ratio_threshold": 0.3,
        "known_variables": [
            "age",
            "workclass",
        ],
        "target": "income",
        "imputation": ImputationParameters(method="mean"),
        "seed": 42,
    },
)

privacy_results

774aa7df-bdc7-435a-be7d-1bb651bbc385
565.377733707428


PrivacyMetricsBatchJob(id=UUID('774aa7df-bdc7-435a-be7d-1bb651bbc385'), status=<JobStatus.success: 'success'>, error_message=None, traceback=None, result=PrivacyMetricsBatchResult(worst_metrics=PrivacyMetrics(hidden_rate=98.04676303181688, local_cloaking=53.0, distance_to_closest=4.5530009269714355, closest_distances_ratio=0.8474568118897553, column_direct_match_protection=98.4778423616288, categorical_hidden_rate=99.65390981592014, row_direct_match_protection=99.99795258179435, correlation_protection_rate=100.0, inference_continuous=None, inference_categorical=39.69124933458909, closest_rate=99.96513893161078), reference_metrics=PrivacyMetricsPerBatchResult(hidden_rate=98.18598746980058, local_cloaking=54.0, distance_to_closest=4.588808059692383, closest_distances_ratio=0.8481453541926993, column_direct_match_protection=98.48025145880497, categorical_hidden_rate=99.66268942254443, row_direct_match_protection=100.0, correlation_protection_rate=100.0, inference_continuous=None, inferenc

In [10]:
privacy_job = client.jobs.get_privacy_metrics_batch_job('2aa81d1a-e2b3-4221-b930-4b64723e1976', timeout=100000)


In [None]:
signal_results = get_signal_metrics_with_batch(
    batch_job = splits,
)

signal_results