# Anonymize Case Data

Demonstrates use of the Intelligence Toolkit library to anonymize a case dataset.

See [readme](https://github.com/microsoft/intelligence-toolkit/blob/main/app/workflows/anonymize_case_data/README.md) for more details.

In [16]:
import sys

sys.path.append("..")
from toolkit.anonymize_case_data import (
    AnonymizeCaseData,
    SynthesizabilityStatistics,
    color_schemes,
)
import pandas as pd

In [17]:
# Create the workflow object
acd = AnonymizeCaseData()
# Load the sensitive data
data_path = "../example_outputs/customer_complaints_prepared.csv"
sensitive_data = pd.read_csv(data_path)
print("Loaded data")

Loaded data


In [18]:
# Check the synthesizabiluty of the data
synthesizability_stats: SynthesizabilityStatistics = acd.analyze_synthesizability(
    sensitive_data
)
print(synthesizability_stats)

SynthesizabilityStatistics(num_cols=11, overall_att_count=208, possible_combinations=176947200, possible_combinations_per_row=59041.4, mean_vals_per_record=10.916583249916583, max_combinations_per_record=1932.9428262222075, excess_combinations_ratio=30.54482481274007)


In [19]:
# Anonymize the data
acd.anonymize_case_data(
    df=sensitive_data,
    epsilon=12.0,
)
print("Anonymized data")

Anonymized data


In [20]:
# Inspect the anonymous aggregate data
print(acd.aggregate_df.head())

                    selections  protected_count
25355             record_count             3010
15237  description_issue:False             2277
3885      delivery_issue:False             2104
1731       service_issue:False             2067
21245        price_issue:False             2056


In [21]:
# Inspect the error report for the aggregate data
# Length represents the length of the attribute value combination being counted
# Error represents the mean absolute error in the count of the attribute value combination,
# calculated as the absolute difference between the actual count and the anonymized/protected count divided by the actual count
# Suppressed % represents the percentage of attribute value combination counts that were suppressed, out of the total count of attribute value combinations
# Fabricated % represents the percentage of attribute value combination counts that were fabricated, out of the total count of attribute value combinations

print(acd.aggregate_error_report)

    Length  Count +/- Error Suppressed % Fabricated %
0        1  147.06 +/- 8.50       9.99 %       0.00 %
1        2   20.97 +/- 7.61      21.00 %       0.50 %
2        3    6.66 +/- 6.60      34.40 %       2.30 %
3        4    3.23 +/- 5.26      51.55 %       1.84 %
4  Overall    4.46 +/- 5.79      41.84 %       1.73 %


In [22]:
# Inspect the error report for the synthetic data
# Length represents the length of the attribute value combination being counted
# Error represents the mean absolute error in the count of the attribute value combination,
# calculated as the absolute difference between the actual count and the anonymized/protected count divided by the actual count
# Suppressed % represents the percentage of attribute value combination counts that were suppressed, out of the total count of attribute value combinations
# Fabricated % represents the percentage of attribute value combination counts that were fabricated, out of the total count of attribute value combinations

print(acd.synthetic_error_report)

    Length  Count +/- Error Suppressed % Fabricated %
0        1  147.06 +/- 8.50       9.99 %       0.00 %
1        2  20.97 +/- 29.91      21.03 %       0.24 %
2        3   6.66 +/- 21.26      35.79 %       0.56 %
3        4   3.23 +/- 16.47      54.05 %       0.42 %
4  Overall   4.46 +/- 18.84      43.66 %       0.40 %


In [23]:
# Create example top attributes bar chart
bar_chart, bar_chart_df = acd.get_bar_chart_fig(
    selection=[],  # Prefilter the dataset by adding attribute values here
    show_attributes=[],
    unit="Customer",
    width=1000,
    height=600,
    scheme=color_schemes["Alphabet"],
    num_values=10,
)
bar_chart.show()

In [24]:
# Create example time series line chart
line_chart, line_chart_df = acd.get_line_chart_fig(
    selection=[],  # Prefilter the dataset by adding attribute values here
    series_attributes=[
        "quality_issue",
        "price_issue",
        "service_issue",
        "delivery_issue",
        "description_issue",
    ],
    time_attribute="period",
    unit="Customer",
    width=1000,
    height=600,
    scheme=color_schemes["Alphabet"],
)
line_chart.show()

In [25]:
# Create example alluvial/flow chart of city-product relationships
flow_chart, flow_chart_df = acd.get_flow_chart_fig(
    selection=[],  # Prefilter the dataset by adding attribute values here
    source_attribute="city",
    target_attribute="product_code",
    highlight_attribute="price_issue:True",
    unit="Customer",
    width=1000,
    height=600,
    scheme=color_schemes["Alphabet"],
)
flow_chart.show()