# Anonymize Case Data

Demonstrates use of the Intelligence Toolkit library to anonymize a case dataset.

See [readme](https://github.com/microsoft/intelligence-toolkit/blob/main/app/workflows/anonymize_case_data/README.md) for more details.

In [12]:
import sys

sys.path.append("..")
from intelligence_toolkit.anonymize_case_data.api import (
    AnonymizeCaseData,
    SynthesizabilityStatistics,
)
from intelligence_toolkit.anonymize_case_data.visuals import color_schemes
import pandas as pd

In [13]:
# Create the workflow object

from intelligence_toolkit.helpers import df_functions


acd = AnonymizeCaseData()
# Load the sensitive data
data_path = "../example_outputs/anonymize_case_data/customer_complaints/customer_complaints_prepared.csv"
sensitive_data = pd.read_csv(data_path)
# Map missing values and binary False to empty strings, since we only care about the presence of attributes
sensitive_data = df_functions.supress_boolean_binary(sensitive_data)
print("Loaded data")

Loaded data


In [14]:
# Check the synthesizabiluty of the data
synthesizability_stats: SynthesizabilityStatistics = acd.analyze_synthesizability(
    sensitive_data
)
print(synthesizability_stats)

SynthesizabilityStatistics(num_cols=9, overall_att_count=101, possible_combinations=27648, possible_combinations_per_row=9.2, mean_vals_per_record=5.409, max_combinations_per_record=42.488485068238006, excess_combinations_ratio=0.21652925457861052)


In [15]:
# Anonymize the data
acd.anonymize_case_data(
    df=sensitive_data,
    epsilon=12.0,
)
print("Anonymized data")

Anonymized data


In [16]:
# Inspect the anonymous aggregate data
acd.aggregate_df.head()

Unnamed: 0,selections,protected_count
6261,record_count,3002
6200,age_range:(30-40],1280
3645,period:2023-H2,1079
1472,period:2023-H1,1063
981,quality_issue:True,958


In [17]:
# Inspect the error report for the aggregate data
# Length represents the length of the attribute value combination being counted
# Error represents the mean absolute error in the count of the attribute value combination,
# calculated as the absolute difference between the actual count and the anonymized/protected count divided by the actual count
# Suppressed % represents the percentage of attribute value combination counts that were suppressed, out of the total count of attribute value combinations
# Fabricated % represents the percentage of attribute value combination counts that were fabricated, out of the total count of attribute value combinations

print(acd.aggregate_error_report)

    Length  Count +/- Error Suppressed % Fabricated %
0        1  160.66 +/- 5.55       4.47 %       0.00 %
1        2   23.85 +/- 5.77       9.66 %       0.33 %
2        3    6.85 +/- 4.99      19.73 %       3.58 %
3        4    2.85 +/- 3.06      43.98 %       8.66 %
4  Overall    6.88 +/- 4.15      20.49 %       3.01 %


In [18]:
# Create example top attributes bar chart
bar_chart, bar_chart_df = acd.get_bar_chart_fig(
    selection=[],  # Prefilter the dataset by adding attribute values here
    show_attributes=[],
    unit="Customer",
    width=1000,
    height=600,
    scheme=color_schemes["Alphabet"],
    num_values=10,
)
bar_chart.show()

In [19]:
# Create example time series line chart
line_chart, line_chart_df = acd.get_line_chart_fig(
    selection=[],  # Prefilter the dataset by adding attribute values here
    series_attributes=[
        "quality_issue",
        "price_issue",
        "service_issue",
        "delivery_issue",
        "description_issue",
    ],
    time_attribute="period",
    unit="Customer",
    width=1000,
    height=600,
    scheme=color_schemes["Alphabet"],
)
line_chart.show()

In [20]:
# Create example alluvial/flow chart of city-product relationships
flow_chart, flow_chart_df = acd.get_flow_chart_fig(
    selection=[],  # Prefilter the dataset by adding attribute values here
    source_attribute="city",
    target_attribute="product_code",
    highlight_attribute="price_issue:True",
    unit="Customer",
    width=1000,
    height=600,
    scheme=color_schemes["Alphabet"],
)
flow_chart.show()