# Anonymize Case Data

Demonstrates use of the Intelligence Toolkit library to anonymize a case dataset.

See [readme](https://github.com/microsoft/intelligence-toolkit/blob/main/app/workflows/anonymize_case_data/README.md) for more details.

In [1]:
import sys
sys.path.append("..")
from toolkit.anonymize_case_data import AnonymizeCaseData, SynthesizabilityStatistics, color_schemes
import pandas as pd

In [2]:
# Create the workflow object
acd = AnonymizeCaseData()
# Load the sensitive data
data_path = "../example_outputs/anonymize_case_data/customer_complaints/customer_complaints_prepared.csv"
sensitive_data = pd.read_csv(data_path)
# Map missing values and binary False to empty strings, since we only care about the presence of attributes
sensitive_data = sensitive_data.astype(str).replace("False", "").replace("nan", "")
print("Loaded data")

Loaded data


In [3]:
# Check the synthesizabiluty of the data
synthesizability_stats: SynthesizabilityStatistics = acd.analyze_synthesizability(sensitive_data)
print(synthesizability_stats)

SynthesizabilityStatistics(num_cols=9, overall_att_count=101, possible_combinations=27648, possible_combinations_per_row=9.2, mean_vals_per_record=5.409, max_combinations_per_record=42.488485068238006, excess_combinations_ratio=0.21652925457861052)


In [4]:
# Anonymize the data
acd.anonymize_case_data(
    df=sensitive_data,
    epsilon=12.0,
)
print("Anonymized data")

Anonymized data


In [5]:
# Inspect the anonymous aggregate data
print(acd.aggregate_df.head())

              selections  protected_count
7161        record_count             3115
7072   age_range:(30-40]             1285
4221      period:2023-H2             1085
1689      period:2023-H1             1069
1149  quality_issue:True              957


In [6]:
# Inspect the anonymous synthetic data
print(acd.synthetic_df.head())

           city age_range price_issue quality_issue service_issue  \
0  Mountainview   (40-50]                                    True   
1  Mountainview   (40-50]                                    True   
2                 (20-30]                                    True   
3                 (20-30]                                    True   
4                 (20-30]        True                               

  delivery_issue description_issue product_code   period  
0           True              True            C  2023-H1  
1           True              True            C  2023-H2  
2           True              True            C  2023-H2  
3           True              True            C  2023-H2  
4           True              True            C  2023-H1  


In [7]:
# Inspect the error report for the aggregate data
# Length represents the length of the attribute value combination being counted
# Error represents the mean absolute error in the count of the attribute value combination,
# calculated as the absolute difference between the actual count and the anonymized/protected count divided by the actual count
# Suppressed % represents the percentage of attribute value combination counts that were suppressed, out of the total count of attribute value combinations
# Fabricated % represents the percentage of attribute value combination counts that were fabricated, out of the total count of attribute value combinations

print(acd.aggregate_error_report)

    Length  Count +/- Error Suppressed % Fabricated %
0        1  160.66 +/- 5.72       4.07 %       0.00 %
1        2   23.85 +/- 5.19       9.19 %       0.21 %
2        3    6.85 +/- 3.66      17.78 %       2.61 %
3        4    2.85 +/- 2.22      36.81 %       7.84 %
4  Overall    6.88 +/- 3.07      17.95 %       2.53 %


In [8]:
# Inspect the error report for the synthetic data
# Length represents the length of the attribute value combination being counted
# Error represents the mean absolute error in the count of the attribute value combination,
# calculated as the absolute difference between the actual count and the anonymized/protected count divided by the actual count
# Suppressed % represents the percentage of attribute value combination counts that were suppressed, out of the total count of attribute value combinations
# Fabricated % represents the percentage of attribute value combination counts that were fabricated, out of the total count of attribute value combinations

print(acd.synthetic_error_report)

    Length  Count +/- Error Suppressed % Fabricated %
0        1  160.66 +/- 5.72       4.07 %       0.00 %
1        2  23.85 +/- 11.05       9.19 %       0.11 %
2        3    6.85 +/- 5.19      18.58 %       1.29 %
3        4    2.85 +/- 2.65      40.03 %       4.69 %
4  Overall    6.88 +/- 4.54      18.98 %       1.14 %


In [9]:
# Create example top attributes bar chart
bar_chart, bar_chart_df = acd.get_bar_chart_fig(
    selection=[], # Prefilter the dataset by adding attribute values here
    show_attributes=[],
    unit="Customer",
    width=1000,
    height=600,
    scheme=color_schemes["Alphabet"],
    num_values=10
)
bar_chart.show()

In [10]:
# Create example time series line chart
line_chart, line_chart_df = acd.get_line_chart_fig(
    selection=[], # Prefilter the dataset by adding attribute values here
    series_attributes=["quality_issue", "price_issue", "service_issue", "delivery_issue", "description_issue"],
    time_attribute="period",
    unit="Customer",
    width=1000,
    height=600,
    scheme=color_schemes["Alphabet"]
)
line_chart.show()

In [11]:
# Create example alluvial/flow chart of city-product relationships
flow_chart, flow_chart_df = acd.get_flow_chart_fig(
    selection=[], # Prefilter the dataset by adding attribute values here
    source_attribute="city",
    target_attribute="product_code",
    highlight_attribute="price_issue:True",
    unit="Customer",
    width=1000,
    height=600,
    scheme=color_schemes["Alphabet"]
)
flow_chart.show()