## Module adverse - analyse de qualité de l'anonymisation

### Import

In [None]:
# Import modules

# Classic modules
import pandas as pd
import matplotlib.pyplot as plt
import time as time
import numpy as np

#from anonympy.pandas import dfAnonymizer

from pycanon import anonymity, report

# Functions
from utils.exploration import explo, clean, drop
from utils.correlation import categorical_comparison, p_vals_correction, numerical_correlation
from utils.tools import col_set
from utils.outliers import identify_outliers, explore_outliers, identify_num_outliers, cluster
from utils.stats import categorical_loss, numerical_loss, plot_info_loss, target_loss
from utils.inference import Infer_Model, compare_models

from utils.ano_correc import all_local_aggregation, get_diversities, less_diverse_groups, get_l

In [None]:
t0 = time.time()

In [None]:
# Path to your dataset
path_to_ano = "data/ano.csv"
path_to_original = "data/ori.csv"
path_to_val = "data/control.csv"

# Columns you want to study
cols = []

In [None]:
# Import data

cols_df = pd.read_csv(
    path_to_original,
    usecols = cols,

    # Uncomment the following lines as needed

    #encoding="utf-8",
    sep=",",
    #lineterminator="\n",
    #header=0,

    )

In [None]:
ano_df = pd.read_csv(path_to_ano,
                     usecols=cols)

val_set = pd.read_csv(path_to_val,
                            usecols = cols)

## User variables

In [None]:
target =''
QI = []
SA = [target]

# The following must include the target column name in the appropriate list
num_cols = []
cat_cols = []

# This value represents 0.1 percent of the total dataframe size
n_1_perc = int((len(cols_df)*0.001)//1)

### Approche adverse

#### Non individualisation :

In [None]:
# Identify outliers

ano_df.describe()

Identification des outliers en terme de donnée catégorielle

In [None]:
# Using a PCA with n_components = 2, this allows us to visualize the potential outliers based on categorical features

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
X = enc.fit_transform(ano_df[QI]).toarray()

from sklearn.decomposition import PCA

pca = PCA(n_components=2)

res = pd.DataFrame(pca.fit_transform(X),columns=['PC1','PC2']) 

plt.scatter(res['PC1'], res['PC2'], c="tomato")
plt.title("Principal Components repartition(n=2)")
plt.show()

Identification des outliers en terme de donnée numérique

In [None]:
# Using a scatter plot, this allows us to visualize the potential outliers based on numerical features
identify_num_outliers(ano_df, num_cols, target)

Identification des doubles outliers

In [None]:
# Using a cluster analysis, this allows us to visualize the potential outliers based on both the numerical and categorical feature
# This is time consuming and therefore commented out for faster execution time

#cluster(temp, n_clusters = 100, cat_cols = cat_cols)


#### Non inférence :

Premier scénario d'attaque : prédire un attribut d'un individu

In [None]:
# This studies whether or not an attacker with the anonymized dataset is more or less capable of predicting an individual's
# attribute than if they had the original dataset

# For each variable, this considers it as the target variable. By training a prediction model (classification or regresion)
# on both the anonymized and original data, this allows us to compare both of these models' performance

res = pd.DataFrame(columns = ["Target", "Score de performance de la prédiction", "Delta in score"])

for x in ano_df.columns :
    target = str(x) 
    model = Infer_Model(ano_df, cat_cols=cat_cols, num_cols = num_cols, target = target)
    before_model = Infer_Model(cols_df, cat_cols=cat_cols, num_cols = num_cols, target = target)
    val_model = Infer_Model(val_set, cat_cols=cat_cols, num_cols = num_cols, target = target)

    model.prep_data()
    before_model.prep_data()
    val_model.prep_data()

    model.df = model.df.align(before_model.df, join='right', axis=1, fill_value=0)[0]
    val_model.df = val_model.df.align(before_model.df, join='right', axis=1, fill_value=0)[0]

    x_train, x_test, y_train, y_test =  model.split()
    pred = model.train_model(x_train, x_test, y_train, y_test)
    x_train, x_test, y_train, y_test =  before_model.split()
    pred_original = before_model.train_model(x_train, x_test, y_train, y_test)

    new_cols =list(val_model.df.columns)
    new_cols.remove(target)
    _, model_score, delta = compare_models(pred, pred_original, val_model.df[new_cols], val_model.df[target], print_bool=False)
    res.loc[len(res)] = [target, model_score, delta]


In [None]:
res.plot.bar(x='Target', stacked=True, color=['tomato','lightseagreen'], figsize=(7,5))

Deuxième scénario d'attaque : ré-entrainer un modèle en connaissant un certain nombre de lignes

In [None]:
# This studies whether or not an attacker with access to a subset of the original dataset is capable of training a high
# performing prediction model

attack_df = cols_df[cols].copy()

attacker_set = attack_df.sample(frac=0.3)
attack_df.drop(index = attacker_set.index, inplace=True)

In [None]:
# This creates and trains the prediction model

attack_model = Infer_Model(attacker_set, cat_cols=cat_cols, num_cols = num_cols, target = target)
attack_model.prep_data()
x_train, x_test, y_train, y_test =  attack_model.split()
attack_pred = attack_model.train_model(x_train, x_test, y_train, y_test)

In [None]:
# We now compare the model's performance to an anonymized version of the subset the attacker could have

attacker_ano = all_local_aggregation(attack_df.copy(),k=n_1_perc, variables = QI, method = 'regroup_with_smallest')

for x in [attacker_ano, attack_df]:
    val_model = Infer_Model(x, cat_cols=cat_cols, num_cols = num_cols, target = target)
    val_model.prep_data()
    val_model.df = val_model.df.align(attack_model.df, join='right', axis=1, fill_value=0)[0]
    new_cols =list(val_model.df.columns)
    new_cols.remove(target)
    print(attack_pred.score(val_model.df[new_cols], val_model.df[target]))

#### Qualité de la donnée statistique

Perte d'information - donnée catégorielle

In [None]:
categorical_loss(cols_df, ano_df, QI)

plot_info_loss(cols_df,ano_df, cat_cols)

target_loss(cols_df, ano_df, target)

Perte d'information - donnée numérique

In [None]:
numerical_loss(cols_df, ano_df, num_cols)

Qualité et évolution de l'anonymisation

In [None]:
# Calculate k for k-anonymity for the anonymized and original data:
k = anonymity.k_anonymity(ano_df, QI)

print("According to the anonymity pycanon module, the k-anonymity post-anonymization is {}".format(k))

In [None]:
#k_og = anonymity.k_anonymity(cols_df, QI)

#print("According to the anonymity pycanon module, the k-anonymity pre-anonymization is {}".format(k_og))

In [None]:
# Calculate l for l-diversity for the anonymized and original data:

cols = list(ano_df.columns)
for x in cols :
    QI = list(ano_df.columns)
    QI.remove(x)
    n = np.mean(ano_df.groupby(QI)[x].count())
    l = get_l(ano_df, QI, x)
    print("For {} as QIs and {} as the target, the l-diversity is of {} throughout {} values on average.".format(QI, x, l, n))

In [None]:
cols = list(cols_df.columns)
for x in cols :
    QI = list(cols_df.columns)
    QI.remove(x)
    n = np.mean(cols_df.groupby(QI)[x].count())
    l = get_l(cols_df, QI, x)
    print("For {} as QIs and {} as the target, the l-diversity is of {} throughout {} values on average.".format(QI, x, l, n))

In [None]:
# Print the anonymity report:

ano_df.reset_index(inplace=True, drop=True)
report.print_report(ano_df, QI, SA)

In [None]:
# Print the computational time

t1 = time.time()
elapsed_time = t1-t0
print("The computational time for the adversary module is {}".format(elapsed_time))