## Anonymisation du jeu de données des équidés

### Import

In [None]:
# Import modules

# Classic modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Anonymization modules
from anonymizer.anonymity import get_k
from anonymizer.anonymity import local_aggregation

#from anonympy.pandas import dfAnonymizer

from pycanon import anonymity, report

# Functions
from utils.exploration import explo, clean, drop
from utils.correlation import categorical_comparison, p_vals_correction, numerical_correlation
from utils.tools import col_set
from utils.outliers import identify_outliers, explore_outliers, identify_num_outliers, cluster
from utils.stats import info_loss, categorical_loss, numerical_loss, plot_info_loss
from utils.inference import Infer_Model, compare_models

## User variables

In [None]:
# Path to your dataset
path = ""

# Columns you want to study
cols = []

In [None]:
# Import data

df = pd.read_csv(
    path,
    usecols = cols,

    # Uncomment the following lines as needed

    #encoding="utf-8",
    #sep=",",
    #lineterminator="\n",
    #header=0,

    # The following is useful if your dataset is large and you wish to test this notebook.
    #nrows=100000
    )

In [None]:
df.columns

## User variables

In [None]:
# The identifier fields
id_cols = []
# The numerical columns
num_cols = []
# The categorical columns
cat_cols = []
# The date columns
dat_cols = []

### Clean-up and exploration

In [None]:
# Clean data

clean(df, id_cols= id_cols)

In [None]:
# Explore data

explo(df, cat_cols, dat_cols, num_cols)

## User Variables

In [None]:
# Delete missing values and columns

# Add fields to the columns_to_drop argument if needed
columns_to_drop = []
drop(df, columns_to_drop)

# Choose the target column name
target = ''

In [None]:
df.head(10)

In [None]:
pvals = []

combines = col_set(cat_cols)
combines_2 = [x for x in combines if len(x) == 2]

for x,y in combines_2:
    u, v = categorical_comparison(df, x, y)
    pvals.append(u)
    print("The p-value of the chi2 test between {} and {} is {}".format(x, y, u))
    #v.plot.bar(figsize=(7,4), rot=0)    

In [None]:
# numerical_correlation

### Anonymisation

In [None]:
# Choose target variables

target_variables =list(df.columns)

n_1_perc = int((len(df)*0.001)//1)
print(n_1_perc)

SA = [target]
QI = target_variables.copy()
QI.remove(target)

# Create df copy

cols_df = df[target_variables].copy()

In [None]:
# Creation of validation set

val_set = cols_df.sample(frac=0.05)

cols_df.drop(index = val_set.index, inplace=True)

In [None]:
from utils.ano_correc import all_local_aggregation, get_diversities, less_diverse_groups, get_l

In [None]:
ano_df = all_local_aggregation(cols_df.copy(),k=n_1_perc, variables = target_variables, method = 'regroup_with_smallest')

In [None]:
cols = list(ano_df.columns)
for x in cols :
    QI = list(ano_df.columns)
    QI.remove(x)
    n = np.mean(ano_df.groupby(QI)[x].count())
    l = get_l(ano_df, QI, x)
    print("For {} as QIs and {} as the target, the l-diversity is of {} throughout {} values on average.".format(QI, x, l, n))
    trial = get_diversities(ano_df, QI, x)
    #print(trial.head())

### Approche adverse

In [None]:
temp = ano_df

In [None]:
# Calculate k for k-anonymity:
#k = anonymity.k_anonymity(temp, QI)

#print("According to the anonymity pycanon module, the k-anonymity is {}".format(k))

In [None]:
# Print the anonymity report:
temp.reset_index(inplace=True, drop=True)
report.print_report(temp, QI, SA)

# Analyze results

#### Non individualisation :

In [None]:
# Identify outliers

temp.describe()

Identification des outliers en terme de donnée catégorielle

Raisonnement suivi :  \
Pour chaque combinaison de colonnes, on regarde le compte de chevaux qui ont la même combinaison de ces variables.\
Si il y a moins de n individus pour une même combinaison, on la stocke dans outliers.\
On ajoute ce dataframe à un dictionnaire qui a pour clé le nom des colonnes dont il est question.\
\
Limites de cette méthode :
- On n'a pas de méthode prédéfinie d'analyse de ces outliers.

In [None]:
# Il faudrait déterminer une méthodologie : étant donnée la taille totale du jeu, comment choisir une valeur de n acceptable?
# Ici, 10 pour n = 10 000 : on est sur 0.0001
n = 10
cat_cols = temp.columns

In [None]:
dic = identify_outliers(temp, target, cat_cols, n)

In [None]:
to_remove = [list(u.index.values) for u in dic.values()]
to_remove = [set(u) for u in to_remove]
to_remove = set().union(*to_remove)

In [None]:
# explore_outliers(df, {"RACE": "SELLE ETRANGER", "PAYS_DE_NAISSANCE":"PAYS-BAS", "ROBE":"NOIR PANGARE"})

In [None]:
# regarder PCA projetée en 2D et regarder un plot

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
X = enc.fit_transform(temp[QI]).toarray()

from sklearn.decomposition import PCA

pca = PCA(n_components=2)

res = pd.DataFrame(pca.fit_transform(X),columns=['PC1','PC2']) 

plt.scatter(res['PC1'], res['PC2'])
plt.show()

Identification des outliers en terme de donnée numérique

In [None]:
# use identify_num_outliers

Identification des doubles outliers

In [None]:
# Cluster analysis

# cluster(temp, n_clusters = 100, cat_cols = cat_cols)


Traitement des outliers

In [None]:
cols_df.reset_index(inplace=True, drop=True)

In [None]:
# Outliers s'opposent à la non individualisation : on a le droit d'écarter la donnée

cols_df.drop(index = to_remove, inplace=True)
temp.drop(index=to_remove, inplace=True)

#### Non inférence :

Premier scénario d'attaque : prédire un attribut d'un individu

In [None]:
# Comparer l'inférence pour toutes les répartitions de variables

res = pd.DataFrame(columns = ["Target", "Score de performance de la prédiction", "Delta in score"])

for x in temp.columns :
    target = x 
    QI = list(temp.columns)
    QI.remove(target)
    model = Infer_Model(temp, cat_cols=QI, num_cols = [], target = target)
    before_model = Infer_Model(cols_df, cat_cols=QI, num_cols = [], target = target)
    model.prep_data()
    before_model.prep_data()
    model.df = model.df.align(before_model.df, join='right', axis=1, fill_value=0)[0]

    x_train, x_test, y_train, y_test =  model.split()
    pred = model.train_model(x_train, x_test, y_train, y_test)
    x_train, x_test, y_train, y_test =  before_model.split()
    pred_original = before_model.train_model(x_train, x_test, y_train, y_test)
    val_model = Infer_Model(val_set, cat_cols=QI, num_cols = [], target = target)
    val_model.prep_data()
    val_model.df = val_model.df.align(before_model.df, join='right', axis=1, fill_value=0)[0]
    new_cols =list(val_model.df.columns)
    new_cols.remove(target)
    _, model_score, delta = compare_models(pred, pred_original, val_model.df[new_cols], val_model.df[target], print=False)
    res.loc[len(res)] = [target, model_score, delta]



In [None]:
res.plot.bar(x='Target', stacked=True, color=['tomato','lightseagreen'], figsize=(7,5))

In [None]:
plot_info_loss(df, temp, cat_cols)

Deuxième scénario d'attaque : ré-entrainer un modèle en connaissant un certain nombre de lignes

In [None]:
# on train sur un sous-jeu
attack_df = df[target_variables].copy()

attacker_set = attack_df.sample(frac=0.3)
attack_df.drop(index = attacker_set.index, inplace=True)

In [None]:
# On entraine le modèle
attack_model = Infer_Model(attacker_set, cat_cols=QI, num_cols = [], target = target)
attack_model.prep_data()
x_train, x_test, y_train, y_test =  attack_model.split()
attack_pred = attack_model.train_model(x_train, x_test, y_train, y_test)

In [None]:
# on teste sur la donnée anonymisée et sur la donnée normale et on voit si on sous-performe maintenant
attacker_ano = all_local_aggregation(attack_df.copy(),k=n_1_perc, variables = QI, method = 'regroup_with_smallest')

for x in [attacker_ano, attack_df]:
    val_model = Infer_Model(x, cat_cols=QI, num_cols = [], target = target)
    val_model.prep_data()
    val_model.df = val_model.df.align(attack_model.df, join='right', axis=1, fill_value=0)[0]
    new_cols =list(val_model.df.columns)
    new_cols.remove(target)
    print(attack_pred.score(val_model.df[new_cols], val_model.df[target]))

#### Qualité de la donnée statistique

In [None]:
# Perte d'information - donnée catégorielle
info_loss(df, temp, QI)

# Rajouter info loss sur la variable target parce que c'est ça l'idée au fond

In [None]:
# Rajouter comparaison sur donnée numérique (moyenne, écart type, par colonne, corrélations?)