## Anonymisation du jeu de données des équidés

### Import

In [None]:
# Import modules

# Classic modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Anonymization modules
from anonymizer.anonymity import get_k
from anonymizer.anonymity import local_aggregation

#from anonympy.pandas import dfAnonymizer

from pycanon import anonymity, report

# Functions
from utils.exploration import explo, clean, drop
from utils.correlation import categorical_comparison, p_vals_correction, numerical_correlation
from utils.tools import col_set
from utils.outliers import identify_outliers, identify_num_outliers

## User variables

In [None]:
# Path to your dataset
path = ""

# Columns you want to study
cols = []

In [None]:
# Import data

df = pd.read_csv(
    path,
    usecols = cols,

    # Uncomment the following lines as needed

    #encoding="utf-8",
    #sep=",",
    #lineterminator="\n",
    #header=0,

    # The following is useful if your dataset is large and you wish to test this notebook.
    #nrows=100000
    )

In [None]:
df.columns

## User variables

In [None]:
# The identifier fields
id_cols = []
# The numerical columns
num_cols = []
# The categorical columns
cat_cols = []
# The date columns
dat_cols = []

### Clean-up and exploration

In [None]:
# Clean data

clean(df, id_cols= id_cols)

In [None]:
# Explore data

explo(df, cat_cols, dat_cols, num_cols)

## User Variables

In [None]:
# Delete missing values and columns

# Add fields to the columns_to_drop argument if needed
columns_to_drop = []
drop(df, columns_to_drop)

# Choose the target column name
target = ''

In [None]:
df.head(10)

In [None]:
pvals = []

combines = col_set(cat_cols)
combines_2 = [x for x in combines if len(x) == 2]

for x,y in combines_2:
    u, v = categorical_comparison(df, x, y)
    pvals.append(u)
    print("The p-value of the chi2 test between {} and {} is {}".format(x, y, u))
    #v.plot.bar(figsize=(7,4), rot=0)    

In [None]:
# numerical_correlation

### Anonymisation

In [None]:
# Choose target variables

target_variables =list(df.columns)

n_1_perc = int((len(df)*0.001)//1)
print(n_1_perc)

SA = [target]
QI = target_variables.copy()
QI.remove(target)

# Create df copy

cols_df = df[target_variables].copy()

In [None]:
# Creation of validation set

val_set = cols_df.sample(frac=0.05)

cols_df.drop(index = val_set.index, inplace=True)

In [None]:
from utils.ano_correc import all_local_aggregation, get_diversities, less_diverse_groups, get_l

In [None]:
ano_df = all_local_aggregation(cols_df.copy(),k=n_1_perc, variables = target_variables, method = 'regroup_with_smallest')

In [None]:
cols = list(ano_df.columns)
for x in cols :
    QI = list(ano_df.columns)
    QI.remove(x)
    n = np.mean(ano_df.groupby(QI)[x].count())
    l = get_l(ano_df, QI, x)
    print("For {} as QIs and {} as the target, the l-diversity is of {} throughout {} values on average.".format(QI, x, l, n))
    trial = get_diversities(ano_df, QI, x)
    #print(trial.head())

### Protection des outliers

In [None]:
temp = ano_df

In [None]:
# Il faudrait déterminer une méthodologie : étant donnée la taille totale du jeu, comment choisir une valeur de n acceptable?
# Ici, 10 pour n = 10 000 : on est sur 0.0001
n = 10
cat_cols = temp.columns

Identification des outliers en terme de donnée catégorielle

In [None]:
dic = identify_outliers(temp, target, cat_cols, n)

In [None]:
to_remove = [list(u.index.values) for u in dic.values()]
to_remove = [set(u) for u in to_remove]
to_remove = set().union(*to_remove)

Identification des outliers en terme de donnée numérique

In [None]:
identify_num_outliers(cols_df, num_cols, target)

Traitement des outliers

In [None]:
cols_df.reset_index(inplace=True, drop=True)

In [None]:
# Outliers s'opposent à la non individualisation : on a le droit d'écarter la donnée

cols_df.drop(index = to_remove, inplace=True)
temp.drop(index=to_remove, inplace=True)

In [None]:
cols_df.to_csv("data/ori.csv")
val_set.to_csv("data/control.csv")
temp.to_csv("data/ano.csv")