## Anonymisation du jeu de données des équidés

### Import

In [None]:
# Import modules

# Classic modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time

# Anonymization modules
from anonymizer.anonymity import get_k
from anonymizer.anonymity import local_aggregation

#from anonympy.pandas import dfAnonymizer

from pycanon import anonymity, report

# Functions
from utils.exploration import explo, clean, drop
from utils.correlation import categorical_comparison, p_vals_correction, numerical_correlation
from utils.tools import col_set
from utils.outliers import identify_outliers, identify_num_outliers

In [None]:
t0 = time.time()

## User variables

In [None]:
# Path to your dataset
path = ""

# Columns you want to study
cols = []

# Number of rows you want to include
nrows = 1000000

In [None]:
# Import data

df = pd.read_csv(
    path,

    # Uncomment the following lines as needed

    usecols = cols,
    encoding="utf-8",
    sep=";",
    lineterminator="\n",
    header=0,

    # The following is useful if your dataset is large and you wish to test this notebook.
    nrows=nrows
    )

In [None]:
df.columns

In [None]:
# The identifier fields
id_cols = []
# The numerical columns
num_cols = []
# The categorical columns
cat_cols = []
# The date columns
dat_cols = []

In [None]:
# Delete missing values and columns

# Add fields to the columns_to_drop argument if needed
columns_to_drop = []
df = drop(df, columns_to_drop)

# Choose the target column name
target = ''

### Clean-up and exploration

In [None]:
# Clean data

clean(df, id_cols= id_cols)

In [None]:
# Explore data

explo(df, cat_cols, dat_cols, num_cols)

In [None]:
df.head(10)

In [None]:
pvals = []

combines = col_set(cat_cols)
combines_2 = [x for x in combines if len(x) == 2]

for x,y in combines_2:
    u, v = categorical_comparison(df, x, y)
    pvals.append(u)
    print("The p-value of the chi2 test between {} and {} is {}".format(x, y, u))
    #v.plot.bar(figsize=(7,4), rot=0)    

In [None]:
# numerical_correlation

### Anonymisation

In [None]:
# Choose target variables

target_variables =list(df.columns)

# The aggregation will be made with k = .1% of the dataset size

n_1_perc = int((len(df)*0.001)//1)
print(n_1_perc)

SA = [target]
QI = target_variables.copy()
QI.remove(target)

# Create df copy

cols_df = df[target_variables].copy()

In [None]:
# Creation of validation set

val_set = cols_df.sample(frac=0.05)

cols_df.drop(index = val_set.index, inplace=True)

In [None]:
# Local aggregation on categorical columns

from utils.ano_correc import all_local_aggregation, get_diversities, less_diverse_groups, get_l

ano_df = all_local_aggregation(cols_df.copy(),k=n_1_perc, variables = target_variables, method = 'regroup_with_smallest')

In [None]:
# Noise addition on numerical columns

from utils.perturbation import numerical_perturbation

ano_df = numerical_perturbation(ano_df, num_cols)

### Protection des outliers

In [None]:
# The minimal number of duplicated attributes allowed to not be considered as an outlier

n = n_1_perc//10

Identification des outliers en terme de donnée catégorielle

In [None]:
dic = identify_outliers(ano_df, target, cat_cols, n)

In [None]:
to_remove = [list(u.index.values) for u in dic.values()]
to_remove = [set(u) for u in to_remove]
to_remove = set().union(*to_remove)

Identification des outliers en terme de donnée numérique

In [None]:
identify_num_outliers(cols_df, num_cols, target)

Traitement des outliers

In [None]:
cols_df.reset_index(inplace=True, drop=True)

### Sauvegarde des jeux de données

In [None]:
# By definition, outliers are a threat to non individualization : we can delete them

cols_df.drop(index = to_remove, inplace=True)
ano_df.drop(index=to_remove, inplace=True)

In [None]:
# Save the three datasets

cols_df.to_csv("data/ori.csv")
val_set.to_csv("data/control.csv")
ano_df.to_csv("data/ano.csv")

In [None]:
# An estimate on time spent anonymizing the dataset

t1 = time.time()
elapsed_time = t1-t0
print("The computational time for {} rows is {}".format(nrows, elapsed_time))