# Lab 6

### Pedro Otero García & Alexandre Sousa Cajide

In [9]:
import numpy as np
import pandas as pd


def generate_dataset(n: int=200):
    '''
    Generates a toy dataset containing n distinct samples.

    - n: number of samples to generate

    Returns:
    - A tuple containing:
        * Dataset as a Pandas Dataframe
        * List of quasi-identifiers
        * Sensitive column name
    '''
    diseases = np.array(["Angine", "Appendicite", "Chlamydia", "Cataracte", "Dengue", 
                         "Eczéma", "Grippe", "Hépatite B", "Hépatite C", "Rhino-pharyngite", 
                         "Otite", "Rougeole", "Scarlatine", "Urticaire", "Varicelle", "Zona"])
    zipcodes = np.array([35000, 35200, 37000, 40000, 40500, 50000, 52000, 60000, 62000, 68000, 
                         75000, 75001, 75002, 75005])

    rows = []
    for _ in range(n):
        row = {'Age':np.random.randint(7, 77), 'ZipCode':np.random.choice(zipcodes), 'Disease':np.random.choice(diseases)}
        while row in rows:
            row = {'Age':np.random.randint(7, 77), 'ZipCode':np.random.choice(zipcodes), 'Disease':np.random.choice(diseases)}
        rows.append(row)
        
        
    dataset = pd.DataFrame(rows)
    dataset.sort_values(by = ['Age', 'ZipCode'], inplace=True)

    return dataset, ['Age', 'ZipCode'], 'Disease'

syn_df, qids, sensitive_data = generate_dataset()
dtypes = {
    'Age': int,
    'ZipCode': int,
    'Disease': str,
}

print(syn_df.head(5))
print()
print(syn_df.dtypes)
print('\n\n')

adult_df = pd.read_csv('adult.csv')
dtypes = {
    'age': int,
    'workclass': str,
    'education': str,
    'marital.status': str,
    'occupation': str,
    'race': str,
    'sex': str,
    'native.country': str,
    'income': str
}

print(adult_df.head(5))
print()
print(adult_df.dtypes)

     Age  ZipCode      Disease
181    7    35000       Dengue
20     7    35200    Cataracte
104    7    35200       Grippe
13     7    37000  Appendicite
82     9    35000       Dengue

Age         int64
ZipCode     int64
Disease    object
dtype: object



   age workclass     education marital.status         occupation   race  \
0   90         ?       HS-grad        Widowed                  ?  White   
1   82   Private       HS-grad        Widowed    Exec-managerial  White   
2   66         ?  Some-college        Widowed                  ?  Black   
3   54   Private       7th-8th       Divorced  Machine-op-inspct  White   
4   41   Private  Some-college      Separated     Prof-specialty  White   

      sex native.country income  
0  Female  United-States  <=50K  
1  Female  United-States  <=50K  
2  Female  United-States  <=50K  
3  Female  United-States  <=50K  
4  Female  United-States  <=50K  

age                int64
workclass         object
education         object
marital.sta

### Exercise 1

In [10]:
def order_qids(data: pd.DataFrame, qids: list) -> list:
    '''
        'order_qids': Function to order the qids by the number of unique values (descending).

        @param data: Dataset to order the qids.
        @param qids: List with the name of qids columns.

        @return qids_sorted: List with the name of qids columns sorted.
    '''
    # Computing the quantity of unique values per column.
    unique_counts = data[qids].apply(lambda x: len(x.unique()))

    # Sorting qids by unique values.
    qids_sorted = unique_counts.sort_values(ascending=False).index

    return qids_sorted


def anonymize(data: pd.DataFrame, qids: list):
    '''
        'anonymize': Function to anonymize the data.

        @param data: Dataset to anonymize.
        @param qids: List with the name of qids columns.

        @return data: Data anonymized.
    '''
    data = data.copy()
    for qid in qids:
        # If the qid is a number the anonymization is a range.
        if data[qid].dtype == int:
            d_min = min(data[qid])
            d_max = max(data[qid])
            gen=f"[{d_min} - {d_max}]"
            data[qid] = gen
        # If the qid is a str the anonymization is a list with all
        # different values of the qid.
        else:
            data[qid] = str(data[qid].unique())

    return data


def mondrian_step(data: pd.DataFrame, qids: list, k: int):
    '''
        'mondrian_step': Function to do a step (split) of the dataset.

        @param data: Dataset to split.
        @param qids: List with the name of qids columns.
        @param k: k-anonymity parameter.

        @return [lhs, rha]: Split of the original dataset.
    '''
    # Check if there is a valid partition.
    # At least we need 2k items to do two partitions of k items.
    if len(data) < 2*k:
        return [data] 
    
    # QID with more different values.
    dim = order_qids(data=data, qids=qids)[0]
    
    data = data.sort_values(by=dim)
    
    # Computing the median.
    m = len(data)//2

    # Split.
    lhs = data[:m]
    rhs = data[m:]    

    return [lhs, rhs]


def mondrian(data: pd.DataFrame, qids: list, k: int):
    ''' 
        'mondrian' function to do the mondrian algorithm.

        @param data: Dataset to split.
        @param qids: List with the name of qids columns.
        @param k: k-anonymity parameter.

        @return anonymized_data: Result of the mondrian algorithm.
    '''
    partitions = [data]
    finish = False
    while not finish:
        parts = []
        for p in partitions:
            pts = mondrian_step(data=p, qids=qids, k=k)
            parts += pts
        partitions = parts
        for p in partitions:
            finish = True
            if len(p) > 2*k:
                finish = False
                break
    
    anonymized_partitions = []
    for p in partitions:
        anonymized_partitions.append(anonymize(data=p, qids=qids))
    anonymized_data = pd.concat(anonymized_partitions)

    return anonymized_data




### Exercise 2

* a) Toy dataset

In [11]:
k = 3
data = syn_df
qids = list(data.columns)
qids.remove('Disease')
anonymized_data = mondrian(data=data, qids=qids, k=k)
anonymized_data.to_csv('synthetic_anonymized_data_ex2.csv', index=False)
print(anonymized_data.head(10))

           Age          ZipCode      Disease
181    [7 - 9]  [35000 - 35200]       Dengue
82     [7 - 9]  [35000 - 35200]       Dengue
20     [7 - 9]  [35000 - 35200]    Cataracte
104    [7 - 9]  [35200 - 37000]       Grippe
44     [7 - 9]  [35200 - 37000]       Eczéma
13     [7 - 9]  [35200 - 37000]  Appendicite
50     [9 - 9]  [35200 - 40000]       Dengue
155    [9 - 9]  [35200 - 40000]    Urticaire
177    [9 - 9]  [35200 - 40000]        Otite
122  [11 - 14]  [35000 - 40000]    Varicelle


* b) Adult dataset

In [12]:
k = 2
data = adult_df
qids = list(data.columns)
qids.remove('income')
anonymized_data = mondrian(data=data, qids=qids, k=k)
anonymized_data.to_csv('adult_anonymized_data_ex4.csv', index=False)
print(anonymized_data.head(10))

             age                workclass  \
28306  [17 - 18]              ['Private']   
24996  [17 - 18]              ['Private']   
19034  [17 - 18]              ['Private']   
24435  [19 - 21]              ['Private']   
18980  [19 - 21]              ['Private']   
1004   [19 - 21]              ['Private']   
16708  [19 - 21]              ['Private']   
30565  [22 - 23]  ['Private' 'Local-gov']   
15799  [22 - 23]  ['Private' 'Local-gov']   
18     [22 - 23]  ['Private' 'Local-gov']   

                                  education  \
28306                       ['11th' '12th']   
24996                       ['11th' '12th']   
19034                       ['11th' '12th']   
24435                ['12th' '10th' '11th']   
18980                ['12th' '10th' '11th']   
1004                 ['12th' '10th' '11th']   
16708                ['12th' '10th' '11th']   
30565  ['7th-8th' 'Assoc-acdm' 'Assoc-voc']   
15799  ['7th-8th' 'Assoc-acdm' 'Assoc-voc']   
18     ['7th-8th' 'Assoc-acdm' 'As

### Exercise 3

For this exercise, it will be modifed the mondrian function of the exercise 1. The workflow are going to be basicly the same, however, two new functions are going to be added before the anonimization of data to fix data in order to achive *l-diversity* and *t-closeness*.

> We will assume that there will be only one column of sensitive  data.

In [13]:

def computeProbs(df: pd.DataFrame, sen: str):
    ''' 
        'computeProbs': Function to compute the probability of each possible in the sensitibity data.

        @param df: Dataset to compute the probabilities.
        @param sen: Sensivity column.

        @return probs_sen: Computed probabilities.
    '''
    probs_sen = {}
    for cat in df[sen].unique():
        probs_sen[cat] = len(df[df[sen] == cat])/len(df)

    return probs_sen

def checkValidPartition(p: pd.DataFrame, sen: str, probs_sen: list, k: int, l: int, t: float):
    ''' 
        'checkValidPartition': Function to check if a partition can be splited.

        @param p: Partition to check.
        @param sen: Sensitivity column.
        @param probs_sen: Probabilities of the values of the sensitivity column in the complete dataset.
        @param k: k-anonynity parameter.
        @param l: l-diversity paramater.
        @param t: t-closness parameter.

        @return bool: Indicates if the partition can be splited or not.
    '''
    # Check k-nonimity.
    if len(p) < 2*k:
        return False
    
    # Check l-diversity.
    l_p = len(p[sen].unique())
    if l_p < l:
        return False
    
    # Check t-clossness.
    probs_p = computeProbs(p, sen)
    t_p = 0
    # If sensitivity is categorical.
    if np.issubdtype(p[sen], np.object_):
        for k in probs_sen.keys():
            try:
                t_p += abs(probs_p[k] - probs_sen[k])
            except:
                t_p += abs(0 - probs_sen[k])
        t_p = t_p/2
    else:
        keys = probs_sen.keys()
        for i, k in enumerate(keys):
            for j in range(0,i+1):
                tp += abs(probs_p[keys[i]]-probs_sen[keys[i]])
        t_p = tp/(len(keys)-1)
    if t_p > t:
        return False
    
    return True

def mondrian_step(data: pd.DataFrame, qids: list, sen: str, probs_sen: list, k: int, l: int, t: float):
    '''
        'mondrian_step': Function to do a step (split) of the dataset.

        @param data: Dataset to split.
        @param qids: List with the name of qids columns.
        @param sen: Sensitivity column.
        @param probs_sen: Probabilities of the values of the sensitivity column in the complete dataset.
        @param k: k-anonynity parameter.
        @param l: l-diversity paramater.
        @param t: t-closness parameter.

        @return [lhs, rha]: Split of the original dataset.
    '''
    # Check if there is a valid partition.
    # At least we need 2k items to do two partitions of k items.
    if not checkValidPartition(data, sen, probs_sen, k, l, t):
        return [data] 
    
    # Widest dimension.
    dim = order_qids(data=data, qids=qids)[0]
    
    data = data.sort_values(by=dim)
    
    # Median.
    m = len(data)//2

    lhs = data[:m]
    rhs = data[m:]    

    return [lhs, rhs]             
        
def mondrian(data: pd.DataFrame, qids: list, sen: str, k: int, l: int, t: float):
    ''' 
        'mondrian' function to do the mondrian algorithm.

        @param data: Dataset to split.
        @param sen: Sensitivity column.
        @param k: k-anonynity parameter.
        @param l: l-diversity paramater.
        @param t: t-closness parameter.

        @return anonymized_data: Result of the mondrian algorithm.
    '''
    probs_sen = computeProbs(data, sen)
    partitions = [data]
    finish = False
    while not finish:
        parts = []
        for p in partitions:
            pts = mondrian_step(data=p, qids=qids, sen=sen, probs_sen=probs_sen, k=k, l=l, t=t)
            parts += pts
        partitions = parts
        for p in partitions:
            finish = True
            if checkValidPartition(p, sen, probs_sen, k, l, t):
                finish = False
                break

    
    anonymized_partitions = []
    for p in partitions:
        anonymized_partitions.append(anonymize(data=p, qids=qids))
    anonymized_data = pd.concat(anonymized_partitions)

    return anonymized_data

### Exercise 4

* a) Toy dataset

In [14]:
k = 3
l = 2
t = 0.3
data = syn_df
sen = 'Disease'
qids = list(data.columns)
qids.remove(sen)
anonymized_data = mondrian(data=data, qids=qids, sen=sen, k=k, l=l, t=t)
anonymized_data.to_csv('synthetic_anonymized_data_ex4.csv', index=False)
print(anonymized_data.head(10))

          Age          ZipCode      Disease
181  [7 - 14]  [35000 - 40000]       Dengue
82   [7 - 14]  [35000 - 40000]       Dengue
96   [7 - 14]  [35000 - 40000]       Dengue
20   [7 - 14]  [35000 - 40000]    Cataracte
104  [7 - 14]  [35000 - 40000]       Grippe
44   [7 - 14]  [35000 - 40000]       Eczéma
50   [7 - 14]  [35000 - 40000]       Dengue
155  [7 - 14]  [35000 - 40000]    Urticaire
30   [7 - 14]  [35000 - 40000]   Hépatite C
13   [7 - 14]  [35000 - 40000]  Appendicite


* b) Adult dataset

In [15]:
k = 2
l = 2
t = 0.4
data = adult_df
sen = 'income'
qids = list(data.columns)
qids.remove(sen)
anonymized_data = mondrian(data=data, qids=qids, sen=sen, k=k, l=l, t=t)
anonymized_data.to_csv('adult_anonymized_data_ex4.csv', index=False)
print(anonymized_data.head(10))

             age                workclass                   education  \
18980  [17 - 21]              ['Private']      ['10th' '11th' '12th']   
28306  [17 - 21]              ['Private']      ['10th' '11th' '12th']   
24996  [17 - 21]              ['Private']      ['10th' '11th' '12th']   
1004   [17 - 21]              ['Private']      ['10th' '11th' '12th']   
19034  [17 - 21]              ['Private']      ['10th' '11th' '12th']   
24435  [17 - 21]              ['Private']      ['10th' '11th' '12th']   
16708  [17 - 21]              ['Private']      ['10th' '11th' '12th']   
30565  [22 - 23]              ['Private']    ['7th-8th' 'Assoc-acdm']   
15799  [22 - 23]              ['Private']    ['7th-8th' 'Assoc-acdm']   
18     [22 - 22]  ['Private' 'Local-gov']  ['Assoc-acdm' 'Assoc-voc']   

                                  marital.status  \
18980  ['Never-married' 'Married-spouse-absent']   
28306  ['Never-married' 'Married-spouse-absent']   
24996  ['Never-married' 'Married-spouse-