# Lab 6

### Pedro Otero García & Alexandre Sousa Cajide

In [1]:
import numpy as np
import pandas as pd


def generate_dataset(n: int=200):
    '''
    Generates a toy dataset containing n distinct samples.

    - n: number of samples to generate

    Returns:
    - A tuple containing:
        * Dataset as a Pandas Dataframe
        * List of quasi-identifiers
        * Sensitive column name
    '''
    diseases = np.array(["Angine", "Appendicite", "Chlamydia", "Cataracte", "Dengue", 
                         "Eczéma", "Grippe", "Hépatite B", "Hépatite C", "Rhino-pharyngite", 
                         "Otite", "Rougeole", "Scarlatine", "Urticaire", "Varicelle", "Zona"])
    zipcodes = np.array([35000, 35200, 37000, 40000, 40500, 50000, 52000, 60000, 62000, 68000, 
                         75000, 75001, 75002, 75005])

    rows = []
    for _ in range(n):
        row = {'Age':np.random.randint(7, 77), 'ZipCode':np.random.choice(zipcodes), 'Disease':np.random.choice(diseases)}
        while row in rows:
            row = {'Age':np.random.randint(7, 77), 'ZipCode':np.random.choice(zipcodes), 'Disease':np.random.choice(diseases)}
        rows.append(row)
        
        
    dataset = pd.DataFrame(rows)
    dataset.sort_values(by = ['Age', 'ZipCode'], inplace=True)

    return dataset, ['Age', 'ZipCode'], 'Disease'

syn_df, qids, sensitive_data = generate_dataset()
dtypes = {
    'Age': int,
    'ZipCode': int,
    'Disease': str,
}

print(syn_df.head(5))
print(syn_df.dtypes)
print('\n\n')

adult_df = pd.read_csv('adult.csv')
dtypes = {
    'age': int,
    'workclass': str,
    'education': str,
    'marital.status': str,
    'occupation': str,
    'race': str,
    'sex': str,
    'native.country': str,
    'income': str
}

print(adult_df.head(5))
print(adult_df.dtypes)

     Age  ZipCode    Disease
66     7    40500     Angine
67     7    40500      Otite
65     8    75000  Urticaire
151    8    75001  Varicelle
10     8    75002     Eczéma
Age         int64
ZipCode     int64
Disease    object
dtype: object



   age workclass     education marital.status         occupation   race  \
0   90         ?       HS-grad        Widowed                  ?  White   
1   82   Private       HS-grad        Widowed    Exec-managerial  White   
2   66         ?  Some-college        Widowed                  ?  Black   
3   54   Private       7th-8th       Divorced  Machine-op-inspct  White   
4   41   Private  Some-college      Separated     Prof-specialty  White   

      sex native.country income  
0  Female  United-States  <=50K  
1  Female  United-States  <=50K  
2  Female  United-States  <=50K  
3  Female  United-States  <=50K  
4  Female  United-States  <=50K  
age                int64
workclass         object
education         object
marital.status    object


### Exercise 1

In [2]:
def order_qids(data: pd.DataFrame, qids: list) -> list:
    '''
        order_qids: Order the qids by the number of unique values (descending).
    '''
    # Computing the quantity of unique values per column.
    unique_counts = data[qids].apply(lambda x: len(x.unique()))

    # Sorting qids by unique values
    qids_sorted = unique_counts.sort_values(ascending=False).index

    return qids_sorted


def anonymize(data: pd.DataFrame, qids: list):
    data = data.copy()
    for qid in qids:
        if data[qid].dtype == int:
            d_min = min(data[qid])
            d_max = max(data[qid])
            gen=f"[{d_min} - {d_max}]"
            data[qid] = gen
        else:
            data[qid] = str(data[qid].unique())
            # data[qid] = '*'

    return data


def mondrian_step(data: pd.DataFrame, qids: list, k: int):
    # Check if there is a valid partition.
    # At least we need 2k items to do two partitions of k items.
    if len(data) < 2*k:
        return [data] 
    
    # Widest dimension.
    dim = order_qids(data=data, qids=qids)[0]
    
    data = data.sort_values(by=dim)
    
    # Median.
    m = len(data)//2

    lhs = data[:m]
    rhs = data[m:]    

    return [lhs, rhs]


def mondrian(data: pd.DataFrame, qids: list, k: int):
    partitions = [data]
    finish = False
    while not finish:
        parts = []
        for p in partitions:
            pts = mondrian_step(data=p, qids=qids, k=k)
            #print(pts)
            parts += pts
        partitions = parts
        for p in partitions:
            finish = True
            if len(p) > 2*k:
                finish = False
                break
            # if len(p) >= 2*k:
            #     finish = False
            #     continue
            # else:
            #     finish = True
    
    anonymized_partitions = []
    for p in partitions:
        anonymized_partitions.append(anonymize(data=p, qids=qids))
    anonymized_data = pd.concat(anonymized_partitions)

    return anonymized_data




### Exercise 2

* a) Toy dataset

In [3]:
k = 3
data = syn_df
qids = list(data.columns)
qids.remove('Disease')
anonymized_data = mondrian(data=data, qids=qids, k=k)
anonymized_data.to_csv('generated.csv', index=False)
print(anonymized_data.head(10))

          Age          ZipCode           Disease
23   [7 - 10]  [35200 - 40500]            Grippe
95   [7 - 10]  [35200 - 40500]  Rhino-pharyngite
66   [7 - 10]  [35200 - 40500]            Angine
67   [7 - 10]  [40500 - 60000]             Otite
84   [7 - 10]  [40500 - 60000]         Chlamydia
135  [7 - 10]  [40500 - 60000]              Zona
139  [8 - 10]  [62000 - 75001]            Dengue
65   [8 - 10]  [62000 - 75001]         Urticaire
151  [8 - 10]  [62000 - 75001]         Varicelle
10    [8 - 9]  [75002 - 75005]            Eczéma


* b) Adult dataset

In [4]:
k = 2
data = adult_df
qids = list(data.columns)
qids.remove('income')
anonymized_data = mondrian(data=data, qids=qids, k=k)
anonymized_data.to_csv('generated_adult.csv', index=False)
print(anonymized_data.head(10))

             age                workclass  \
28306  [17 - 18]              ['Private']   
24996  [17 - 18]              ['Private']   
19034  [17 - 18]              ['Private']   
24435  [19 - 21]              ['Private']   
18980  [19 - 21]              ['Private']   
1004   [19 - 21]              ['Private']   
16708  [19 - 21]              ['Private']   
30565  [22 - 23]  ['Private' 'Local-gov']   
15799  [22 - 23]  ['Private' 'Local-gov']   
18     [22 - 23]  ['Private' 'Local-gov']   

                                  education  \
28306                       ['11th' '12th']   
24996                       ['11th' '12th']   
19034                       ['11th' '12th']   
24435                ['12th' '10th' '11th']   
18980                ['12th' '10th' '11th']   
1004                 ['12th' '10th' '11th']   
16708                ['12th' '10th' '11th']   
30565  ['7th-8th' 'Assoc-acdm' 'Assoc-voc']   
15799  ['7th-8th' 'Assoc-acdm' 'Assoc-voc']   
18     ['7th-8th' 'Assoc-acdm' 'As

### Exercise 3

For this exercise, it will be modifed the mondrian function of the exercise 1. The workflow are going to be basicly the same, however, two new functions are going to be added before the anonimization of data to fix data in order to achive *l-diversity* and *t-closeness*.

In [None]:
# def checkLDiversity(p: pd.DataFrame, sens: list, l: int):
#     return len(p[sens].drop_duplicates()) < l

# def achiveLDiversity(partitions: list, sens: list, l: int):
#     for p in partitions:
#         if not checkLDiversity(p, sens, l):
#             for p in partitions:
                

        
def mondrian(data: pd.DataFrame, qids: list, k: int, l: int):
    partitions = [data]
    finish = False
    while not finish:
        parts = []
        for p in partitions:
            pts = mondrian_step(data=p, qids=qids, k=k)
            parts += pts
        partitions = parts
        for p in partitions:
            finish = True
            if len(p) > 2*k:
                finish = False
                break

    
    anonymized_partitions = []
    for p in partitions:
        anonymized_partitions.append(anonymize(data=p, qids=qids))
    anonymized_data = pd.concat(anonymized_partitions)

    return anonymized_data

In [49]:
x = [1/5]*5
y = [1/5,1/5,1/5,1/5,1/5]
y = [0,2/5,1/5,1/5,1/5]
y = [1/10,4/10,2/10,2/10,1/10]

y = [1/5,4/5,0,0,0]
y = [1/5,1/5,1/5,2/5,0]

m = len(x)

tcloss = 0
for i in range(0,m):
    suma = 0
    for j in range(0,i+1):
        suma += x[j] - y[j]
    print(round(suma,2))
    tcloss += abs(round(suma,2))

tcloss *= 1/(m-1)
tcloss

0.0
0.0
0.0
-0.2
0.0


0.05

In [32]:
for i in range(0,0):
    print(1233)

In [15]:
import pandas as pd

# Example DataFrame
data = {
    'A': [1, 2, 2, 3, 4],
    'B': [1, 2, 2, 3, 5],
    'C': [10, 20, 20, 30, 40],
    'D': [11, 22, 22, 33, 44],
    'E': [12, 23, 23, 34, 45]
}
df = pd.DataFrame(data)

# Counting unique combinations of rows in columns A and B
count_unique_AB = df.duplicated(subset=['A', 'B']).sum()

print(f"Number of different rows between columns A and B: {count_unique_AB}")


Number of different rows between columns A and B: 1


In [27]:
print(df[['A','B']])
len(df[['A','B']].drop_duplicates())

   A  B
0  1  1
1  2  2
2  2  2
3  3  3
4  4  5


4