## easy dataset, iris (from sklearn)

In [65]:
import numpy as np
import pandas as pd
from sklearn import datasets

# Load the Iris dataset
iris = datasets.load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df["species"] = pd.Categorical.from_codes(iris.target, iris.target_names)

# Rename columns
iris_df.columns = ["att_1", "att_2", "att_3", "att_4", "att_5"]

# # Encode species
species_encoding = {"setosa": "x", "versicolor": "y", "virginica": "z"}
iris_df["att_5"] = iris_df["att_5"].map(species_encoding)

scalling_factor = 0.9

# Apply scaling
for att in iris_df.columns[:-1]:
    iris_df[att] = iris_df[att] * scalling_factor

iris_df = iris_df.round(2)

# Shuffle the dataset
iris_df = iris_df.sample(frac=1).reset_index(drop=True)

# Drop 10% of the records
drop_indices = np.random.choice(iris_df.index, int(len(iris_df) * 0.1), replace=False)
iris_df = iris_df.drop(drop_indices)
iris_df = iris_df.reset_index(drop=True)

iris_df.to_csv("datasets/anonymized/iris.csv", index=False)

iris_df

Unnamed: 0,att_1,att_2,att_3,att_4,att_5
0,5.49,2.70,4.41,1.62,z
1,6.21,2.88,5.13,2.07,z
2,7.11,3.42,5.76,1.80,z
3,5.22,2.52,4.59,2.16,z
4,4.59,2.25,2.70,0.99,y
...,...,...,...,...,...
130,5.85,2.70,5.22,1.98,z
131,4.14,2.88,1.26,0.18,x
132,6.03,2.70,4.50,1.53,y
133,5.67,2.52,4.59,1.35,z


## intermediate dataset, adult (https://www.kaggle.com/datasets/wenruliu/adult-income-dataset)

In [66]:
# Function to generate letter codes for encoding categorical attributes
def generate_letter_codes(n):
    letters = [chr(i) for i in range(97, 123)]
    codes = []
    for letter in letters:
        codes.append(letter)
        if len(codes) == n:
            return codes
    for first_letter in letters:
        for second_letter in letters:
            code = first_letter + second_letter
            codes.append(code)
            if len(codes) == n:
                return codes
    raise ValueError("Too many categories to encode with double letters.")

In [67]:
adult_df = pd.read_csv("datasets/adult.csv")

# Drop records with '?'
for column in adult_df.columns:
    if adult_df[column].dtype == "object":
        # Drop rows where column has '?'
        adult_df = adult_df[adult_df[column] != "?"]

# Replace unique values in each categorical column with letter codes
for column in adult_df.select_dtypes(include=["object"]).columns:
    unique_values = adult_df[column].unique()
    letter_codes = generate_letter_codes(len(unique_values))
    replacement_values = {val: letter_codes[i] for i, val in enumerate(unique_values)}
    adult_df[column] = adult_df[column].map(replacement_values)

# Renaming columns
new_column_names = {col: f"att_{i}" for i, col in enumerate(adult_df.columns, 1)}
adult_df.rename(columns=new_column_names, inplace=True)

# Shuffling the dataset
adult_df = adult_df.sample(frac=1).reset_index(drop=True)

# Drop 10% of the records
drop_indices = np.random.choice(adult_df.index, int(len(adult_df) * 0.1), replace=False)
adult_df = adult_df.drop(drop_indices)
adult_df = adult_df.reset_index(drop=True)

# Save the transformed dataset to a CSV file
adult_df.to_csv("datasets/anonymized/adults.csv", index=False)

adult_df

Unnamed: 0,att_1,att_2,att_3,att_4,att_5,att_6,att_7,att_8,att_9,att_10,att_11,att_12,att_13,att_14,att_15
0,27,b,289039,d,10,a,c,d,e,b,0,0,40,a,a
1,30,a,393965,c,12,e,g,c,b,b,0,0,25,a,a
2,42,c,326083,k,11,b,e,b,b,a,0,0,40,a,a
3,34,a,178841,h,13,b,h,e,b,b,0,0,40,a,b
4,60,a,317083,h,13,b,h,b,b,a,0,0,45,a,b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40695,32,a,181665,b,9,a,g,f,b,b,0,0,50,a,a
40696,31,a,19302,h,13,b,e,b,b,a,0,0,40,a,b
40697,62,a,207443,a,7,a,a,f,b,a,0,0,50,a,a
40698,26,a,171114,h,13,a,h,a,b,b,0,0,40,a,a


## hard dataset, student (https://archive.ics.uci.edu/dataset/320/student+performance)

In [92]:
student_df = pd.read_csv('datasets/student.csv', delimiter=';')

# Renaming columns
new_column_names = {col: f"att_{i}" for i, col in enumerate(student_df.columns, 1)}
student_df.rename(columns=new_column_names, inplace=True)

# Replace unique values in each categorical column with letter codes
for column in student_df.select_dtypes(include=["object"]).columns:
    unique_values = student_df[column].unique()
    letter_codes = generate_letter_codes(len(unique_values))
    replacement_values = {val: letter_codes[i] for i, val in enumerate(unique_values)}
    student_df[column] = student_df[column].map(replacement_values)

# Shuffling the dataset
student_df = student_df.sample(frac=1).reset_index(drop=True)

# Drop 10% of the records
drop_indices = np.random.choice(student_df.index, int(len(student_df) * 0.1), replace=False)
student_df.drop(drop_indices, inplace=True)
student_df = student_df.reset_index(drop=True)

for col in student_df.columns:
    # Random missing rate between 1% and 10% for each column
    missing_rate = np.random.uniform(0.01, 0.1)
    student_df.loc[student_df.sample(frac=missing_rate).index, col] = np.nan

student_df.to_csv("datasets/anonymized/student.csv", index=False)

# Display the DataFrame with missing values
student_df.head()

Unnamed: 0,att_1,att_2,att_3,att_4,att_5,att_6,att_7,att_8,att_9,att_10,...,att_24,att_25,att_26,att_27,att_28,att_29,att_30,att_31,att_32,att_33
0,a,a,16.0,a,b,b,2.0,2.0,d,c,...,4.0,,3.0,2.0,3.0,4.0,2.0,12.0,13.0,13.0
1,a,b,17.0,a,b,b,4.0,4.0,c,a,...,4.0,1.0,1.0,2.0,2.0,5.0,0.0,11.0,11.0,10.0
2,a,a,15.0,a,a,b,3.0,2.0,d,b,...,4.0,3.0,5.0,1.0,1.0,2.0,26.0,7.0,6.0,6.0
3,a,a,16.0,a,a,b,3.0,3.0,c,b,...,4.0,3.0,2.0,1.0,2.0,5.0,4.0,6.0,10.0,10.0
4,a,b,16.0,a,a,b,4.0,4.0,d,a,...,4.0,4.0,3.0,1.0,1.0,4.0,0.0,16.0,17.0,17.0


In [91]:
student_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356 entries, 0 to 355
Data columns (total 33 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   att_1   343 non-null    object 
 1   att_2   325 non-null    object 
 2   att_3   320 non-null    float64
 3   att_4   348 non-null    object 
 4   att_5   321 non-null    object 
 5   att_6   321 non-null    object 
 6   att_7   346 non-null    float64
 7   att_8   340 non-null    float64
 8   att_9   321 non-null    object 
 9   att_10  351 non-null    object 
 10  att_11  348 non-null    object 
 11  att_12  321 non-null    object 
 12  att_13  337 non-null    float64
 13  att_14  330 non-null    float64
 14  att_15  325 non-null    float64
 15  att_16  321 non-null    object 
 16  att_17  345 non-null    object 
 17  att_18  349 non-null    object 
 18  att_19  321 non-null    object 
 19  att_20  344 non-null    object 
 20  att_21  340 non-null    object 
 21  att_22  352 non-null    object 
 22  at