In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier

from random_sampling import random_deletion
import tqdm

In [2]:
SA_NAMES = ['hypertension', 'heart_disease', 'smoking_history', 'HbA1c_level', 'blood_glucose_level', 'diabetes']
QID_NAMES = ['gender', 'age', 'bmi']

In [3]:
def interval_transform(x):
    try:
        return float(x)
    except ValueError:
        if x == '*': return 0
        interval_vals = x.split('-')
        return (float(interval_vals[0]) + float(interval_vals[1])) * 0.5
        
def get_metrics(y_test, y_pred):
    f1 = f1_score(y_test, y_pred, average="weighted")*100
    prec = precision_score(y_test, y_pred, average="weighted")*100
    rec = recall_score(y_test, y_pred, average="weighted")*100
    acc = accuracy_score(y_test, y_pred)*100
    return acc, prec, rec, f1

In [4]:
def eval_classifiers(df, test_size, seed):
    # Label Encode
    gender_encoder = [['Female','Male','*','Other'], np.arange(4)]
    smoking_encoder = [['never','No Info','current','not current','ever','former'], np.arange(6)]
    df['gender'] = df['gender'].replace(gender_encoder[0], gender_encoder[1])
    df['smoking_history'] = df['smoking_history'].replace(smoking_encoder[0], smoking_encoder[1])
    numerical_encode = ['age','bmi']
    for attr in numerical_encode:
        df[attr] = df[attr].apply(interval_transform)

    # Split Train/Test
    x = df.drop(columns=['diabetes'])
    y = df['diabetes']
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=test_size, random_state=seed)
    res_df = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1 Score'])

    # Naive Bayes
    gnb = GaussianNB()
    y_pred = gnb.fit(x_train, y_train).predict(x_test)
    res_df.loc['Naive Bayes'] = get_metrics(y_test, y_pred)
    
    # Decision Tree
    clftree = tree.DecisionTreeClassifier()
    y_pred = clftree.fit(x_train,y_train).predict(x_test)
    res_df.loc['Decision Tree'] = get_metrics(y_test, y_pred)
    
    # KNN    
    knn = KNeighborsClassifier(n_neighbors = 5)
    y_pred = knn.fit(x_train,y_train).predict(x_test)
    res_df.loc['KNN'] = get_metrics(y_test, y_pred)
    
    # SVM
    #rbf kernel
    rbfsvm = svm.SVC(kernel = "rbf")
    y_pred = rbfsvm.fit(x_train,y_train).predict(x_test)
    res_df.loc['SVM'] = get_metrics(y_test, y_pred)
    
    # RF
    rftree=RandomForestClassifier(n_estimators=100)
    y_pred = rftree.fit(x_train,y_train).predict(x_test)
    res_df.loc['Random Forest'] = get_metrics(y_test, y_pred)
    
    # LR
    logReg = LogisticRegression(max_iter=25)
    y_pred = logReg.fit(x_train,y_train).predict(x_test)
    res_df.loc['Logistic Regression'] = get_metrics(y_test, y_pred)
    
    # AdaBoost
    adaboost= AdaBoostClassifier()
    y_pred = adaboost.fit(x_train,y_train).predict(x_test)
    res_df.loc['AdaBoost'] = get_metrics(y_test, y_pred)
    
    # Bagging
    bg = BaggingClassifier()
    y_pred = bg.fit(x_train,y_train).predict(x_test)
    res_df.loc['Bagging'] = get_metrics(y_test, y_pred)

    return res_df

In [5]:
#load_path = os.path.join('kanonymity', 'results_part1_260923', 'arx', '250', 'diabetes-anonymized.csv')
#load_path = os.path.join('kanonymity','datasets','diabetes','preprocessed_diabetes.csv')
k = 50
p = 0
# load_path = os.path.join('all_results', 'raw_data', f'k{k}', f'{p}', f'diabetes-anonymized.csv')
# save_path = os.path.join('all_results', 'raw_data', 'classifiers_raw', f'k{k}', f'{p}',f'diabetes-anonymized_classifiers.csv')
load_path1 = os.path.join('datasets', 'diabetes_anonymized_arx250.csv')
load_path2 = os.path.join('datasets', 'preprocessed_diabetes.csv')
dfo = pd.read_csv(load_path1)
dfo = dfo.drop(columns='RID')
df = pd.read_csv(load_path2)
df = df.drop(columns='RID')

test_size = 0.25 # Split train/test 75/25
seed = 35

In [6]:
accuracy = pd.DataFrame()
recall = pd.DataFrame()
precision = pd.DataFrame()
f1 = pd.DataFrame()
metrics = {
    'Accuracy': pd.DataFrame(),
    'Recall': pd.DataFrame(),
    'Precision': pd.DataFrame(),
    'F1 Score': pd.DataFrame(),
}

In [None]:
results = []
dfs = {
    'Original Dataset': eval_classifiers(dfo, test_size, seed),
    'Only K-Anonymity': eval_classifiers(df, test_size, seed)
}
# Sampling
p = 0.5
qids = ['gender', 'age', 'bmi']
print("--Sampling the dataset 10 times--")
for i in tqdm.tqdm(range(10)):
    sampled_df = random_deletion(df, p, qids)
    dfs[f'Random Deletion {i}'] = eval_classifiers(sampled_df, test_size, seed)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


--Sampling the dataset 10 times--


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
 20%|████████████████▍                                                                 | 2/10 [06:10<24:40, 185.06s/it]

In [None]:
for k, v in dfs.items():
    for n, m in metrics.items():
        m.insert(0, k, v[n])
m['Random Deletion (average)'] = m.iloc[:, 2:].mean(axis=1)