In [35]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [36]:
SA_NAMES = ['hypertension', 'heart_disease', 'smoking_history', 'HbA1c_level', 'blood_glucose_level', 'diabetes']
QID_NAMES = ['gender', 'age', 'bmi']

# Load Dataset

In [37]:
#load_path = os.path.join('kanonymity', 'results_part1_260923', 'arx', '250', 'diabetes-anonymized.csv')
#load_path = os.path.join('kanonymity','datasets','diabetes','preprocessed_diabetes.csv')
k = 50
p = 0
# load_path = os.path.join('all_results', 'raw_data', f'k{k}', f'{p}', f'diabetes-anonymized.csv')
# save_path = os.path.join('all_results', 'raw_data', 'classifiers_raw', f'k{k}', f'{p}',f'diabetes-anonymized_classifiers.csv')
load_path = os.path.join('all_results', 'raw_data','original_dataset', f'preprocessed_diabetes.csv')
save_path = os.path.join('all_results', 'raw_data','classifiers_raw','original_dataset',f'preprocessed_diabetes_classifiers.csv')
df = pd.read_csv(load_path)
df = df.drop(columns='RID')
#df = df.drop(columns=SA_NAMES[:-1] + ['RID'])


# Label Encode Dataset

In [4]:
# to_encode = ['gender', 'age', 'bmi']
# #to_encode = ['gender']
# for attr in to_encode:
#     #uniques = df[attr].unique()
#     #df[attr] = df[attr].replace(uniques, np.arange(len(uniques)))
    


In [5]:
def interval_transform(x):
    try:
        return float(x)
    except ValueError:
        if x == '*': return 0
        interval_vals = x.split('-')
        return (float(interval_vals[0]) + float(interval_vals[1])) * 0.5
        
        

In [6]:
gender_encoder = [['Female','Male','*','Other'], np.arange(4)]
smoking_encoder = [['never','No Info','current','not current','ever','former'], np.arange(6)]
df['gender'] = df['gender'].replace(gender_encoder[0], gender_encoder[1])
df['smoking_history'] = df['smoking_history'].replace(smoking_encoder[0], smoking_encoder[1])

In [7]:
numerical_encode = ['age','bmi']
for attr in numerical_encode:
    df[attr] = df[attr].apply(interval_transform)

In [8]:
df['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

# Split Train/Test

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
seed = 35
test_size = 0.25

In [11]:
x = df.drop(columns=['diabetes'])
y = df['diabetes']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=test_size, random_state=seed)

In [12]:
res_df = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1 Score'])

In [13]:
def get_metrics(y_test, y_pred):
    f1 = f1_score(y_test, y_pred, average="weighted")*100
    prec = precision_score(y_test, y_pred, average="weighted")*100
    rec = recall_score(y_test, y_pred, average="weighted")*100
    acc = accuracy_score(y_test, y_pred)*100
    return acc, prec, rec, f1

In [14]:
np.unique(y_test)

array([0, 1], dtype=int64)

# Naive Bayes

In [15]:
from sklearn.naive_bayes import GaussianNB

In [16]:
gnb = GaussianNB()
y_pred = gnb.fit(x_train, y_train).predict(x_test)
res_df.loc['Naive Bayes'] = get_metrics(y_test, y_pred)

In [17]:
display(res_df)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Naive Bayes,89.932,92.185213,89.932,90.83117


# Decision Tree


In [18]:
from sklearn import tree

In [19]:
clftree = tree.DecisionTreeClassifier()
y_pred = clftree.fit(x_train,y_train).predict(x_test)
res_df.loc['Decision Tree'] = get_metrics(y_test, y_pred)

In [20]:
res_df

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Naive Bayes,89.932,92.185213,89.932,90.83117
Decision Tree,95.044,95.204398,95.044,95.117153


In [21]:
np.unique(y_pred)


array([0, 1], dtype=int64)

# KNN

In [22]:
from sklearn.neighbors import KNeighborsClassifier

In [23]:
knn = KNeighborsClassifier(n_neighbors = 5)
y_pred = knn.fit(x_train,y_train).predict(x_test)
res_df.loc['KNN'] = get_metrics(y_test, y_pred)

# SVM

In [24]:
from sklearn import svm

In [25]:
#rbf kernel
rbfsvm = svm.SVC(kernel = "rbf")
y_pred = rbfsvm.fit(x_train,y_train).predict(x_test)
res_df.loc['SVM'] = get_metrics(y_test, y_pred)

# RF

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
rftree=RandomForestClassifier(n_estimators=100)
y_pred = rftree.fit(x_train,y_train).predict(x_test)
res_df.loc['Random Forest'] = get_metrics(y_test, y_pred)

# LR

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
logReg = LogisticRegression()
y_pred = logReg.fit(x_train,y_train).predict(x_test)
res_df.loc['Logistic Regression'] = get_metrics(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# AdaBoost

In [30]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [31]:
adaboost= AdaBoostClassifier()
y_pred = adaboost.fit(x_train,y_train).predict(x_test)
res_df.loc['AdaBoost'] = get_metrics(y_test, y_pred)

# Bagging

In [32]:
from sklearn.ensemble import BaggingClassifier

In [33]:
bg = BaggingClassifier()
y_pred = bg.fit(x_train,y_train).predict(x_test)
res_df.loc['Bagging'] = get_metrics(y_test, y_pred)

In [34]:
res_df.to_csv(save_path)