In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 
from sklearn.model_selection import train_test_split, cross_val_predict
from imblearn.over_sampling import SMOTENC 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix  
import imblearn
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')


In [2]:
data = pd.read_csv('input/diabetes_dataset__2019.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 952 entries, 0 to 951
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               952 non-null    object 
 1   Gender            952 non-null    object 
 2   Family_Diabetes   952 non-null    object 
 3   highBP            952 non-null    object 
 4   PhysicallyActive  952 non-null    object 
 5   BMI               948 non-null    float64
 6   Smoking           952 non-null    object 
 7   Alcohol           952 non-null    object 
 8   Sleep             952 non-null    int64  
 9   SoundSleep        952 non-null    int64  
 10  RegularMedicine   952 non-null    object 
 11  JunkFood          952 non-null    object 
 12  Stress            952 non-null    object 
 13  BPLevel           952 non-null    object 
 14  Pregancies        910 non-null    float64
 15  Pdiabetes         951 non-null    object 
 16  UriationFreq      952 non-null    object 
 1

In [3]:
for col in data.columns:
    print(col)
    print(data[col].value_counts())
    print(f'-'*20)

Age
Age
less than 40    488
40-49           164
50-59           156
60 or older     144
Name: count, dtype: int64
--------------------
Gender
Gender
Male      580
Female    372
Name: count, dtype: int64
--------------------
Family_Diabetes
Family_Diabetes
no     498
yes    454
Name: count, dtype: int64
--------------------
highBP
highBP
no     724
yes    228
Name: count, dtype: int64
--------------------
PhysicallyActive
PhysicallyActive
less than half an hr    336
more than half an hr    272
one hr or more          212
none                    132
Name: count, dtype: int64
--------------------
BMI
BMI
24.0    111
21.0     88
23.0     76
28.0     71
26.0     66
33.0     64
27.0     63
22.0     58
20.0     48
19.0     36
25.0     34
30.0     33
18.0     32
29.0     28
38.0     28
36.0     20
17.0     16
32.0     16
31.0     16
34.0     12
35.0     12
15.0      8
39.0      4
40.0      4
42.0      3
45.0      1
Name: count, dtype: int64
--------------------
Smoking
Smoking
no     844
yes  

In [4]:
data['RegularMedicine'].replace('o','no', inplace=True)
data['BPLevel'] = data['BPLevel'].str.lower().str.strip()
data['Pdiabetes'].replace('0', 'no', inplace=True)
data['Diabetic'] = data['Diabetic'].str.strip()

data[data['Gender']=='Male']['Pregancies'].isna().sum()

16

In [5]:
data['Pregancies'].replace(np.nan, 0, inplace=True)
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 947 entries, 0 to 951
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               947 non-null    object 
 1   Gender            947 non-null    object 
 2   Family_Diabetes   947 non-null    object 
 3   highBP            947 non-null    object 
 4   PhysicallyActive  947 non-null    object 
 5   BMI               947 non-null    float64
 6   Smoking           947 non-null    object 
 7   Alcohol           947 non-null    object 
 8   Sleep             947 non-null    int64  
 9   SoundSleep        947 non-null    int64  
 10  RegularMedicine   947 non-null    object 
 11  JunkFood          947 non-null    object 
 12  Stress            947 non-null    object 
 13  BPLevel           947 non-null    object 
 14  Pregancies        947 non-null    float64
 15  Pdiabetes         947 non-null    object 
 16  UriationFreq      947 non-null    object 
 17  Di

In [6]:
num_cols = ['BMI', 'Sleep', 'SoundSleep', 'Pregancies']
category_cols = list(set(data.columns).difference(set(num_cols)))

data_clean = pd.DataFrame()
for col in num_cols: 
    data_clean[col] = data[col].astype('int')

for col in category_cols: 
    data_clean[col] = data[col].astype('category')

 
data_clean['Age'] = pd.Categorical(data['Age'], ordered=True, 
                                   categories=['less than 40', '40-49', '50-59', '60 or older'])
data_clean['PhysicallyActive'] = pd.Categorical(data['PhysicallyActive'], ordered=True, 
                                                categories=['one hr or more', 'more than half an hr', 'less than half an hr', 'none'])
data_clean['JunkFood'] = pd.Categorical(data['JunkFood'], ordered=True, categories=['occasionally', 'often', 'very often', 'always'])
data_clean['BPLevel'] = pd.Categorical(data['BPLevel'], ordered=True, 
                                       categories=['low', 'normal', 'high'])
data_clean['Stress'] = pd.Categorical(data['Stress'], ordered=True, 
                                      categories=['not at all', 'sometimes', 'very often', 'always'])
category_mapping = {
    'Age':{'less than 40':0, '40-49':1, '50-59':2, '60 or older':3},
    'Family_Diabetes':{'no':0, 'yes':1},
    'Gender':{'Female':0, 'Male':1},
    'Smoking':{'no':0, 'yes':1},
    'Pdiabetes':{'no':0, 'yes':1},
    'RegularMedicine':{'no':0, 'yes':1},
    'PhysicallyActive':{'one hr or more':0, 'more than half an hr':1, 'less than half an hr':2, 'none':3},
    'JunkFood':{'occasionally':0, 'often':1, 'very often':2, 'always':3},
    'BPLevel':{'low':0, 'normal':1, 'high':2},
    'highBP':{'no':0, 'yes':1},
    'Alcohol':{'no':0, 'yes':1},
    'UriationFreq':{'not much':0, 'quite often':1},
    'Stress':{'not at all':0, 'sometimes':1, 'very often':2, 'always':3},
    'Diabetic':{'no':0, 'yes':1},
}
for col in category_cols:
    data_clean[col] = data_clean[col].map(category_mapping[col])

In [7]:
data_clean.head()

Unnamed: 0,BMI,Sleep,SoundSleep,Pregancies,BPLevel,Smoking,UriationFreq,Family_Diabetes,RegularMedicine,PhysicallyActive,Diabetic,highBP,Pdiabetes,Gender,JunkFood,Alcohol,Stress,Age
0,39,8,6,0,2,0,0,0,0,0,0,1,0,1,0,0,1,2
1,28,8,6,0,1,0,0,0,1,2,0,1,0,1,2,0,1,2
2,24,6,6,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1
3,23,8,6,0,1,0,0,0,0,0,0,0,0,1,0,0,1,2
4,27,8,8,0,1,0,0,0,0,2,0,0,0,1,0,0,1,1


In [8]:
def smote_data(X, y):
    smotenc = SMOTENC(random_state = 123, categorical_features = list(range(4, 17)), n_jobs=-1)
    X_smote, y_smote = smotenc.fit_resample(X, y)
    return X_smote, y_smote 

In [9]:
X = data_clean.drop('Diabetic', axis=1)
y = data_clean['Diabetic']

X_smote, y_smote = smote_data(X, y)



In [10]:
# Future selection 
correlation = data_clean.corr()['Diabetic'].abs().sort_values(ascending=False)

# Belirlenen eşik değer
threshold = 0.3

# Eşik değerden düşük olan özellikleri seçelim
selected_features = correlation[correlation > threshold].index.tolist()

# Seçilen özellikleri görüntüleyelim
print("Seçilen Özellikler:")
print(selected_features)

df_selected_features = data_clean[selected_features]
df_selected_features.head()

Seçilen Özellikler:
['Diabetic', 'RegularMedicine', 'Age', 'BPLevel', 'highBP']


Unnamed: 0,Diabetic,RegularMedicine,Age,BPLevel,highBP
0,0,0,2,2,1
1,0,1,2,1,1
2,0,0,1,1,0
3,0,0,2,1,0
4,0,0,1,1,0


In [11]:
from sklearn.metrics import f1_score


def cross_validate(X, y, model, scoring='recall', num_folds=10):
    num_instances = len(X)
    fold_size = num_instances // num_folds

    indices = np.arange(num_instances)
    np.random.seed(123)
    np.random.shuffle(indices)

    X_train_shuffled = X.iloc[indices]
    y_train_shuffled = y.iloc[indices]

    best_model = None  
    scores = []
    predictions = []
    precision_scores = []
    recall_scores = []
    accuracy_scores = []
    f1_scores = []
    
    # Her bir kat için çapraz doğrulama gerçekleştir
    for i in range(num_folds):
        start = i * fold_size
        end = (i + 1) * fold_size if i < num_folds - 1 else num_instances

        X_fold = X_train_shuffled.iloc[start:end]
        y_fold = y_train_shuffled.iloc[start:end]

        # SMOTE işlemini gerçekleştir
        X_fold_smote, y_fold_smote = smote_data(X_fold, y_fold)

        # Eğitim veri kümelerini oluştur
        X_train_fold = pd.concat([X_fold, X_train_shuffled.iloc[:start], X_train_shuffled.iloc[end:]], axis=0)
        y_train_fold = pd.concat([y_fold, y_train_shuffled.iloc[:start], y_train_shuffled.iloc[end:]], axis=0)

        # Modeli eğit
        model.fit(X_train_fold, y_train_fold)  

        # Test veri kümelerini oluştur
        X_test_fold = X_train_shuffled.iloc[start:end]
        y_test_fold = y_train_shuffled.iloc[start:end]
        
        prediction = model.predict(X_test_fold)
        predictions.append(prediction)

        score = model.score(X_test_fold, y_test_fold)
        scores.append(score)

        precision = precision_score(y_test_fold, prediction)
        precision_scores.append(precision)
        recall = recall_score(y_test_fold, prediction)
        recall_scores.append(recall)

        accuracy = accuracy_score(y_test_fold, prediction)
        accuracy_scores.append(accuracy)

        f1 = f1_score(y_test_fold, prediction)
        f1_scores.append(f1)

        if score == max(scores):
            best_model = model

    print("Best Accuracy Skoru:", np.max(scores))
    print("Accuracy Scores:", accuracy_scores)
    print("Average Accuracy Score:", np.mean(accuracy_scores))
    print("Average Precision Score:", np.mean(precision_scores))
    print("Average Recall Score:", np.mean(recall_scores))
    print("Average F1 Score:", np.mean(f1_scores))
    



In [12]:
# Modellerin tanımlanması
models = {
    "LSVM": LinearSVC(max_iter=5000),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "RBF SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "MLP": MLPClassifier(),
    #"XGBoost": xgb_model,
    "AdaBoost": AdaBoostClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=5000),
}
# Her bir model için cross-validation işleminin çalıştırılması
for model_name, model in models.items():
    print(f"{model_name} modeli için cross-validation sonuçları:")
    cross_validate(X_smote, y_smote, model)
    print("\n----------------------------------------------------------------------------\n")


LSVM modeli için cross-validation sonuçları:
Best Accuracy Skoru: 0.8897058823529411
Accuracy Scores: [0.8602941176470589, 0.8235294117647058, 0.8014705882352942, 0.8676470588235294, 0.8529411764705882, 0.8382352941176471, 0.8823529411764706, 0.8676470588235294, 0.8897058823529411, 0.8857142857142857]
Average Accuracy Score: 0.856953781512605
Average Precision Score: 0.8535732418811705
Average Recall Score: 0.8605512122209248
Average F1 Score: 0.8567729914055018

----------------------------------------------------------------------------

KNN modeli için cross-validation sonuçları:
Best Accuracy Skoru: 1.0
Accuracy Scores: [0.9852941176470589, 0.9779411764705882, 0.9852941176470589, 0.9779411764705882, 0.9632352941176471, 0.9632352941176471, 0.9558823529411765, 0.9779411764705882, 0.9705882352941176, 1.0]
Average Accuracy Score: 0.9757352941176471
Average Precision Score: 0.9857085974883291
Average Recall Score: 0.9658465987548798
Average F1 Score: 0.9754146128547789

----------------

In [13]:

# future selection la model sonuçları 

X_fs = data_clean.drop('Diabetic', axis=1)
y_fs = data_clean['Diabetic']

X_smote_fs, y_smote_fs = smote_data(X_fs, y_fs)


for model_name, model in models.items():
    print(f"{model_name} modeli için cross-validation sonuçları:")
    cross_validate(X_smote_fs, y_smote_fs, model)
    print("\n----------------------------------------------------------------------------\n")

LSVM modeli için cross-validation sonuçları:
Best Accuracy Skoru: 0.8897058823529411
Accuracy Scores: [0.8602941176470589, 0.8235294117647058, 0.8014705882352942, 0.8676470588235294, 0.8529411764705882, 0.8382352941176471, 0.8823529411764706, 0.8676470588235294, 0.8897058823529411, 0.8857142857142857]
Average Accuracy Score: 0.856953781512605
Average Precision Score: 0.8535732418811705
Average Recall Score: 0.8605512122209248
Average F1 Score: 0.8567729914055018

----------------------------------------------------------------------------

KNN modeli için cross-validation sonuçları:
Best Accuracy Skoru: 1.0
Accuracy Scores: [0.9852941176470589, 0.9779411764705882, 0.9852941176470589, 0.9779411764705882, 0.9632352941176471, 0.9632352941176471, 0.9558823529411765, 0.9779411764705882, 0.9705882352941176, 1.0]
Average Accuracy Score: 0.9757352941176471
Average Precision Score: 0.9857085974883291
Average Recall Score: 0.9658465987548798
Average F1 Score: 0.9754146128547789

----------------