# 👨‍⚕️ Early Stage Diabetes Risk Prediction
---

Given data about *patient symptoms*, let's try to predict if a given patient is **at risk for diabetes or not**.

# Getting Started

In [None]:
# Tools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Disable warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('../input/early-stage-diabetes-risk-prediction-dataset/diabetes_data_upload.csv')
data

In [None]:
data.isna().sum()

There are no missing values.

In [None]:
{column: len(data[column].unique()) for column in data.columns}

Most of the columns contain only 2 classes except the age columns.

The target is **Positive** or **Negative**.

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Split X and y
    X = df.drop('class', axis=1)
    y = df['class']
    
    # Binary encode X
    X = X.replace({'No': 0, 'Yes': 1})
    X = X.replace({'Female': 0, 'Male': 1})
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=42)
    
    # Scale X
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)
X_train.describe()

For the training set, the mean is close to 0 and the standard deviation is close to 1 (standard scaling).

# Training

In [None]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier()    
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + "trained.")

# Cross Validation

In [None]:
# Evaluate the model
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=5)
    print(name, "average score: {:.2f} %".format(scores.mean() * 100))

# Results

In [None]:
# Predict on test set
for name, model in models.items():
    print(name + "score on test set: {:.2f} %".format(model.score(X_test, y_test) * 100))

# Conclusion

According to the cross validation, the best models are Decision Tree and Random Forest. It seems that bagging was not helpful in that case.

One can notice that other models such as Neural Network give a very good accuracy on the test set, but not as good during cross validation.

Sometimes, there is some randomness because of the little amount of data. Cross validation seems to be the best



**Thank you for reading, have a nice day!**