In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
arezalo_diabetes_path = kagglehub.dataset_download('arezalo/diabetes')

print('Data source import complete.')


# <font color="orange"> Logistic Regression Project: Diabetes Prediction </font>

In [None]:
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import itertools

# for solve problem of show plotly plots
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from warnings import filterwarnings
filterwarnings('ignore')
plt.style.use('_mpl-gallery')

In [None]:
from sklearn.model_selection import train_test_split , GridSearchCV, KFold, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler , StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report
from sklearn import metrics


In [None]:
# import diabetes dataset
data = pd.read_csv("../input/diabetes/diabetes.csv")
data

# <font color='orange'> 1. Dataset Overview </font>

In [None]:
df = pd.DataFrame(data)
df.info()

# <font color='orange'> 2. Cleaning and Preparing Data </font>

In [None]:
df.isnull().sum()
# df.isna().sum()

We have 9 columns, none of which seem to have missing values

In [None]:
df.describe().T

In [None]:
df.duplicated().sum()

Ok There is no duplicate data

In [None]:
# Now draw plots and check the noisy data:
# first we check the data distribution:
%matplotlib inline
fig, ax = plt.subplots(3,3,figsize=(15,9))
for i, col in enumerate(df):
    sns.histplot(df[col], kde=True, ax=ax[i//3, i%3])
    # plt.title(f"{col} ditribution", fontsize=14, fontweight='bold', c='r')
plt.show()

In [None]:
df1 = df.copy()
zero_col = ['Glucose','Insulin','SkinThickness','BloodPressure','BMI']
df1[zero_col] = df1[zero_col].replace(0, np.nan)

In [None]:
df1.isna().sum()

In [None]:
%matplotlib inline
sns.heatmap(data.corr(), annot=True, cmap='Reds')
plt.show()

- I don't have expertise or science in medicine, but I think that glucose, insulin, skin thickness, blood pressure, and BMI cannot have a zero value, and since I don't have access to the owner of the data to research this data, so I have to fill or drop these data
- The features do not have much correlation with each other and therefore they cannot be filled according to each other. Also, since the number of false zero data is high, removing them is not a good solution so I fill them according to distribution data
- Mean is most useful when the original data is not skewed, while the median is more robust, not sensitive to outliers, and thus used when data is skewed.
- As can be seen in the data distribution charts, insulin, glucose and skin thickness are deviated to the right and left, so we fill them with Madin. On the other hand, BMI and blood pressure have a normal distribution and we use the average to fill them

In [None]:
for col in ['Glucose','Insulin','SkinThickness']:
    median_col = np.median(df1[df1[col].notna()][col])
    df1[col] = df1[col].fillna(median_col)
for col in ['BMI','BloodPressure']:
    mean_col = np.mean(df1[df1[col].notna()][col])
    df1[col] = df1[col].fillna(mean_col)

In [None]:
df1.isna().sum()

In [None]:
df1.info()

# <font color="orange"> 3.Exploratory Data Analysis (EDA) </font>

### <font color="orange"> 3.1.Univariate Analysis: </font>

In [None]:
df1.describe().T

- According to above table, Most people in this dataset do not have diabetes.

#### <font color="orange"> 3.2.Numeric - Numeric Bivariate Analysis: </font>

In [None]:
%matplotlib inline
sns.pairplot(data=df1, diag_kind='kde', hue='outcome')
plt.show()

In [None]:
fig = go.Figure()
my_cols = ['Pregnancies','Glucose','BloodPressure','SkinThickness',
           'Insulin','BMI','DiabetesPedigreeFunction','Age']
for col in my_cols:
    fig.add_trace(go.Box(y=df1[col], name=f'{col}'))
fig.show()

# <font color='orange'> 4.Model </font>

In [None]:
df1

In [None]:
# define x and y for modeling
x = df1.drop('outcome', axis=1)
y = df1.outcome

In [None]:
scaler = StandardScaler()
x_norm = scaler.fit_transform(x)
x_norm

In [None]:
# convert x_train_st to DataFrame
x = pd.DataFrame(x_norm, index=x.index, columns=x.columns)
x

In [None]:
# split data to train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
FPR1 = []
TPR1 = []
FPR0 = []
TPR0 = []
ACC = []
Recall = []
Precision = []
F1 = []

def plot_confusion_matrix2(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function plots the confusion matrix.
        cm(array): confusion matrix
        classes(dictionary): classes in our target
    """
    plt.figure(figsize=(10,7))
    plt.grid(False)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt ='d'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")


    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.tight_layout()
    plt.show()

# --------------------------------------------------------------------------------------

def Perform_cross_val(model, k, x, y, scoring):
    """
    perform cross validation
        model: logistic model
        k(scaler): the value for n_splits in KFold()
        x(DataFrame or array):  x_train
        y(DataFrame or array): y_train
        scoring(string): an approach for evaluation in cross validation
    """

    kf = KFold(n_splits=k)
    cv_results = cross_val_score(model, x, y, cv=kf, scoring=scoring)
    cv_mean = np.mean(cv_results)
    print('-'*20, f"CV for k={k}, scoring={scoring}", '-'*20)
    print(f"CV mean: {cv_mean}")
    print(f"CV results: {cv_results}\n")

# --------------------------------------------------------------------------------------

def find_fold_index(k, x):
    """
    Find fold index in kfold
        k(scaler): the value used for n_splits in KFold()
        x(DataFrame or array): x_train
    """

    my_fold_index = []
    j=1
    for _ , test in KFold(k).split(x):

        my_fold_index = []
        for i in test:
            my_fold_index.append(i)
        print(f"fold {j}: [{my_fold_index[0]},{my_fold_index[-1]}]")
        print(20*'-')
        j += 1

# --------------------------------------------------------------------------------------

def features_importance(model, x_test):
    """
    print feature weights and show imporance of each feature
        model: logistic regression
        x_test(Dataframe)
    """
    print('-'*20 , 'Feature', '-'*20)
    coeff = list(model.coef_[0])
    labels = list(x_test.columns)
    features = pd.DataFrame()
    features['Features'] = labels + ['Intercept']
    features['Importance'] = coeff + [model.intercept_[0]]
    features.sort_values(by=['Importance'], ascending=True, inplace=True)
    features['Positive'] = features['Importance'] > 0
    features.set_index('Features', inplace=True)
    features.Importance.plot(kind='barh', figsize=(5,3),
                             color=features.Positive.map({True:'green', False:'orange'}))
    plt.xlabel('Importance')
    plt.show()
    return features

# --------------------------------------------------------------------------------------

def plot_results(FPR0, TPR0, FPR1, TPR1, ACC, Recall, Precision, F1):
    """
    draw ROC curve and plot of Recall, precision, f1 score etc.
        FPR0(list): list of False Positive Rate for class 0
        TPR0(list): list of True Positive Rate for class 0
        FPR1(list): list of Flase Positive Rate for class 1
        TPR1(list): list of True Positive Rate for class 1
        ACC(list): list of accuracy of models
        Recall(list): list of recall score of models
        Precision(list): list of Precision score of models
        F1(list): list of F1 score of models
    """
    fig, ax = plt.subplots(1,3,figsize=(15,4))
    # plot model evaluation
    ax[0].set_title('Model Evaluation Results', fontsize=14, fontweight='bold')
    sns.lineplot(data=pd.DataFrame({'accoracy': ACC, 'Recall': Recall,
                                    'Precision': Precision, 'F1 score': F1}),
                                    markers=True, ax=ax[0])
    ax[0].set_xlabel('M')
    ax[0].set_ylabel('Evaluation')
    ax[0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12),
          fancybox=True, shadow=True)

    # plot ROC curve for class 1
    i=1
    ax[1].set_title('ROC Curve of Class 1', fontsize=14, fontweight='bold')
    for fpr , tpr in zip(FPR1, TPR1):
        ax[1].plot(fpr, tpr, label=f"ROC curve of model{i} (AUC = {round(metrics.auc(fpr, tpr),3)})")
        i += 1
        ax[1].set_xlabel('FPR')
        ax[1].set_ylabel('TPR')
    ax[1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12),
          fancybox=True, shadow=True)

    # plot ROC curve for class zero
    i=1
    ax[2].set_title('ROC Curve of Class 0', fontsize=14, fontweight='bold')
    for fpr , tpr in zip(FPR0, TPR0):
        ax[2].plot(fpr, tpr, '--', label=f"ROC curve of model{i} (AUC = {round(metrics.auc(fpr, tpr),3)})")
        i += 1
        ax[2].set_xlabel('FPR')
        ax[2].set_ylabel('TPR')
    ax[2].legend(loc='upper center', bbox_to_anchor=(0.5, -0.12),
        fancybox=True, shadow=True)

    plt.show()

# --------------------------------------------------------------------------------------

def modeling(x, y, test_size, classes, parameters, is_add=1 ):

    # split data to train and test
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=0)

    # define model and fit model
    logreg = LogisticRegression(solver=parameters['solver'], penalty=parameters['penalty'],
                                C=parameters['C'], n_jobs=-1)
    logreg.fit(x_train, y_train)

    # prediction and results
    y_pred = logreg.predict(x_test)
    y_pred_proba = logreg.predict_proba(x_test)
    cm = confusion_matrix(y_test, y_pred, labels=[1,0])
    fpr1, tpr1, _ = metrics.roc_curve(y_test, y_pred_proba[:,1])
    fpr0, tpr0, _ = metrics.roc_curve(y_test, y_pred_proba[:,0])
    acc = metrics.accuracy_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    pre = metrics.precision_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)

    # append results
    if is_add == 1:
        FPR0.append(fpr0)
        TPR0.append(tpr0)
        FPR1.append(fpr1)
        TPR1.append(tpr1)
        ACC.append(acc)
        Recall.append(rec)
        Precision.append(pre)
        F1.append(f1)

    plot_results(FPR0, TPR0, FPR1, TPR1, ACC, Recall, Precision, F1)

    # Evaluation model
    print('-'*20 , 'Confusion Matrix', '-'*20)
    print(cm)
    plot_confusion_matrix2(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues)
    # or use plot_confusion_matrix from sklearn.metrics
    print('-'*20 , 'Classification Report', '-'*20)
    print(classification_report(y_test, y_pred, target_names=classes), '\n')
    print(f"Jaccard Score: {metrics.jaccard_score(y_test, y_pred)}", '\n')

    # print other result about weight and predicted data
    return logreg, acc, features_importance(logreg, x_test)

In [None]:
# define model
logreg = LogisticRegression(solver='liblinear')

# check cross validation
cross5_acc = Perform_cross_val(logreg, 5, x_train, y_train, 'accuracy')
cross10_acc = Perform_cross_val(logreg, 10, x_train, y_train, 'accuracy')

The values of the folds are close to each other and no problem

Now create model for test_size = 0.2

In [None]:
%matplotlib inline
parameters = dict(solver='liblinear' ,penalty='l2',C=1)
model, acc, features = modeling(x, y, 0.2, ['Diabetes disease=1', 'Not Diabetes disease=0'], parameters, is_add=1)
features

The obtained results are relatively good, but we will continue to improve the performance of the model

In [None]:
x

# <font color='orange'> 5.Improve Model </font>

In [None]:
# we improve model by ذringing some features to the power of 3
# we have negative number so we can not use dim=2
dim = 3
for col in features.index.tolist()[1:]:
    new_col = f"{col}_{dim}"
    x[new_col] = x[col] ** dim
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    logreg = LogisticRegression(solver='liblinear')
    logreg.fit(x_train, y_train)
    accuracy = metrics.accuracy_score(y_test, logreg.predict(x_test))
    if accuracy <= acc:
        x.drop(new_col, axis=1, inplace=True)
    else:
        acc = accuracy
        parameters = dict(solver='liblinear' ,penalty='l2',C=1)
        modeling(x, y, 0.2, ['Diabetes disease=1', 'Not Diabetes disease=0'], parameters, is_add=1)


In [None]:
x

Since the performance of the model did not change much, exponentiation has no effect.

Now tuning hyperparameter for logistic regression for improve model

In [None]:
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, random_state=0)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"{mean} ({stdev}) with: {param}")

In [None]:
parameters = dict(grid_result.best_params_)
model, _, features = modeling(x, y, 0.2, ['Diabetes disease=1', 'Not Diabetes disease=0'], parameters, is_add=1)
features

try imporve model by dot product some features

In [None]:
x['p_BloodPressure_Insulin'] = x.BloodPressure * x.Insulin
model, _, features = modeling(x, y, 0.2, ['Diabetes disease=1', 'Not Diabetes disease=0'], parameters, is_add=1)

Now testing model for test_size =0.3 and 0.25

In [None]:
model, _, features = modeling(x, y, 0.3, ['Diabetes disease=1', 'Not Diabetes disease=0'], parameters, is_add=1)

In [None]:
model, acc, features = modeling(x, y, 0.25, ['Diabetes disease=1', 'Not Diabetes disease=0'], parameters, is_add=1)

As seen, the best model was the one obtained with a test size = 0.2

In [None]:
# for prevent repeat model 3 that has better result we
# delete model 3 and now fit model again as the final model so:
del FPR1[2]
del TPR1[2]
del FPR0[2]
del TPR0[2]
del ACC[2]
del Recall[2]
del Precision[2]
del F1[2]

In [None]:
# modeling final model that is better model
model, _, features = modeling(x, y, 0.2, ['Diabetes disease=1', 'Not Diabetes disease=0'], parameters, is_add=1)
features

So model 5 is our best model

# <font color='orange'> 6. Visualization Final Model </font>

In [None]:
%matplotlib inline
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
x_test.insert(9,'outcome', y_test)
cols = ['Age','BloodPressure','Insulin','SkinThickness','DiabetesPedigreeFunction','Pregnancies','BMI']
for col in cols:

    fig = px.scatter_3d(
        data_frame= x_test,
        x=x_test.Glucose,
        y=x_test[col],
        z=x_test.outcome,
        color=model.predict(x_test.drop('outcome', axis=1)).astype(str),
        color_discrete_sequence={0:'red', 1:'green'},
        template='ggplot2',
        hover_name='Age',
        # hover_data=
        opacity=0.6,
        # symbol='Transmission',
        # symbol_map=
        # log_x=True,
        # log_z=True,
        height=700,
        title=f'Visualization Performance of Model in Predicting')

    pio.show(fig)

As can be seen in the above 3D scatter and confusion matrix, the model does not perform as well as class one in class zero.

#### THE END

Thanks for your attention :)