In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,precision_score,recall_score,classification_report,confusion_matrix,roc_curve,auc

In [3]:
diabetes=load_diabetes()
diabetes

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [6]:
diabetes.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

In [7]:
df=pd.DataFrame(data=diabetes.data,columns=diabetes.feature_names)
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [8]:
df.isnull().sum()

age    0
sex    0
bmi    0
bp     0
s1     0
s2     0
s3     0
s4     0
s5     0
s6     0
dtype: int64

In [9]:
df.shape

(442, 10)

In [10]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
437    False
438    False
439    False
440    False
441    False
Length: 442, dtype: bool

In [11]:
diabetes.target

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [12]:
X,y=diabetes.data,diabetes.target

# convert target variable to binary

In [13]:
y_binary=(y>np.median(y)).astype(int)
y_binary

array([1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,

# split data

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y_binary,test_size=0.2,random_state=42)

In [15]:
X_train.shape

(353, 10)

In [16]:
X_test.shape


(89, 10)

# feature scaling 

In [17]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [18]:
X_train.shape

(353, 10)

# train the model

In [19]:
model=LogisticRegression()
model.fit(X_train,y_train)

# model predict

In [20]:
y_predict = model.predict(X_test)

In [21]:
y_predict

array([0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1])

# Model Evaluation

In [22]:
# Actual value versus predicted value
accuracy_metric = accuracy_score(y_test,y_predict)
accuracy_metric

0.7303370786516854

In [39]:
accuracy_lr=accuracy_metric

In [23]:
confusion_matrix_metric = confusion_matrix(y_test,y_predict)
confusion_matrix_metric

array([[36, 13],
       [11, 29]], dtype=int64)

In [24]:
classification_report_metrics = classification_report(y_test,y_predict)
print(classification_report_metrics)

              precision    recall  f1-score   support

           0       0.77      0.73      0.75        49
           1       0.69      0.72      0.71        40

    accuracy                           0.73        89
   macro avg       0.73      0.73      0.73        89
weighted avg       0.73      0.73      0.73        89



In [None]:
#Naive Bayes
##Model Architecture:
Naive Bayes is a probabilistic model based on Bayes' theorem, assuming independence between features. It's simple and efficient.

#Mathematics:
The core formula is Bayes' theorem: 
P(A∣B)=P(B∣A)⋅P(A)/ P(B)

 , where A is the event and B is the evidence.

Comparison Metrics:

In [26]:
#Accuracy: 
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_nb_predict = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_nb_predict)

from sklearn.metrics import classification_report

classification_report_nb = classification_report(y_test, y_nb_predict)
print("Naive Bayes Classification Report:\n", classification_report_nb)



Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.76      0.75        49
           1       0.69      0.68      0.68        40

    accuracy                           0.72        89
   macro avg       0.72      0.72      0.72        89
weighted avg       0.72      0.72      0.72        89



In [30]:
#SVM
#Model Architecture:
#Support Vector Machines (SVM) is a powerful classification algorithm that finds the hyperplane that best separates data into classes.

#Mathematics:
#SVM aims to find the hyperplane that maximizes the margin between classes. It involves solving a convex optimization problem.

#Comparison Metrics:
    
    

In [33]:
#ACCURACY:

from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train, y_train)
y_svm_predict = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_svm_predict)

#Recall, F1-score, Precision:
from sklearn.metrics import classification_report

classification_report_svm = classification_report(y_test, y_svm_predict)
print("SVM Classification Report:\n", classification_report_svm)
    




SVM Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.71      0.75        49
           1       0.69      0.78      0.73        40

    accuracy                           0.74        89
   macro avg       0.74      0.74      0.74        89
weighted avg       0.75      0.74      0.74        89



In [None]:
#Summary
#Here's a summary of the accuracy comparison:

In [40]:
#print(f"Decision Tree Accuracy: {accuracy_dt}")
#print(f"Random Forest Accuracy: {accuracy_rf}")
print(f"Naive Bayes Accuracy: {accuracy_nb}")
print(f"SVM Accuracy: {accuracy_svm}")
print(f"Logistic Regression Accuracy: {accuracy_lr}")


Naive Bayes Accuracy: 0.7191011235955056
SVM Accuracy: 0.7415730337078652
Logistic Regression Accuracy: 0.7303370786516854


Certainly! Here's a summary of the additional metrics (Recall, F1-score, Precision) for each model:




### Naive Bayes
| Metric      | Precision | Recall | F1-Score |
|-------------|-----------|--------|----------|
| Class 0     | 0.74      | 0.76   | 0.75     |
| Class 1     | 0.69      | 0.68   | 0.68     |
| Accuracy    | 0.719     |        |          |

### SVM
| Metric      | Precision | Recall | F1-Score |
|-------------|-----------|--------|----------|
| Class 0     | 0.800     | 0.71   | 0.75     |
| Class 1     | 0.691     | 0.78   | 0.73     |
| Accuracy    | 0.741     |        |          |

### Logistic Regression (for Comparison)
| Metric      | Precision | Recall | F1-Score |
|-------------|-----------|--------|----------|
| Class 0     | 0.767     | 0.735  | 0.751    |
| Class 1     | 0.691     | 0.720  | 0.705    |
| Accuracy    | 0.730     |        |          |

These tables provide a concise summary of the precision, recall, and F1-score for each class and overall accuracy for each model. 