In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
for dirname, _, filenames in os.walk('/content/drive/MyDrive/diabetes.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data=pd.read_csv("/content/drive/MyDrive/diabetes.csv")

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
data.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [None]:
data.dtypes

Unnamed: 0,0
Pregnancies,int64
Glucose,int64
BloodPressure,int64
SkinThickness,int64
Insulin,int64
BMI,float64
DiabetesPedigreeFunction,float64
Age,int64
Outcome,int64


In [None]:
# Split the data into features and labels
X = data.drop("Outcome", axis=1)
y = data["Outcome"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# Create a pipeline with standard scaling and SVC model
model_pipeline = Pipeline([
    ("scaler", StandardScaler()),  # Normalize the features
    ("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=5))  # SVC model
])

In [None]:
# Train the model on the training data
model_pipeline.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = model_pipeline.predict(X_test)

In [None]:
# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the SVC model on test data: {:.2f}%".format(accuracy * 100))

Accuracy of the SVC model on test data: 72.73%


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest model
rf_model = RandomForestClassifier(random_state=42)

In [None]:
# Train the Random Forest model on the training data
rf_model.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
rf_pred = rf_model.predict(X_test)

In [None]:
# Evaluate the accuracy
rf_accuracy = accuracy_score(y_test, rf_pred)
print("Accuracy of the Random Forest model on test data: {:.2f}%".format(rf_accuracy * 100))

Accuracy of the Random Forest model on test data: 68.83%


In [None]:
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression model
log_reg_model = LogisticRegression(max_iter=200, random_state=42)

In [None]:
# Train the Logistic Regression model on the training data
log_reg_model.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
log_reg_pred = log_reg_model.predict(X_test)

In [None]:
# Evaluate the accuracy
log_reg_accuracy = accuracy_score(y_test, log_reg_pred)
print("Accuracy of the Logistic Regression model on test data: {:.2f}%".format(log_reg_accuracy * 100))

Accuracy of the Logistic Regression model on test data: 70.13%


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Make predictions and evaluate accuracy
gb_pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)
print("Accuracy of the Gradient Boosting model on test data: {:.2f}%".format(gb_accuracy * 100))


Accuracy of the Gradient Boosting model on test data: 72.73%


In [None]:
from xgboost import XGBClassifier

# Create an XGBoost Classifier without the use_label_encoder parameter
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions and evaluate accuracy
xgb_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
print("Accuracy of the XGBoost model on test data: {:.2f}%".format(xgb_accuracy * 100))


Accuracy of the XGBoost model on test data: 71.43%


In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Create an AdaBoost Classifier with the SAMME algorithm
ada_model = AdaBoostClassifier(algorithm='SAMME', random_state=42)

# Train the model
ada_model.fit(X_train, y_train)

# Make predictions and evaluate accuracy
ada_pred = ada_model.predict(X_test)
ada_accuracy = accuracy_score(y_test, ada_pred)
print("Accuracy of the AdaBoost model on test data: {:.2f}%".format(ada_accuracy * 100))


Accuracy of the AdaBoost model on test data: 75.32%


In [None]:
import joblib

# Save the model to a file in .joblib format
joblib_filename = 'ada_model.joblib'
joblib.dump(ada_model, joblib_filename)
print(f"Model saved as {joblib_filename}")

Model saved as ada_model.joblib


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Create a neural network model to approximate the SVC
keras_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Adjust output layer based on your task
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
keras_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
keras_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 91ms/step - accuracy: 0.5135 - loss: 2.5894 - val_accuracy: 0.6364 - val_loss: 0.9719
Epoch 2/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5502 - loss: 1.1158 - val_accuracy: 0.5195 - val_loss: 0.8532
Epoch 3/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6051 - loss: 0.7895 - val_accuracy: 0.6364 - val_loss: 0.6936
Epoch 4/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6527 - loss: 0.6635 - val_accuracy: 0.7013 - val_loss: 0.5858
Epoch 5/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6369 - loss: 0.7773 - val_accuracy: 0.6883 - val_loss: 0.6444
Epoch 6/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6530 - loss: 0.7758 - val_accuracy: 0.6104 - val_loss: 0.7020
Epoch 7/10
[1m22/22[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x78c08a9d7be0>

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

# Make predictions on the test data
y_pred_prob = keras_model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary output

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
Test Accuracy: 72.73%


In [None]:
# Save the model in .h5 format
keras_model.save("model_keras.h5")



In [None]:
import joblib

# Save the model to a file
joblib.dump(model_pipeline, 'model_pipeline.joblib')

['model_pipeline.joblib']

In [None]:
import h5py
import joblib

# Save the model to a temporary .joblib file
joblib.dump(model_pipeline, 'temp_model_pipeline.joblib')

# Create an HDF5 file and save the model as binary data
with h5py.File('model_pipeline.h5', 'w') as h5f:
    with open('temp_model_pipeline.joblib', 'rb') as model_file:
        model_data = model_file.read()
        h5f.create_dataset('model', data=np.void(model_data))

print("Model saved to model_pipeline.h5")


Model saved to model_pipeline.h5


In [None]:
import numpy as np

# Check predictions
correct_predictions = np.where(y_pred == y_test)[0]  # Get indices of correct predictions
incorrect_predictions = np.where(y_pred != y_test)[0]  # Get indices of incorrect predictions

print("Correct Predictions Indices:", correct_predictions)
print("Incorrect Predictions Indices:", incorrect_predictions)

# Optionally, display some examples
print("Actual values for correct predictions:", y_test.iloc[correct_predictions].values)
print("Predicted values for correct predictions:", y_pred[correct_predictions])
print("Actual values for incorrect predictions:", y_test.iloc[incorrect_predictions].values)
print("Predicted values for incorrect predictions:", y_pred[incorrect_predictions])


Correct Predictions Indices: [ 0  1  2  3  4  6  7 11 14 16 17 18 20 21 22 23 24 25 26 27 29 30 31 32
 33 34 35 38 39 40 41 43 44 45 46 47 48 50 51 52 54 55 57 58 59 60 61 64
 66 67 68 69 71 72 74 76]
Incorrect Predictions Indices: [ 5  8  9 10 12 13 15 19 28 36 37 42 49 53 56 62 63 65 70 73 75]
Actual values for correct predictions: [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 0 1 0 0 1 0 1 1 0 0
 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0]
Predicted values for correct predictions: [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 0 1 0 0 1 0 1 1 0 0
 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0]
Actual values for incorrect predictions: [0 0 0 1 1 0 1 1 1 1 1 0 0 1 0 0 1 0 0 1 0]
Predicted values for incorrect predictions: [1 1 1 0 0 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1]


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

label_column = 'Outcome'
X_train = data.drop(columns=[label_column])
y_train = data[label_column]

# Function to input features and check prediction
def predict_interactively():
    # Take input features from the user based on actual column names
    input_features = []
    print("Enter feature values for prediction:")
    for column in X_train.columns:
        feature_value = float(input(f"{column}: "))
        input_features.append(feature_value)

    # Convert input to a DataFrame with column names to match training data
    input_features_df = pd.DataFrame([input_features], columns=X_train.columns)

    # Predict and display results
    predicted_label = svc_model.predict(input_features_df)[0]
    print("Predicted Label:", predicted_label)

# Run the interactive prediction function
predict_interactively()


Enter feature values for prediction:
Pregnancies: 6
Glucose: 148
BloodPressure: 72
SkinThickness: 35
Insulin: 0
BMI: 33.6
DiabetesPedigreeFunction: 0.627
Age: 50
Predicted Label: 1
