<h1>Diabetes Dataset for Beginners | Machine Learning KAGGLE</h1>

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
diabetes = pd.read_csv('diabetes.csv')
diabetes.sample(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
129,0,105,84,0,0,27.9,0.741,62,1
143,10,108,66,0,0,32.4,0.272,42,1
45,0,180,66,39,0,42.0,1.893,25,1
750,4,136,70,0,0,31.2,1.182,22,1
640,0,102,86,17,105,29.3,0.695,27,0


In [3]:
diabetes.isna().sum() # no NaN values

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
# Preprocessing

# Making matrix from columns
X = diabetes.drop('Outcome', axis = 1)
y = diabetes['Outcome']

# Splitting them
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

# SVM model training
svm_model = SVC()
svm_model.fit(X_train, y_train)

In [5]:
# Predicting and showing accuracy of model

y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy for Support Vector Machine is: {accuracy:.2f}")

Accuracy for Support Vector Machine is: 0.81


In [6]:
# Apply GridSearchCV for finding best hyperparameters for SVM

param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'gamma': [0.001, 0.01, 0.1, 1],
}

grid_search_cv = GridSearchCV(svm_model, param_grid, cv = 5, n_jobs = -1)
grid_search_cv.fit(X_train, y_train)

best_params = grid_search_cv.best_params_
best_model = grid_search_cv.best_estimator_

print("Best parameters:", best_params)

Best parameters: {'C': 10, 'gamma': 0.001, 'kernel': 'linear'}


In [7]:
svm_model_best_params = SVC(C = 10, gamma = 0.001, kernel = 'linear')
svm_model_best_params.fit(X_train, y_train)

y_pred_best_params = svm_model.predict(X_test)
accuracy_best_params = accuracy_score(y_test, y_pred)

print(f"Accuracy for Support Vector Machine with Best Params is: {accuracy:.2f}")

Accuracy for Support Vector Machine with Best Params is: 0.81


In [15]:
# Scale the values

warnings.filterwarnings('ignore', category=UserWarning)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm_model_best_params_scaled = SVC(C = 10, gamma = 0.001, kernel = 'linear')
svm_model_best_params_scaled.fit(X_train_scaled, y_train)

y_pred_best_params_scaled = svm_model.predict(X_test_scaled)
accuracy_best_params_scaled = accuracy_score(y_test, y_pred)

print(f"Accuracy for Support Vector Machine with Best Params & Scaled Values is: {accuracy:.2f}")

Accuracy for Support Vector Machine with Best Params & Scaled Values is: 0.81


In [20]:
# K-Nearest Neighbours model training and testing

knn_model = KNeighborsClassifier(n_neighbors = 2)

knn_model.fit(X_train_scaled, y_train)

y_pred = knn_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy for K-Nearest Neighbours is: {accuracy:.2f}")

Accuracy for K-Nearest Neighbours is: 0.72


In [22]:
# GridSearchCV with KNN

param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 15, 20],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 30, 50]
}

grid_search_cv_knn = GridSearchCV(knn_model, param_grid, cv = 5)
grid_search_cv_knn.fit(X_train_scaled, y_train)


print("Best parameters:", grid_search_cv_knn.best_params_)


Best parameters: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}


In [26]:

knn_model_best_params = KNeighborsClassifier(algorithm = 'auto', leaf_size = 10, metric = 'euclidean', n_neighbors = 15, weights = 'distance')

knn_model_best_params.fit(X_train_scaled, y_train)

y_pred_best_params = knn_model_best_params.predict(X_test_scaled)
accuracy_best_params = accuracy_score(y_test, y_pred)

print(f"Accuracy for K-Nearest Neighbours with Best Params is: {accuracy_best_params:.2f}")

Accuracy for K-Nearest Neighbours with Best Params is: 0.72


In [31]:
# Logistic Regression training and testing

log_reg_model = LogisticRegression()
log_reg_model.fit(X_train_scaled, y_train)

y_pred = log_reg_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy for Logistic Regression is: {accuracy:.2f}")

Accuracy for Logistic Regression is: 0.80


In [36]:
# Logistic Regression with GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  
    'solver': ['newton-cg', 'lbfgs', 'saga'],
    'multi_class': ['ovr', 'multinomial'], 
    'max_iter': [100, 200, 300]  
}

grid_search_cv_log_reg = GridSearchCV(log_reg_model, param_grid, cv = 5, n_jobs = -1)
grid_search_cv_log_reg.fit(X_train_scaled, y_train)

print("Best Hyperparameters:", grid_search_cv_log_reg.best_params_)


Best Hyperparameters: {'C': 1, 'max_iter': 100, 'multi_class': 'multinomial', 'solver': 'newton-cg'}


In [37]:
log_reg_model_best_params = LogisticRegression(C = 1, max_iter = 100, multi_class = 'multinomial', solver = 'newton-cg')
log_reg_model_best_params.fit(X_train_scaled, y_train)

y_pred = log_reg_model_best_params.predict(X_test_scaled)
accuracy_best_params = accuracy_score(y_test, y_pred)

print(f"Accuracy for Logistic Regression is: {accuracy_best_params:.2f}")

Accuracy for Logistic Regression is: 0.80
