In [1]:
import pandas as pd

df = pd.read_csv('encoded_diabetes_data.csv')
print(df.keys())

Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level', 'diabetes', 'smoking_history_encoded',
       'gender_encoded'],
      dtype='object')


In [4]:
# Extract features (X) – everything except the target column
df_feat = df.drop(columns=['diabetes'])

# Extract target (y) – the column we want to predict
df_target = df[['diabetes']]

print("Feature Variables: ")
print(df_feat.info())  

print("\nTarget Variable: ")
print(df_target.value_counts()) 

Feature Variables: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14618 entries, 0 to 14617
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      14618 non-null  float64
 1   hypertension             14618 non-null  int64  
 2   heart_disease            14618 non-null  int64  
 3   bmi                      14618 non-null  float64
 4   HbA1c_level              14618 non-null  float64
 5   blood_glucose_level      14618 non-null  int64  
 6   smoking_history_encoded  14618 non-null  int64  
 7   gender_encoded           14618 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 913.8 KB
None

Target Variable: 
diabetes
0           10000
1            4618
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split 
import numpy as np 
 
X_train, X_test, y_train, y_test = train_test_split( 
                        df_feat, np.ravel(df_target), 
                test_size = 0.30, random_state = 101) 

In [6]:
from sklearn.svm import SVC 
from sklearn.metrics import classification_report, confusion_matrix
# train the model on train set 
model = SVC() 
model.fit(X_train, y_train) 
  
# print prediction results 
predictions = model.predict(X_test) 
print(classification_report(y_test, predictions)) 

              precision    recall  f1-score   support

           0       0.86      0.89      0.87      3002
           1       0.74      0.68      0.71      1384

    accuracy                           0.82      4386
   macro avg       0.80      0.78      0.79      4386
weighted avg       0.82      0.82      0.82      4386



In [8]:

# apply GridSearchCV
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score

# common hyperparameters to tune
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X_train, y_train)

# print best parameter after tuning 
print(grid.best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 

grid_predictions = grid.predict(X_test) 
  
# print classification report 
print(classification_report(y_test, grid_predictions)) 


Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.684 total time=   7.1s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.684 total time=   4.1s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.684 total time=   4.4s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.684 total time=   3.8s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.684 total time=   3.7s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.749 total time=   2.6s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.744 total time=   2.6s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.746 total time=   2.6s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.750 total time=   2.7s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.742 total time=   2.7s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.823 total time=   1.4s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf