<a href="https://colab.research.google.com/github/manhili/ML-Assignments/blob/main/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Suport Vector Machines**

#**Importing necessary libraries**

In [2]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

#**Importing dataset**

In [3]:
url = "https://raw.githubusercontent.com/manhili/Machine-Learning/main/diabetes.csv"
data = pd.read_csv(url)

#**Exploring Data**


In [4]:
data.shape

(768, 9)

In [5]:
data.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [6]:
data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [7]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
# Putting feature variable to X
X = data.drop('Outcome',axis=1)
# Putting labels to y
y = data['Outcome']

#**Splitting Data**

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.3, random_state=42) # 70% training and 30% test

#**Generating Models using default Hyperparameters**

**Linear Kernel**

In [11]:
#Create a svm Classifier
clf = svm.SVC()

clf_linear = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf_linear.fit(X_train, y_train)

#Predict the response for test dataset
y_pred1 = clf_linear.predict(X_test)

**Polynomial Kernel**

In [12]:
clf_poly = svm.SVC(kernel='poly')     # Polynomial Kernel
clf_poly.fit(X_train, y_train)
y_pred2 = clf_poly.predict(X_test)

**RBF Kernel**

In [13]:
clf_rbf = svm.SVC(kernel='rbf', C = 1, gamma = 0.1)       # RBF Kernel
clf_rbf.fit(X_train, y_train)
y_pred3 = clf_rbf.predict(X_test)

#**Evaluating Models**

In [14]:
predictions = [y_pred1, y_pred2, y_pred3]
kernels = ['linear','polynomial', 'RBF']
for i,j in zip(predictions,kernels):
  print("Accuracy for {} kernel:".format(j),metrics.accuracy_score(y_test, i))

Accuracy for linear kernel: 0.7445887445887446
Accuracy for polynomial kernel: 0.7489177489177489
Accuracy for RBF kernel: 0.6536796536796536


In [15]:
for i,j in zip(predictions,kernels):
  print("Classification report for {} kernel:\n\n".format(j), classification_report(y_test, i))

Classification report for linear kernel:

               precision    recall  f1-score   support

           0       0.80      0.81      0.81       151
           1       0.63      0.62      0.63        80

    accuracy                           0.74       231
   macro avg       0.72      0.72      0.72       231
weighted avg       0.74      0.74      0.74       231

Classification report for polynomial kernel:

               precision    recall  f1-score   support

           0       0.77      0.88      0.82       151
           1       0.69      0.50      0.58        80

    accuracy                           0.75       231
   macro avg       0.73      0.69      0.70       231
weighted avg       0.74      0.75      0.74       231

Classification report for RBF kernel:

               precision    recall  f1-score   support

           0       0.65      1.00      0.79       151
           1       0.00      0.00      0.00        80

    accuracy                           0.65       23

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


So from the results shown above the best model is svm using polynomial kernel. \

Let's do some hypeparametre tuning to enhance the models.

#**Hyperparameters Tuning**

In [16]:
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}

grid = GridSearchCV(clf, param_grid, scoring='accuracy', n_jobs=-1, cv=cv)
 
# fitting the model for grid search
grid.fit(X_train, y_train)
# summarize result
print('Best Score: %s' % grid.best_score_)
print('Best Hyperparameters: %s' % grid.best_params_)

Best Score: 0.7692289774050779
Best Hyperparameters: {'C': 10, 'gamma': 0.0001}


In [None]:
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# defining parameter range
param_grid = {'kernel': ['poly', 'linear', 'rbf']}

grid = GridSearchCV(clf, param_grid, scoring='accuracy', n_jobs=-1, cv=cv)
 
# fitting the model for grid search
grid.fit(X_train, y_train)
# summarize result
print('Best Score: %s' % grid.best_score_)
print('Best Hyperparameters: %s' % grid.best_params_)

#**After Tuning Hyperparameters**

In [None]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear', C = 10, gamma=0.0001)   

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
 print("Accuracy: ",metrics.accuracy_score(y_test, y_pred))

Accuracy:  0.7359307359307359
