Linear SVM over diabetic dataset


In [1]:
# importing required libraries
import numpy as np #nd-arrays
import pandas as pd #read data from datasources
from sklearn.preprocessing import LabelEncoder #Encode numerical varible into categorical variables
from sklearn.preprocessing import StandardScaler #Standardize the data using mean and std
from sklearn.model_selection import train_test_split #split the data into train and test
from sklearn.metrics import accuracy_score,confusion_matrix #evaluate a model
from sklearn.svm import LinearSVC, SVC #develop svm based clssification models
from sklearn.model_selection import GridSearchCV #tune the hyperparamters
from sklearn.metrics import average_precision_score, make_scorer, recall_score #design custom scoring functions

In [13]:
#Reading the data into dataframe
diabetes_df=pd.read_excel('data_file.xlsx')
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2703 entries, 0 to 2702
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   GENDER    2703 non-null   int64  
 1   AGE       2703 non-null   int64  
 2   Height    2703 non-null   int64  
 3   Weight    2703 non-null   float64
 4   BMI       2703 non-null   float64
 5   BAI       2703 non-null   float64
 6   HBA1C1    2703 non-null   float64
 7   OGTT1FBS  2703 non-null   int64  
 8   NDD       2703 non-null   int64  
dtypes: float64(4), int64(5)
memory usage: 190.1 KB


In [4]:
#work on the copy of the dataset
clean_df=diabetes_df.copy()
#Pruning the data
clean_df.drop_duplicates(keep = 'first',inplace=True)
print(clean_df.shape)

(1065, 9)


In [9]:
clean_df.head()


Unnamed: 0,GENDER,AGE,Height,Weight,BMI,BAI,HBA1C1,OGTT1FBS,NDD
0,1,37,156,88.0,36.160421,41.53,5.1,102,0
1,0,35,146,56.0,26.271345,34.15,5.0,91,0
2,1,54,160,76.0,29.6875,28.45,5.4,73,0
3,0,46,154,64.0,26.986001,33.28,6.0,151,0
4,0,70,156,55.0,22.600263,21.52,5.6,142,0


In [10]:
#Assigning labels with appropriate numerics
nondia=0
diabetic=1
#create a dataframe and assign a column to indicate the diabetic status
Ynew = pd.DataFrame(nondia, index=clean_df.index, columns=['diabetic'])
#Identify the diabetic status of each record using the blood test results of FBS or HBA1C1
Ynew.iloc[list(np.where((clean_df.OGTT1FBS>=126) | (clean_df.HBA1C1>=6.5))[0])]=diabetic
#Concatenate the diabetic status with the anthroprometric features of the dataset
data_df=pd.concat([clean_df.iloc[:,:6],Ynew],axis=1)


In [19]:
data_df.head()


Unnamed: 0,GENDER,AGE,Height,Weight,BMI,BAI,diabetic
0,1,37,156,88.0,36.160421,41.53,0
1,0,35,146,56.0,26.271345,34.15,0
2,1,54,160,76.0,29.6875,28.45,0
3,0,46,154,64.0,26.986001,33.28,1
4,0,70,156,55.0,22.600263,21.52,1


In [20]:
#Find the diabetic and non-diabetic patients
diabetic_yes=data_df.iloc[list(np.where(data_df.diabetic==diabetic)[0])]
diabetic_no=data_df.iloc[list(np.where(data_df.diabetic==nondia)[0])]

In [23]:
diabetic_no.describe()

Unnamed: 0,GENDER,AGE,Height,Weight,BMI,BAI,diabetic
count,537.0,537.0,537.0,537.0,537.0,537.0,537.0
mean,0.450652,44.463687,158.217877,68.325885,27.261189,31.687058,0.0
std,0.498023,11.890164,7.930377,13.118822,4.770046,8.276824,0.0
min,0.0,20.0,139.0,33.5,13.671875,14.08,0.0
25%,0.0,35.0,153.0,61.0,24.508946,25.65,0.0
50%,0.0,44.0,158.0,68.0,26.95984,30.14,0.0
75%,1.0,52.0,162.0,76.0,29.757785,38.26,0.0
max,1.0,84.0,186.0,110.0,50.219138,59.76,0.0


In [24]:
diabetic_yes.describe()

Unnamed: 0,GENDER,AGE,Height,Weight,BMI,BAI,diabetic
count,528.0,528.0,528.0,528.0,528.0,528.0,528.0
mean,0.57197,51.8125,160.280303,68.404356,26.640045,29.618864,1.0
std,0.495262,10.921528,7.56233,12.759664,4.809005,7.753363,0.0
min,0.0,25.0,138.0,36.5,15.390454,8.3,1.0
25%,0.0,43.75,156.0,59.0,23.290154,24.79,1.0
50%,1.0,50.0,159.5,68.0,26.527004,28.145,1.0
75%,1.0,59.0,165.0,76.0,29.585799,33.8475,1.0
max,1.0,80.0,186.0,102.0,41.207076,58.25,1.0


In [25]:
train_x, test_x, train_y, test_y = train_test_split(data_df.iloc[:,:6],
 data_df.diabetic,
test_size=0.3, random_state=43)


Normalizing the data using StandardScaler

In [26]:
sc=StandardScaler() # creating an instance for StandardScaler class
train_x=sc.fit_transform(train_x) # estimate mu and sigma for train set and transform
test_x=sc.transform(test_x) # transform the test set

In [29]:
# fetch the diabetic records from train set
diabetic_yes_train=train_x[list(np.where(train_y==diabetic)[0])]
# fetch the non-diabetic records from train set
diabetic_no_train=train_x[list(np.where(train_y==nondia)[0])]
# display the counts for each class
print('non-diabetic=',diabetic_no_train.shape,'diabetic=',diabetic_yes_train.shape)

non-diabetic= (369, 6) diabetic= (376, 6)


In [30]:
# fetch the diabetic records from test set
diabetic_yes_test=test_x[list(np.where(test_y==diabetic)[0])]
# fetch the non-diabetic records from test set
diabetic_no_test=test_x[list(np.where(test_y==nondia)[0])]
# display the counts for each class from test set
print('non-diabetic=',diabetic_no_test.shape,'diabetic=',diabetic_yes_test.shape)

non-diabetic= (168, 6) diabetic= (152, 6)


Evaluation metrics


In [32]:
#evaluate a model using confusion matrix and accuracy score between true and actual
def evaluate(yt,yp):
 cf=confusion_matrix(yt,yp) #estimate confusion matrix
 acc=accuracy_score(yt,yp) # estimate accuracy of the model
 return cf,acc
# Display metrics
def display(yt,yp,model):
 cf,acc = evaluate(yt,yp)
 print('Model=',model,'\ncf=',cf,'\n')#,'\nacc=',acc,'\n')


Linear SVM


In [33]:
#Perform Classification using Linear SVM
lsvc = LinearSVC(random_state=0,C=10,max_iter=100000) # create a LinearSVC instance
lsvc.fit(train_x, train_y) # fit the model for trainset
train_yp=lsvc.predict(train_x) # predict the y for train set
test_yp=lsvc.predict(test_x) # predict the y for test set

In [34]:
# display the results
display(train_y,train_yp,'Linear SVC: Validation')
display(test_y,test_yp,'Linear SVC: Testing')

Model= Linear SVC: Validation 
cf= [[232 137]
 [140 236]] 

Model= Linear SVC: Testing 
cf= [[106  62]
 [ 61  91]] 



In [36]:
lsvc.coef_

array([[ 0.03428288,  0.30612863,  0.47749737, -0.81865936,  0.75756727,
        -0.04616451]])

In [37]:
lsvc.intercept_

array([0.00939524])

In [40]:
# rescaling the coefficients to original scale of the features of X
rescaled_coef=lsvc.coef_/np.sqrt(sc.var_)
rescaled_coef

array([[ 0.03428288,  0.30612863,  0.47749737, -0.81865936,  0.75756727,
        -0.04616451]])

In [39]:
# the intercept in the original feature space
rescaled_intercept=rescaled_coef.dot(sc.mean_.T)+lsvc.intercept_
rescaled_intercept

array([0.00939524])

In [41]:
# display the counts for each class
print('non-diabetic=',diabetic_no_train.shape,'diabetic=',diabetic_yes_train.shape)
display(train_y,train_yp,'Linear SVC: Validation')

non-diabetic= (369, 6) diabetic= (376, 6)
Model= Linear SVC: Validation 
cf= [[232 137]
 [140 236]] 



In [42]:
# identifiy the slacks for each class
non_dia_slacks=(lsvc.coef_.dot(diabetic_yes_train.T)+lsvc.intercept_)
np.sum(non_dia_slacks<0)

140

In [43]:
dia_slacks=(lsvc.coef_.dot(diabetic_no_train.T)+lsvc.intercept_)
np.sum(dia_slacks>0)

137

Hyperparamter tuning using Gridsearchcv


In [44]:
# recall = tp / (tp + fn) = Sensitivity or True Positive Rate / True Negative Rate
# precision = tp / (tp + fp) = Positive predictive value
custom_scorer = {'recall':make_scorer(recall_score, pos_label=diabetic),
 'precision':make_scorer(average_precision_score, pos_label=diabetic)}

In [45]:
custom_scorer

{'recall': make_scorer(recall_score, pos_label=1),
 'precision': make_scorer(average_precision_score, pos_label=1)}

In [None]:
gscv = GridSearchCV(LinearSVC(max_iter=1e7), {'C':[1e-5,1e-4,1e-3,1e-2,1e-1,1,10,100,1000]},
 cv=5,verbose=False,scoring=custom_scorer,refit='recall')
gscv.fit(train_x,train_y)
gscv.best_params_


In [None]:
# display the results
display(train_y,train_yp,'For C=10: Training')
#display(test_y,test_yp,'For C=10: Testing')
#Perform Classification using Linear SVM
lsvc = LinearSVC(random_state=0,C=0.001,max_iter=100000) # create a LinearSVC instance
lsvc.fit(train_x, train_y) # fit the model for trainset
train_yp=lsvc.predict(train_x) # predict the y for train set
test_yp=lsvc.predict(test_x) # predict the y for test set
# display the results
display(train_y,train_yp,'For C=0.001: Training')
#display(test_y,test_yp,'For C=0.001: Testing')