##### imports

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC, OneClassSVM
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

# Support Vector Machines
## Classification

##### read data

In [2]:
diabetes_data = pd.read_csv('data/diabetic_data.csv')
diabetes_data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


##### check all dtypes

In [3]:
diabetes_data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

##### separate attributes and targets from the dataset

In [4]:
diabetes_data_attr = diabetes_data.drop('readmitted', axis=1)
diabetes_data_trg = diabetes_data['readmitted']

##### make attr One-Hot Encoding for Categorical Data (pandas.get_dummies)

In [5]:
diabetes_data_attr =pd.get_dummies(diabetes_data_attr)

##### check the new shape of the data

In [6]:
diabetes_data_attr.shape

(101766, 2472)

##### scaling the attributes

In [7]:
scalar = MinMaxScaler()
diabetes_data_attr_scaled = scalar.fit_transform(diabetes_data_attr)

##### check the scaling is ok -> min must be 0 and max must be 1

In [8]:
diabetes_data_attr_scaled.min(axis=0), diabetes_data_attr_scaled.max(axis=0)

(array([0., 0., 0., ..., 0., 0., 0.]), array([1., 1., 1., ..., 1., 1., 1.]))

##### split the data to use just a part of it, couse SVM is slow function

In [9]:
all_data, _, all_targets, _ = train_test_split(diabetes_data_attr_scaled, diabetes_data_trg, train_size=0.1)

##### this time split for train and test set

In [10]:
diab_train, diab_test, targ_train, targ_test = train_test_split(all_data, all_targets, train_size=0.2, stratify=all_targets)

##### create SVM (Linear SVC) and train it

In [11]:
svm = LinearSVC(C = 10, max_iter=1e5)
svm.fit(diab_train, targ_train)

LinearSVC(C=10, max_iter=100000.0)

##### check the coef -> similar to logistic regression

In [12]:
svm.coef_

array([[-0.08527889,  0.22850809, -0.22344474, ...,  0.17837784,
         0.1572933 , -0.16443653],
       [-0.75167839,  0.60097078,  0.04597542, ..., -0.12826211,
        -0.24078219,  0.16922543],
       [ 0.76821091, -0.57289241,  0.05787692, ...,  0.0153979 ,
         0.14200224, -0.13018017]])

##### create gaussian svm

In [13]:
gaussian_svm = SVC(kernel='rbf', C=1e5)
gaussian_svm.fit(diab_train, targ_train)

SVC(C=100000.0)

##### classification_report with train set

In [14]:
print(classification_report(targ_train, svm.predict(diab_train)))

              precision    recall  f1-score   support

         <30       0.84      0.59      0.69       233
         >30       0.79      0.75      0.77       704
          NO       0.82      0.90      0.86      1098

    accuracy                           0.81      2035
   macro avg       0.82      0.75      0.77      2035
weighted avg       0.81      0.81      0.81      2035



In [15]:
print(classification_report(targ_train, gaussian_svm.predict(diab_train)))

              precision    recall  f1-score   support

         <30       1.00      1.00      1.00       233
         >30       1.00      1.00      1.00       704
          NO       1.00      1.00      1.00      1098

    accuracy                           1.00      2035
   macro avg       1.00      1.00      1.00      2035
weighted avg       1.00      1.00      1.00      2035



##### classification_report with test set

In [16]:
print(classification_report(targ_test, svm.predict(diab_test)))

              precision    recall  f1-score   support

         <30       0.14      0.10      0.12       932
         >30       0.41      0.39      0.40      2816
          NO       0.59      0.63      0.61      4393

    accuracy                           0.49      8141
   macro avg       0.38      0.38      0.38      8141
weighted avg       0.48      0.49      0.48      8141



In [17]:
print(classification_report(targ_test, gaussian_svm.predict(diab_test)))

              precision    recall  f1-score   support

         <30       0.17      0.13      0.15       932
         >30       0.41      0.41      0.41      2816
          NO       0.59      0.63      0.61      4393

    accuracy                           0.49      8141
   macro avg       0.39      0.39      0.39      8141
weighted avg       0.48      0.49      0.49      8141



##### create Kneighbors

In [18]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(diab_train, targ_train)

KNeighborsClassifier()

##### predict Kneighbors

In [19]:
knn.predict(diab_train)

array(['>30', 'NO', '<30', ..., 'NO', 'NO', 'NO'], dtype=object)

##### classification_report with train set

In [20]:
print(classification_report(targ_train, knn.predict(diab_train)))

              precision    recall  f1-score   support

         <30       0.48      0.33      0.39       233
         >30       0.57      0.63      0.60       704
          NO       0.74      0.74      0.74      1098

    accuracy                           0.66      2035
   macro avg       0.60      0.57      0.58      2035
weighted avg       0.65      0.66      0.65      2035



##### classification_report with test set

In [21]:
print(classification_report(targ_test, knn.predict(diab_test)))

              precision    recall  f1-score   support

         <30       0.13      0.08      0.10       932
         >30       0.37      0.40      0.38      2816
          NO       0.57      0.59      0.58      4393

    accuracy                           0.47      8141
   macro avg       0.36      0.36      0.36      8141
weighted avg       0.45      0.47      0.46      8141



##### let see how to detect abnormal data

In [22]:
anomaly_detector = OneClassSVM(nu=0.02)
anomaly_detector.fit(diab_train)

OneClassSVM(nu=0.02)

In [23]:
predictions = anomaly_detector.predict(diab_train)
predictions.sum() / len(predictions)

0.9488943488943489