# Simple classification of diabetes data with scikit-learn

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# load PIMA indiants diabetes dataset
diabetes = pd.read_csv('pima-indians-diabetes.csv')

In [3]:
diabetes

Unnamed: 0,1. Number of times pregnant,2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test,3. Diastolic blood pressure (mm Hg),4. Triceps skin fold thickness (mm),5. 2-Hour serum insulin (mu U/ml),6. Body mass index (weight in kg/(height in m)^2),7. Diabetes pedigree function,8. Age (years),9. Class variable (0 or 1)
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
# rename columns to short names
diabetes.columns = [
    "NumTimesPrg", "PlGlcConc", "BloodP",
    "SkinThick", "TwoHourSerIns", "BMI",
    "DiPedFunc", "Age", "HasDiabetes"]

In [5]:
# fill zero values with the mean of the column values
for col in ['PlGlcConc', 'BloodP', 'SkinThick', 'TwoHourSerIns', 'BMI']:
    median_val = diabetes[col].mean()
    diabetes[col] = diabetes[col].replace(to_replace=0, value=median_val)

In [6]:
# split to train/test set
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(diabetes, test_size=0.05, random_state=42)

In [7]:
train_set

Unnamed: 0,NumTimesPrg,PlGlcConc,BloodP,SkinThick,TwoHourSerIns,BMI,DiPedFunc,Age,HasDiabetes
78,0,131.0,69.105469,20.536458,79.799479,43.2,0.270,26,1
482,4,85.0,58.000000,22.000000,49.000000,27.8,0.306,28,0
456,1,135.0,54.000000,20.536458,79.799479,26.7,0.687,62,0
30,5,109.0,75.000000,26.000000,79.799479,36.0,0.546,60,0
635,13,104.0,72.000000,20.536458,79.799479,31.2,0.465,38,1
...,...,...,...,...,...,...,...,...,...
71,5,139.0,64.000000,35.000000,140.000000,28.6,0.411,26,0
106,1,96.0,122.000000,20.536458,79.799479,22.4,0.207,27,0
270,10,101.0,86.000000,37.000000,79.799479,45.6,1.136,38,1
435,0,141.0,69.105469,20.536458,79.799479,42.4,0.205,29,1


In [8]:
# split to train/test inputs/labels
train_set_labels = train_set["HasDiabetes"].copy()
train_set = train_set.drop("HasDiabetes", axis=1)
test_set_labels = test_set["HasDiabetes"].copy()
test_set = test_set.drop("HasDiabetes", axis=1)

In [9]:
# scale input data, learn scaling parameters from train set
from sklearn.preprocessing import MinMaxScaler as Scaler
scaler = Scaler()
scaler.fit(train_set)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [10]:
train_set_scaled = scaler.transform(train_set)
test_set_scaled = scaler.transform(test_set)

In [11]:
# convert to arrays
X_train = train_set_scaled
X_test = test_set_scaled

y_train = np.asarray(train_set_labels)
y_test = np.asarray(test_set_labels)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score

In [13]:
# Prepare an array with all the algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVC', SVC()))
models.append(('LSVC', LinearSVC()))
models.append(('RFC', RandomForestClassifier()))
models.append(('DTR', DecisionTreeRegressor()))
models

[('LR',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False)),
 ('KNN',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                       metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                       weights='uniform')),
 ('NB', GaussianNB(priors=None, var_smoothing=1e-09)),
 ('SVC',
  SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, probability=False, random_state=None, shrinking=True,
      tol=0.001, verbose=False)),
 ('LSVC', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
            intercept_scaling=1, loss='squared_hi

In [14]:
names = []
scores = []
for name, model_classic in models:
    model_classic.fit(X_train, y_train)
    y_pred = model_classic.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    names.append(name)

tr_split = pd.DataFrame({'Name': names, 'Score': scores})
tr_split

Unnamed: 0,Name,Score
0,LR,0.717949
1,KNN,0.794872
2,NB,0.692308
3,SVC,0.717949
4,LSVC,0.769231
5,RFC,0.692308
6,DTR,0.692308
