# Simple classification of diabetes data with scikit-learn

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# load PIMA indiants diabetes dataset
diabetes = pd.read_csv('pima-indians-diabetes.csv')

In [None]:
diabetes

In [None]:
# rename columns to short names
diabetes.columns = [
    "NumTimesPrg", "PlGlcConc", "BloodP",
    "SkinThick", "TwoHourSerIns", "BMI",
    "DiPedFunc", "Age", "HasDiabetes"]

In [None]:
# fill zero values with the mean of the column values
for col in ['PlGlcConc', 'BloodP', 'SkinThick', 'TwoHourSerIns', 'BMI']:
    median_val = diabetes[col].mean()
    diabetes[col] = diabetes[col].replace(to_replace=0, value=median_val)

In [None]:
# split to train/test set
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(diabetes, test_size=0.05, random_state=42)

In [None]:
train_set

In [None]:
# split to train/test inputs/labels
train_set_labels = train_set["HasDiabetes"].copy()
train_set = train_set.drop("HasDiabetes", axis=1)
test_set_labels = test_set["HasDiabetes"].copy()
test_set = test_set.drop("HasDiabetes", axis=1)

In [None]:
# scale input data, learn scaling parameters from train set
from sklearn.preprocessing import MinMaxScaler as Scaler
scaler = Scaler()
scaler.fit(train_set)

In [None]:
train_set_scaled = scaler.transform(train_set)
test_set_scaled = scaler.transform(test_set)

In [None]:
# convert to arrays
X_train = train_set_scaled
X_test = test_set_scaled

y_train = np.asarray(train_set_labels)
y_test = np.asarray(test_set_labels)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score

In [None]:
# Prepare an array with all the algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVC', SVC()))
models.append(('LSVC', LinearSVC()))
models.append(('RFC', RandomForestClassifier()))
models.append(('DTR', DecisionTreeRegressor()))
models

In [None]:
names = []
scores = []
for name, model_classic in models:
    model_classic.fit(X_train, y_train)
    y_pred = model_classic.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    names.append(name)

tr_split = pd.DataFrame({'Name': names, 'Score': scores})
tr_split