In [2]:
# Load libraries
import pandas
import numpy as np
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC



# Load the diabetes dataset
 

In [3]:
# Load dataset
url = "C:\\HU\DSML Workshop\\Classification\\Hands-on\\pima-indians-diabetes_data.txt"
names = ['timespregnant', 'glucose', 'bp', 'skinfold','seruminsolin','bmi','pedigree','age','class']
dataset = pandas.read_csv(url, names=names)

# print the dimension of dataset
print(dataset.shape)

(768, 9)


# Look into your data

In [None]:
# head
print(dataset.head(20))

# descriptions
print(dataset.describe())

# class distribution
print(dataset.groupby('class').size())


## plot scatter plot matrix

In [None]:
# scatter plot matrix
scatter_matrix(dataset)
plt.show()

## Patition the dataset into train (66%) and test (33%)

In [6]:
# Split-out validation dataset
array = dataset.values
X = array[:,0:dataset.shape[1]-1]
Y = array[:,dataset.shape[1]-1]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

print(X_train.shape)
print(Y_train.shape)

print(X_validation.shape)
print(Y_validation.shape)

(614, 8)
(614,)
(154, 8)
(154,)


## Learn a decision tree classifier from training dataset

In [7]:
# with train-test split
DT = DecisionTreeClassifier()
DT.fit(X_train, Y_train)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

## Make predictions on test dataset

In [8]:
predictions = DT.predict(X_validation)
print(predictions)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  1.  0.  0.  0.  1.
  0.  0.  1.  0.  1.  0.  0.  1.  0.  1.  1.  0.  0.  0.  0.  1.  0.  1.
  1.  0.  1.  0.  1.  1.  1.  0.  0.  0.  1.  1.  0.  1.  0.  0.  1.  0.
  1.  1.  1.  0.  0.  0.  1.  1.  0.  1.  1.  1.  0.  1.  1.  0.  1.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  1.  0.
  1.  0.  1.  0.  1.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  1.  0.  0.
  0.  1.  0.  0.  0.  0.  1.  1.  0.  0.  1.  1.  0.  0.  0.  0.  0.  1.
  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.
  0.  1.  1.  0.  1.  0.  0.  1.  0.  0.]


## Evalutate Classifier accuracy

In [9]:
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

0.746753246753
[[79 18]
 [21 36]]
             precision    recall  f1-score   support

        0.0       0.79      0.81      0.80        97
        1.0       0.67      0.63      0.65        57

avg / total       0.74      0.75      0.75       154



## Perform 10-fold cross validation and compute accuracy 

In [10]:
# Cross Validation
seed = 7
scoring = 'accuracy'
kfold = model_selection.KFold(n_splits=20, random_state=seed)
cv_results = model_selection.cross_val_score(DT, X_train, Y_train, cv=kfold, scoring=scoring)
print(cv_results)
print(sum(cv_results)/20)

[ 0.67741935  0.5483871   0.67741935  0.74193548  0.61290323  0.70967742
  0.67741935  0.74193548  0.74193548  0.74193548  0.58064516  0.70967742
  0.61290323  0.80645161  0.66666667  0.8         0.66666667  0.63333333
  0.66666667  0.8       ]
0.690698924731


In [11]:
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(DT, X_train, Y_train, cv=kfold, scoring=scoring)


# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
   

LR: 0.776864 (0.060738)
LDA: 0.773559 (0.058283)
KNN: 0.710153 (0.064599)
CART: 0.680883 (0.054520)
NB: 0.750820 (0.050575)
SVM: 0.656293 (0.044581)
