In [4]:
# read the data into a pandas DataFrame
import pandas as pd
path = 'data/pima-indians-diabetes.data'
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']

pima = pd.read_csv(path, header=None, names=col_names)
pima

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [5]:
feature_cols = ['pregnant', 'insulin', 'bmi', 'age']
X = pima[feature_cols]
y = pima.label

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
answers= logreg.predict(X_test)
scores= cross_val_score(logreg, cv=10,X=X_test,y= y_test)

In [34]:
scores.mean()

0.6928947368421052

##  How to know how much we can trust this score?

1 : **Classification accuracy:** percentage of correct predictions

In [37]:
from sklearn import metrics
print(metrics.accuracy_score(y_test, answers))

0.6770833333333334


2: **Null accuracy:** accuracy that could be achieved by always predicting the most frequent class

In [38]:
y_test.value_counts()

0    130
1     62
Name: label, dtype: int64

In [39]:
y_test.mean()

0.3229166666666667

In [40]:
1 - y_test.mean()

0.6770833333333333

In [41]:
# calculate null accuracy (for binary classification problems coded as 0/1)
max(y_test.mean(), 1 - y_test.mean())

0.6770833333333333

In [42]:
# calculate null accuracy (for multi-class classification problems)
y_test.value_counts().head(1) / len(y_test)

0    0.677083
Name: label, dtype: float64

3: Comparing the **true** and **predicted** response values

In [44]:
print('True:', y_test.values[0:25])
print('Pred:', answers[0:25])

True: [1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0]
Pred: [0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


**Conclusion:**

- Classification accuracy is the **easiest classification metric to understand**
- But, it does not tell you the **underlying distribution** of response values
- And, it does not tell you what **"types" of errors** your classifier is making

## Confusion matrix

Table that describes the performance of a classification model

  **Basic terminology**

- **True Positives (TP):** we *correctly* predicted that they *do* have diabetes
- **True Negatives (TN):** we *correctly* predicted that they *don't* have diabetes
- **False Positives (FP):** we *incorrectly* predicted that they *do* have diabetes (a "Type I error")
- **False Negatives (FN):** we *incorrectly* predicted that they *don't* have diabetes (a "Type II error")

In [85]:
# IMPORTANT: first argument is true values, second argument is predicted values
from sklearn import metrics

tn,fp,fn,tp= metrics.confusion_matrix(y_test, answers).ravel()
print('\t  predicted 0\t predicted 1')
print('actual 0  ',z[0][0],'\t \t ',z[0][1])
print('actual 1  ',z[1][0],'\t \t ',z[1][1])


	  predicted 0	 predicted 1
actual 0   114 	 	  16
actual 1   46 	 	  16
