In [18]:
########################################
#Classification Metrics
#
#Choice of metrics influences how the 
#performance of machine learning algorithms 
#is measured and compared. They influence 
#how you weight the importance of different 
#characteristics in the results and your 
#ultimate choice of which algorithm to choose.
#
#http://machinelearningmastery.com/metrics-evaluate-machine-learning-algorithms-python/
########################################

# 1. Classification Accuracy

# Classification accuracy is the number of 
# correct predictions made as a ratio of 
# all predictions made.

# This is the most common evaluation metric 
# for classification problems, it is also the 
# most misused. It is really only suitable when 
# there are an equal number of observations in 
# each class (which is rarely the case) and that 
# all predictions and prediction errors are 
# equally important, which is often not the case.

# Cross Validation Classification Accuracy
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("\n1. Classification Accuracy\n")
print("Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))


# 2. Logarthmic Loss

# Logarithmic loss (or logloss) is a performance 
# metric for evaluating the predictions of 
# probabilities of membership to a given class.

# Cross Validation Classification LogLoss
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LogisticRegression()
scoring = 'neg_log_loss'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("\n2. Logarthmic Loss\n")
print("Logloss: %.3f (%.3f)" % (results.mean(), results.std()))


# 3. Area under ROC curve

# for binary classification problems

# The AUC represents a model’s ability to discriminate 
# between positive and negative classes. An area of 
# 1.0 represents a model that made all predictions 
# perfectly. An area of 0.5 represents a model as good as random.

# Cross Validation Classification ROC AUC
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LogisticRegression()
scoring = 'roc_auc'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("\n3. Area under ROC curve\n")
print("AUC: %.3f (%.3f)" % (results.mean(), results.std()))


# 4. Confusion Matrix

# handy presentation of accuracy

# Cross Validation Classification Confusion Matrix
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print("\n4. Confusion Matrix\n")
print(matrix)


# 5. Classification Report

# The classification_report() function displays 
#the precision, recall, f1-score and support for each class.

# Cross Validation Classification Report
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print("\n5. Classification Report\n")
print(report)


1. Classification Accuracy

Accuracy: 0.770 (0.048)

2. Logarthmic Loss

Logloss: -0.493 (0.047)

3. Area under ROC curve

AUC: 0.824 (0.041)

4. Confusion Matrix

[[141  21]
 [ 41  51]]

5. Classification Report

             precision    recall  f1-score   support

        0.0       0.77      0.87      0.82       162
        1.0       0.71      0.55      0.62        92

avg / total       0.75      0.76      0.75       254

