# Lesson 14: Evaluation Metrics

## 5: Accuracy

In [7]:
#
# In this and the following exercises, you'll be adding train test splits to the data
# to see how it changes the performance of each classifier
#
# The code provided will load the Titanic dataset like you did in project 0, then train
# a decision tree (the method you used in your project) and a Bayesian classifier (as
# discussed in the introduction videos). You don't need to worry about how these work for
# now. 
#
# What you do need to do is import a train/test split, train the classifiers on the
# training data, and store the resulting accuracy scores in the dictionary provided.

import numpy as np
import pandas as pd

# Load the dataset
X = pd.read_csv('datasets/titanic_data.csv')
# Limit to numeric data
X = X._get_numeric_data()
# Separate the labels
y = X['Survived']
# Remove labels from the inputs, and age due to missing data
del X['Age'], X['Survived']

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.

train, test, train_lb, test_lb = train_test_split(X, y)

# The decision tree classifier
clf1 = DecisionTreeClassifier()
clf1.fit(train, train_lb)
print("Decision Tree has accuracy: ", str(accuracy_score(test_lb , clf1.predict(test))))
# The naive Bayes classifier

clf2 = GaussianNB()
clf2.fit(train, train_lb)
print("GaussianNB has accuracy: ", str(accuracy_score(test_lb, clf2.predict(test))))

answer = { 
 "Naive Bayes Score": 0, 
 "Decision Tree Score": 0
}

('Decision Tree has accuracy: ', '0.650224215247')
('GaussianNB has accuracy: ', '0.659192825112')


### 15. Build a confusion matrix

In [15]:
# In this exercise, we'll use the Titanic dataset as before, train two classifiers and
# look at their confusion matrices. Your job is to create a train/test split in the data
# and report the results in the dictionary at the bottom.

import numpy as np
import pandas as pd

# Load the dataset
from sklearn import datasets

X = pd.read_csv('datasets/titanic_data.csv')

X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn import cross_validation

# TODO: split the data into training and testing sets,
# using the default settings for train_test_split (or test_size = 0.25 if specified).
# Then, train and test the classifiers with your newly split data instead of X and y.
train, test, train_lb, test_lb = cross_validation.train_test_split(X, y)


clf1 = DecisionTreeClassifier()
clf1.fit(train, train_lb)
print "Confusion matrix for this Decision Tree:\n",confusion_matrix(test_lb,clf1.predict(test))

clf2 = GaussianNB()
clf2.fit(train, train_lb)
print "GaussianNB confusion matrix:\n",confusion_matrix(test_lb,clf2.predict(test))

#TODO: store the confusion matrices on the test sets below

confusions = {
 "Naive Bayes": confusion_matrix(test_lb,clf1.predict(test)),
 "Decision Tree": confusion_matrix(test_lb,clf2.predict(test))
}

Confusion matrix for this Decision Tree:
[[97 41]
 [39 46]]
GaussianNB confusion matrix:
[[121  17]
 [ 49  36]]


## 26. Precision vs. Recall 

In [18]:
# As with the previous exercises, let's look at the performance of a couple of classifiers
# on the familiar Titanic dataset. Add a train/test split, then store the results in the
# dictionary provided.

import numpy as np
import pandas as pd

# Load the dataset
X = pd.read_csv('datasets/titanic_data.csv')

X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.naive_bayes import GaussianNB
from sklearn import cross_validation


# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
train, test, train_lb, test_lb = cross_validation.train_test_split(X, y)

clf1 = DecisionTreeClassifier()
clf1.fit(train, train_lb)
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall(test_lb,clf1.predict(test)),precision(test_lb,clf1.predict(test)))

clf2 = GaussianNB()
clf2.fit(train, train_lb)
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall(test_lb,clf2.predict(test)),precision(test_lb,clf2.predict(test)))

results = {
  "Naive Bayes Recall": recall(test_lb,clf2.predict(test)),
  "Naive Bayes Precision": precision(test_lb,clf2.predict(test)),
  "Decision Tree Recall": recall(test_lb,clf1.predict(test)),
  "Decision Tree Precision": precision(test_lb,clf1.predict(test))
}

Decision Tree recall: 0.54 and precision: 0.55
GaussianNB recall: 0.40 and precision: 0.67


## 29. Compute F1 Scores

In [19]:
# As usual, use a train/test split to get a reliable F1 score from two classifiers, and
# save it the scores in the provided dictionaries.

import numpy as np
import pandas as pd

# Load the dataset
X = pd.read_csv('datasets/titanic_data.csv')

X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn import cross_validation

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.

x_train, x_test, y_train, y_test = cross_validation.train_test_split(X,y) 

clf1 = DecisionTreeClassifier()
clf1.fit(x_train, y_train)
print "Decision Tree F1 score: {:.2f}".format(f1_score(y_test, clf1.predict(x_test)))

clf2 = GaussianNB()
clf2.fit(x_train, y_train)
print "GaussianNB F1 score: {:.2f}".format(f1_score(y_test, clf2.predict(x_test)))

F1_scores = {
 "Naive Bayes": f1_score(y_test, clf1.predict(x_test)),
 "Decision Tree": f1_score(y_test, clf2.predict(x_test))
}

Decision Tree F1 score: 0.49
GaussianNB F1 score: 0.47


## 32. Compute Mean Absolute Error 

In [20]:
import numpy as np
import pandas as pd

# Load the dataset
from sklearn.datasets import load_linnerud

linnerud_data = load_linnerud()
X = linnerud_data.data
y = linnerud_data.target

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression
from sklearn import cross_validation

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
x_train, x_test, y_train, y_test = cross_validation.train_test_split(X,y)

reg1 = DecisionTreeRegressor()
reg1.fit(x_train, y_train)
print "Decision Tree mean absolute error: {:.2f}".format(mae(y_test,reg1.predict(x_test)))

reg2 = LinearRegression()
reg2.fit(x_train, y_train)
print "Linear regression mean absolute error: {:.2f}".format(mae(y_test,reg2.predict(x_test)))

results = {
 "Linear Regression": mae(y_test,reg2.predict(x_test)),
 "Decision Tree": mae(y_test,reg1.predict(x_test))
}

Decision Tree mean absolute error: 7.87
Linear regression mean absolute error: 10.78


## 34. Compute Mean Squared Error

In [21]:
import numpy as np
import pandas as pd

# Load the dataset
from sklearn.datasets import load_linnerud

linnerud_data = load_linnerud()
X = linnerud_data.data
y = linnerud_data.target

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
from sklearn import cross_validation

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
x_train, x_test, y_train, y_test = cross_validation.train_test_split(X,y)

reg1 = DecisionTreeRegressor()
reg1.fit(x_train, y_train)
print "Decision Tree mean absolute error: {:.2f}".format(mse(y_test, reg1.predict(x_test)))

reg2 = LinearRegression()
reg2.fit(x_train, y_train)
print "Linear regression mean absolute error: {:.2f}".format(mse(y_test, reg2.predict(x_test)))

results = {
 "Linear Regression": mse(y_test, reg2.predict(x_test)),
 "Decision Tree": mse(y_test, reg1.predict(x_test))
}

Decision Tree mean absolute error: 1167.47
Linear regression mean absolute error: 339.76
