In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import graphviz
from graphviz import Graph

df = data('iris')
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [2]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test


In [3]:
train, validate, test = train_validate_test_split(df, target='Species', seed=123)

X_train = train.drop(columns=['Species'])
y_train = train.Species

X_validate = validate.drop(columns=['Species'])
y_validate = validate.Species

X_test = test.drop(columns=['Species'])
y_test = test.Species

In [4]:
# for classification you can change the algorithm to gini or entropy (information gain).  
# Default is gini.

clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [5]:
# model.fit(X, y)

clf = clf.fit(X_train, y_train)

In [7]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('iris_decision_tree', view=True)

ExecutableNotFound: failed to execute ['dot', '-Kdot', '-Tpdf', '-O', 'iris_decision_tree'], make sure the Graphviz executables are on your systems' PATH

In [38]:
# make prediction on train obeservations
y_pred = clf.predict(X_train)
y_pred[0:5]

array(['versicolor', 'setosa', 'virginica', 'versicolor', 'setosa'],
      dtype=object)

In [14]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.03703704, 0.96296296],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ]])

In [15]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.96


Create a confusion matrix

    True Positive: number of occurrences where y is true and y is predicted true.
    True Negative: number of occurrences where y is false and y is predicted false.
    False Positive: number of occurrences where y is false and y is predicted true.
    False Negative: number of occurrences where y is true and y is predicted false.


In [16]:
# confusion matrix

confusion_matrix(y_train, y_pred)


array([[28,  0,  0],
       [ 0, 27,  1],
       [ 0,  2, 26]])

In [17]:
y_train.value_counts()

setosa        28
virginica     28
versicolor    28
Name: Species, dtype: int64

In [18]:
import pandas as pd

labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,setosa,versicolor,virginica
setosa,28,0,0
versicolor,0,27,1
virginica,0,2,26


In [19]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        28
  versicolor       0.93      0.96      0.95        28
   virginica       0.96      0.93      0.95        28

    accuracy                           0.96        84
   macro avg       0.96      0.96      0.96        84
weighted avg       0.96      0.96      0.96        84



# Test with out-of-sample data

In [20]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.94


In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [8]:
# Exercises
# Using the titanic data, in your classification-exercises repository, create a notebook, 
# model.ipynb where you will do the following:
titanic = data('titanic')
titanic.head()

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [14]:
train, validate, test = train_validate_test_split(titanic, target= 'sex', seed=123)

In [16]:
titanic.value_counts()

class      age     sex    survived
3rd class  adults  man    no          387
2nd class  adults  man    no          154
1st class  adults  women  yes         140
                   man    no          118
3rd class  adults  women  no           89
2nd class  adults  women  yes          80
3rd class  adults  women  yes          76
                   man    yes          75
1st class  adults  man    yes          57
3rd class  child   man    no           35
                   women  no           17
2nd class  adults  man    yes          14
3rd class  child   women  yes          14
2nd class  child   women  yes          13
           adults  women  no           13
3rd class  child   man    yes          13
2nd class  child   man    yes          11
1st class  child   man    yes           5
           adults  women  no            4
           child   women  yes           1
dtype: int64

In [None]:
# 1) What is your baseline prediction? What is your baseline accuracy? remember: your baseline 
# prediction for a classification problem is predicting the most prevelant class in the training 
# dataset (the mode). When you make those predictions, what is your accuracy? This is your 
# baseline accuracy.

In [11]:
X_train = train.drop(columns=['sex'])
y_train = train.sex

X_validate = validate.drop(columns=['sex'])
y_validate = validate.sex

X_test = test.drop(columns=['sex'])
y_test = test.sex

In [12]:
# for classification you can change the algorithm to gini or entropy (information gain).  
# Default is gini.

clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
# 2) Fit the decision tree classifier to your training sample and transform 
# (i.e. make predictions on the training sample)


In [13]:
# model.fit(X, y)

clf = clf.fit(X_train, y_train)

ValueError: could not convert string to float: '3rd class'

In [None]:
# 3) Evaluate your in-sample results using the model score, confusion matrix, 
# and classification report.


In [None]:
# 4) Compute: Accuracy, true positive rate, false positive rate, true negative rate, 
# false negative rate, precision, recall, f1-score, and support.

In [22]:
# 5) Run through steps 2-4 using a different max_depth value.

In [None]:
# 6) Which model performs better on your in-sample data?


In [None]:
# 7) 
Which model performs best on your out-of-sample data, the validate set?
