In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import graphviz
from graphviz import Graph

df = data('iris')
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [2]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test


In [3]:
train, validate, test = train_validate_test_split(df, target='Species', seed=123)

X_train = train.drop(columns=['Species'])
y_train = train.Species

X_validate = validate.drop(columns=['Species'])
y_validate = validate.Species

X_test = test.drop(columns=['Species'])
y_test = test.Species

In [4]:
# for classification you can change the algorithm to gini or entropy (information gain).  
# Default is gini.

clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [5]:
# model.fit(X, y)

clf = clf.fit(X_train, y_train)

In [6]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('iris_decision_tree', view=True)

'iris_decision_tree.pdf'

In [7]:
# make prediction on train obeservations
y_pred = clf.predict(X_train)
y_pred[0:5]

array(['versicolor', 'setosa', 'virginica', 'versicolor', 'setosa'],
      dtype=object)

In [8]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.03703704, 0.96296296],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ]])

In [9]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.96


Create a confusion matrix

    True Positive: number of occurrences where y is true and y is predicted true.
    True Negative: number of occurrences where y is false and y is predicted false.
    False Positive: number of occurrences where y is false and y is predicted true.
    False Negative: number of occurrences where y is true and y is predicted false.


In [10]:
# confusion matrix

confusion_matrix(y_train, y_pred)


array([[28,  0,  0],
       [ 0, 27,  1],
       [ 0,  2, 26]])

In [11]:
y_train.value_counts()

versicolor    28
setosa        28
virginica     28
Name: Species, dtype: int64

In [12]:
import pandas as pd

labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,setosa,versicolor,virginica
setosa,28,0,0
versicolor,0,27,1
virginica,0,2,26


In [13]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        28
  versicolor       0.93      0.96      0.95        28
   virginica       0.96      0.93      0.95        28

    accuracy                           0.96        84
   macro avg       0.96      0.96      0.96        84
weighted avg       0.96      0.96      0.96        84



# Test with out-of-sample data

In [14]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.94


In [15]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [130]:
# Exercises
# Using the titanic data, in your classification-exercises repository, create a notebook, 
# model.ipynb where you will do the following:
from aquire import get_titanic_data
from prepare import train_validate_test_split
df = get_titanic_data()
df = df.drop(columns='deck')
df = df[~ df.age.isna()]
df = df[~ df.embarked.isna()]
df = df.drop(columns=['embarked', 'class', 'passenger_id'])
df.head() 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,3,male,22.0,1,0,7.25,Southampton,0
1,1,1,female,38.0,1,0,71.2833,Cherbourg,0
2,1,3,female,26.0,0,0,7.925,Southampton,1
3,1,1,female,35.0,1,0,53.1,Southampton,0
4,0,3,male,35.0,0,0,8.05,Southampton,1


In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     712 non-null    int64  
 1   pclass       712 non-null    int64  
 2   sex          712 non-null    object 
 3   age          712 non-null    float64
 4   sibsp        712 non-null    int64  
 5   parch        712 non-null    int64  
 6   fare         712 non-null    float64
 7   embark_town  712 non-null    object 
 8   alone        712 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 55.6+ KB


In [132]:
df.pclass.value_counts()

3    355
1    184
2    173
Name: pclass, dtype: int64

In [133]:
def split(df, stratify_by= 'pclass'):
    """
    Crude train, validate, test split
    To stratify, send in a column name
    """
    
    if stratify_by == 'pclass':
        train, test = train_test_split(df, test_size=.3, random_state=123)
        train, validate = train_test_split(df, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(df, test_size=.3, random_state=123, stratify=train[stratify_by])
    
    return train, validate, test

In [134]:
# 1) What is your baseline prediction? What is your baseline accuracy? remember: your baseline 
# prediction for a classification problem is predicting the most prevelant class in the training 
# dataset (the mode). When you make those predictions, what is your accuracy? This is your 
# baseline accuracy.

In [135]:
# aquire
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,3,male,22.0,1,0,7.25,Southampton,0
1,1,1,female,38.0,1,0,71.2833,Cherbourg,0
2,1,3,female,26.0,0,0,7.925,Southampton,1
3,1,1,female,35.0,1,0,53.1,Southampton,0
4,0,3,male,35.0,0,0,8.05,Southampton,1


In [136]:
df["is_female"] = df.sex == "Female"

In [137]:
dummy_df = pd.get_dummies(df[["embark_town"]], drop_first=True)
dummy_df

Unnamed: 0,embark_town_Queenstown,embark_town_Southampton
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
885,1,0
886,0,1
887,0,1
889,0,0


In [138]:
df = pd.concat([df, dummy_df], axis=1)

In [139]:
df = df.drop(columns=['sex', 'embark_town'])
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,0,False,0,1
1,1,1,38.0,1,0,71.2833,0,False,0,0
2,1,3,26.0,0,0,7.925,1,False,0,1
3,1,1,35.0,1,0,53.1,0,False,0,1
4,0,3,35.0,0,0,8.05,1,False,0,1


In [142]:
train, validate, test = split(df, stratify_by="pclass")
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton
605,0,3,36.0,1,0,15.55,0,False,0,1
197,0,3,42.0,0,1,8.4042,0,False,0,1
56,1,2,21.0,0,0,10.5,1,False,0,1
645,1,1,48.0,1,0,76.7292,0,False,0,0
356,1,1,22.0,0,1,55.0,0,False,0,1


In [141]:
# We will be making a tree model that will predict survival on the titanic that prefroms better on the baseline 

In [143]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['pclass'])
y_train = train.pclass # labeled data == supervise algorithm

X_validate = validate.drop(columns=['pclass'])
y_validate = validate.pclass

X_test = test.drop(columns=['pclass'])
y_test = test.pclass

In [123]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton
605,0,3,36.0,1,0,15.55,0,False,0,1
197,0,3,42.0,0,1,8.4042,0,False,0,1
56,1,2,21.0,0,0,10.5,1,False,0,1
645,1,1,48.0,1,0,76.7292,0,False,0,0
356,1,1,22.0,0,1,55.0,0,False,0,1


In [124]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [125]:
# Now let's train our model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=3, random_state=123)

In [144]:
# Visualize the model so iut can explain itself!
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [90]:
# Now we'll make a set of predictions using this trained model
y_pred = clf.predict(X_train)
y_pred[0:3]

array([3, 3, 2])

In [91]:
# Estimate the probabilities for each class
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

array([[0.        , 0.18333333, 0.81666667],
       [0.        , 0.        , 1.        ],
       [0.        , 0.88888889, 0.11111111]])

In [92]:
y_train.head(3)

605    3
197    3
56     2
Name: pclass, dtype: int64

In [95]:
# Baseline prediction = 3
# Baseline accuracy = 3
train["most_frequent"] = 3
baseline_accuracy = (train.pclass == train.most_frequent).mean()
baseline_accuracy 

0.4919678714859438

In [96]:
# 2) Fit the decision tree classifier to your training sample and transform 
# (i.e. make predictions on the training sample)
# model.fit(X, y)


In [97]:
# 3) Evaluate your in-sample results using the model score, confusion matrix, 
# and classification report.

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

print(classification_report(y_train, y_pred))

Accuracy of Decision Tree classifier on training set: 0.86
              precision    recall  f1-score   support

           1       0.88      0.98      0.93       134
           2       0.71      0.81      0.75       119
           3       0.95      0.82      0.88       245

    accuracy                           0.86       498
   macro avg       0.84      0.87      0.85       498
weighted avg       0.87      0.86      0.86       498



In [98]:
# Out of sample data
clf.score(X_validate, y_validate)

0.8644859813084113

In [99]:
# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.86


In [100]:
# Use the classification model trained on train data to make predictions on validate data
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([1, 3, 1])

In [101]:
y_validate.head(3)

509    3
640    3
537    1
Name: pclass, dtype: int64

In [102]:
# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           1       0.91      0.98      0.94        50
           2       0.76      0.76      0.76        54
           3       0.90      0.86      0.88       110

    accuracy                           0.86       214
   macro avg       0.85      0.87      0.86       214
weighted avg       0.86      0.86      0.86       214



In [None]:
# 4) Compute: Accuracy, true positive rate, false positive rate, true negative rate, 
# false negative rate, precision, recall, f1-score, and support.

In [22]:
# 5) Run through steps 2-4 using a different max_depth value.

In [70]:
clf = DecisionTreeClassifier(max_depth=4, random_state=123)

In [71]:
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=4, random_state=123)

In [72]:
# Visualize the model so iut can explain itself!
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [73]:
# Now we'll make a set of predictions using this trained model
y_pred = clf.predict(X_train)
y_pred[0:3]

array([3, 3, 2])

In [74]:
# Estimate the probabilities for each class
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

array([[0.        , 0.12244898, 0.87755102],
       [0.        , 0.        , 1.        ],
       [0.        , 0.96296296, 0.03703704]])

In [75]:
y_train.head(3)

605    3
197    3
56     2
Name: pclass, dtype: int64

In [76]:
# Baseline prediction = 3
# Baseline accuracy = 3
train["most_frequent"] = 3
baseline_accuracy = (train.pclass == train.most_frequent).mean()
baseline_accuracy 

0.4919678714859438

In [79]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

print(classification_report(y_train, y_pred))
# This model is overfit.

Accuracy of Decision Tree classifier on training set: 0.92
              precision    recall  f1-score   support

           1       0.96      0.96      0.96       134
           2       0.86      0.86      0.86       119
           3       0.94      0.94      0.94       245

    accuracy                           0.92       498
   macro avg       0.92      0.92      0.92       498
weighted avg       0.92      0.92      0.92       498



In [80]:
# Out off sample data 
clf.score(X_validate, y_validate)

0.9065420560747663

In [81]:
# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.91


In [84]:
# Use the classification model trained on train data to make predictions on validate data
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([1, 3, 1])

In [85]:
y_validate.head(3)

509    3
640    3
537    1
Name: pclass, dtype: int64

In [3]:
# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

NameError: name 'classification_report' is not defined

In [103]:
# 6) Which model performs better on your in-sample data?

In [105]:
# The Model with a max_depth = 3 does better with in sample data 

In [None]:
# 7) Which model performs best on your out-of-sample data, the validate set?
# The Model with a max_depth = 4 does better on out of sample data 