# Acquire Titanic Data and Prep

In [7]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import env
import acquire

In [8]:
def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

def get_titanic():
    my_query="SELECT * FROM passengers"
    df = pd.read_sql(my_query, get_connection('titanic_db'))
    return df

In [9]:
df = get_titanic()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [10]:
# drop rows where age or embarked is null, drop column 'deck', drop passenger_id
def prep_titanic(df):
    '''
    take in titanc dataframe, remove all rows where age or embarked is null, 
    get dummy variables for sex and embark_town, 
    and drop sex, deck, passenger_id, class, and embark_town. 
    '''
    df = (
        df[(df.age.notna()) & (df.embarked.notna())].
        drop(columns=['deck', 'passenger_id', 'class']))
    dummy_df = (
        pd.get_dummies(df[['sex', 'embark_town']], prefix=['sex', 'embark']))
    df = (
        pd.concat([df, dummy_df.drop(columns=['sex_male'])], axis=1).
        drop(columns=['sex', 'embark_town']))
    return df

In [23]:
df = prep_titanic(df)
df.head()

KeyError: "['deck' 'passenger_id' 'class'] not found in axis"

In [24]:
# dropped embarked column, because we can see where people embarked using the dummy columns.
df = df.drop(columns='embarked')
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_female,embark_Cherbourg,embark_Queenstown,embark_Southampton
0,0,3,22.0,1,0,7.25,0,0,0,0,1
1,1,1,38.0,1,0,71.2833,0,1,1,0,0
2,1,3,26.0,0,0,7.925,1,1,0,0,1
3,1,1,35.0,1,0,53.1,0,1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,0,0,1


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   survived             712 non-null    int64  
 1   pclass               712 non-null    int64  
 2   age                  712 non-null    float64
 3   sibsp                712 non-null    int64  
 4   parch                712 non-null    int64  
 5   fare                 712 non-null    float64
 6   alone                712 non-null    int64  
 7   sex_female           712 non-null    uint8  
 8   embark_Cherbourg     712 non-null    uint8  
 9   embark_Queenstown    712 non-null    uint8  
 10  embark_Southampton   712 non-null    uint8  
 11  baseline_prediction  712 non-null    int64  
dtypes: float64(2), int64(6), uint8(4)
memory usage: 52.8 KB


In [27]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [28]:
train, validate, test = train_validate_test_split(df, target='survived')
train.head()
# What are our observations?
# Each row is an observation, and each observation is a person on the titanic. 
# In tidy data, every row is an observation

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_female,embark_Cherbourg,embark_Queenstown,embark_Southampton
450,0,2,36.0,1,2,27.75,0,0,0,0,1
543,1,2,32.0,1,0,26.0,0,0,0,0,1
157,0,3,30.0,0,0,8.05,1,0,0,0,1
462,0,1,47.0,0,0,38.5,1,0,0,0,1
397,0,2,46.0,0,0,26.0,1,0,0,0,1


In [29]:
print(train.shape, validate.shape, test.shape)

(398, 11) (171, 11) (143, 11)


# Model Exercises

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [30]:
# since survived is our target variable, we will be looking at its value counts to create a baseline. 
df.survived.value_counts()
# 0 is most frequent, so that will be our baseline. 

0    424
1    288
Name: survived, dtype: int64

In [31]:
df.isnull().sum()

survived              0
pclass                0
age                   0
sibsp                 0
parch                 0
fare                  0
alone                 0
sex_female            0
embark_Cherbourg      0
embark_Queenstown     0
embark_Southampton    0
dtype: int64

In [34]:
df['baseline_prediction'] = 0
df.head(2)


Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_female,embark_Cherbourg,embark_Queenstown,embark_Southampton,baseline_prediction
0,0,3,22.0,1,0,7.25,0,0,0,0,1,0
1,1,1,38.0,1,0,71.2833,0,1,1,0,0,0


In [36]:
# baseline accuracy
baseline_accuracy = (df.baseline_prediction == df.survived).mean()
print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline accuracy: 59.55%


### 2.Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [40]:
test.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_female,embark_Cherbourg,embark_Queenstown,embark_Southampton
178,0,2,30.0,0,0,13.0,1,0,0,0,1
722,0,2,34.0,0,0,13.0,1,0,0,0,1
200,0,3,28.0,0,0,9.5,1,0,0,0,1
539,1,1,22.0,0,2,49.5,0,1,1,0,0
79,1,3,30.0,0,0,12.475,1,1,0,0,1


In [41]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

### Train Model

In [51]:
clf = DecisionTreeClassifier(max_depth=5, random_state=123)

### Fit The Model

In [52]:
clf = clf.fit(X_train, y_train)

### Visualize a Decision Tree

In [53]:
import graphviz
from graphviz import Graph

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree2', view=True)

'titanic_decision_tree2.pdf'

### Make Predictions

In [54]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 0, 0])

In [55]:
### Estimate Probability

In [56]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.85714286, 0.14285714],
       [0.85714286, 0.14285714],
       [0.94230769, 0.05769231],
       [0.52173913, 0.47826087],
       [0.85714286, 0.14285714]])

In [57]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_female,embark_Cherbourg,embark_Queenstown,embark_Southampton
450,0,2,36.0,1,2,27.75,0,0,0,0,1
543,1,2,32.0,1,0,26.0,0,0,0,0,1
157,0,3,30.0,0,0,8.05,1,0,0,0,1
462,0,1,47.0,0,0,38.5,1,0,0,0,1
397,0,2,46.0,0,0,26.0,1,0,0,0,1


### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report

### Evaluate the model

In [58]:
# Compute Accuracy
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.85


In [59]:
# Confusion Matrix
confusion_matrix(y_train, y_pred)

array([[233,   4],
       [ 56, 105]])

In [60]:
y_train.value_counts()

0    237
1    161
Name: survived, dtype: int64

In [63]:
y_train.head()

450    0
543    1
157    0
462    0
397    0
Name: survived, dtype: int64

In [62]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,233,4
1,56,105


### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [65]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.98      0.89       237
           1       0.96      0.65      0.78       161

    accuracy                           0.85       398
   macro avg       0.88      0.82      0.83       398
weighted avg       0.87      0.85      0.84       398



In [66]:
# Accuracy of model when run on the validate data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.78


In [67]:
# And since accuracy isn't everything

# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.96      0.84       102
           1       0.90      0.52      0.66        69

    accuracy                           0.78       171
   macro avg       0.82      0.74      0.75       171
weighted avg       0.81      0.78      0.77       171



### 6. Which model performs better on your in-sample data?



Model with a depth of 5

### 7. Which model performs best on your out-of-sample data, the validate set?



Model with a depth of 5

# Random Forest Exercises

### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [69]:
from sklearn.ensemble import RandomForestClassifier

In [70]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [71]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

In [72]:
print(rf.feature_importances_)

[0.12594762 0.24461643 0.03325895 0.03532853 0.23544138 0.01988453
 0.25891885 0.02498201 0.0067361  0.01488559]


In [74]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_female,embark_Cherbourg,embark_Queenstown,embark_Southampton
450,0,2,36.0,1,2,27.75,0,0,0,0,1
543,1,2,32.0,1,0,26.0,0,0,0,0,1
157,0,3,30.0,0,0,8.05,1,0,0,0,1
462,0,1,47.0,0,0,38.5,1,0,0,0,1
397,0,2,46.0,0,0,26.0,1,0,0,0,1


In [79]:
y_pred = rf.predict(X_train)
y_pred[:5]

array([0, 1, 0, 0, 0])

In [78]:
y_pred_proba = rf.predict_proba(X_train)
y_pred_proba[:5]

array([[0.88721905, 0.11278095],
       [0.49432273, 0.50567727],
       [0.88497635, 0.11502365],
       [0.86302561, 0.13697439],
       [0.94245517, 0.05754483]])

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [80]:
print('Accuracy of random forest classifier on training set: {:.2f}' .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.96


In [81]:
print(confusion_matrix(y_train, y_pred))

[[237   0]
 [ 15 146]]


In [82]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       237
           1       1.00      0.91      0.95       161

    accuracy                           0.96       398
   macro avg       0.97      0.95      0.96       398
weighted avg       0.96      0.96      0.96       398



Accuracy of random forest classifier on test set: 0.78


### 3.Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [85]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.78
