In [37]:
import numpy as np
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

In [None]:
import acquire
import prepare

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Modeling Exercises:

1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

5. Run through steps 2-4 using a different max_depth value.

6. Which model performs better on your in-sample data?

7. Which model performs best on your out-of-sample data, the validate set?

____________________________

In [None]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.head()

In [None]:
#get value counts of survived (1) vs did not survive (0)
train.survived.value_counts()

### #1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [None]:
#create the baseline
train["baseline"] = 0
baseline_accuracy = (train.survived == train.baseline).mean()
print(f'Baseline accuracy is: {baseline_accuracy:.2%}')

_________________________

### #2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
#identify columns that you want to use

#only using the following columns
X_cols = ['pclass', 'fare', 'alone', 'Q', 'S']

#only trying to see who survived or died
y_col = 'survived'

In [None]:
#split your data - train, validate, test

X_train, y_train = train[X_cols], train[y_col]
X_validate, y_validate = validate[X_cols], validate[y_col]
X_test, y_test = test[X_cols], test[y_col]

In [None]:
#identify model 1 
model1 = DecisionTreeClassifier()

In [None]:
#fit model 1 using train data
model1.fit(X_train, y_train)

In [None]:
#get scores for train and validate, not using test yet
print(f'training score: {model1.score(X_train, y_train):.2%}')
print(f'validate score: {model1.score(X_validate, y_validate):.2%}')

In [None]:
#model 1 is the prediction
train['model1'] = model1.predict(X_train)
train.head()

_________________

### #3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
#model 1 score
# code == model1.score(X_train, y_train)
print(f'model1 score: {model1.score(X_train, y_train):.2%}')

In [None]:
# confusion matrix
pd.crosstab(train.survived, train.model1)

In [None]:
#classification report
print(classification_report(train.survived, train.model1, zero_division =True))

__________________

### #4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
#to get percentage, normalize=true
pd.crosstab(train.survived, train.model1, normalize=True)

positive- died (0)
<br>
negative - survived (1)

- **True positive** - 59.96%
- **False Postive** - 11.27% (predict they died, but they lived)
- **True Negative** - 25.96%
- **False Negative** - 1.81% (predict they lived, but they died)

In [None]:
pd.DataFrame(classification_report(train.survived, train.model1, zero_division =True, output_dict=True))

In [None]:
#Precision = TP / (TP+FP)
precision= 298 / (298+61)
precision

In [None]:
#computer is using the (1- survived) as a positive
precision_score(train.survived, train.model1)

In [None]:
#Recall = TP/ (TP+FN)
recall= 298/ (298+9)
recall

In [None]:
#computer is using the (1- survived) as a positive
recall_score(train.survived, train.model1)

In [None]:
#Accuracy = TP+TN/(TP +TN+FN+FP)
accuracy= (298+129) / (298+129+61+9)
accuracy

In [None]:
accuracy_score(train.survived, train.model1)

In [None]:
#OR make classification_report its own value
cr= pd.DataFrame(classification_report(train.survived, train.model1, zero_division =True, output_dict=True))

In [None]:
#call classification variable- column 1
cr['0']

_________________________

### #5. Run through steps 2-4 using a different max_depth value.

In [None]:
model2 = DecisionTreeClassifier(max_depth=1)
model2.fit(X_train, y_train)

plt.figure(figsize=(24,12))

plot_tree(model2, feature_names=X_train.columns.tolist(), class_names=['died', 'survived'])
plt.show()

In [None]:
#max_depth = 1 score
# code == model2.score(X_train, y_train)
print(f'model2 score: {model2.score(X_train, y_train):.2%}')

##### Model 2 gives a score of 63.18% accuracy

In [None]:
model3 = DecisionTreeClassifier(max_depth=3)
model3.fit(X_train, y_train)

plt.figure(figsize=(24,12))

plot_tree(model3, feature_names=X_train.columns.tolist(), class_names=['died', 'survived'])
plt.show()

In [None]:
#max_depth = 3 score
# code == model3.score(X_train, y_train)
print(f'model3 score: {model3.score(X_train, y_train):.2%}')

##### Model 3 gives a score of 69.01% accuracy

____________________

### #6. Which model performs better on your in-sample data?

Model 1 (with default max_depth=0) did the best of all three models with an accuracy of 85.92%

___________________________

### #7 Which model performs best on your out-of-sample data, the validate set?

In [None]:
model1.fit(X_validate, y_validate)

In [None]:
print(f'model1 score: {model1.score(X_validate, y_validate):.2%}')

In [None]:
model2.fit(X_validate, y_validate)

In [None]:
print(f'model2 score: {model2.score(X_validate, y_validate):.2%}')

In [None]:
model3.fit(X_validate, y_validate)

In [None]:
print(f'model3 score: {model3.score(X_validate, y_validate):.2%}')

#### Model 3 has the highest accuracy in validate set

<hr style="border:2px solid blue"> </hr>

# Random Forest Exercises:

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.
<br>

2. Evaluate your results using the model score, confusion matrix, and classification report.
<br>

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
<br>

4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.
<br>

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [31]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report

from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

In [2]:
from pydataset import data
from sklearn.model_selection import train_test_split
import pandas as pd
import acquire
import prepare

In [3]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())

In [4]:
#get to know data
train.shape, validate.shape, test.shape

((497, 12), (214, 12), (178, 12))

In [5]:
#only using the following columns
X_cols = ['pclass', 'fare', 'alone', 'Q', 'S']

#only trying to see who survived or died
y_col = 'survived'

X_train, y_train = train[X_cols], train[y_col]
X_validate, y_validate = validate[X_cols], validate[y_col]
X_test, y_test = test[X_cols], test[y_col]

In [6]:
#look to see if we have nulls or columns to drop
train.info()

#data looks to be ready

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497 entries, 583 to 553
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     497 non-null    int64  
 1   pclass       497 non-null    int64  
 2   sex          497 non-null    object 
 3   age          497 non-null    float64
 4   sibsp        497 non-null    int64  
 5   parch        497 non-null    int64  
 6   fare         497 non-null    float64
 7   class        497 non-null    object 
 8   embark_town  497 non-null    object 
 9   alone        497 non-null    int64  
 10  Q            497 non-null    uint8  
 11  S            497 non-null    uint8  
dtypes: float64(2), int64(5), object(3), uint8(2)
memory usage: 43.7+ KB


In [7]:
#take a look at X_train
X_train.head()

Unnamed: 0,pclass,fare,alone,Q,S
583,1,40.125,1,0,0
337,1,134.5,1,0,0
50,3,39.6875,0,0,1
218,1,76.2917,1,0,0
31,1,146.5208,0,0,0


In [8]:
y_train.head()

583    0
337    1
50     0
218    1
31     1
Name: survived, dtype: int64

#### #1 Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [9]:
#make our thing
clf= RandomForestClassifier(min_samples_leaf = 1, max_depth = 10, random_state= 123)

In [10]:
#fit the thing (ONLY on train set!!)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

In [11]:
#use the thing (on training set)
y_pred = clf.predict(X_train)

In [None]:
#shows an array of y_predictions
#y_pred

#### #2 Evaluate your results using the model score, confusion matrix, and classification report.

In [12]:
#model score
clf.score(X_train, y_train)

0.8551307847082495

In [14]:
train['model5'] = clf.predict(X_train)
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,Q,S,model5
583,0,1,male,36.0,0,0,40.125,First,Cherbourg,1,0,0,0
337,1,1,female,41.0,0,0,134.5,First,Cherbourg,1,0,0,1
50,0,3,male,7.0,4,1,39.6875,Third,Southampton,0,0,1,0
218,1,1,female,32.0,0,0,76.2917,First,Cherbourg,1,0,0,1
31,1,1,female,29.916875,1,0,146.5208,First,Cherbourg,0,0,0,1


In [19]:
#confusion matrix- created manually with crosstab
pd.crosstab(train.survived, train.model5)

model5,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,288,19
1,53,137


In [20]:
#get percentage to answer question
pd.crosstab(train.survived, train.model5, normalize=True)

model5,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.579477,0.038229
1,0.10664,0.275654


In [22]:
#use confusion matrix code
[tn, fn], [fp,  tp] = confusion_matrix(y_train, y_pred)

In [23]:
tp, tn, fp, fn

(137, 288, 53, 19)

In [13]:
#classification report
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.844575,0.878205,0.855131,0.86139,0.857431
recall,0.938111,0.721053,0.855131,0.829582,0.855131
f1-score,0.888889,0.791908,0.855131,0.840398,0.851814
support,307.0,190.0,0.855131,497.0,497.0


#### #3 Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [53]:
print(f'Random Forest Model 5 Accuracy score is: {clf.score(X_train, y_train):.2%}')
print(f'Random Forest Model 5 Precision score is: {precision_score(train.survived, train.model5):.2%}')
print(f'Random Forest Model 5 Recall score is: {recall_score(train.survived, train.model5):.2%}')
print(f'Random Forest Model 5 F1 score is: {f1_score(train.survived, train.model5):.2%}')


Random Forest Model 5 Accuracy score is: 79.07%
Random Forest Model 5 Precision score is: 87.82%
Random Forest Model 5 Recall score is: 72.11%
Random Forest Model 5 F1 score is: 79.19%


In [34]:
#OR 
rf= pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
rf['1']

precision      0.878205
recall         0.721053
f1-score       0.791908
support      190.000000
Name: 1, dtype: float64

- positive- died (1) 
- negative - survived (0)
<br>

- True positive - 27.57%
- False Postive - 10.66% (predict they lived, but they actually died)
- True Negative - 57.95%
- False Negative - 3.82% (predict they died, but they actually lived)

_________________________

#### #4 Run through steps increasing your min_samples_leaf and decreasing your max_depth

In [43]:
#second RF model
clf= RandomForestClassifier(min_samples_leaf = 3, max_depth = 15, random_state= 123)

In [44]:
#second RF model fit
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=15, min_samples_leaf=3, random_state=123)

In [45]:
#second RF model fit
y_pred = clf.predict(X_train)

In [46]:
#second RF model score
clf.score(X_train, y_train)

0.7907444668008048

In [64]:
#second RF model- insert column
train['model6'] = clf.predict(X_train)
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,Q,S,model5,model6,model7
583,0,1,male,36.0,0,0,40.125,First,Cherbourg,1,0,0,0,0,0
337,1,1,female,41.0,0,0,134.5,First,Cherbourg,1,0,0,1,1,1
50,0,3,male,7.0,4,1,39.6875,Third,Southampton,0,0,1,0,0,0
218,1,1,female,32.0,0,0,76.2917,First,Cherbourg,1,0,0,1,1,1
31,1,1,female,29.916875,1,0,146.5208,First,Cherbourg,0,0,0,1,1,1


In [65]:
#second RF model confusion matrix
pd.crosstab(train.survived, train.model6)

model6,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,280,27
1,64,126


In [67]:
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.813953,0.823529,0.816901,0.818741,0.817614
recall,0.912052,0.663158,0.816901,0.787605,0.816901
f1-score,0.860215,0.734694,0.816901,0.797454,0.812229
support,307.0,190.0,0.816901,497.0,497.0


In [49]:
print(f'Random Forest Model 6 Accuracy score is: {clf.score(X_train, y_train):.2%}')
print(f'Random Forest Model 6 Precision score is: {precision_score(train.survived, train.model6):.2%}')
print(f'Random Forest Model 6 Recall score is: {recall_score(train.survived, train.model6):.2%}')
print(f'Random Forest Model 6 F1 score is: {f1_score(train.survived, train.model6):.2%}')

Random Forest Model 6 Accuracy score is: 79.07%
Random Forest Model 6 Precision score is: 77.92%
Random Forest Model 6 Recall score is: 63.16%
Random Forest Model 6 F1 score is: 69.77%


__________________________________

In [68]:
#third RF model
clf= RandomForestClassifier(min_samples_leaf = 2, max_depth = 20, random_state= 123)

In [69]:
#third RF model fit
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=20, min_samples_leaf=2, random_state=123)

In [70]:
#third RF model fit
y_pred = clf.predict(X_train)

In [71]:
#second RF model score
clf.score(X_train, y_train)

0.8169014084507042

In [72]:
#third RF model- insert column
train['model7'] = clf.predict(X_train)
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,Q,S,model5,model6,model7
583,0,1,male,36.0,0,0,40.125,First,Cherbourg,1,0,0,0,0,0
337,1,1,female,41.0,0,0,134.5,First,Cherbourg,1,0,0,1,1,1
50,0,3,male,7.0,4,1,39.6875,Third,Southampton,0,0,1,0,0,0
218,1,1,female,32.0,0,0,76.2917,First,Cherbourg,1,0,0,1,1,1
31,1,1,female,29.916875,1,0,146.5208,First,Cherbourg,0,0,0,1,1,1


In [73]:
#third RF model confusion matrix
pd.crosstab(train.survived, train.model7)

model7,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,280,27
1,64,126


In [74]:
print(f'Random Forest Model 6 Accuracy score is: {clf.score(X_train, y_train):.2%}')
print(f'Random Forest Model 6 Precision score is: {precision_score(train.survived, train.model6):.2%}')
print(f'Random Forest Model 6 Recall score is: {recall_score(train.survived, train.model6):.2%}')
print(f'Random Forest Model 6 F1 score is: {f1_score(train.survived, train.model6):.2%}')

Random Forest Model 6 Accuracy score is: 81.69%
Random Forest Model 6 Precision score is: 82.35%
Random Forest Model 6 Recall score is: 66.32%
Random Forest Model 6 F1 score is: 73.47%


In [162]:
#third RF model
clf= RandomForestClassifier(min_samples_leaf = 1, max_depth = 13, random_state= 123)

In [163]:
#third RF model fit
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=13, random_state=123)

In [164]:
#third RF model fit
y_pred = clf.predict(X_train)

In [165]:
#second RF model score
clf.score(X_train, y_train)

0.8591549295774648

In [148]:
#third RF model- insert column
train['model7'] = clf.predict(X_train)
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,Q,S,model5,model6,model7
583,0,1,male,36.0,0,0,40.125,First,Cherbourg,1,0,0,0,0,0
337,1,1,female,41.0,0,0,134.5,First,Cherbourg,1,0,0,1,1,1
50,0,3,male,7.0,4,1,39.6875,Third,Southampton,0,0,1,0,0,0
218,1,1,female,32.0,0,0,76.2917,First,Cherbourg,1,0,0,1,1,1
31,1,1,female,29.916875,1,0,146.5208,First,Cherbourg,0,0,0,1,1,1


In [149]:
#third RF model confusion matrix
pd.crosstab(train.survived, train.model7)

model7,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,289,18
1,52,138


In [None]:
print(f'Random Forest Model 6 Accuracy score is: {clf.score(X_train, y_train):.2%}')
print(f'Random Forest Model 6 Precision score is: {precision_score(train.survived, train.model6):.2%}')
print(f'Random Forest Model 6 Recall score is: {recall_score(train.survived, train.model6):.2%}')
print(f'Random Forest Model 6 F1 score is: {f1_score(train.survived, train.model6):.2%}')

________________________

#### #5 What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

- Having run through multiple max_depth and min_sample_leaf:

    - it appears that max_depth=13 is the optimal level, any max_depth above that gives SAME score.
    - min_sample_leaf=1 (defaul) gives the optimal level, any min_sample_leaf above that, will give a lower score.
    
<br>

- clf= RandomForestClassifier(min_samples_leaf = 1, max_depth = 13, random_state= 123) <--- this gives highest score of **85.92%** accuracy in train set