# Classification - Model Exercise

In [1]:
from acquire import get_titanic_data, get_iris_data
from prepare import prep_titanic, split_titanic, min_max_scale_titanic, prep_iris

from collections import namedtuple
from copy import deepcopy

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

import adalib

In [2]:
class DataSet(object):
    def __init__(self):
        self.df = None
        self.train = None
        self.test = None
        self.xcols = None
        self.model = None
        self.pred_train = None
        self.pred_test = None
        self.score = None
        self.confmatrix = None
        self.classrep = None
        self.precision = None
        self.recall = None
        self.f1 = None
        self.support = None

## Logistic Regression

In [3]:
lr_titanic = DataSet()
lr_titanic.df = get_titanic_data()
lr_titanic.df = prep_titanic(lr_titanic.df)

In [4]:
lr_titanic.train, lr_titanic.test = split_titanic(lr_titanic.df)



In [5]:
lr_titanic.train, lr_titanic.test = min_max_scale_titanic(lr_titanic.train, lr_titanic.test)

# 1. Fit the logistic regression classifier to your training sample and transform, i.e. make predictions on the training sample

In [6]:
lr_titanic.xcols = ["pclass", "age", "sibsp", "parch", "fare", "alone", "embarked_encode", "sex_encode"]

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [7]:
# adalib.logreg_model(lr_titanic.train[lr_titanic.xcols], lr_titanic.train.survived, lr_titanic.test[lr_titanic.xcols], lr_titanic.test.survived)

logreg_preds_train, log_reg_preds_test, _ = adalib.logreg_fit_and_predict(
    lr_titanic.train[lr_titanic.xcols], lr_titanic.train.survived, lr_titanic.test[lr_titanic.xcols],
    lr_titanic.test.survived, class_weight="balanced", random_state=123)

print("TRAIN EVALUATION")
adalib.logreg_evaluate_model(lr_titanic.train.survived, logreg_preds_train)

print("TEST EVALUATION")
adalib.logreg_evaluate_model(lr_titanic.test.survived, log_reg_preds_test)

TRAIN EVALUATION
Accuracy: 0.7875751503006012

Confusion matrix:
          Pred -  Pred +
Actual -     247      49
Actual +      57     146

Classification report:
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       296
           1       0.75      0.72      0.73       203

   micro avg       0.79      0.79      0.79       499
   macro avg       0.78      0.78      0.78       499
weighted avg       0.79      0.79      0.79       499


Rates:
True positive rate: 0.7192118226600985
False positive rate: 0.2512820512820513

True negative rate: 0.8344594594594594
False negative rate: 0.1875

TEST EVALUATION
Accuracy: 0.7581395348837209

Confusion matrix:
          Pred -  Pred +
Actual -      96      32
Actual +      20      67

Classification report:
              precision    recall  f1-score   support

           0       0.83      0.75      0.79       128
           1       0.68      0.77      0.72        87

   micro avg       0.76   



4. Look in the scikit-learn documentation to research the solver parameter. What is your best option(s) for the particular problem you are trying to solve and the data to be used?

liblinear because this is a small dataset.

5. Run through steps 2-4 using another solver (from question 5)

In [8]:
newton_preds_train, newton_preds_test, newton_model = adalib.logreg_fit_and_predict(lr_titanic.train[lr_titanic.xcols], lr_titanic.train.survived, lr_titanic.test[lr_titanic.xcols], lr_titanic.test.survived, solver="newton-cg")
print("TRAIN EVALUATION")
adalib.logreg_evaluate_model(lr_titanic.train.survived, newton_preds_train)

print("TEST EVALUATION")
adalib.logreg_evaluate_model(lr_titanic.test.survived, newton_preds_test)

TRAIN EVALUATION
Accuracy: 0.8016032064128257

Confusion matrix:
          Pred -  Pred +
Actual -     258      38
Actual +      61     142

Classification report:
              precision    recall  f1-score   support

           0       0.81      0.87      0.84       296
           1       0.79      0.70      0.74       203

   micro avg       0.80      0.80      0.80       499
   macro avg       0.80      0.79      0.79       499
weighted avg       0.80      0.80      0.80       499


Rates:
True positive rate: 0.6995073891625616
False positive rate: 0.2111111111111111

True negative rate: 0.8716216216216216
False negative rate: 0.19122257053291536

TEST EVALUATION
Accuracy: 0.7767441860465116

Confusion matrix:
          Pred -  Pred +
Actual -     101      27
Actual +      21      66

Classification report:
              precision    recall  f1-score   support

           0       0.83      0.79      0.81       128
           1       0.71      0.76      0.73        87

   micro avg 

6. Which performs better on your in-sample data?

newton-cg, but not by much (about ~0.1 more accurate)

7. Save the best model in logit_fit

In [9]:
logit_fit = newton_model

## Decision Tree

### Iris Data

In [10]:
iris = DataSet()

In [11]:
iris.df = get_iris_data()
iris.df = prep_iris(iris.df)
iris.train, iris.test = train_test_split(iris.df, test_size=0.3, random_state=123, stratify=iris.df[["species"]])
iris.xcols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [12]:
iris.pred_train, iris.pred_test, iris.classes, iris.model = adalib.dectree_fit_and_predict(
    iris.train[iris.xcols], iris.train.species, iris.test[iris.xcols], iris.test.species
)

print("TRAIN EVALUATION")
adalib.dectree_evaluate_model(iris.train.species, iris.pred_train, iris.classes)

print("TEST EVALUATION")
adalib.dectree_evaluate_model(iris.test.species, iris.pred_test, iris.classes)

TRAIN EVALUATION
Accuracy: 1.0

Confusion matrix:
                   Pred setosa  Pred versicolor  Pred virginica
Actual setosa               35                0               0
Actual versicolor            0               35               0
Actual virginica             0                0              35

Classification report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       1.00      1.00      1.00        35
   virginica       1.00      1.00      1.00        35

   micro avg       1.00      1.00      1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105


True setosa rate:  1.000
False setosa rate:  0.000

True versicolor rate:  1.000
False versicolor rate:  0.000

True virginica rate:  1.000
False virginica rate:  0.000

TEST EVALUATION
Accuracy: 0.8666666666666667

Confusion matrix:
                   Pred setosa  Pred versicolor  Pred 

4. Run through steps 2-4 using entropy as your measure of impurity.

In [13]:
iris_entropy = DataSet()
iris_entropy.df = iris.df
iris_entropy.train = iris.train
iris_entropy.test = iris.test
iris_entropy.xcols = iris.xcols

In [14]:
iris_entropy.pred_train, iris_entropy.pred_test, iris_entropy.classes, iris_entropy.model = adalib.dectree_fit_and_predict(
    iris_entropy.train[iris_entropy.xcols], iris_entropy.train.species, iris_entropy.test[iris_entropy.xcols], iris_entropy.test.species, criterion="entropy"
)

print("TRAIN EVALUATION")
adalib.dectree_evaluate_model(iris_entropy.train.species, iris_entropy.pred_train, iris_entropy.classes)

print("TEST EVALUATION")
adalib.dectree_evaluate_model(iris_entropy.test.species, iris_entropy.pred_test, iris_entropy.classes)

TRAIN EVALUATION
Accuracy: 1.0

Confusion matrix:
                   Pred setosa  Pred versicolor  Pred virginica
Actual setosa               35                0               0
Actual versicolor            0               35               0
Actual virginica             0                0              35

Classification report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       1.00      1.00      1.00        35
   virginica       1.00      1.00      1.00        35

   micro avg       1.00      1.00      1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105


True setosa rate:  1.000
False setosa rate:  0.000

True versicolor rate:  1.000
False versicolor rate:  0.000

True virginica rate:  1.000
False virginica rate:  0.000

TEST EVALUATION
Accuracy: 0.9111111111111111

Confusion matrix:
                   Pred setosa  Pred versicolor  Pred 

5. Which performs better on your in-sample data?

They both perform equally well on the training data. 100% accuracy

I need to run this on the titanic data and then if i have time, create a function to do the decision tree and analysis, then run the test data through the model

6. Save the best model in tree_fit

In [15]:
tree_fit = iris_entropy.model

### Titanic Data

In [16]:
dt_titanic = DataSet()
dt_titanic.df = get_titanic_data()
dt_titanic.df = prep_titanic(dt_titanic.df)

In [17]:
dt_titanic.train, dt_titanic.test = split_titanic(dt_titanic.df)
dt_titanic.train, dt_titanic.test = min_max_scale_titanic(dt_titanic.train, dt_titanic.test)
dt_titanic.xcols = ["pclass", "age", "sibsp", "parch", "fare", "alone", "embarked_encode", "sex_encode"]



In [18]:
dt_titanic.pred_train, dt_titanic.pred_test, dt_titanic.classes, dt_titanic.model = adalib.dectree_fit_and_predict(
    dt_titanic.train[dt_titanic.xcols], dt_titanic.train.survived, dt_titanic.test[dt_titanic.xcols],
    dt_titanic.test.survived, max_depth=4, random_state=123
)

print("TRAIN EVALUATION")
adalib.dectree_evaluate_model(dt_titanic.train.survived, dt_titanic.pred_train, dt_titanic.classes)

print("TEST EVALUATION")
adalib.dectree_evaluate_model(dt_titanic.test.survived, dt_titanic.pred_test, dt_titanic.classes)

dt_titanic.model.feature_importances_

TRAIN EVALUATION
Accuracy: 0.843687374749499

Confusion matrix:
          Pred 0  Pred 1
Actual 0     261      35
Actual 1      43     160

Classification report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       296
           1       0.82      0.79      0.80       203

   micro avg       0.84      0.84      0.84       499
   macro avg       0.84      0.83      0.84       499
weighted avg       0.84      0.84      0.84       499


True 0 rate:  0.882
False 0 rate:  0.141

True 1 rate:  0.788
False 1 rate:  0.179

TEST EVALUATION
Accuracy: 0.813953488372093

Confusion matrix:
          Pred 0  Pred 1
Actual 0     105      23
Actual 1      17      70

Classification report:
              precision    recall  f1-score   support

           0       0.86      0.82      0.84       128
           1       0.75      0.80      0.78        87

   micro avg       0.81      0.81      0.81       215
   macro avg       0.81      0.81      0.81  

array([0.20168897, 0.11980658, 0.05662803, 0.00798466, 0.02327734,
       0.        , 0.        , 0.59061443])

In [19]:
dt_titanic_entropy = DataSet()
dt_titanic_entropy.df = get_titanic_data()
dt_titanic_entropy.df = prep_titanic(dt_titanic_entropy.df)

In [20]:
dt_titanic_entropy.train, dt_titanic_entropy.test = split_titanic(dt_titanic_entropy.df)
dt_titanic_entropy.train, dt_titanic_entropy.test = min_max_scale_titanic(dt_titanic_entropy.train, dt_titanic_entropy.test)
dt_titanic_entropy.xcols = ["pclass", "age", "sibsp", "parch", "fare", "alone", "embarked_encode", "sex_encode"]



In [21]:
dt_titanic_entropy.pred_train, dt_titanic_entropy.pred_test, dt_titanic_entropy.classes, dt_titanic_entropy.model = adalib.dectree_fit_and_predict(
    dt_titanic_entropy.train[dt_titanic_entropy.xcols], dt_titanic_entropy.train.survived, dt_titanic_entropy.test[dt_titanic_entropy.xcols], dt_titanic_entropy.test.survived, criterion="entropy",
    max_depth=5
)

print("TRAIN EVALUATION")
adalib.dectree_evaluate_model(dt_titanic_entropy.train.survived, dt_titanic_entropy.pred_train, dt_titanic_entropy.classes)

print("TEST EVALUATION")
adalib.dectree_evaluate_model(dt_titanic_entropy.test.survived, dt_titanic_entropy.pred_test, dt_titanic_entropy.classes)

TRAIN EVALUATION
Accuracy: 0.8557114228456913

Confusion matrix:
          Pred 0  Pred 1
Actual 0     289       7
Actual 1      65     138

Classification report:
              precision    recall  f1-score   support

           0       0.82      0.98      0.89       296
           1       0.95      0.68      0.79       203

   micro avg       0.86      0.86      0.86       499
   macro avg       0.88      0.83      0.84       499
weighted avg       0.87      0.86      0.85       499


True 0 rate:  0.976
False 0 rate:  0.184

True 1 rate:  0.680
False 1 rate:  0.048

TEST EVALUATION
Accuracy: 0.8

Confusion matrix:
          Pred 0  Pred 1
Actual 0     123       5
Actual 1      38      49

Classification report:
              precision    recall  f1-score   support

           0       0.76      0.96      0.85       128
           1       0.91      0.56      0.70        87

   micro avg       0.80      0.80      0.80       215
   macro avg       0.84      0.76      0.77       215
weig

## KNN

### Titanic Data

1. Fit the K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [22]:
knn_titanic = DataSet()
knn_titanic.df = get_titanic_data()
knn_titanic.df = prep_titanic(knn_titanic.df)

In [23]:
knn_titanic.train, knn_titanic.test = split_titanic(knn_titanic.df)
knn_titanic.train, knn_titanic.test = min_max_scale_titanic(knn_titanic.train, knn_titanic.test)
knn_titanic.xcols = ["pclass", "age", "sibsp", "parch", "fare", "alone", "embarked_encode", "sex_encode"]



2. Evaluate your results using the model score, confusion matrix, and classification report.

Better than logistic regression, but not as good as random forest w/ gini

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [24]:
knn_titanic.pred_train, knn_titanic.pred_test, knn_titanic.classes, knn_titanic.model = adalib.knn_fit_and_predict(
    knn_titanic.train[knn_titanic.xcols], knn_titanic.train.survived, knn_titanic.test[knn_titanic.xcols], knn_titanic.test.survived,
    n_neighbors=4, weights="uniform"
)

knn_4 = knn_titanic.model

print("TRAIN EVALUATION")
adalib.knn_evaluate_model(knn_titanic.train.survived, knn_titanic.pred_train, ("Not Survive", "Survive"))

print("TEST EVALUATION")
adalib.knn_evaluate_model(knn_titanic.test.survived, knn_titanic.pred_test, ("Not Survive", "Survive"))

TRAIN EVALUATION
Accuracy: 0.8416833667334669

Confusion matrix:
                    Pred Not Survive  Pred Survive
Actual Not Survive               284            12
Actual Survive                    67           136

Classification report:
              precision    recall  f1-score   support

           0       0.81      0.96      0.88       296
           1       0.92      0.67      0.77       203

   micro avg       0.84      0.84      0.84       499
   macro avg       0.86      0.81      0.83       499
weighted avg       0.85      0.84      0.84       499


True Not Survive rate:  0.959
False Not Survive rate:  0.191

True Survive rate:  0.670
False Survive rate:  0.081

TEST EVALUATION
Accuracy: 0.7906976744186046

Confusion matrix:
                    Pred Not Survive  Pred Survive
Actual Not Survive               114            14
Actual Survive                    31            56

Classification report:
              precision    recall  f1-score   support

           0      

4. Run through steps 2-4 setting k to 10

In [25]:
knn_titanic.pred_train, knn_titanic.pred_test, knn_titanic.classes, knn_titanic.model = adalib.knn_fit_and_predict(
    knn_titanic.train[knn_titanic.xcols], knn_titanic.train.survived, knn_titanic.test[knn_titanic.xcols], knn_titanic.test.survived,
    n_neighbors=10, weights="uniform"
)

knn_10 = knn_titanic.model

print("TRAIN EVALUATION")
adalib.knn_evaluate_model(knn_titanic.train.survived, knn_titanic.pred_train, ("Not Survive", "Survive"))

print("TEST EVALUATION")
adalib.knn_evaluate_model(knn_titanic.test.survived, knn_titanic.pred_test, ("Not Survive", "Survive"))

TRAIN EVALUATION
Accuracy: 0.8216432865731463

Confusion matrix:
                    Pred Not Survive  Pred Survive
Actual Not Survive               277            19
Actual Survive                    70           133

Classification report:
              precision    recall  f1-score   support

           0       0.80      0.94      0.86       296
           1       0.88      0.66      0.75       203

   micro avg       0.82      0.82      0.82       499
   macro avg       0.84      0.80      0.81       499
weighted avg       0.83      0.82      0.82       499


True Not Survive rate:  0.936
False Not Survive rate:  0.202

True Survive rate:  0.655
False Survive rate:  0.125

TEST EVALUATION
Accuracy: 0.7906976744186046

Confusion matrix:
                    Pred Not Survive  Pred Survive
Actual Not Survive               114            14
Actual Survive                    31            56

Classification report:
              precision    recall  f1-score   support

           0      

Still not as good as random forest w/ gini and about the same as k=4

5. Run through setps 2-4 setting k to 20

In [26]:
knn_titanic.pred_train, knn_titanic.pred_test, knn_titanic.classes, knn_titanic.model = adalib.knn_fit_and_predict(
    knn_titanic.train[knn_titanic.xcols], knn_titanic.train.survived, knn_titanic.test[knn_titanic.xcols], knn_titanic.test.survived,
    n_neighbors=20, weights="uniform"
)

knn_20 = knn_titanic.model

print("TRAIN EVALUATION")
adalib.knn_evaluate_model(knn_titanic.train.survived, knn_titanic.pred_train, ("Not Survive", "Survive"))

print("TEST EVALUATION")
adalib.knn_evaluate_model(knn_titanic.test.survived, knn_titanic.pred_test, ("Not Survive", "Survive"))

TRAIN EVALUATION
Accuracy: 0.8096192384769539

Confusion matrix:
                    Pred Not Survive  Pred Survive
Actual Not Survive               271            25
Actual Survive                    70           133

Classification report:
              precision    recall  f1-score   support

           0       0.79      0.92      0.85       296
           1       0.84      0.66      0.74       203

   micro avg       0.81      0.81      0.81       499
   macro avg       0.82      0.79      0.79       499
weighted avg       0.81      0.81      0.80       499


True Not Survive rate:  0.916
False Not Survive rate:  0.205

True Survive rate:  0.655
False Survive rate:  0.158

TEST EVALUATION
Accuracy: 0.7488372093023256

Confusion matrix:
                    Pred Not Survive  Pred Survive
Actual Not Survive               108            20
Actual Survive                    34            53

Classification report:
              precision    recall  f1-score   support

           0      

k = 20 is worse than k = 4 or 10

6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

Measured on TEST data
Accuracy (highest to lowest): 4, 10, 20
True rates (highest to lowest): 4 & 10 (tie), 20
False rates (lowest to highest): 4 & 10 (tie), 20

I'd go with 4 because it is less complex given that it calculates only the 4 nearest neighbors, not 10.

7. Save the best model in knn_fit

In [27]:
knn_fit = knn_4

### Iris Data

In [28]:
knn_iris = DataSet()

In [29]:
knn_iris.df = get_iris_data()
knn_iris.df = prep_iris(knn_iris.df)
knn_iris.train, knn_iris.test = train_test_split(knn_iris.df, test_size=0.3, random_state=123, stratify=knn_iris.df[["species"]])
knn_iris.xcols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

1. Fit the K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

2. Evaluate your results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [30]:
knn_iris.pred_train, knn_iris.pred_test, knn_iris.classes, knn_iris.model = adalib.knn_fit_and_predict(
    knn_iris.train[knn_iris.xcols], knn_iris.train.species, knn_iris.test[knn_iris.xcols], knn_iris.test.species,
    n_neighbors=5, weights="uniform"
)

knn_5 = knn_iris.model

print("TRAIN EVALUATION")
adalib.knn_evaluate_model(knn_iris.train.species, knn_iris.pred_train, knn_iris.classes)

print("TEST EVALUATION")
adalib.knn_evaluate_model(knn_iris.test.species, knn_iris.pred_test, knn_iris.classes)

TRAIN EVALUATION
Accuracy: 0.9809523809523809

Confusion matrix:
                   Pred setosa  Pred versicolor  Pred virginica
Actual setosa               35                0               0
Actual versicolor            0               34               1
Actual virginica             0                1              34

Classification report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       0.97      0.97      0.97        35
   virginica       0.97      0.97      0.97        35

   micro avg       0.98      0.98      0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105


True setosa rate:  1.000
False setosa rate:  0.000

True versicolor rate:  0.971
False versicolor rate:  0.029

True virginica rate:  0.971
False virginica rate:  0.029

TEST EVALUATION
Accuracy: 0.9555555555555556

Confusion matrix:
                   Pred setosa  Pred ve

4. Run through steps 2-4 setting k to 10

In [31]:
knn_iris.pred_train, knn_iris.pred_test, knn_iris.classes, knn_iris.model = adalib.knn_fit_and_predict(
    knn_iris.train[knn_iris.xcols], knn_iris.train.species, knn_iris.test[knn_iris.xcols], knn_iris.test.species,
    n_neighbors=10, weights="uniform"
)

knn_10 = knn_iris.model

print("TRAIN EVALUATION")
adalib.knn_evaluate_model(knn_iris.train.species, knn_iris.pred_train, knn_iris.classes)

print("TEST EVALUATION")
adalib.knn_evaluate_model(knn_iris.test.species, knn_iris.pred_test, knn_iris.classes)

TRAIN EVALUATION
Accuracy: 0.9809523809523809

Confusion matrix:
                   Pred setosa  Pred versicolor  Pred virginica
Actual setosa               35                0               0
Actual versicolor            0               34               1
Actual virginica             0                1              34

Classification report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       0.97      0.97      0.97        35
   virginica       0.97      0.97      0.97        35

   micro avg       0.98      0.98      0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105


True setosa rate:  1.000
False setosa rate:  0.000

True versicolor rate:  0.971
False versicolor rate:  0.029

True virginica rate:  0.971
False virginica rate:  0.029

TEST EVALUATION
Accuracy: 0.9777777777777777

Confusion matrix:
                   Pred setosa  Pred ve

5. Run through setps 2-4 setting k to 20

In [32]:
knn_iris.pred_train, knn_iris.pred_test, knn_iris.classes, knn_iris.model = adalib.knn_fit_and_predict(
    knn_iris.train[knn_iris.xcols], knn_iris.train.species, knn_iris.test[knn_iris.xcols], knn_iris.test.species,
    n_neighbors=20, weights="uniform"
)

knn_20 = knn_iris.model

print("TRAIN EVALUATION")
adalib.knn_evaluate_model(knn_iris.train.species, knn_iris.pred_train, knn_iris.classes)

print("TEST EVALUATION")
adalib.knn_evaluate_model(knn_iris.test.species, knn_iris.pred_test, knn_iris.classes)

TRAIN EVALUATION
Accuracy: 0.9619047619047619

Confusion matrix:
                   Pred setosa  Pred versicolor  Pred virginica
Actual setosa               35                0               0
Actual versicolor            0               33               2
Actual virginica             0                2              33

Classification report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       0.94      0.94      0.94        35
   virginica       0.94      0.94      0.94        35

   micro avg       0.96      0.96      0.96       105
   macro avg       0.96      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105


True setosa rate:  1.000
False setosa rate:  0.000

True versicolor rate:  0.943
False versicolor rate:  0.057

True virginica rate:  0.943
False virginica rate:  0.057

TEST EVALUATION
Accuracy: 0.9555555555555556

Confusion matrix:
                   Pred setosa  Pred ve

6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

7. Save the best model in knn_fit

## Random Forest

### Titanic Data

In [33]:
rf_titanic = DataSet()
rf_titanic.df = get_titanic_data()
rf_titanic.df = prep_titanic(rf_titanic.df)

In [34]:
rf_titanic.train, rf_titanic.test = split_titanic(rf_titanic.df)
rf_titanic.train, rf_titanic.test = min_max_scale_titanic(rf_titanic.train, rf_titanic.test)
rf_titanic.xcols = ["pclass", "age", "sibsp", "parch", "fare", "alone", "embarked_encode", "sex_encode"]



1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

In [35]:
rf_titanic.pred_train, rf_titanic.pred_test, rf_titanic.model = adalib.random_forest_fit_and_predict(
    rf_titanic.train[rf_titanic.xcols], rf_titanic.train.survived,
    rf_titanic.test[rf_titanic.xcols], rf_titanic.test.survived, min_samples_leaf=3,
                            n_estimators=100,
                            random_state=123)

knn_20 = rf_titanic.model

print("TRAIN EVALUATION")
adalib.knn_evaluate_model(rf_titanic.train.survived, rf_titanic.pred_train, ("Not Survive", "Survive"))

print("TEST EVALUATION")
adalib.knn_evaluate_model(rf_titanic.test.survived, rf_titanic.pred_test, ("Not Survive", "Survive"))

TRAIN EVALUATION
Accuracy: 0.8837675350701403

Confusion matrix:
                    Pred Not Survive  Pred Survive
Actual Not Survive               281            15
Actual Survive                    43           160

Classification report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       296
           1       0.91      0.79      0.85       203

   micro avg       0.88      0.88      0.88       499
   macro avg       0.89      0.87      0.88       499
weighted avg       0.89      0.88      0.88       499


True Not Survive rate:  0.949
False Not Survive rate:  0.133

True Survive rate:  0.788
False Survive rate:  0.086

TEST EVALUATION
Accuracy: 0.8232558139534883

Confusion matrix:
                    Pred Not Survive  Pred Survive
Actual Not Survive               113            15
Actual Survive                    23            64

Classification report:
              precision    recall  f1-score   support

           0      

2. Evaluate your results using the model score, confusion matrix, and classification report.

These are not the best results I have achieved. The decision tree has lower confusion matrix numbers across the board. 

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [36]:
rf_titanic2 = DataSet()
rf_titanic2.df = get_titanic_data()
rf_titanic2.df = prep_titanic(rf_titanic2.df)

In [37]:
rf_titanic2.train, rf_titanic2.test = split_titanic(rf_titanic2.df)
rf_titanic2.train, rf_titanic2.test = min_max_scale_titanic(rf_titanic2.train, rf_titanic2.test)
rf_titanic2.xcols = ["pclass", "age", "sibsp", "parch", "fare", "alone", "embarked_encode", "sex_encode"]



In [38]:
rf_titanic2.pred_train, rf_titanic2.pred_test, rf_titanic2.model = adalib.random_forest_fit_and_predict(
    rf_titanic2.train[rf_titanic2.xcols], rf_titanic2.train.survived,
    rf_titanic2.test[rf_titanic2.xcols], rf_titanic2.test.survived, min_samples_leaf=5,
                            n_estimators=100,
                            random_state=123, max_depth=3)

knn_20 = rf_titanic2.model

print("TRAIN EVALUATION")
adalib.knn_evaluate_model(rf_titanic2.train.survived, rf_titanic2.pred_train, ("Not Survive", "Survive"))

print("TEST EVALUATION")
adalib.knn_evaluate_model(rf_titanic2.test.survived, rf_titanic2.pred_test, ("Not Survive", "Survive"))

TRAIN EVALUATION
Accuracy: 0.8296593186372746

Confusion matrix:
                    Pred Not Survive  Pred Survive
Actual Not Survive               276            20
Actual Survive                    65           138

Classification report:
              precision    recall  f1-score   support

           0       0.81      0.93      0.87       296
           1       0.87      0.68      0.76       203

   micro avg       0.83      0.83      0.83       499
   macro avg       0.84      0.81      0.82       499
weighted avg       0.84      0.83      0.83       499


True Not Survive rate:  0.932
False Not Survive rate:  0.191

True Survive rate:  0.680
False Survive rate:  0.127

TEST EVALUATION
Accuracy: 0.786046511627907

Confusion matrix:
                    Pred Not Survive  Pred Survive
Actual Not Survive               113            15
Actual Survive                    31            56

Classification report:
              precision    recall  f1-score   support

           0       

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

The random forest with min_samples_leaf = 1 and max_depth = 20 performs better than the other.

6. Save the best model in forest_fit

In [39]:
forest_fit = rf_titanic.model

### Test
Once you have determined which algorithm (with metaparameters) performs the best, try reducing the number of features to the top 4 features in terms of information gained for each feature individually. That is, how close do we get to predicting accurately the species with each feature?

1. Compute the information gained.
1. Create a new dataframe with top 4 features (train_df_reduced).
1. Use the top performing algorithm with the metaparameters used in that model. Create the object, fit, transform on in-sample data, and evaluate the results. Compare your evaluation metrics with those from the original model (with all the features). Select the best model.
1. Run your final model on your out-of-sample dataframe (test_df). Evaluatethe results.

In [40]:
rf_titanic3 = DataSet()
rf_titanic3.df = get_titanic_data()
rf_titanic3.df = prep_titanic(rf_titanic3.df)

In [41]:
rf_titanic3.train, rf_titanic3.test = split_titanic(rf_titanic3.df)
rf_titanic3.train, rf_titanic3.test = min_max_scale_titanic(rf_titanic3.train, rf_titanic3.test)
rf_titanic3.xcols = ["pclass", "age", "sibsp", "parch", "fare", "alone", "embarked_encode", "sex_encode"]



In [42]:
rf_titanic3.pred_train, rf_titanic3.pred_test, rf_titanic3.model = adalib.random_forest_fit_and_predict(
    rf_titanic3.train[rf_titanic3.xcols], rf_titanic3.train.survived,
    rf_titanic3.test[rf_titanic3.xcols], rf_titanic3.test.survived, min_samples_leaf=3,
                            n_estimators=100,
                            random_state=123)

knn_20 = rf_titanic3.model

print("TRAIN EVALUATION")
adalib.knn_evaluate_model(rf_titanic3.train.survived, rf_titanic3.pred_train, ("Not Survive", "Survive"))

print("TEST EVALUATION")
adalib.knn_evaluate_model(rf_titanic3.test.survived, rf_titanic3.pred_test, ("Not Survive", "Survive"))

rf_titanic3.model.feature_importances_

TRAIN EVALUATION
Accuracy: 0.8837675350701403

Confusion matrix:
                    Pred Not Survive  Pred Survive
Actual Not Survive               281            15
Actual Survive                    43           160

Classification report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       296
           1       0.91      0.79      0.85       203

   micro avg       0.88      0.88      0.88       499
   macro avg       0.89      0.87      0.88       499
weighted avg       0.89      0.88      0.88       499


True Not Survive rate:  0.949
False Not Survive rate:  0.133

True Survive rate:  0.788
False Survive rate:  0.086

TEST EVALUATION
Accuracy: 0.8232558139534883

Confusion matrix:
                    Pred Not Survive  Pred Survive
Actual Not Survive               113            15
Actual Survive                    23            64

Classification report:
              precision    recall  f1-score   support

           0      

array([0.11961799, 0.17416357, 0.03863058, 0.03285185, 0.23370067,
       0.02352536, 0.03430046, 0.34320953])

In [43]:
import copy

# important features for random forest: "sibsp", "embarked_encode", "sex_encode", "parch"
# important features for decision tree: "sex_encode", "pclass", "age", "sibsp"

titanic5 = copy.deepcopy(dt_titanic)

# titanic5.xcols = ["sibsp", "embarked_encode", "sex_encode", "parch"]
titanic5.xcols = ["sex_encode", "pclass", "age", "sibsp"]
titanic5.pred_train, titanic5.pred_test, titanic5.classes, titanic5.model = adalib.dectree_fit_and_predict(
    titanic5.train[titanic5.xcols], titanic5.train.survived, titanic5.test[titanic5.xcols],
    titanic5.test.survived, max_depth=5, random_state=123, class_weight="balanced", max_leaf_nodes=3
)

print("TRAIN EVALUATION")
adalib.dectree_evaluate_model(titanic5.train.survived, titanic5.pred_train, titanic5.classes)

print("TEST EVALUATION")
adalib.dectree_evaluate_model(titanic5.test.survived, titanic5.pred_test, titanic5.classes)

TRAIN EVALUATION
Accuracy: 0.7875751503006012

Confusion matrix:
          Pred 0  Pred 1
Actual 0     256      40
Actual 1      66     137

Classification report:
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       296
           1       0.77      0.67      0.72       203

   micro avg       0.79      0.79      0.79       499
   macro avg       0.78      0.77      0.77       499
weighted avg       0.79      0.79      0.78       499


True 0 rate:  0.865
False 0 rate:  0.205

True 1 rate:  0.675
False 1 rate:  0.226

TEST EVALUATION
Accuracy: 0.7627906976744186

Confusion matrix:
          Pred 0  Pred 1
Actual 0     104      24
Actual 1      27      60

Classification report:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       128
           1       0.71      0.69      0.70        87

   micro avg       0.76      0.76      0.76       215
   macro avg       0.75      0.75      0.75