# Classification - Model Exercise

In [1]:
from acquire import get_titanic_data, get_iris_data
from prepare import prep_titanic, split_titanic, min_max_scale_titanic, prep_iris

from collections import namedtuple
from copy import deepcopy

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier

import adalib

In [2]:
class DataSet(object):
    def __init__(self):
        self.df = None
        self.train = None
        self.test = None
        self.xcols = None
        self.model = None
        self.pred_train = None
        self.pred_test = None
        self.score = None
        self.confmatrix = None
        self.classrep = None
        self.precision = None
        self.recall = None
        self.f1 = None
        self.support = None

## Logistic Regression

In [3]:
lr_titanic = DataSet()
lr_titanic.df = get_titanic_data()
lr_titanic.df = prep_titanic(lr_titanic.df)

In [4]:
lr_titanic.train, lr_titanic.test = split_titanic(lr_titanic.df)



In [5]:
lr_titanic.train, lr_titanic.test = min_max_scale_titanic(lr_titanic.train, lr_titanic.test)

# 1. Fit the logistic regression classifier to your training sample and transform, i.e. make predictions on the training sample

In [6]:
lr_titanic.xcols = ["pclass", "age", "sibsp", "parch", "fare", "alone", "embarked_encode", "sex_encode"]

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [7]:
adalib.logreg_model(lr_titanic.train[lr_titanic.xcols], lr_titanic.train.survived, lr_titanic.test[lr_titanic.xcols], lr_titanic.test.survived)

TRAIN EVALUATION
Accuracy: 0.7915831663326653

Confusion matrix:
          Pred -  Pred +
Actual -     254      42
Actual +      62     141

Classification report:
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       296
           1       0.77      0.69      0.73       203

   micro avg       0.79      0.79      0.79       499
   macro avg       0.79      0.78      0.78       499
weighted avg       0.79      0.79      0.79       499


Rates:
True positive rate: 0.6945812807881774
False positive rate: 0.22950819672131148
True negative rate: 0.8581081081081081
False negative rate: 0.1962025316455696

TEST EVALUATION
Accuracy: 0.7674418604651163

Confusion matrix:
          Pred -  Pred +
Actual -     105      23
Actual +      27      60

Classification report:
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       128
           1       0.72      0.69      0.71        87

   micro avg  

4. Look in the scikit-learn documentation to research the solver parameter. What is your best option(s) for the particular problem you are trying to solve and the data to be used?

liblinear because this is a small dataset.

5. Run through steps 2-4 using another solver (from question 5)

In [8]:
newton_preds_train, newton_preds_test, newton_model = adalib.logreg_fit_and_predict(lr_titanic.train[lr_titanic.xcols], lr_titanic.train.survived, lr_titanic.test[lr_titanic.xcols], lr_titanic.test.survived, solver="newton-cg")
print("TRAIN EVALUATION")
adalib.logreg_evaluate_model(lr_titanic.train.survived, newton_preds_train)

print("TEST EVALUATION")
adalib.logreg_evaluate_model(lr_titanic.test.survived, newton_preds_test)

TRAIN EVALUATION
Accuracy: 0.8016032064128257

Confusion matrix:
          Pred -  Pred +
Actual -     258      38
Actual +      61     142

Classification report:
              precision    recall  f1-score   support

           0       0.81      0.87      0.84       296
           1       0.79      0.70      0.74       203

   micro avg       0.80      0.80      0.80       499
   macro avg       0.80      0.79      0.79       499
weighted avg       0.80      0.80      0.80       499


Rates:
True positive rate: 0.6995073891625616
False positive rate: 0.2111111111111111
True negative rate: 0.8716216216216216
False negative rate: 0.19122257053291536

TEST EVALUATION
Accuracy: 0.7767441860465116

Confusion matrix:
          Pred -  Pred +
Actual -     101      27
Actual +      21      66

Classification report:
              precision    recall  f1-score   support

           0       0.83      0.79      0.81       128
           1       0.71      0.76      0.73        87

   micro avg  

6. Which performs better on your in-sample data?

newton-cg, but not by much (about ~0.1 more accurate)

7. Save the best model in logit_fit

In [9]:
logit_fit = newton_model

## Decision Tree

### Iris Data

In [10]:
iris = DataSet()

In [11]:
iris.df = get_iris_data()
iris.df = prep_iris(iris.df)
iris.train, iris.test = train_test_split(iris.df, test_size=0.3, random_state=123, stratify=iris.df[["species"]])
iris.xcols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [12]:
iris.pred_train, iris.pred_test, iris.classes, iris.model = adalib.dectree_fit_and_predict(
    iris.train[iris.xcols], iris.train.species, iris.test[iris.xcols], iris.test.species
)

print("TRAIN EVALUATION")
adalib.dectree_evaluate_model(iris.train.species, iris.pred_train, iris.classes)

print("TEST EVALUATION")
adalib.dectree_evaluate_model(iris.test.species, iris.pred_test, iris.classes)

TRAIN EVALUATION
Accuracy: 1.0

Confusion matrix:
                   Pred setosa  Pred versicolor  Pred virginica
Actual setosa               35                0               0
Actual versicolor            0               35               0
Actual virginica             0                0              35

Classification report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       1.00      1.00      1.00        35
   virginica       1.00      1.00      1.00        35

   micro avg       1.00      1.00      1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105


True setosa rate:  1.000
False setosa rate:  0.000

True versicolor rate:  1.000
False versicolor rate:  0.000

True virginica rate:  1.000
False virginica rate:  0.000

TEST EVALUATION
Accuracy: 0.8666666666666667

Confusion matrix:
                   Pred setosa  Pred versicolor  Pred 

4. Run through steps 2-4 using entropy as your measure of impurity.

In [13]:
iris_entropy = DataSet()
iris_entropy.df = iris.df
iris_entropy.train = iris.train
iris_entropy.test = iris.test
iris_entropy.xcols = iris.xcols

In [14]:
iris_entropy.pred_train, iris_entropy.pred_test, iris_entropy.classes, iris_entropy.model = adalib.dectree_fit_and_predict(
    iris_entropy.train[iris_entropy.xcols], iris_entropy.train.species, iris_entropy.test[iris_entropy.xcols], iris_entropy.test.species, criterion="entropy"
)

print("TRAIN EVALUATION")
adalib.dectree_evaluate_model(iris_entropy.train.species, iris_entropy.pred_train, iris_entropy.classes)

print("TEST EVALUATION")
adalib.dectree_evaluate_model(iris_entropy.test.species, iris_entropy.pred_test, iris_entropy.classes)

TRAIN EVALUATION
Accuracy: 1.0

Confusion matrix:
                   Pred setosa  Pred versicolor  Pred virginica
Actual setosa               35                0               0
Actual versicolor            0               35               0
Actual virginica             0                0              35

Classification report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       1.00      1.00      1.00        35
   virginica       1.00      1.00      1.00        35

   micro avg       1.00      1.00      1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105


True setosa rate:  1.000
False setosa rate:  0.000

True versicolor rate:  1.000
False versicolor rate:  0.000

True virginica rate:  1.000
False virginica rate:  0.000

TEST EVALUATION
Accuracy: 0.9111111111111111

Confusion matrix:
                   Pred setosa  Pred versicolor  Pred 

5. Which performs better on your in-sample data?

They both perform equally well on the training data. 100% accuracy

I need to run this on the titanic data and then if i have time, create a function to do the decision tree and analysis, then run the test data through the model

6. Save the best model in tree_fit

In [15]:
tree_fit = iris_entropy.model

### Titanic Data

In [16]:
dt_titanic = DataSet()
dt_titanic.df = get_titanic_data()
dt_titanic.df = prep_titanic(dt_titanic.df)

In [17]:
dt_titanic.train, dt_titanic.test = split_titanic(dt_titanic.df)
dt_titanic.train, dt_titanic.test = min_max_scale_titanic(dt_titanic.train, dt_titanic.test)
dt_titanic.xcols = ["pclass", "age", "sibsp", "parch", "fare", "alone", "embarked_encode", "sex_encode"]



In [18]:
dt_titanic.pred_train, dt_titanic.pred_test, dt_titanic.classes, dt_titanic.model = adalib.dectree_fit_and_predict(
    dt_titanic.train[dt_titanic.xcols], dt_titanic.train.survived, dt_titanic.test[dt_titanic.xcols], dt_titanic.test.survived, max_depth=4
)

print("TRAIN EVALUATION")
adalib.dectree_evaluate_model(dt_titanic.train.survived, dt_titanic.pred_train, dt_titanic.classes)

print("TEST EVALUATION")
adalib.dectree_evaluate_model(dt_titanic.test.survived, dt_titanic.pred_test, dt_titanic.classes)

TRAIN EVALUATION
Accuracy: 0.843687374749499

Confusion matrix:
          Pred 0  Pred 1
Actual 0     261      35
Actual 1      43     160

Classification report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       296
           1       0.82      0.79      0.80       203

   micro avg       0.84      0.84      0.84       499
   macro avg       0.84      0.83      0.84       499
weighted avg       0.84      0.84      0.84       499


True 0 rate:  0.882
False 0 rate:  0.141

True 1 rate:  0.788
False 1 rate:  0.179

TEST EVALUATION
Accuracy: 0.813953488372093

Confusion matrix:
          Pred 0  Pred 1
Actual 0     105      23
Actual 1      17      70

Classification report:
              precision    recall  f1-score   support

           0       0.86      0.82      0.84       128
           1       0.75      0.80      0.78        87

   micro avg       0.81      0.81      0.81       215
   macro avg       0.81      0.81      0.81  

In [19]:
dt_titanic_entropy = DataSet()
dt_titanic_entropy.df = get_titanic_data()
dt_titanic_entropy.df = prep_titanic(dt_titanic_entropy.df)

In [20]:
dt_titanic_entropy.train, dt_titanic_entropy.test = split_titanic(dt_titanic_entropy.df)
dt_titanic_entropy.train, dt_titanic_entropy.test = min_max_scale_titanic(dt_titanic_entropy.train, dt_titanic_entropy.test)
dt_titanic_entropy.xcols = ["pclass", "age", "sibsp", "parch", "fare", "alone", "embarked_encode", "sex_encode"]



In [21]:
dt_titanic_entropy.pred_train, dt_titanic_entropy.pred_test, dt_titanic_entropy.classes, dt_titanic_entropy.model = adalib.dectree_fit_and_predict(
    dt_titanic_entropy.train[dt_titanic_entropy.xcols], dt_titanic_entropy.train.survived, dt_titanic_entropy.test[dt_titanic_entropy.xcols], dt_titanic_entropy.test.survived, criterion="entropy",
    max_depth=5
)

print("TRAIN EVALUATION")
adalib.dectree_evaluate_model(dt_titanic_entropy.train.survived, dt_titanic_entropy.pred_train, dt_titanic_entropy.classes)

print("TEST EVALUATION")
adalib.dectree_evaluate_model(dt_titanic_entropy.test.survived, dt_titanic_entropy.pred_test, dt_titanic_entropy.classes)

TRAIN EVALUATION
Accuracy: 0.8557114228456913

Confusion matrix:
          Pred 0  Pred 1
Actual 0     289       7
Actual 1      65     138

Classification report:
              precision    recall  f1-score   support

           0       0.82      0.98      0.89       296
           1       0.95      0.68      0.79       203

   micro avg       0.86      0.86      0.86       499
   macro avg       0.88      0.83      0.84       499
weighted avg       0.87      0.86      0.85       499


True 0 rate:  0.976
False 0 rate:  0.184

True 1 rate:  0.680
False 1 rate:  0.048

TEST EVALUATION
Accuracy: 0.8

Confusion matrix:
          Pred 0  Pred 1
Actual 0     123       5
Actual 1      38      49

Classification report:
              precision    recall  f1-score   support

           0       0.76      0.96      0.85       128
           1       0.91      0.56      0.70        87

   micro avg       0.80      0.80      0.80       215
   macro avg       0.84      0.76      0.77       215
weig