In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from graphviz import Source

import aquire
import prepare
import split_scale

import warnings
warnings.filterwarnings("ignore")

#### Fit the logistic regression classifier to your training sample and transform, i.e. make predictions on the training sample

In [2]:
df, encoder = prepare.prep_iris(aquire.get_iris_data())
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
train, test = split_scale.split_my_data(df, .8)

In [4]:
X_train = train.drop(columns="species")
X_test = test.drop(columns="species")
y_train = train[["species"]]
y_test = test[["species"]]

In [5]:
# create object 
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')
#fit object
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={1: 2}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[ 0.31190115  1.44915589 -2.33359923 -1.02353246]
 [ 0.44119976 -1.70023706  0.60904436 -1.4215582 ]
 [-1.64209783 -1.73568864  2.69199299  2.48328471]]
Intercept: 
 [ 0.58799989  1.95748209 -2.58666863]


#### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [7]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.94


In [8]:
confusion_matrix(y_train, y_pred)

array([[37,  0,  0],
       [ 0, 43,  1],
       [ 0,  6, 33]])

In [9]:
multilabel_confusion_matrix(y_train, y_pred)

array([[[83,  0],
        [ 0, 37]],

       [[70,  6],
        [ 1, 43]],

       [[80,  1],
        [ 6, 33]]])

In [10]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       0.88      0.98      0.92        44
           2       0.97      0.85      0.90        39

    accuracy                           0.94       120
   macro avg       0.95      0.94      0.94       120
weighted avg       0.95      0.94      0.94       120



#### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [11]:
cm = confusion_matrix(y_train, y_pred)
fp = cm.sum(axis=0) - np.diag(cm)
fn = cm.sum(axis=1) - np.diag(cm)
tp = np.diag(cm)
tn = pd.DataFrame(cm).values.sum() - (fp + fn + tp)
print(fp, fn, tp, tn)

[0 6 1] [0 1 6] [37 43 33] [83 70 80]


#### Run through steps using another solver

In [12]:
# create object 
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='liblinear')

#fit object
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={1: 2}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[ 0.37830241  1.39373078 -2.16975423 -0.95877223]
 [ 0.53325407 -1.61393357  0.61682878 -1.48608802]
 [-1.58600296 -1.57640704  2.41071353  2.2050944 ]]
Intercept: 
 [ 0.24819862  1.20458773 -1.05526796]


In [14]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.94


In [15]:
print(confusion_matrix(y_train, y_pred))

[[37  0  0]
 [ 0 41  3]
 [ 0  4 35]]


In [16]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       0.91      0.93      0.92        44
           2       0.92      0.90      0.91        39

    accuracy                           0.94       120
   macro avg       0.94      0.94      0.94       120
weighted avg       0.94      0.94      0.94       120



#### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [17]:
clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=123)

In [18]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [19]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([2, 1, 0, 0, 1])

In [20]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.        , 0.        , 1.        ],
       [0.        , 0.95555556, 0.04444444],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.95555556, 0.04444444],
       [0.        , 0.95555556, 0.04444444],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.95555556, 0.04444444],
       [0.        , 0.95555556, 0.04444444],
       [1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.95555556, 0.04444444],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [1.

In [21]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.97


In [22]:
confusion_matrix(y_train, y_pred)

array([[37,  0,  0],
       [ 0, 44,  0],
       [ 0,  3, 36]])

In [23]:
y_train.species.value_counts()

1    44
2    39
0    37
Name: species, dtype: int64

In [24]:
labels = sorted(y_train.species.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1,2
0,37,0,0
1,0,44,0
2,0,3,36


In [25]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       0.94      1.00      0.97        44
           2       1.00      0.92      0.96        39

    accuracy                           0.97       120
   macro avg       0.98      0.97      0.98       120
weighted avg       0.98      0.97      0.97       120



#### Run through steps 2-4 using entropy as your measure of impurity.

In [26]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [27]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([2, 2, 0, 0, 1])

In [28]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.        , 0.        , 1.        ],
       [0.        , 0.4       , 0.6       ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.97619048, 0.02380952],
       [0.        , 0.97619048, 0.02380952],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.97619048, 0.02380952],
       [0.        , 0.97619048, 0.02380952],
       [1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.97619048, 0.02380952],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [1.

In [29]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.97


In [30]:
labels = sorted(y_train.species.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1,2
0,37,0,0
1,0,42,2
2,0,2,37


In [31]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       0.95      0.95      0.95        44
           2       0.95      0.95      0.95        39

    accuracy                           0.97       120
   macro avg       0.97      0.97      0.97       120
weighted avg       0.97      0.97      0.97       120



#### visualize

In [32]:
dot_data = export_graphviz(clf, out_file=None) 
graph = Source(dot_data) 

graph.render('iris_decision_tree', view=True)

'iris_decision_tree.pdf'

#### Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

In [79]:
# create random forest classifier
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', \
                       min_samples_leaf=1, n_estimators=100, max_depth=20, random_state=123)
# fit the object
rf.fit(X_train, y_train)
# print features
rf.feature_importances_

array([0.09289778, 0.02081047, 0.46263682, 0.42365493])

In [71]:
y_pred = rf.predict(X_train)
y_pred_proba = rf.predict_proba(X_train)

#### Evaluate your results using the model score, confusion matrix, and classification report.

In [72]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 1.00


#### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [77]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[37  0  0]
 [ 0 44  0]
 [ 0  0 39]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       1.00      1.00      1.00        44
           2       1.00      1.00      1.00        39

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120



#### Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [84]:
# create random forest classifier
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', \
                       min_samples_leaf=5, n_estimators=100, max_depth=3, random_state=123)
# fit the object
rf.fit(X_train, y_train)
# print features
rf.feature_importances_

array([0.08851614, 0.00438857, 0.47155058, 0.43554471])

In [85]:
y_pred = rf.predict(X_train)
y_pred_proba = rf.predict_proba(X_train)

In [86]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [87]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[37  0  0]
 [ 0 42  2]
 [ 0  2 37]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       0.95      0.95      0.95        44
           2       0.95      0.95      0.95        39

    accuracy                           0.97       120
   macro avg       0.97      0.97      0.97       120
weighted avg       0.97      0.97      0.97       120



#### What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?