# Logistic Regression (titanic)

In [112]:
import acquire
import prepare
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
import classy
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [113]:
df = prepare.prep_titanic(acquire.get_titanic_data())

Use this as dbc to help get your SQL data: Engine(mysql+pymysql://ada_665:***@157.230.209.171/titanic_db)
Don't forget to assign to a df
fill
drop
encode


In [114]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,3
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,3
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,3
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,3


In [115]:
df.isnull().sum()

passenger_id         0
survived             0
pclass               0
sex                  0
age                177
sibsp                0
parch                0
fare                 0
embarked             0
class                0
embark_town          0
alone                0
embarked_encode      0
dtype: int64

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
passenger_id       891 non-null int64
survived           891 non-null int64
pclass             891 non-null int64
sex                891 non-null object
age                714 non-null float64
sibsp              891 non-null int64
parch              891 non-null int64
fare               891 non-null float64
embarked           891 non-null object
class              891 non-null object
embark_town        891 non-null object
alone              891 non-null int64
embarked_encode    891 non-null int64
dtypes: float64(2), int64(7), object(4)
memory usage: 90.6+ KB


Getting rid of null values.

In [117]:
df.dropna(inplace=True)

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
passenger_id       714 non-null int64
survived           714 non-null int64
pclass             714 non-null int64
sex                714 non-null object
age                714 non-null float64
sibsp              714 non-null int64
parch              714 non-null int64
fare               714 non-null float64
embarked           714 non-null object
class              714 non-null object
embark_town        714 non-null object
alone              714 non-null int64
embarked_encode    714 non-null int64
dtypes: float64(2), int64(7), object(4)
memory usage: 78.1+ KB


In [119]:
df.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,embarked_encode
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,447.582633,0.406162,2.236695,29.699118,0.512605,0.431373,34.694514,0.565826,2.408964
std,259.119524,0.49146,0.83825,14.526497,0.929783,0.853289,52.91893,0.495995,1.158288
min,0.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0
25%,221.25,0.0,1.0,20.125,0.0,0.0,8.05,0.0,3.0
50%,444.0,0.0,2.0,28.0,0.0,0.0,15.7417,1.0,3.0
75%,676.75,1.0,3.0,38.0,1.0,1.0,33.375,1.0,3.0
max,890.0,1.0,3.0,80.0,5.0,6.0,512.3292,1.0,3.0


Splitting the data into train and test sets. Random state is `123` and  training size is `0.7`.

In [120]:
train, test = prepare.split_it(df, strat=df[['survived']])

Parameters are df, train_size, random_state, and stratify
Returns train, test




Making sure everything matches up.

In [121]:
train.shape

(499, 13)

In [122]:
test.shape

(215, 13)

Min-max normalization for `age` and `fare`.

In [123]:
train, scaler = prepare.min_max(train, ['age', 'fare'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train[list_of_what_to_standardize] = scaler.transform(train[list_of_what_to_standardize])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Parameters are a training df and a list of what columns to standardize.
Returns the normalized training df and the scaler for scaling the test set.


In [124]:
scaler

MinMaxScaler(copy=True, feature_range=(0, 1))

In [125]:
train.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,embarked_encode
count,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0
mean,459.09018,0.406814,2.238477,0.36928,0.53507,0.430862,0.068261,0.561122,2.402806
std,254.343216,0.491733,0.841472,0.18811,0.931528,0.855369,0.102274,0.496748,1.163394
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,246.5,0.0,1.0,0.258608,0.0,0.0,0.015713,0.0,3.0
50%,455.0,0.0,3.0,0.346569,0.0,0.0,0.030937,1.0,3.0
75%,683.5,1.0,3.0,0.472229,1.0,1.0,0.067096,1.0,3.0
max,890.0,1.0,3.0,1.0,5.0,6.0,1.0,1.0,3.0


Normalizing the test data set using the scaler from the train data set.

In [126]:
test[['age', 'fare']] = scaler.transform(test[['age', 'fare']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [127]:
test.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,embarked_encode
count,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0
mean,420.874419,0.404651,2.232558,0.364766,0.460465,0.432558,0.066462,0.576744,2.423256
std,268.58584,0.49197,0.832663,0.169286,0.925783,0.850432,0.105844,0.495228,1.14892
min,1.0,0.0,1.0,0.005152,0.0,0.0,0.0,0.0,0.0
25%,161.5,0.0,1.0,0.246042,0.0,0.0,0.01812,0.0,3.0
50%,419.0,0.0,2.0,0.346569,0.0,0.0,0.029278,1.0,3.0
75%,658.5,1.0,3.0,0.472229,1.0,1.0,0.061045,1.0,3.0
max,889.0,1.0,3.0,0.798944,5.0,5.0,1.0,1.0,3.0


## Model 1

Modeling using default parameters with random state set to `123`. 

In [19]:
model = LogisticRegression(random_state=123)

In [20]:
model.fit(train[['pclass', 'age', 'sibsp', 'parch', 'fare']], train.survived)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

Creating a new column for predictions.

In [21]:
train['predictions'] = model.predict(train[['pclass', 'age', 'sibsp', 'parch', 'fare']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
train.sample(10)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode,predictions
546,546,1,2,female,0.233476,1,0,0.050749,S,Second,Southampton,0,3,0
852,852,0,3,female,0.107816,1,1,0.029758,C,Third,Cherbourg,0,0,0
427,427,1,2,female,0.233476,0,0,0.050749,S,Second,Southampton,1,3,1
834,834,0,3,male,0.22091,0,0,0.016201,S,Third,Southampton,1,3,0
781,781,1,1,female,0.208344,1,0,0.111257,S,First,Southampton,0,3,1
699,699,0,3,male,0.522493,0,0,0.014932,S,Third,Southampton,1,3,0
237,237,1,2,female,0.09525,0,2,0.051237,S,Second,Southampton,0,3,1
88,88,1,1,female,0.28374,3,2,0.513342,S,First,Southampton,0,3,1
850,850,0,3,male,0.044986,4,2,0.061045,S,Third,Southampton,0,3,0
224,224,1,1,male,0.472229,1,0,0.175668,S,First,Southampton,0,3,1


Showing the confusion matrix and assigning the numbers to variables (true positive, false negative, etc).

In [23]:
cm = confusion_matrix(train.survived, train.predictions)
cm
TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]

cm

array([[248,  48],
       [ 97, 106]])

Printing out the classification report.

In [24]:
names = ['did not survive (as +)', 'survived (as +)']
print(classification_report(train.survived, train.predictions, target_names=names))

                        precision    recall  f1-score   support

did not survive (as +)       0.72      0.84      0.77       296
       survived (as +)       0.69      0.52      0.59       203

             micro avg       0.71      0.71      0.71       499
             macro avg       0.70      0.68      0.68       499
          weighted avg       0.71      0.71      0.70       499



In [25]:
print(f'accuracy: {(TP + TN)/(TP + TN + FP + FN)}')
print(f'true positive rate: {TP/(TP + FN)}')
print(f'false positive rate: {FP/(FP + TN)}')
print(f'true negative rate: {TN/(TN + FP)}')
print(f'false negative rate: {FN/(FN + TP)}')
print(f'precision: {TP/(TP + FP)}')
print(f'recall: {TP/(TP + FN)}')
print(f'f1-score: {((TP/(TP + FP)) + (TP/(TP + FN)))/2}')
print(f'support:\n    did not survive: {TN + FP}\n    survivied: {TP + FN}')

accuracy: 0.7094188376753507
true positive rate: 0.5221674876847291
false positive rate: 0.16216216216216217
true negative rate: 0.8378378378378378
false negative rate: 0.47783251231527096
precision: 0.6883116883116883
recall: 0.5221674876847291
f1-score: 0.6052395879982087
support:
    did not survive: 296
    survivied: 203


## Model 2 Using a Different Solver

`liblinear` is used for binary models (only 2 outcomes/classes) and small datasets.. Other solver options allow for multilinear logistic regressions. `sag` and `saga` are faster for larger datasets. For the sake of this model, the default of `liblinear` works fine. `liblinear` and `saga` handle L1 penalty, while `newton-cg`, `lbfgs` and `sag` handle L2 penalty. L1 might work better when lots of features are present.

In [26]:
train.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,embarked_encode,predictions
count,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0
mean,459.09018,0.406814,2.238477,0.36928,0.53507,0.430862,0.068261,0.561122,2.402806,0.308617
std,254.343216,0.491733,0.841472,0.18811,0.931528,0.855369,0.102274,0.496748,1.163394,0.462386
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,246.5,0.0,1.0,0.258608,0.0,0.0,0.015713,0.0,3.0,0.0
50%,455.0,0.0,3.0,0.346569,0.0,0.0,0.030937,1.0,3.0,0.0
75%,683.5,1.0,3.0,0.472229,1.0,1.0,0.067096,1.0,3.0,1.0
max,890.0,1.0,3.0,1.0,5.0,6.0,1.0,1.0,3.0,1.0


Creating a second model with random state set to `123`, but solver is `saga`.

In [27]:
model2 = LogisticRegression(solver='saga', random_state=123)
model2.fit(train[['pclass', 'age', 'sibsp', 'parch', 'fare']], train.survived)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [28]:
train['more_predictions'] = model2.predict(train[['pclass', 'age', 'sibsp', 'parch', 'fare']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [29]:
model2.score(train[['pclass', 'age', 'sibsp', 'parch', 'fare']], train.survived)

0.7134268537074149

In [30]:
classy.TwoByTwo_cm_printouts(confusion_matrix(train.survived, train.more_predictions))

These stats are for 1 being what is considered positive.
accuracy: 0.7134268537074149
true positive rate: 0.5320197044334976
false positive rate: 0.16216216216216217
true negative rate: 0.8378378378378378
false negative rate: 0.46798029556650245
precision: 0.6923076923076923
recall: 0.5320197044334976
f1-score: 0.6121636983705949
support:
    did not survive: 296
    survivied: 203


The second model, where `solver='saga'`, worked better than the first model, where `solver='liblinear'`.

## Testing The Model

In [31]:
test['predictions'] = model2.predict(test[['pclass', 'age', 'sibsp', 'parch', 'fare']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
model2.score(test[['pclass', 'age', 'sibsp', 'parch', 'fare']], test.survived)

0.6976744186046512

In [33]:
confusion_matrix(test.survived, test.predictions)

array([[102,  26],
       [ 39,  48]])

In [34]:
print(classification_report(test.survived, test.predictions))

              precision    recall  f1-score   support

           0       0.72      0.80      0.76       128
           1       0.65      0.55      0.60        87

   micro avg       0.70      0.70      0.70       215
   macro avg       0.69      0.67      0.68       215
weighted avg       0.69      0.70      0.69       215



In [35]:
classy.TwoByTwo_cm_printouts(confusion_matrix(test.survived, test.predictions))

These stats are for 1 being what is considered positive.
accuracy: 0.6976744186046512
true positive rate: 0.5517241379310345
false positive rate: 0.203125
true negative rate: 0.796875
false negative rate: 0.4482758620689655
precision: 0.6486486486486487
recall: 0.5517241379310345
f1-score: 0.6001863932898416
support:
    did not survive: 128
    survivied: 87


In [36]:
logit_fit = model2

# Logistic Regression (iris)

In [128]:
df = prepare.prep_iris(acquire.get_iris_data())

Use this as dbc to help get your SQL data: Engine(mysql+pymysql://ada_665:***@157.230.209.171/iris_db)
Don't forget to assign to a df


In [129]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_encode
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [130]:
train, test = prepare.split_it(df, strat=df[['species']])
print(train.shape)
print(test.shape)

Parameters are df, train_size, random_state, and stratify
Returns train, test
(105, 6)
(45, 6)




In [132]:
lr = LogisticRegression(random_state=123, solver='sag')
lr.fit(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)
lr_predict = lr.predict(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])



In [137]:
lr_score = lr.score(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)
lr_score

0.9619047619047619

In [134]:
confusion_matrix(train.species, lr_predict)

array([[35,  0,  0],
       [ 0, 31,  4],
       [ 0,  0, 35]])

In [136]:
print(classification_report(train.species, lr_predict))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       1.00      0.89      0.94        35
   virginica       0.90      1.00      0.95        35

   micro avg       0.96      0.96      0.96       105
   macro avg       0.97      0.96      0.96       105
weighted avg       0.97      0.96      0.96       105



# Decision Tree (iris)

In [37]:
iris = prepare.prep_iris(acquire.get_iris_data())
iris.head()

Use this as dbc to help get your SQL data: Engine(mysql+pymysql://ada_665:***@157.230.209.171/iris_db)
Don't forget to assign to a df


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_encode
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [38]:
train, test = prepare.split_it(iris, training_size=0.8, strat=iris[['species']])

Parameters are df, train_size, random_state, and stratify
Returns train, test




In [39]:
train.shape

(120, 6)

In [40]:
test.shape

(30, 6)

Creating the decision tree object. Random state is `123` and max depth is 3.

In [41]:
tree = DecisionTreeClassifier(random_state=123, max_depth=3)

In [42]:
tree.fit(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [43]:
train['prediction'] = tree.predict(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [44]:
train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_encode,prediction
99,5.7,2.8,4.1,1.3,versicolor,1,versicolor
35,5.0,3.2,1.2,0.2,setosa,0,setosa
87,6.3,2.3,4.4,1.3,versicolor,1,versicolor
5,5.4,3.9,1.7,0.4,setosa,0,setosa
10,5.4,3.7,1.5,0.2,setosa,0,setosa


In [45]:
pred_prob = tree.predict_proba(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

Model can be 100% accurate but it is not due to setting a limit to depth so overfitting is avoided. Still pretty good.

In [46]:
tree.score(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)

0.975

In [47]:
confusion_matrix(train.species, train.prediction)

array([[40,  0,  0],
       [ 0, 37,  3],
       [ 0,  0, 40]])

In [48]:
# print('Stats using virginica')
# print(f'accuracy: {(40+37+40)/(40+37+3+40)}')
# print(f'true positive rate: {40/40}')
# print(f'false positive rate: {3/80}')
# print(f'true negative rate: {}')
# print(f'false negative rate: {}')
# print(f'precision: {}')
# print(f'recall: {40/40}')
# print(f'f1-score: {}')
# print(f'support: {}')

In [49]:
print(classification_report(train.species, train.prediction))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        40
  versicolor       1.00      0.93      0.96        40
   virginica       0.93      1.00      0.96        40

   micro avg       0.97      0.97      0.97       120
   macro avg       0.98      0.97      0.97       120
weighted avg       0.98      0.97      0.97       120



## Using a Different Criterion (entropy)

Same parameters as previous model, but criterion is set to `entropy` (information gain).

In [50]:
model2 = DecisionTreeClassifier(random_state=123, max_depth=3, criterion='entropy')
model2.fit(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)
train['more_predictions'] = model2.predict(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [51]:
train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_encode,prediction,more_predictions
99,5.7,2.8,4.1,1.3,versicolor,1,versicolor,versicolor
35,5.0,3.2,1.2,0.2,setosa,0,setosa,setosa
87,6.3,2.3,4.4,1.3,versicolor,1,versicolor,versicolor
5,5.4,3.9,1.7,0.4,setosa,0,setosa,setosa
10,5.4,3.7,1.5,0.2,setosa,0,setosa,setosa


In [52]:
model2.score(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)

0.975

In [53]:
confusion_matrix(train.species, train.more_predictions)

array([[40,  0,  0],
       [ 0, 37,  3],
       [ 0,  0, 40]])

In [54]:
print(classification_report(train.species, train.more_predictions))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        40
  versicolor       1.00      0.93      0.96        40
   virginica       0.93      1.00      0.96        40

   micro avg       0.97      0.97      0.97       120
   macro avg       0.98      0.97      0.97       120
weighted avg       0.98      0.97      0.97       120



Both models performed the same.

In [55]:
tree_fit = tree

## Testing The Model

In [56]:
test['predictions'] = tree.predict(test[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [57]:
test.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_encode,predictions
80,5.5,2.4,3.8,1.1,versicolor,1,versicolor
45,4.8,3.0,1.4,0.3,setosa,0,setosa
144,6.7,3.3,5.7,2.5,virginica,2,virginica
110,6.5,3.2,5.1,2.0,virginica,2,virginica
38,4.4,3.0,1.3,0.2,setosa,0,setosa


In [58]:
tree.score(test[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], test.species)

0.9

In [59]:
labels = sorted(test.species.unique())

pd.DataFrame(confusion_matrix(test.species, test.predictions), index=labels, columns=labels)



Unnamed: 0,setosa,versicolor,virginica
setosa,10,0,0
versicolor,0,7,3
virginica,0,0,10


In [60]:
print(classification_report(test.species, test.predictions))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.70      0.82        10
   virginica       0.77      1.00      0.87        10

   micro avg       0.90      0.90      0.90        30
   macro avg       0.92      0.90      0.90        30
weighted avg       0.92      0.90      0.90        30



In [61]:
from sklearn.datasets import load_iris
from sklearn import tree

iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

import graphviz

dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render('iris_decision_tree', view=True)


'iris_decision_tree.pdf'

# K Nearest Neighbors (titanic)

In [3]:
df = prepare.prep_titanic(acquire.get_titanic_data())

Use this as dbc to help get your SQL data: Engine(mysql+pymysql://ada_665:***@157.230.209.171/titanic_db)
Don't forget to assign to a df
fill
drop
encode


In [4]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,3
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,3
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,3
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,3


In [28]:
df.isnull().sum()

passenger_id       0
survived           0
pclass             0
sex                0
age                0
sibsp              0
parch              0
fare               0
embarked           0
class              0
embark_town        0
alone              0
embarked_encode    0
dtype: int64

In [26]:
df.dropna(inplace=True)

In [29]:
train, test = prepare.split_it(df, strat=df[['survived']])

Parameters are df, train_size, random_state, and stratify
Returns train, test




In [30]:
print(train.shape)
print(test.shape)

(499, 13)
(215, 13)


In [31]:
train, scaler = prepare.min_max(train, ['age', 'fare'])

Parameters are a training df and a list of what columns to standardize.
Returns the normalized training df and the scaler for scaling the test set.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  train[list_of_what_to_standardize] = scaler.transform(train[list_of_what_to_standardize])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [32]:
train.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,embarked_encode
count,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0,499.0
mean,459.09018,0.406814,2.238477,0.36928,0.53507,0.430862,0.068261,0.561122,2.402806
std,254.343216,0.491733,0.841472,0.18811,0.931528,0.855369,0.102274,0.496748,1.163394
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,246.5,0.0,1.0,0.258608,0.0,0.0,0.015713,0.0,3.0
50%,455.0,0.0,3.0,0.346569,0.0,0.0,0.030937,1.0,3.0
75%,683.5,1.0,3.0,0.472229,1.0,1.0,0.067096,1.0,3.0
max,890.0,1.0,3.0,1.0,5.0,6.0,1.0,1.0,3.0


In [33]:
test[['age', 'fare']] = scaler.transform(test[['age', 'fare']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [34]:
test.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,embarked_encode
count,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0
mean,420.874419,0.404651,2.232558,0.364766,0.460465,0.432558,0.066462,0.576744,2.423256
std,268.58584,0.49197,0.832663,0.169286,0.925783,0.850432,0.105844,0.495228,1.14892
min,1.0,0.0,1.0,0.005152,0.0,0.0,0.0,0.0,0.0
25%,161.5,0.0,1.0,0.246042,0.0,0.0,0.01812,0.0,3.0
50%,419.0,0.0,2.0,0.346569,0.0,0.0,0.029278,1.0,3.0
75%,658.5,1.0,3.0,0.472229,1.0,1.0,0.061045,1.0,3.0
max,889.0,1.0,3.0,0.798944,5.0,5.0,1.0,1.0,3.0


In [35]:
train.columns

Index(['passenger_id', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch',
       'fare', 'embarked', 'class', 'embark_town', 'alone', 'embarked_encode'],
      dtype='object')

### KNN with K = 5

In [36]:
knn = KNeighborsClassifier()
knn.fit(train[['pclass', 'age', 'sibsp', 'parch', 'fare']], train.survived)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [37]:
knn_predictions = knn.predict(train[['pclass', 'age', 'sibsp', 'parch', 'fare']])

In [43]:
confusion_matrix(train.survived, knn_predictions)

array([[249,  47],
       [ 61, 142]])

In [39]:
classy.TwoByTwo_cm_printouts(confusion_matrix(train.survived, knn_predictions))

These stats are for 1 being what is considered positive.
accuracy: 0.7835671342685371
true positive rate: 0.6995073891625616
false positive rate: 0.15878378378378377
true negative rate: 0.8412162162162162
false negative rate: 0.30049261083743845
precision: 0.7513227513227513
recall: 0.6995073891625616
f1-score: 0.7254150702426565
support:
    did not survive: 296
    survivied: 203


In [61]:
score = knn.score(train[['pclass', 'age', 'sibsp', 'parch', 'fare']], train.survived)
score

0.7835671342685371

In [42]:
print(classification_report(train.survived, knn_predictions))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82       296
           1       0.75      0.70      0.72       203

   micro avg       0.78      0.78      0.78       499
   macro avg       0.78      0.77      0.77       499
weighted avg       0.78      0.78      0.78       499



### KNN with K = 10

In [48]:
knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_10.fit(train[['pclass', 'age', 'sibsp', 'parch', 'fare']], train.survived)
knn_predictions_10 = knn_10.predict(train[['pclass', 'age', 'sibsp', 'parch', 'fare']])

In [62]:
score10 = knn_10.score(train[['pclass', 'age', 'sibsp', 'parch', 'fare']], train.survived)
score10

0.7535070140280561

In [50]:
confusion_matrix(train.survived, knn_predictions_10)

array([[261,  35],
       [ 88, 115]])

In [51]:
classy.TwoByTwo_cm_printouts(confusion_matrix(train.survived, knn_predictions_10))

These stats are for 1 being what is considered positive.
accuracy: 0.7535070140280561
true positive rate: 0.5665024630541872
false positive rate: 0.11824324324324324
true negative rate: 0.8817567567567568
false negative rate: 0.43349753694581283
precision: 0.7666666666666667
recall: 0.5665024630541872
f1-score: 0.6665845648604269
support:
    did not survive: 296
    survivied: 203


In [54]:
print(classification_report(train.survived, knn_predictions_10))

              precision    recall  f1-score   support

           0       0.75      0.88      0.81       296
           1       0.77      0.57      0.65       203

   micro avg       0.75      0.75      0.75       499
   macro avg       0.76      0.72      0.73       499
weighted avg       0.76      0.75      0.75       499



### KNN with K = 20

In [55]:
knn_20 = KNeighborsClassifier(n_neighbors=20)
knn_20.fit(train[['pclass', 'age', 'sibsp', 'parch', 'fare']], train.survived)
knn_predictions_20 = knn_20.predict(train[['pclass', 'age', 'sibsp', 'parch', 'fare']])

In [63]:
score20 = knn_20.score(train[['pclass', 'age', 'sibsp', 'parch', 'fare']], train.survived)
score20

0.751503006012024

In [57]:
confusion_matrix(train.survived, knn_predictions_20)

array([[262,  34],
       [ 90, 113]])

In [58]:
classy.TwoByTwo_cm_printouts(confusion_matrix(train.survived, knn_predictions_20))

These stats are for 1 being what is considered positive.
accuracy: 0.751503006012024
true positive rate: 0.5566502463054187
false positive rate: 0.11486486486486487
true negative rate: 0.8851351351351351
false negative rate: 0.4433497536945813
precision: 0.7687074829931972
recall: 0.5566502463054187
f1-score: 0.662678864649308
support:
    did not survive: 296
    survivied: 203


In [60]:
print(classification_report(train.survived, knn_predictions_20))

              precision    recall  f1-score   support

           0       0.74      0.89      0.81       296
           1       0.77      0.56      0.65       203

   micro avg       0.75      0.75      0.75       499
   macro avg       0.76      0.72      0.73       499
weighted avg       0.75      0.75      0.74       499



In [65]:
print('default: {}'.format(score))
print('k = 10: {}'.format(score10))
print('k = 20: {}'.format(score20))

default: 0.7835671342685371
k = 10: 0.7535070140280561
k = 20: 0.751503006012024


Accuracy for the model is going down as k increases. This can be due to n(did_not_survive) being higher than n(survived). For example, when k = 5 more instances were correctly classified. As more neighbors were considered, this caused misclassifications, probably due to more neighbors being those who died when the particular instance survived.

In [66]:
knn_fit = knn

# K Nearest Neighbors (iris)

In [88]:
df = prepare.prep_iris(acquire.get_iris_data())
df.head()

Use this as dbc to help get your SQL data: Engine(mysql+pymysql://ada_665:***@157.230.209.171/iris_db)
Don't forget to assign to a df


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_encode
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [89]:
df.isna().sum()

sepal_length      0
sepal_width       0
petal_length      0
petal_width       0
species           0
species_encode    0
dtype: int64

In [92]:
train, test = prepare.split_it(df, strat=df[['species']])

Parameters are df, train_size, random_state, and stratify
Returns train, test




In [93]:
print(train.shape)
print(test.shape)

(105, 6)
(45, 6)


### KNN with K = 5

In [94]:
knn_iris = KNeighborsClassifier()
knn_iris.fit(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)
knn_iris_predict = knn_iris.predict(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

In [95]:
knn_iris.score(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)

0.9809523809523809

In [96]:
confusion_matrix(train.species, knn_iris_predict)

array([[35,  0,  0],
       [ 0, 34,  1],
       [ 0,  1, 34]])

In [98]:
print(classification_report(train.species, knn_iris_predict))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       0.97      0.97      0.97        35
   virginica       0.97      0.97      0.97        35

   micro avg       0.98      0.98      0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



### KNN with K = 10

In [107]:
knn_iris_10 = KNeighborsClassifier(n_neighbors=10)
knn_iris_10.fit(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)
knn_iris_predict_10 = knn_iris_10.predict(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

In [108]:
knn_iris_10.score(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)

0.9809523809523809

In [109]:
confusion_matrix(train.species, knn_iris_predict_10)

array([[35,  0,  0],
       [ 0, 34,  1],
       [ 0,  1, 34]])

In [111]:
print(classification_report(train.species, knn_iris_predict_10))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       0.97      0.97      0.97        35
   virginica       0.97      0.97      0.97        35

   micro avg       0.98      0.98      0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



# Random Forest (iris)

In [67]:
df = prepare.prep_iris(acquire.get_iris_data())
df.head()

Use this as dbc to help get your SQL data: Engine(mysql+pymysql://ada_665:***@157.230.209.171/iris_db)
Don't forget to assign to a df


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_encode
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [72]:
df.isnull().sum()

sepal_length      0
sepal_width       0
petal_length      0
petal_width       0
species           0
species_encode    0
dtype: int64

In [69]:
train, test = prepare.split_it(df, strat=df[['species']])

Parameters are df, train_size, random_state, and stratify
Returns train, test




In [71]:
print(train.shape)
print(test.shape)

(105, 6)
(45, 6)


### Random Forest Model with max depth = 20 and min samples leaf = 1

In [74]:
rf = RandomForestClassifier(random_state=123, max_depth=20, min_samples_leaf=1)
rf.fit(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)
rf_predictions = rf.predict(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])



In [75]:
print(rf.feature_importances_)

[0.08374317 0.01660151 0.58612522 0.3135301 ]


In [76]:
rf.score(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)

1.0

In [77]:
confusion_matrix(train.species, rf_predictions)

array([[35,  0,  0],
       [ 0, 35,  0],
       [ 0,  0, 35]])

In [80]:
print(classification_report(train.species, rf_predictions))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       1.00      1.00      1.00        35
   virginica       1.00      1.00      1.00        35

   micro avg       1.00      1.00      1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105



### Random Forest with max depth = 3 and min samples leaf = 5

In [81]:
rf_short = RandomForestClassifier(random_state=123, max_depth=3, min_samples_leaf=5)
rf_short.fit(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)
rf_predictions_short = rf_short.predict(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])



In [82]:
rf_short.score(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train.species)

0.9809523809523809

In [83]:
confusion_matrix(train.species, rf_predictions_short)

array([[35,  0,  0],
       [ 0, 34,  1],
       [ 0,  1, 34]])

In [86]:
print(classification_report(train.species, rf_predictions_short))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       0.97      0.97      0.97        35
   virginica       0.97      0.97      0.97        35

   micro avg       0.98      0.98      0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



The model with max depth of 20 is more accurate than the model with a shorter depth. The depth of 20 allows it to perfectly fit the data to the training sample. Normally this would lead to overfitting, but I believe the model's accuracy will still be 100% when used on the out of sample data.

In [87]:
forest_fit = rf