In [1]:
# We'll use sklearn's Dummy Classifier as a standin for other classification algorithms
# it behaves the same way and we use it the same way that we'll use the "real" models

import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
import acquire
import prepare

### Data Split

In [2]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.shape, validate.shape, test.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['age'] = imputer.transform(test[['age']])


((497, 14), (214, 14), (178, 14))

In [4]:
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

### Create your first model
- step 1: create the object
- step 2: fit the onject on training data
- step 3: use the object(score, predict, transform)

### Ways we use sklearn classification models:
- .score gives us accuracy
- .predict lets us make predictions given a set of indep vars
- .predict_proba gives us the probability that each observation falls into each label
- some specific model types have additional properties

In [5]:
# 1. Create the object
model = DummyClassifier(strategy='constant', constant=1)
# 2. Fit the object
model.fit(X_train, y_train)

DummyClassifier(constant=1, strategy='constant')

In [6]:
print('Training accuracy: %.4f' % model.score(X_train, y_train))

Training accuracy: 0.3823


In [7]:
# TODO: view the accuracy on the validate split
model.score(X_validate, y_validate)

0.38317757009345793

In [8]:
model.predict(X_validate)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [15]:
#Todo: create new column on the train dataframe that contrains the models predictions
train['prediction'] = model.predict(X_train)

In [16]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S,prediction
583,583,0,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0,1
337,337,1,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0,1
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1,1
218,218,1,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0,1
31,31,1,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0,1


In [17]:
#use the column you just created and the actua values in the survived column
#to genderate a classification report
print(classification_report(train.survived, train.prediction, zero_division =True))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00       307
           1       0.38      1.00      0.55       190

    accuracy                           0.38       497
   macro avg       0.69      0.50      0.28       497
weighted avg       0.76      0.38      0.21       497



In [18]:
#transpose so that it's easier to read
pd.DataFrame(classification_report(train.survived, train.prediction, output_dict=True)).transpose()

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,307.0
1,0.382294,1.0,0.55313,190.0
accuracy,0.382294,0.382294,0.382294,0.382294
macro avg,0.191147,0.5,0.276565,497.0
weighted avg,0.146149,0.382294,0.211458,497.0


### More Models
Now we'll make more models, one model is the unique combination of:
- algorithm
- hyperparameters
- training data

In [20]:
model1 = DummyClassifier(strategy='constant', constant=0)
# TODO: fit the model on the training data
model1.fit(X_train, y_train)
# TODO: see how this model performs on train and validate
model1.score(X_train, y_train), model1.score(X_validate, y_validate)

(0.6177062374245473, 0.616822429906542)

In [23]:
model2 = DummyClassifier(strategy='uniform', random_state=0)
# TODO: fit the model on the training data
model2.fit(X_train, y_train)
# TODO: see how this model performs on train and validate
model2.score(X_train, y_train), model2.score(X_validate, y_validate)

(0.45875251509054327, 0.5)

In [24]:
# Following the pattern above, create 2 more models that vary in either hyperparameters or training data
# fit the models and view their performance
model3 = DummyClassifier(strategy='stratified', random_state=123)
model3.fit(X_train, y_train)
model3.score(X_train, y_train), model3.score(X_validate, y_validate)

(0.5633802816901409, 0.4953271028037383)

In [25]:
model4 = DummyClassifier(strategy='uniform', random_state=13)
model4.fit(X_train, y_train)
model4.score(X_train, y_train), model4.score(X_validate, y_validate)

(0.49899396378269617, 0.5)

### What are we looking for when evaluating model performance?
- Is the model overfit? I.e. does it perform drastically better on the training data compared to the validate split
- How good or bad is the model, i.e. how does it perform
    - compared to the other models
    - compared to the baseline model

## Compare and Finalize

TODO: compare the performance of your models on the validate split
- Model1 is our best model with 61% accuracy on validate

In [26]:
# TODO: find the performance of your best model on the test split

In [27]:
model1.score(X_test, y_test)

0.6179775280898876