# In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

In [13]:
import pydataset
from env import get_db_url

import pandas as pd
import numpy as np

from prepare import prep_titanic
from prepare import titanic_split

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = prep_titanic()
df['baseline_prediction'] = 0
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S,baseline_prediction
0,0,0,3,male,1,0,7.25,S,0,1,0,1,0
1,1,1,1,female,1,0,71.2833,C,0,0,0,0,0
2,2,1,3,female,0,0,7.925,S,1,0,0,1,0
3,3,1,1,female,1,0,53.1,S,0,0,0,1,0
4,4,0,3,male,0,0,8.05,S,1,1,0,1,0


In [3]:
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

## Baseline Accuracy

In [4]:
baseline_accuracy = (df.survived == df.baseline_prediction).mean()
baseline_accuracy

0.6161616161616161

In [5]:
df['fare'] = df['fare'].astype('float64')

In [6]:
# df.pclass.get_dummies()
# Assuming df is your DataFrame
dummy_df = pd.get_dummies(df['pclass'], prefix='pclass')

# Concatenate the dummy variables with the original DataFrame
df = pd.concat([df, dummy_df], axis=1)

In [7]:
df.columns

Index(['passenger_id', 'survived', 'pclass', 'sex', 'sibsp', 'parch', 'fare',
       'embarked', 'alone', 'sex_male', 'embarked_Q', 'embarked_S',
       'baseline_prediction', 'pclass_1', 'pclass_2', 'pclass_3'],
      dtype='object')

## Train, Validate, Test

In [8]:
df, train, validate, test = titanic_split(df)

In [9]:
# inspect
train.info(), validate.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 534 entries, 455 to 496
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   passenger_id         534 non-null    int64  
 1   survived             534 non-null    int64  
 2   pclass               534 non-null    int64  
 3   sex                  534 non-null    object 
 4   sibsp                534 non-null    int64  
 5   parch                534 non-null    int64  
 6   fare                 534 non-null    float64
 7   embarked             534 non-null    object 
 8   alone                534 non-null    int64  
 9   sex_male             534 non-null    uint8  
 10  embarked_Q           534 non-null    uint8  
 11  embarked_S           534 non-null    uint8  
 12  baseline_prediction  534 non-null    int64  
 13  pclass_1             534 non-null    uint8  
 14  pclass_2             534 non-null    uint8  
 15  pclass_3             534 non-null    u

(None, None, None)

## * For all of the models you create, choose a threshold that optimizes for accuracy.

# Create a new notebook, logistic_regression, use it to answer the following questions:

## 1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

## Removing the lesser columns

In [10]:
# create X & y version of train/validate/test
# where X contains the features we want to use and y is a series with just the target variable

X_train = train.drop(columns=['passenger_id', 'survived', 'sex', 'sibsp', 'parch', 'embarked', 'alone', 'embarked_Q', 
        'embarked_S', 'baseline_prediction', 'pclass_1', 'pclass_2', 'pclass_3'])
y_train = train.survived
X_validate = validate.drop(columns=['passenger_id', 'survived', 'sex', 'sibsp', 'parch', 'embarked', 'alone', 'embarked_Q', 
        'embarked_S', 'baseline_prediction', 'pclass_1', 'pclass_2', 'pclass_3'])
y_validate = validate.survived
X_test = test.drop(columns=['passenger_id', 'survived', 'sex',  'sibsp', 'parch', 'embarked', 'alone', 'embarked_Q', 
        'embarked_S', 'baseline_prediction', 'pclass_1', 'pclass_2', 'pclass_3'])
y_test = test.survived

In [None]:
# from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')


In [12]:
X_train.columns

Index(['pclass', 'fare', 'sex_male'], dtype='object')

## 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

## 3. Try out other combinations of features and models.

## 4. Use you best 3 models to predict and evaluate on your validate sample.

## 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?