In [111]:
import pandas as pd
import numpy as np
import acquire as aq
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [112]:
# Use the function you defined in acquire.py to load the titanic dataset.

In [113]:
df = aq.get_titanic_data()

In [114]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [115]:
# check for missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  891 non-null    int64  
 1   survived      891 non-null    int64  
 2   pclass        891 non-null    int64  
 3   sex           891 non-null    object 
 4   age           714 non-null    float64
 5   sibsp         891 non-null    int64  
 6   parch         891 non-null    int64  
 7   fare          891 non-null    float64
 8   embarked      889 non-null    object 
 9   class         891 non-null    object 
 10  deck          203 non-null    object 
 11  embark_town   889 non-null    object 
 12  alone         891 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


- age, embarked, deck and embark_town have missing values

In [116]:
# all the columns
df.columns

Index(['passenger_id', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch',
       'fare', 'embarked', 'class', 'deck', 'embark_town', 'alone'],
      dtype='object')

In [117]:
# only look at object cols
obj_cols = df.columns[[df[col].dtype == 'O' for col in df.columns]]
obj_cols

Index(['sex', 'embarked', 'class', 'deck', 'embark_town'], dtype='object')

In [118]:
# describe object columns 
for col in obj_cols:
    print (df[col].value_counts())
    print (df[col].value_counts(normalize = True, dropna = False))
    print ('------------------')

male      577
female    314
Name: sex, dtype: int64
male      0.647587
female    0.352413
Name: sex, dtype: float64
------------------
S    644
C    168
Q     77
Name: embarked, dtype: int64
S      0.722783
C      0.188552
Q      0.086420
NaN    0.002245
Name: embarked, dtype: float64
------------------
Third     491
First     216
Second    184
Name: class, dtype: int64
Third     0.551066
First     0.242424
Second    0.206510
Name: class, dtype: float64
------------------
C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: deck, dtype: int64
NaN    0.772166
C      0.066218
B      0.052750
D      0.037037
E      0.035915
A      0.016835
F      0.014590
G      0.004489
Name: deck, dtype: float64
------------------
Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64
Southampton    0.722783
Cherbourg      0.188552
Queenstown     0.086420
NaN            0.002245
Name: embark_town, dtype: float64
------------------


-  72% of values in embark_town are Southampton or S in embarked column
- we will replace missing values in embark_town and embarked with the most frequently occurring value

In [119]:
# Handle the missing values in the embark_town and embarked columns.
imputer = SimpleImputer(strategy='most_frequent', missing_values=None)
titanic[['embark_town']] = imputer.fit_transform(titanic[['embark_town']])

In [120]:
imputer = SimpleImputer(strategy='most_frequent', missing_values=None)
titanic[['embarked']] = imputer.fit_transform(titanic[['embarked']])

In [121]:
# after replacing missing values with most frequently occurring value
titanic[['embark_town', 'embarked']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   embark_town  891 non-null    object
 1   embarked     891 non-null    object
dtypes: object(2)
memory usage: 14.0+ KB


In [122]:
# Remove the deck column
df = df.drop(['deck'], axis =1)

In [123]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [124]:
df.columns

Index(['passenger_id', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch',
       'fare', 'embarked', 'class', 'embark_town', 'alone'],
      dtype='object')

In [125]:
# Create a dummy variable of the embarked column
dummy_df = pd.get_dummies(df[['embarked']], dummy_na=False, drop_first=True)
dummy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   embarked_Q  891 non-null    uint8
 1   embarked_S  891 non-null    uint8
dtypes: uint8(2)
memory usage: 1.9 KB


In [126]:
# Split Data
# train, test = train_test_split(df, test_size=0.2, random_state=1349, stratify=df.survived)
# train, validate = train_test_split(train, train_size=0.7, random_state=1349, stratify=train.survived)

In [127]:
# train, test, validate

In [139]:
# Create a helper function to split titanic data into train, validate, test datasets
def titanic_split(df):
    '''
    This function performs split on titanic data, stratifying on survived.
    Returns train, validate, and test dfs.
    '''
    train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123, 
                                        stratify=df.survived)
    train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=123, 
                                   stratify=train_validate.survived)
    return train, validate, test

In [129]:
train, validate, test = titanic_split(df)
train

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
583,583,0,1,male,36.0,0,0,40.1250,C,First,Cherbourg,1
165,165,1,3,male,9.0,0,2,20.5250,S,Third,Southampton,0
50,50,0,3,male,7.0,4,1,39.6875,S,Third,Southampton,0
259,259,1,2,female,50.0,0,1,26.0000,S,Second,Southampton,0
306,306,1,1,female,,0,0,110.8833,C,First,Cherbourg,1
...,...,...,...,...,...,...,...,...,...,...,...,...
313,313,0,3,male,28.0,0,0,7.8958,S,Third,Southampton,1
636,636,0,3,male,32.0,0,0,7.9250,S,Third,Southampton,1
222,222,0,3,male,51.0,0,0,8.0500,S,Third,Southampton,1
485,485,0,3,female,,3,1,25.4667,S,Third,Southampton,0


In [131]:
# Fill or impute the missing values in age
# imputer = SimpleImputer(strategy='median', missing_values=np.nan)
# df[['age']] = imputer.fit_transform(df[['age']])

In [132]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  891 non-null    int64  
 1   survived      891 non-null    int64  
 2   pclass        891 non-null    int64  
 3   sex           891 non-null    object 
 4   age           714 non-null    float64
 5   sibsp         891 non-null    int64  
 6   parch         891 non-null    int64  
 7   fare          891 non-null    float64
 8   embarked      889 non-null    object 
 9   class         891 non-null    object 
 10  embark_town   889 non-null    object 
 11  alone         891 non-null    int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 83.7+ KB


In [133]:
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

train -> (498, 12)
validate -> (214, 12)
test -> (179, 12)


In [135]:
# Build a helper function for imputing
def impute_age(df):
    imputer = SimpleImputer(strategy='median', missing_values=np.nan)
    df[['age']] = imputer.fit_transform(df[['age']])
    return df

In [137]:
impute_age(df).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  891 non-null    int64  
 1   survived      891 non-null    int64  
 2   pclass        891 non-null    int64  
 3   sex           891 non-null    object 
 4   age           891 non-null    float64
 5   sibsp         891 non-null    int64  
 6   parch         891 non-null    int64  
 7   fare          891 non-null    float64
 8   embarked      889 non-null    object 
 9   class         891 non-null    object 
 10  embark_town   889 non-null    object 
 11  alone         891 non-null    int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 83.7+ KB


In [138]:
# Create a prep_titanic function that accepts the untransformed titanic data and returns the data with the transformations above applied.