# Import libraries

In [174]:
from pathlib import Path
import os

import pandas as pd
import numpy as np
import my_functions as my_func

import seaborn as sns
import matplotlib.pyplot as plt

# ML models 
from autogluon.tabular import TabularPredictor
#from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor


# Data

In [146]:
path = Path(os.getcwd()).parent      
path

PosixPath('/Users/work/Desktop/github/data-science-projects/titanic')

In [147]:
train_df = pd.read_csv('/Users/work/Desktop/github/data-science-projects/titanic/data/titanic_train.csv')
test_df = pd.read_csv(path / 'data/titanic_test.csv')

## Data dictionary 
- PassengerId: unique identifier for each passenger
- Survived: whether the passenger survived or not (0 = No, 1 = Yes)
- Pclass: passenger class (1 = 1st class, 2 = 2nd class, 3 = 3rd class)
- Name: name of the passenger
- Sex: gender of the passenger
- Age: age of the passenger (in years)
- SibSp: number of siblings/spouses aboard the Titanic
- Parch: number of parents/children aboard the Titanic
- Ticket: ticket number
- Fare: fare paid by the passenger
- Cabin: cabin number
- Embarked: port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

# Data prep 

In [148]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Change data type 

In [149]:
print(train_df.dtypes)

train_df = my_func.change_dtype(dataset=train_df, dtype_list=['PassengerId','Survived','Pclass'], type=str)
test_df = my_func.change_dtype(dataset=test_df, dtype_list=['PassengerId','Pclass'], type=str)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


## Check variables for number/percentage of null and unique values 

In [150]:
many_nulls, many_unique_vals  = my_func.descriptive_stats(train_df)

PassengerId
Number of unique values: 891 (100.0%)
Number of null values: 0 (0.0%)
-----------------------------------------
Survived
Number of unique values: 2 (0.22%)
Number of null values: 0 (0.0%)
-----------------------------------------
Pclass
Number of unique values: 3 (0.34%)
Number of null values: 0 (0.0%)
-----------------------------------------
Name
Number of unique values: 891 (100.0%)
Number of null values: 0 (0.0%)
-----------------------------------------
Sex
Number of unique values: 2 (0.22%)
Number of null values: 0 (0.0%)
-----------------------------------------
Age
Number of unique values: 88 (9.88%)
Number of null values: 177 (19.87%)
-----------------------------------------
SibSp
Number of unique values: 7 (0.79%)
Number of null values: 0 (0.0%)
-----------------------------------------
Parch
Number of unique values: 7 (0.79%)
Number of null values: 0 (0.0%)
-----------------------------------------
Ticket
Number of unique values: 681 (76.43%)
Number of null valu

In [151]:
print(f"Variables with many unique values (>75%): {many_unique_vals}")
print(f"Variables that are over 50% null: {many_nulls}")

Variables with many unique values (>75%): ['PassengerId', 'Name', 'Ticket']
Variables that are over 50% null: ['Cabin']


## Drop variables that have either too many nulls and/or too many unique values
- too many unique values are problems for categorical dtypes

In [152]:
# drop variables that either many nulls or many unique values 
train_df = train_df.drop(['PassengerId', 'Ticket','Cabin'], axis=1)
test_df = test_df.drop(['PassengerId', 'Ticket','Cabin'], axis=1)

# Preliminary EDA

In [172]:
train_df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,Mrs
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,Mrs
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,Mr


In [186]:
# Surival 
train_df['Survived'].value_counts(normalize=True)
# Most people did not survive 

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [204]:
# Did survival differ by class?
for col in ['Pclass', 'Sex','Parch','Embarked']:
    print(train_df.groupby(col)['Survived'].value_counts(normalize=True))
    print('---------------------------------------')
    
# Note:
# Pclass: passengers in class 1 seemed to survive more, passengers in class 3 survived less
# Sex: females were more likely to survive 
# Parch: can't tell 
# Embarked: those who embarked from Q and S were less likely to survive 

Pclass  Survived
1       1           0.629630
        0           0.370370
2       0           0.527174
        1           0.472826
3       0           0.757637
        1           0.242363
Name: Survived, dtype: float64
---------------------------------------
Sex     Survived
female  1           0.742038
        0           0.257962
male    0           0.811092
        1           0.188908
Name: Survived, dtype: float64
---------------------------------------
Parch  Survived
0      0           0.656342
       1           0.343658
1      1           0.550847
       0           0.449153
2      0           0.500000
       1           0.500000
3      1           0.600000
       0           0.400000
4      0           1.000000
5      0           0.800000
       1           0.200000
6      0           1.000000
Name: Survived, dtype: float64
---------------------------------------
Embarked  Survived
C         1           0.553571
          0           0.446429
Q         0           0.610390

# Feature engineering

## Name variable

In [153]:
# looks like the title is afer the first comma
train_df['Name'].head(20)

# extract title from the name 
train_df['Title'] = train_df['Name'].str.split(',').str[1].str.split('.').str[0].str.lstrip()
test_df['Title'] = test_df['Name'].str.split(',').str[1].str.split('.').str[0].str.lstrip()

In [154]:
train_df['Title'].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: Title, dtype: int64

In [155]:
# replace Ms to Miss (probably a typo)
train_df['Title'] = train_df['Title'].replace({'Ms':'Miss'})

# The values that are not Mrs, Miss, or Mr seem to be upper-class titles. Let's replace values with upper-class-female and upper-class-male
# upper-class-females     
index = train_df.query('Sex=="female" & ~Title.isin(["Miss","Mrs"])').index
train_df.loc[index, 'Title'] = 'upper-class-female'

# upper-class-males
index = train_df.query('Sex=="male" & ~Title.isin(["Mr"])').index
train_df.loc[index, 'Title'] = 'upper-class-male'

In [156]:
train_df['Title'].value_counts()

Mr                    517
Miss                  183
Mrs                   125
upper-class-male       60
upper-class-female      6
Name: Title, dtype: int64

In [207]:
# drop name since we have title 
train_df = train_df.drop('Name', axis=1)