# Kaggle Competition: Titanic
Predicting whether an idividual survived or not

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns',100)

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

_________________

# Titanic Feature Engineering

In [2]:
df = pd.read_csv('titanic_cleaned.csv')

In [3]:
df.shape

(891, 14)

In [4]:
df.tail(3)

Unnamed: 0,survived,pclass,isFemale,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,alone,age_missing
888,0,3,1,0.0,1,2,23.45,S,3,woman,0,Missing,0,1
889,1,1,0,26.0,0,0,30.0,C,1,man,1,C,1,0
890,0,3,0,32.0,0,0,7.75,Q,3,man,1,Missing,1,0


_________________

## Domain Knowledge 

In [8]:
df.tail(3)

Unnamed: 0,survived,pclass,isFemale,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,alone,age_missing
888,0,3,1,0.0,1,2,23.45,S,3,woman,0,Missing,0,1
889,1,1,0,26.0,0,0,30.0,C,1,man,1,C,1,0
890,0,3,0,32.0,0,0,7.75,Q,3,man,1,Missing,1,0


### Boolean masks

Boolean masks allow you to easily create indicator variables based on conditionals. They empower
you to specifically isolate information that you suspect might be important for the model.

### Interaction Features

Will create a feature that is a calculation of the total party size for an individual. This will include the individual themself, meaning that the smallest group size is 1. 

In [10]:
df['group_size'] = 1 + df.sibsp + df.parch
df.tail(3)

##  Sparse Classes

In [15]:
df.select_dtypes(include=[object]).head(3)

Unnamed: 0,embarked,who,deck
0,S,man,Missing
1,C,woman,C
2,S,woman,Missing


### Similar Classes
There are few classes and they are unique enough so no need to group several classes together

### Other Classes

In [17]:
df.deck.value_counts()

Missing    688
C           59
B           47
D           33
E           32
A           15
F           13
G            4
Name: deck, dtype: int64

Can group the 'A', 'F', 'G' values of the 'deck' feature together but for now, will leave them as is. 

## Dummy Variables

In [19]:
df.select_dtypes(include=[object]).head(3)

Unnamed: 0,embarked,who,deck
0,S,man,Missing
1,C,woman,C
2,S,woman,Missing


In [25]:
categorical_colums = [col for col in df.select_dtypes(include=[object]).columns]
print(categorical_colums)

['embarked', 'who', 'deck']


In [28]:
df = pd.get_dummies(df, columns=categorical_colums)

In [29]:
df.head()

Unnamed: 0,survived,pclass,isFemale,age,sibsp,parch,fare,class,adult_male,alone,age_missing,group_size,embarked_C,embarked_Q,embarked_S,who_child,who_man,who_woman,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_Missing
0,0,3,0,22.0,1,0,7.25,3,1,0,0,2,0,0,1,0,1,0,0,0,0,0,0,0,0,1
1,1,1,1,38.0,1,0,71.2833,1,0,0,0,2,1,0,0,0,0,1,0,0,1,0,0,0,0,0
2,1,3,1,26.0,0,0,7.925,3,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1
3,1,1,1,35.0,1,0,53.1,1,0,0,0,2,0,0,1,0,0,1,0,0,1,0,0,0,0,0
4,0,3,0,35.0,0,0,8.05,3,1,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1


## Remove Unused/Redundant

There are no unused/redundant features here. 

# Save the analytic base table

In [31]:
df.isnull().sum()

survived        0
pclass          0
isFemale        0
age             0
sibsp           0
parch           0
fare            0
class           0
adult_male      0
alone           0
age_missing     0
group_size      0
embarked_C      0
embarked_Q      0
embarked_S      0
who_child       0
who_man         0
who_woman       0
deck_A          0
deck_B          0
deck_C          0
deck_D          0
deck_E          0
deck_F          0
deck_G          0
deck_Missing    0
dtype: int64

In [30]:
df.to_csv('titanic_base_table.csv', index=None)