# Kaggle Competition: Titanic
Predicting whether an idividual survived or not

In [19]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns',100)

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

_________________

# Titanic Data Cleaning 

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.shape

(891, 15)

In [4]:
df.tail(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


_________________

## Unwanted Observations

### Duplicates

Assuming that this dataset has kept all necessary records. There are duplicates but since the dataset from Kaggle also contains 891 records, I will not remove any duplicates. 

### Irrelevant

In [5]:
df.groupby('survived').alive.value_counts()

survived  alive
0         no       549
1         yes      342
Name: alive, dtype: int64

In [6]:
case1 = ((df.survived == 1) & (df.alive == 'yes')).astype(int).sum()
case2 = ((df.survived == 0) & (df.alive == 'no')).astype(int).sum()
print(case1)
print(case2)
print('both should add up to 891')
print('{} + {} = {}'.format(case1, case2, case1+case2))

342
549
both should add up to 891
342 + 549 = 891


The feature '*alive*' is the EXACT same as '*survived*', will drop 'alive'.

In [7]:
del df['alive']

In [8]:
df.shape

(891, 14)

In [9]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alone'],
      dtype='object')

'*Alive*' feature is gone.

In [10]:
df[['embarked','embark_town']].drop_duplicates()

Unnamed: 0,embarked,embark_town
0,S,Southampton
1,C,Cherbourg
5,Q,Queenstown
61,,


'embarked' feature is the **EXACT** same as 'embark_town'. Remove 'embark_town' feature.

In [11]:
df.iloc[61,:]

survived            1
pclass              1
sex            female
age                38
sibsp               0
parch               0
fare               80
embarked          NaN
class           First
who             woman
adult_male      False
deck                B
embark_town       NaN
alone            True
Name: 61, dtype: object

In [12]:
del df['embark_town']

In [13]:
df.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,False


_________________

## Structural Errors

### Indicator Variables

In [14]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,True


In [15]:
indicator_features = ['survived', 'sex', 'adult_male', 'alone']

In [16]:
df.dtypes

survived        int64
pclass          int64
sex            object
age           float64
sibsp           int64
parch           int64
fare          float64
embarked       object
class          object
who            object
adult_male       bool
deck           object
alone            bool
dtype: object

In [17]:
df.survived.unique()

array([0, 1])

In [18]:
del indicator_features[0]
print(indicator_features)

['sex', 'adult_male', 'alone']


In [20]:
df.rename(columns={'sex':'isFemale'},inplace=True)

In [21]:
df.isFemale.unique()

array(['male', 'female'], dtype=object)

In [22]:
df.isFemale.replace('female',1,inplace=True)
print(df.head(3))

   survived  pclass isFemale   age  sibsp  parch     fare embarked  class  \
0         0       3     male  22.0      1      0   7.2500        S  Third   
1         1       1        1  38.0      1      0  71.2833        C  First   
2         1       3        1  26.0      0      0   7.9250        S  Third   

     who adult_male deck  alone  
0    man       True  NaN  False  
1  woman      False    C  False  
2  woman      False  NaN   True  


In [23]:
df.isFemale.replace('male',0,inplace=True)
print(df.head(3))

   survived  pclass  isFemale   age  sibsp  parch     fare embarked  class  \
0         0       3         0  22.0      1      0   7.2500        S  Third   
1         1       1         1  38.0      1      0  71.2833        C  First   
2         1       3         1  26.0      0      0   7.9250        S  Third   

     who adult_male deck  alone  
0    man       True  NaN  False  
1  woman      False    C  False  
2  woman      False  NaN   True  


In [24]:
df.isFemale.unique()

array([0, 1])

In [25]:
del indicator_features[0]
print(indicator_features)

['adult_male', 'alone']


In [26]:
df.adult_male.unique()

array([True, False], dtype=object)

In [27]:
df.adult_male.head()

0     True
1    False
2    False
3    False
4     True
Name: adult_male, dtype: bool

In [28]:
df.adult_male = df.adult_male.astype(int)

In [29]:
df.adult_male.head()

0    1
1    0
2    0
3    0
4    1
Name: adult_male, dtype: int64

In [30]:
del indicator_features[0]
print(indicator_features)

['alone']


In [31]:
df.alone.unique()

array([False, True], dtype=object)

In [32]:
df.alone.head()

0    False
1    False
2     True
3    False
4     True
Name: alone, dtype: bool

In [33]:
df.alone = df.alone.astype(int)

In [34]:
df.alone.head()

0    0
1    0
2    1
3    0
4    1
Name: alone, dtype: int64

In [35]:
del indicator_features[0]
print(indicator_features)

[]


### Typos and Capitalizations

In [46]:
df.head()

Unnamed: 0,survived,pclass,isFemale,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,alone
0,0,3,0,22.0,1,0,7.25,S,Third,man,1,,0
1,1,1,1,38.0,1,0,71.2833,C,First,woman,0,C,0
2,1,3,1,26.0,0,0,7.925,S,Third,woman,0,,1
3,1,1,1,35.0,1,0,53.1,S,First,woman,0,C,0
4,0,3,0,35.0,0,0,8.05,S,Third,man,1,,1


In [47]:
categorical_features = [col for col in df.select_dtypes(include=[object]).columns]
print(categorical_features)

['embarked', 'class', 'who', 'deck']


In [48]:
df.embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [49]:
del categorical_features[0]
print(categorical_features)

['class', 'who', 'deck']


In [50]:
df['class'].unique()

array(['Third', 'First', 'Second'], dtype=object)

In [51]:
df['class'].tail()

886    Second
887     First
888     Third
889     First
890     Third
Name: class, dtype: object

In [38]:
#df['class'] = df['class'].replace(['Third','First', 'Second'],[3,1,2],inplace=True)
df['class'].replace(['Third','First', 'Second'],[3,1,2],inplace=True)

In [39]:
df['class'].tail()

886    2
887    1
888    3
889    1
890    3
Name: class, dtype: int64

In [54]:
del categorical_features[0]
print(categorical_features)

['who', 'deck']


In [55]:
df.who.unique()

array(['man', 'woman', 'child'], dtype=object)

In [56]:
del categorical_features[0]
print(categorical_features)

['deck']


In [57]:
df.deck.unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F'], dtype=object)

In [58]:
del categorical_features[0]
print(categorical_features)

[]


### Mislabeled Classes

Not applicable in this case, the classes (categorical) features are few in number with few instances of each class. 

_________________

## Unwanted Outliers

Since this dataset comes from Kaggle, is a well used (beginner) dataset, and no huge anomalies were discovered during the Data Exploration phase of the project, I will be removing no outliers from the dataset.

_________________

## Missing Data

### Missing Categorical

In [41]:
categorical_features = [col for col in df.select_dtypes(include=[object]).columns]
print(categorical_features)

['embarked', 'who', 'deck']


In [42]:
df[categorical_features].isnull().sum()

embarked      2
who           0
deck        688
dtype: int64

In [44]:
cat_missing = [col for col in categorical_features if df[col].isnull().sum() > 0]
print(cat_missing)

['embarked', 'deck']


In [45]:
df.embarked.value_counts(normalize=True)

S    0.724409
C    0.188976
Q    0.086614
Name: embarked, dtype: float64

With the understanding that my change will make any machine learning algorithm not be able to attain 100% accuracy with any certainty, I will go ahead and manually updated the 2 missing 'embarked' features to 'S' since that class makes up 72.44% of the values.

In [46]:
df.embarked.fillna('S',inplace=True)

In [47]:
df[cat_missing].isnull().sum()

embarked      0
deck        688
dtype: int64

In [48]:
del cat_missing[0]
print(cat_missing)

['deck']


In [52]:
df.deck.value_counts()

Missing    688
C           59
B           47
D           33
E           32
A           15
F           13
G            4
Name: deck, dtype: int64

In [54]:
missing_deck = (df.deck.isnull().sum()/len(df.deck))
not_missing = (1-missing_deck)
print('{} non-null objects.\n'.format(df.deck.isnull().sum()))
print('That means {}% of values in deck are not null'.format(missing_deck*100))
print('{}% of the values are missing'.format(not_missing*100))

0 non-null objects.

That means 0.0% of values in deck are not null
100.0% of the values are missing


In [51]:
df.deck.fillna('Missing',inplace=True)

In [55]:
df[cat_missing].isnull().sum()

deck    0
dtype: int64

In [56]:
del cat_missing[0]
print(cat_missing)

[]


### Missing Numeric

In [80]:
df.select_dtypes(include=[np.number]).isnull().sum()

survived        0
pclass          0
isFemale        0
age           177
sibsp           0
parch           0
fare            0
class           0
adult_male      0
alone           0
dtype: int64

In [79]:
df.select_dtypes(exclude=[object]).isnull().sum()

survived        0
pclass          0
isFemale        0
age           177
sibsp           0
parch           0
fare            0
class           0
adult_male      0
alone           0
dtype: int64

**Only 'age' feature has missing values.**

In [61]:
missing_age = (df.age.isnull().sum()/len(df.age))
not_missing = (1-missing_age)
print('{} non-null objects.\n'.format(df.age.isnull().sum()))
print('That means {}% of values in deck are not null'.format(missing_age*100))
print('{}% of the values are missing'.format(not_missing*100))

177 non-null objects.

That means 19.865319865319865% of values in deck are not null
80.13468013468014% of the values are missing


In [69]:
df[(df.age.isnull())][['age','who']].drop_duplicates()

Unnamed: 0,age,who
5,,man
19,,woman


In [78]:
m_age_m = df[df.who == 'man'].groupby('who').age.mean()[0].round()
m_age_w = df[df.who == 'woman'].groupby('who').age.mean()[0].round()
print(m_age_m)
print(m_age_w)

33.0
32.0


To me, there seems to be 3 options here: 
1. Drop the 'age' feature entirely. 
2. Drop the 177 observations that have a null value for the 'age' feature
3. Impute the average age for 'man' and 'woman' (only 'who' features that have missing values) and then replace the nulls with those average values.  
4. **Flag and fill the missing value. <-------- CHOSEN OPTION**

In [81]:
df['age_missing'] = df.age.isnull().astype(int)
print(df.tail(3))

In [84]:
df.age.fillna(0, inplace=True)
print(df.tail(3))

     survived  pclass  isFemale   age  sibsp  parch   fare embarked  class  \
888         0       3         1   0.0      1      2  23.45        S      3   
889         1       1         0  26.0      0      0  30.00        C      1   
890         0       3         0  32.0      0      0   7.75        Q      3   

       who  adult_male     deck  alone  age_missing  
888  woman           0  Missing      0            1  
889    man           1        C      1            0  
890    man           1  Missing      1            0  


In [85]:
df.to_csv('titanic_cleaned.csv', index=None)