# Load Data

In [58]:
import numpy as np
import pandas as pd

df = pd.read_csv('titanic-data.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Wrangling

## Search for missing values

In [38]:
df.isnull().describe()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891,891,891,891,891,891,891,891,891,891,891,891
unique,1,1,1,1,1,2,1,1,1,1,2,2
top,False,False,False,False,False,False,False,False,False,False,True,False
freq,891,891,891,891,891,714,891,891,891,891,687,889


## Clean Data

In [59]:
# Age
age_median = df['Age'].median()
df['Age'] = df['Age'].fillna(age_median)

# Cabin
df['Cabin'] = df['Cabin'].fillna('Unknown')

# Embarked
df['Embarked'] = df['Embarked'].fillna('Unknown')

In [40]:
# Validating
df.isnull().describe()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891,891,891,891,891,891,891,891,891,891,891,891
unique,1,1,1,1,1,1,1,1,1,1,1,1
top,False,False,False,False,False,False,False,False,False,False,False,False
freq,891,891,891,891,891,891,891,891,891,891,891,891


In [60]:
# Age
df['Age'] = df['Age'].astype(int)

# Sex
df['Sex'] = df['Sex'].str.capitalize()

# Embarked
embarked_dict = {"C": "Cherbourg", "Q": "Queenstown", "S": "Southampton"}
df["Embarked"].replace(embarked_dict, inplace=True)

pclass_dict = {1: "Upper class", 2: "Middle class", 3: "Lower"}
df["Pclass"].replace(pclass_dict, inplace=True)

# Survived (Convert data type)
df['Survived'] = (df['Survived'] == 1).astype(bool)


In [61]:
# Checking
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,False,Lower,"Braund, Mr. Owen Harris",Male,22,1,0,A/5 21171,7.25,Unknown,Southampton
1,2,True,Upper class,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Female,38,1,0,PC 17599,71.2833,C85,Cherbourg
2,3,True,Lower,"Heikkinen, Miss. Laina",Female,26,0,0,STON/O2. 3101282,7.925,Unknown,Southampton
3,4,True,Upper class,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Female,35,1,0,113803,53.1,C123,Southampton
4,5,False,Lower,"Allen, Mr. William Henry",Male,35,0,0,373450,8.05,Unknown,Southampton


## Save Cleaned Data

In [62]:
df.to_csv('titanic-data-cleaned.csv', index=False)

In [63]:
# Checking
df_cleaned = pd.read_csv('titanic-data-cleaned.csv')
df_cleaned.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,False,Lower,"Braund, Mr. Owen Harris",Male,22,1,0,A/5 21171,7.25,Unknown,Southampton
1,2,True,Upper class,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Female,38,1,0,PC 17599,71.2833,C85,Cherbourg
2,3,True,Lower,"Heikkinen, Miss. Laina",Female,26,0,0,STON/O2. 3101282,7.925,Unknown,Southampton
3,4,True,Upper class,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Female,35,1,0,113803,53.1,C123,Southampton
4,5,False,Lower,"Allen, Mr. William Henry",Male,35,0,0,373450,8.05,Unknown,Southampton
5,6,False,Lower,"Moran, Mr. James",Male,28,0,0,330877,8.4583,Unknown,Queenstown
6,7,False,Upper class,"McCarthy, Mr. Timothy J",Male,54,0,0,17463,51.8625,E46,Southampton
7,8,False,Lower,"Palsson, Master. Gosta Leonard",Male,2,3,1,349909,21.075,Unknown,Southampton
8,9,True,Lower,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",Female,27,0,2,347742,11.1333,Unknown,Southampton
9,10,True,Middle class,"Nasser, Mrs. Nicholas (Adele Achem)",Female,14,1,0,237736,30.0708,Unknown,Cherbourg
