### Initialization

In [4]:
#Initial setup
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model

import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(0)

#Read Titanic data
titanic_df = pd.read_csv("train.csv")

### Data cleaning

In [7]:
#Check the unique values for variables that should have a small number of possible values
print("Unique values for 'Sex':", pd.unique(titanic_df["Sex"]))
print("Unique values for 'Pclass':", sorted(pd.unique(titanic_df["Pclass"])))
print("Unique values for 'SibSp':", sorted(pd.unique(titanic_df["SibSp"])))
print("Unique values for 'Parch':", sorted(pd.unique(titanic_df["Parch"])))
print("Unique values for 'Embarked':", pd.unique(titanic_df["Embarked"]))

#Check min, max, 10th, 25th, 50th, 75th, and 90th quartiles for 'Age' field
print("Age percentiles\n", "\n".join(["\t%dth pctl: %.5f" % (n, titanic_df["Age"].quantile(n/100)) for n in [0, 10, 25, 50, 75, 90, 100]]))
print("\tnull values:", titanic_df["Age"].isna().sum())
#Impute missing ages with the mean age
titanic_df["Age"].fillna(titanic_df["Age"].mean(), inplace=True)

#Check min, max, 10th, 25th, 50th, 75th, and 90th quartiles for 'Fare' field
print("Fare percentiles\n", "\n".join(["\t%dth pctl: %.5f" % (n, titanic_df["Fare"].quantile(n/100)) for n in [0, 10, 25, 50, 75, 90, 100]]))
print("\tnull values:", titanic_df["Fare"].isna().sum())
#Impute missing fares with the mean fare
titanic_df["Fare"].fillna(titanic_df["Fare"].mean(), inplace=True)

Unique values for 'Sex': ['male' 'female']
Unique values for 'Pclass': [1, 2, 3]
Unique values for 'SibSp': [0, 1, 2, 3, 4, 5, 8]
Unique values for 'Parch': [0, 1, 2, 3, 4, 5, 6]
Unique values for 'Embarked': ['S' 'C' 'Q' nan]
Age percentiles
 	0th pctl: 0.42000
	10th pctl: 16.00000
	25th pctl: 22.00000
	50th pctl: 29.69912
	75th pctl: 35.00000
	90th pctl: 47.00000
	100th pctl: 80.00000
	null values: 0
Fare percentiles
 	0th pctl: 0.00000
	10th pctl: 7.55000
	25th pctl: 7.91040
	50th pctl: 14.45420
	75th pctl: 31.00000
	90th pctl: 77.95830
	100th pctl: 512.32920
	null values: 0


Data cleaning takeaways:
* There are no unusual values in the Sex, Pclass, SibSp, or Parch column. 
* The Embarked column has some missing values, but no other unusual values.
* There do not appear to be any unusual values in the Age column, although there are 177 missing values. The missing ages were imputed with the mean age.
* There do not appear to be any unusual values in the Fare column. A fare of 0.0 could be interpreted as a ticket provided for free. Although the maximum value is significantly higher than the 90th percentile value, there are a number other of high-cost tickets.

In [None]:
print("Goodbye!")