## Initial checking

In [1]:
# imports
import matplotlib.pyplot as plt
import pandas as pd

# sklearn
from sklearn import ensemble, preprocessing, tree
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, StratifiedKFold
from yellowbrick.classifier import ConfusionMatrix, ROCAUC
from yellowbrick.model_selection import LearningCurve

In [2]:
df = pd.read_excel('/Users/aline/Titanic/titanic3.xls')
orig_df = df

In [3]:
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [None]:
import pandas_profiling
profile = df.profile_report(title='Titanic Profiling Report', progress_bar=True)
profile.to_widgets()
profile

In [None]:
# save profile to html
profile.to_file("/Users/aline/Titanic/profile.html")

In [4]:
df.shape

(1309, 14)

In [None]:
df.describe().iloc[:, :2]

In [5]:
# any missing values?
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [6]:
df.isnull().sum(axis=1).loc[:10]

0     1
1     1
2     2
3     1
4     2
5     1
6     1
7     2
8     1
9     2
10    1
dtype: int64

Let’s look at some of the rows with missing data

In [7]:
mask = df.isnull().any(axis=1)

In [8]:
mask.head() #rows

0    True
1    True
2    True
3    True
4    True
dtype: bool

In [9]:
df[mask].body.head()

0      NaN
1      NaN
2      NaN
3    135.0
4      NaN
Name: body, dtype: float64

In [10]:
df.sex.value_counts(dropna=False)

male      843
female    466
Name: sex, dtype: int64

In [11]:
df.embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

## Create Features

Let's drop columns that leak information

In [12]:
name = df.name
name.head(3)

0     Allen, Miss. Elisabeth Walton
1    Allison, Master. Hudson Trevor
2      Allison, Miss. Helen Loraine
Name: name, dtype: object

In [13]:
df = df.drop(
    columns=[
        "name", 
        "ticket", 
        "home.dest", 
        "boat", 
        "body", 
        "cabin",
    ]
)

We need to create dummy columns from string columns. This will create new columns for sex and embarked. Pandas has a convenient get_dummies function for that

In [14]:
df = pd.get_dummies(df)

In [15]:
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

We can add a drop_first=True parameter to the get_dummies call:

In [16]:
df = pd.get_dummies(df, drop_first=True)

In [17]:
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

Create a DataFrame (X) with the features and a series (y) with the labels. We could also use numpy arrays, but then we don’t have column names:

In [18]:
y = df.survived
X = df.drop(columns="survived")

## Sample Data

In [19]:
# train test split
# 30% for testing 
# why random state is 42 is a good question for intreviews ;)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

## Impute Data

In [20]:
# I've never tested the IterativeImputer, let's check it
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute

In [21]:
num_cols = [
    "pclass", 
    "age", 
    "sibsp", 
    "parch", 
    "fare", 
    "sex_female",
]

In [22]:
imputer = impute.IterativeImputer()
imputed = imputer.fit_transform(X_train[num_cols])
X_train.loc[:, num_cols] = imputed
imputed = imputer.transform(X_test[num_cols])
X_test.loc[:, num_cols] = imputed

If we wanted to impute with the median, we can use pandas to do that:

In [23]:
meds = X_train.median()
X_train = X_train.fillna(meds)
X_test = X_test.fillna(meds)

## Normalize Data

Standardize the data for the preprocessing

In [26]:
scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))