# Feature engineering

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("../data/train.csv", sep=";")
df = df.drop(["PassengerId", "Name", "Ticket", "Fare", "Cabin"], axis=1)
df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,24.0,0,0,S
1,0,3,male,29.0,0,0,S
2,0,3,male,20.0,0,0,S
3,0,1,male,46.0,1,0,S
4,0,3,male,26.0,1,2,S
5,0,3,male,59.0,0,0,S
6,0,3,male,,0,0,S
7,0,1,male,71.0,0,0,C
8,1,1,male,23.0,0,1,C
9,1,2,female,34.0,0,1,S


In [3]:
df_val = pd.read_csv("../data/val.csv", sep=";")
df_val = df_val.drop(["PassengerId", "Name", "Ticket", "Fare", "Cabin"], axis=1)

## Handling missing data

In [4]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         156
SibSp         0
Parch         0
Embarked      1
dtype: int64

In [5]:
df_val.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age         21
SibSp        0
Parch        0
Embarked     1
dtype: int64

Both training set and validtion set have the same columns with missing values. For *Embarked* column we can insert the most frequent value and for *Age* column we can impute median age value.

In [6]:
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].value_counts().idxmax())

## Converting features

In [7]:
df["Sex"] = df["Sex"].map({"male": 0, "female": 1}).astype(int)

In [8]:
bin_labels = ["Young", "Medium", "Old"]
df["Age"] = pd.qcut(df["Age"], q=3, labels=bin_labels)

## One-hot encoding

In [9]:
age = pd.get_dummies(df["Age"], drop_first=True)
embarked = pd.get_dummies(df["Embarked"], drop_first=True)
pclass = pd.get_dummies(df["Pclass"], drop_first=True)

df = pd.concat([df, pclass, age, embarked], axis=1)
df = df.drop(["Pclass", "Age", "Embarked"], axis=1)

## Adding new features

In [10]:
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["IsAlone"] = 0
df.loc[df["FamilySize"] == 1, "IsAlone"] = 1

df = df.drop(["SibSp", "Parch", "FamilySize"], axis=1)

In [11]:
df.head(10)

Unnamed: 0,Survived,Sex,2,3,Medium,Old,Q,S,IsAlone
0,0,0,0,1,0,0,0,1,1
1,0,0,0,1,1,0,0,1,1
2,0,0,0,1,0,0,0,1,1
3,0,0,0,0,0,1,0,1,0
4,0,0,0,1,1,0,0,1,0
5,0,0,0,1,0,1,0,1,1
6,0,0,0,1,1,0,0,1,1
7,0,0,0,0,0,1,0,0,1
8,1,0,0,0,0,0,0,0,0
9,1,1,1,0,0,1,0,1,0
