## Cleaning

In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')

In [3]:
df_train.drop(['PassengerId', 'Cabin'], axis=1, inplace=True)
df_test.drop(['Cabin'], axis=1, inplace=True)

In [4]:
df_train['Ticket'].value_counts().count()

681

In [5]:
df_train.drop(['Ticket'], axis=1, inplace=True)
df_test.drop(['Ticket'], axis=1, inplace=True)

In [6]:
df_train['Name'] = df_train['Name'].str.extract('([A-Za-z]+)\.', expand=False)
df_train['Name'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: Name, dtype: int64

In [7]:
df_train['Name'] = df_train['Name'].map(lambda x: x if x in ['Mr', 'Miss', 'Mrs', 'Master'] else 'Other')
df_train['Name'].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
Other      27
Name: Name, dtype: int64

In [8]:
df_test['Name'] = df_test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df_test['Name'] = df_test['Name'].map(lambda x: x if x in ['Mr', 'Miss', 'Mrs', 'Master'] else 'Other')

In [9]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,Mr,male,22.0,1,0,7.25,S
1,1,1,Mrs,female,38.0,1,0,71.2833,C
2,1,3,Miss,female,26.0,0,0,7.925,S
3,1,1,Mrs,female,35.0,1,0,53.1,S
4,0,3,Mr,male,35.0,0,0,8.05,S


In [10]:
df_train.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [11]:
df_ages = df_train[['Sex', 'Age', 'Name', 'Pclass']].\
    groupby(['Sex', 'Name', 'Pclass'], as_index=False).mean()
df_ages['Age'] = df_ages['Age'].astype(int)
df_ages

Unnamed: 0,Sex,Name,Pclass,Age
0,female,Miss,1,30
1,female,Miss,2,22
2,female,Miss,3,16
3,female,Mrs,1,40
4,female,Mrs,2,33
5,female,Mrs,3,33
6,female,Other,1,33
7,female,Other,2,28
8,male,Master,1,5
9,male,Master,2,2


In [12]:
hash_lb = lambda row: hash(tuple(row)) & ((1 << 64) - 1)

In [13]:
df_ages['AgeId'] = df_ages[['Sex', 'Name', 'Pclass']].apply(hash_lb, axis=1)
df_ages.head()

Unnamed: 0,Sex,Name,Pclass,Age,AgeId
0,female,Miss,1,30,16959824029630599272
1,female,Miss,2,22,12066750000657902906
2,female,Miss,3,16,127646683050349715
3,female,Mrs,1,40,8542665414658306882
4,female,Mrs,2,33,17697978076410794004


In [14]:
df_train["AgeId"] = df_train[['Sex', 'Name', 'Pclass']].apply(hash_lb, axis=1)
df_train["Age"] = df_train["Age"].fillna(df_train['AgeId']\
                                         .map(df_ages.set_index('AgeId')['Age']))

del(df_train['AgeId'])

In [15]:
df_test["AgeId"] = df_test[['Sex', 'Name', 'Pclass']].apply(hash_lb, axis=1)
df_test["Age"] = df_test["Age"].fillna(df_test['AgeId']\
                                         .map(df_ages.set_index('AgeId')['Age']))

del(df_test['AgeId'])
del(df_ages)

In [16]:
df_train.isna().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [17]:
df_test.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            1
SibSp          0
Parch          0
Fare           1
Embarked       0
dtype: int64

In [18]:
df_train.dropna(axis=0, inplace=True)

In [19]:
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].median())
df_test['Age'] = df_test['Age'].fillna(df_test['Fare'].median())

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
df_train['Name'] = df_train[['Name']].apply(LabelEncoder().fit_transform)
df_train['Sex'] = df_train[['Sex']].apply(LabelEncoder().fit_transform)
df_train['Embarked'] = df_train[['Embarked']].apply(LabelEncoder().fit_transform)

df_test['Name'] = df_test[['Name']].apply(LabelEncoder().fit_transform)
df_test['Sex'] = df_test[['Sex']].apply(LabelEncoder().fit_transform)
df_test['Embarked'] = df_test[['Embarked']].apply(LabelEncoder().fit_transform)

In [22]:
df_train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,2,1,22.0,1,0,7.25,2
1,1,1,3,0,38.0,1,0,71.2833,0
2,1,3,1,0,26.0,0,0,7.925,2
3,1,1,3,0,35.0,1,0,53.1,2
4,0,3,2,1,35.0,0,0,8.05,2


In [23]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,2,1,34.5,0,0,7.8292,1
1,893,3,3,0,47.0,1,0,7.0,2
2,894,2,2,1,62.0,0,0,9.6875,1
3,895,3,2,1,27.0,0,0,8.6625,2
4,896,3,3,0,22.0,1,1,12.2875,2


In [24]:
df_train.to_csv('../data/interim/train.csv', index=False)
df_test.to_csv('../data/interim/test.csv', index=False)