# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import sys
sys.path.append("../..")
from data_processing import fill_demean_scale

# Read in `train` and `test`

In [3]:
train_df = pd.read_csv("../../store/train.csv")
test_df = pd.read_csv("../../store/test.csv")

# `train`

## `Sex`

In [4]:
sex_mapping = {'male':1, 'female':0}
train_df.Sex = train_df.Sex.map(sex_mapping)

## `Age`, `Fare`, `Parch`, and `SipSp`

In [5]:
for column in ['Age', 'Fare', 'Parch', 'SibSp']:
    train_df[column] = fill_demean_scale(df=train_df, column=column)

## `Embarked`

### Drop `null` values

In [6]:
train_df = train_df[~train_df.Embarked.isnull()]

### Split into `Q`, `S`, and `C` binary variables

In [14]:
train_df = train_df.join(pd.get_dummies(train_df.Embarked, dummy_na=False))

## `Pclass`

In [19]:
train_df = train_df.join(
    pd.get_dummies(train_df.Pclass, prefix='Pclass', dummy_na=False)
)

In [22]:
train_df = train_df.set_index('PassengerId')

## `len_name`

In [25]:
train_df['len_name'] = train_df.Name.apply(lambda x: len(x))

In [30]:
train_df.len_name = fill_demean_scale(df=train_df, column='len_name')

In [34]:
train_df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      0
C             0
Q             0
S             0
Pclass_1      0
Pclass_2      0
Pclass_3      0
len_name      0
dtype: int64

## Drop `Cabin`

In [35]:
train_df = train_df.drop(columns='Cabin')

In [36]:
train_df.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
C           0
Q           0
S           0
Pclass_1    0
Pclass_2    0
Pclass_3    0
len_name    0
dtype: int64

# `test`

## `Sex`

In [37]:
sex_mapping = {'male':1, 'female':0}
test_df.Sex = test_df.Sex.map(sex_mapping)

## `Age`, `Fare`, `Parch`, and `SipSp`

In [38]:
for column in ['Age', 'Fare', 'Parch', 'SibSp']:
    test_df[column] = fill_demean_scale(df=test_df, column=column)

## `Embarked`

### Drop `null` values

In [40]:
test_df = test_df[~test_df.Embarked.isnull()]

### Split into `Q`, `S`, and `C`

In [45]:
test_df = test_df.join(pd.get_dummies(test_df.Embarked, dummy_na=False))

In [46]:
test_df.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pclass_1,Pclass_2,Pclass_3,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
892,3,"Kelly, Mr. James",1,0.055749,-0.055921,-0.043594,330911,-0.054258,,Q,0,0,1,0,1,0
893,3,"Wilkes, Mrs. James (Ellen Needs)",0,0.220591,0.069079,-0.043594,363272,-0.055877,,S,0,0,1,0,0,1
894,2,"Myles, Mr. Thomas Francis",1,0.418402,-0.055921,-0.043594,240276,-0.050631,,Q,0,1,0,0,1,0
895,3,"Wirz, Mr. Albert",1,-0.043157,-0.055921,-0.043594,315154,-0.052632,,S,0,0,1,0,0,1
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,-0.109094,0.069079,0.067517,3101298,-0.045556,,S,0,0,1,0,0,1


## `Pclass`

In [44]:
test_df = test_df.join(
    pd.get_dummies(test_df.Pclass, prefix='Pclass', dummy_na=False)
)

test_df = test_df.set_index("PassengerId")

ValueError: columns overlap but no suffix specified: Index(['Pclass_1', 'Pclass_2', 'Pclass_3'], dtype='object')

## `len_name`

In [47]:
test_df['len_name'] = test_df.Name.apply(lambda x: len(x))

In [48]:
test_df.len_name = fill_demean_scale(df=test_df, column='len_name')

## Drop `Cabin`

In [49]:
test_df = test_df.drop(columns='Cabin')

# Store

In [51]:
train_df.to_pickle("../../store/Run3_train_df.pkl")
test_df.to_pickle("../../store/Run3_test_df.pkl")