In [649]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [650]:
train = pd.read_csv('train.csv')
Y = train['Survived']
train.drop('Survived',axis=1,inplace = True)
test = pd.read_csv('test.csv')

In [651]:
def fill_nan(df):    
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Cabin'] = df['Cabin'].isna().astype(int)
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df['Embarked'] = df['Embarked'].replace({'S':1,'C':2,'Q':3}).fillna('S')   
    return df


In [652]:
def get_title(df): 
    df['title'] = df['Name'].str.split(',',expand = True)[1].str.split('.',expand = True)[0]
    mr_values =['Don','Rev','Dr','Major','Col','Capt','Jonkheer','Sir']
    mrs_replace = ['Mme','Dona','the Countess','Mra','Lady']
    miss_replace = ['Ms','Mlle']
    df['title'] = df['title'].replace(mr_values,'Mr',regex = True)
    df['title'] = df['title'].replace(mrs_replace,'Mrs',regex = True)
    df['title'] = df['title'].replace(miss_replace,'Miss',regex = True)
    return df


In [653]:
def ticket(df):
    df['Ticket'] = df['Ticket'].str.split(' ',expand = True)[0]
    df['Ticket'] = df['Ticket'].astype('category').cat.codes
    return df

In [654]:
def family_members(df):
    df['Family Size'] = df['Parch'] + df['SibSp']
    df['withFamily'] = df['Family Size'].where(df['Family Size'] < 1,1)
    return df

In [655]:
def getDummies(df):
    dummies_sex = pd.get_dummies(df['Sex'])
    dummies_embarked = pd.get_dummies(df['Embarked'])
    dummies_title = pd.get_dummies(df['title'])
    dummies_Pclass = pd.get_dummies(df['Pclass'])
    df = pd.concat([dummies_embarked,dummies_sex, dummies_title,dummies_Pclass],axis = 1)
    return df

In [656]:
def preProcess(df):
    data = fill_nan(df)
    data = get_title(data)
    data = ticket(data)
    data = family_members(data)
    dummies = getDummies(data)
    final_df = pd.concat([data,dummies],axis= 1)
    final_df.drop(['Name','Sex','Pclass','Embarked','title'],inplace = True,axis = 1)
    return final_df

In [657]:
preProcess(train)

Unnamed: 0,PassengerId,Age,SibSp,Parch,Ticket,Fare,Cabin,Family Size,withFamily,1.0,...,S,female,male,Master,Miss,Mr,Mrs,1,2,3
0,1,22.000000,1,0,518,7.2500,1,1,1,1,...,0,0,1,0,0,1,0,0,0,1
1,2,38.000000,1,0,532,71.2833,0,1,1,0,...,0,1,0,0,0,0,1,1,0,0
2,3,26.000000,0,0,551,7.9250,1,0,0,1,...,0,1,0,0,1,0,0,0,0,1
3,4,35.000000,1,0,49,53.1000,0,1,1,1,...,0,1,0,0,0,0,1,1,0,0
4,5,35.000000,0,0,472,8.0500,1,0,0,1,...,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,27.000000,0,0,101,13.0000,1,0,0,1,...,0,0,1,0,0,1,0,0,1,0
887,888,19.000000,0,0,14,30.0000,0,0,0,1,...,0,1,0,0,1,0,0,1,0,0
888,889,29.699118,1,2,553,23.4500,1,3,1,1,...,0,1,0,0,1,0,0,0,0,1
889,890,26.000000,0,0,8,30.0000,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0


In [658]:
preProcess(test)

Unnamed: 0,PassengerId,Age,SibSp,Parch,Ticket,Fare,Cabin,Family Size,withFamily,1,...,3,female,male,Master,Miss,Mr,Mrs,1.1,2,3.1
0,892,34.50000,0,0,152,7.8292,1,0,0,0,...,1,0,1,0,0,1,0,0,0,1
1,893,47.00000,1,0,221,7.0000,1,1,1,1,...,0,1,0,0,0,0,1,0,0,1
2,894,62.00000,0,0,73,9.6875,1,0,0,0,...,1,0,1,0,0,1,0,0,1,0
3,895,27.00000,0,0,147,8.6625,1,0,0,1,...,0,0,1,0,0,1,0,0,0,1
4,896,22.00000,1,1,138,12.2875,1,2,1,1,...,0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,30.27259,0,0,267,8.0500,1,0,0,1,...,0,0,1,0,0,1,0,0,0,1
414,1306,39.00000,0,0,280,108.9000,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0
415,1307,38.50000,0,0,291,7.2500,1,0,0,1,...,0,0,1,0,0,1,0,0,0,1
416,1308,30.27259,0,0,220,8.0500,1,0,0,1,...,0,0,1,0,0,1,0,0,0,1
