In [35]:
# packages
# import libraries 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [36]:
# load data
df_train = pd.read_csv('../data/train.csv', index_col = 'PassengerId')
df_test = pd.read_csv('../data/test.csv', index_col = 'PassengerId')

display(df_train.head())
display(df_test.head())


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [37]:
df_combined = pd.concat([df_train.drop('Survived', axis = 1), df_test], axis = 0, sort= True)
print(df_combined.shape)

display(df_combined.head())

(1309, 10)


Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,A/5 21171
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,PC 17599
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,STON/O2. 3101282
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,113803
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,373450


In [38]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 10 columns):
Age         1046 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
Fare        1308 non-null float64
Name        1309 non-null object
Parch       1309 non-null int64
Pclass      1309 non-null int64
Sex         1309 non-null object
SibSp       1309 non-null int64
Ticket      1309 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB


In [39]:
df_combined.describe()

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp
count,1046.0,1308.0,1309.0,1309.0,1309.0
mean,29.881138,33.295479,0.385027,2.294882,0.498854
std,14.413493,51.758668,0.86556,0.837836,1.041658
min,0.17,0.0,0.0,1.0,0.0
25%,21.0,7.8958,0.0,2.0,0.0
50%,28.0,14.4542,0.0,3.0,0.0
75%,39.0,31.275,0.0,3.0,1.0
max,80.0,512.3292,9.0,3.0,8.0


# Impute data and drop columns


In [40]:
df_combined = df_combined.fillna(df_combined.mean())

df_combined = df_combined.drop('Ticket', axis = 1)

df_combined = df_combined.drop('Name', axis = 1)

df_combined = df_combined.drop('Cabin', axis = 1)

df_combined['Sex'] = df_combined['Sex'].map({'male': 0, 'female':1})

# Get dummies

In [41]:
df_combined = pd.get_dummies(df_combined, drop_first=True)
display(df_combined.head())

Unnamed: 0_level_0,Age,Fare,Parch,Pclass,Sex,SibSp,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,22.0,7.25,0,3,0,1,0,1
2,38.0,71.2833,0,1,1,1,0,0
3,26.0,7.925,0,3,1,0,0,1
4,35.0,53.1,0,1,1,1,0,1
5,35.0,8.05,0,3,0,0,0,1


In [42]:
# merge sale price back into training set
train_data = df_combined.iloc[0 : len(df_train)]
train_data = train_data.merge(df_train[['Survived']], left_index=True, right_index=True)

print(train_data.shape)


test_data = df_combined.iloc[len(df_train):, ]
print(test_data.shape)



train_data.to_csv('../cleaned_data/train_data_simple_clean.csv')
test_data.to_csv('../cleaned_data/test_data_simple_clean.csv')

(891, 9)
(418, 8)
