taken from : https://www.kaggle.com/schmitzi/cleaning-titanic-data-and-running-scikitlearn


## Load the data

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("titanic.csv", sep=",", header=0)

print(data.columns)

Index(['pclass', 'survived', 'name', 'sex', 'Age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')


Data structure

In [2]:

print(data.columns)
print(data.dtypes)

Index(['pclass', 'survived', 'name', 'sex', 'Age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')
pclass         int64
survived       int64
name          object
sex           object
Age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object


## Check the first few entires

In [3]:
data.head()


Unnamed: 0,pclass,survived,name,sex,Age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


## Statistics

In [4]:
data.describe()


Unnamed: 0,pclass,survived,Age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881138,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.413493,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.17,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [5]:
data.corr()


Unnamed: 0,pclass,survived,Age,sibsp,parch,fare,body
pclass,1.0,-0.312469,-0.408106,0.060832,0.018322,-0.558629,-0.034642
survived,-0.312469,1.0,-0.055512,-0.027825,0.08266,0.244265,
Age,-0.408106,-0.055512,1.0,-0.243699,-0.150917,0.17874,0.058809
sibsp,0.060832,-0.027825,-0.243699,1.0,0.373587,0.160238,-0.099961
parch,0.018322,0.08266,-0.150917,0.373587,1.0,0.221539,0.051099
fare,-0.558629,0.244265,0.17874,0.160238,0.221539,1.0,-0.04311
body,-0.034642,,0.058809,-0.099961,0.051099,-0.04311,1.0


## Estimate incomplete data

The data incomplete. It needs to be filled out. This is done by taking in consideration the the mean of the values and the mode in the case for the emmabrked.

In [6]:
data.dropna( axis=0, how="any", thresh=None, subset=['name'], inplace=True)


In [7]:

data['Age'].fillna(value=data['Age'].mean(), inplace=True)
data['fare'].fillna(value=data['fare'].mean(), inplace=True)
data['embarked'].fillna(value=(data['embarked'].value_counts().idxmax()), inplace=True)

In [8]:
## Extract titles and add a column.
# A title is not found it is defaulted as Mr.
def extract_title(row):
    return row['name'].split(",")[1].split(".")[0]

data['Title'] = data.apply(extract_title, axis = 1)
#titles = pd.DataFrame(data.apply(lambda x: x.name.split(",")[1].split(".")[0], axis=1), columns=["Title"])

## Drop data that is not helpful

In [9]:
#Drop data that is not helpful.

data.drop('name', axis=1, inplace=True)
data.drop('cabin', axis=1, inplace=True)
data.drop('ticket', axis=1, inplace=True)
data.drop('body', axis=1, inplace=True)
# boats must go.
data.drop('boat', axis=1, inplace=True)
data.drop('home.dest', axis=1, inplace=True) # with 386 destinations I would presume it does not matter..



## Change the data into numbers

In [10]:
## add sex dummies.
dummies = pd.get_dummies(data['sex'])
dummies.drop([dummies.columns[1]], inplace=True, axis=1)
data = data.join(dummies[dummies.columns[0]])
data.drop(['sex'], inplace=True, axis=1)

## add title dummies.
dummies = pd.get_dummies(data['Title'])
dummies.drop([dummies.columns[0]], inplace=True, axis=1)
data = data.join(dummies)
data.drop(['Title'], inplace=True, axis=1)


## add embarked dummies.
dummies = pd.get_dummies(data['embarked'])
dummies.drop([dummies.columns[0]], inplace=True, axis=1)
data = data.join(dummies)
data.drop(['embarked'], inplace=True, axis=1)

## add pc class dummies.
dummies = pd.get_dummies(data['pclass'])
dummies.drop([dummies.columns[0]], inplace=True, axis=1)
dummies.rename(columns={2:'class-2',
                          3:'class-3'}, 
                 inplace=True)
data = data.join(dummies)
data.drop(['pclass'], inplace=True, axis=1)


In [11]:
data.head()

Unnamed: 0,survived,Age,sibsp,parch,fare,female,Col,Don,Dona,Dr,...,Mr,Mrs,Ms,Rev,Sir,the Countess,Q,S,class-2,class-3
0,1,29.0,0,0,211.3375,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0.92,1,2,151.55,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,2.0,1,2,151.55,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,30.0,1,2,151.55,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0,25.0,1,2,151.55,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


## Move survived

* Place survived at the end of the table. -> https://cmdlinetips.com/2020/03/move-a-column-to-first-position-in-pandas-dataframe/

In [12]:
survived_col = data.pop('survived')
data.insert(len(data.columns), 'survived', survived_col)


In [13]:
data.corr()

Unnamed: 0,Age,sibsp,parch,fare,female,Col,Don,Dona,Dr,Jonkheer,...,Mrs,Ms,Rev,Sir,the Countess,Q,S,class-2,class-3,survived
Age,1.0,-0.190747,-0.130872,0.171521,-0.057397,0.103687,0.021726,0.019578,0.072941,0.017431,...,0.200612,-0.002857,0.069225,0.041049,0.006696,-0.012718,-0.059153,-0.014193,-0.302093,-0.050198
sibsp,-0.190747,1.0,0.373587,0.160224,0.109609,-0.013232,-0.013247,-0.013247,0.0095,-0.013247,...,0.065098,-0.018741,-0.018741,0.013308,-0.013247,-0.048678,0.073709,-0.052419,0.07261,-0.027825
parch,-0.130872,0.373587,1.0,0.221522,0.213125,-0.024637,-0.012304,-0.012304,-0.023566,-0.012304,...,0.217673,-0.017408,-0.012238,-0.012304,-0.012304,-0.100943,0.071881,-0.010057,0.019521,0.08266
fare,0.171521,0.160224,0.221522,1.0,0.185484,0.049481,-0.00298,0.04042,0.030262,-0.0178,...,0.140508,-0.017336,-0.022267,0.012635,0.028444,-0.130054,-0.169894,-0.121372,-0.419616,0.244208
female,-0.057397,0.109609,0.213125,0.185484,1.0,-0.041163,-0.020558,0.037189,-0.037831,-0.020558,...,0.566111,0.052614,-0.058302,-0.020558,0.037189,0.088651,-0.115193,0.028862,-0.116562,0.528693
Col,0.103687,-0.013232,-0.024637,0.049481,-0.041163,1.0,-0.001531,-0.001531,-0.004341,-0.001531,...,-0.023303,-0.002166,-0.004341,-0.001531,-0.001531,-0.017829,-0.054326,-0.028683,-0.060183,0.013449
Don,0.021726,-0.013247,-0.012304,-0.00298,-0.020558,-0.001531,1.0,-0.000765,-0.002168,-0.000765,...,-0.011638,-0.001082,-0.002168,-0.000765,-0.000765,-0.008904,-0.042213,-0.014325,-0.030057,-0.021737
Dona,0.019578,-0.013247,-0.012304,0.04042,0.037189,-0.001531,-0.000765,1.0,-0.002168,-0.000765,...,-0.011638,-0.001082,-0.002168,-0.000765,-0.000765,-0.008904,-0.042213,-0.014325,-0.030057,0.035171
Dr,0.072941,0.0095,-0.023566,0.030262,-0.037831,-0.004341,-0.002168,-0.002168,1.0,-0.002168,...,-0.033006,-0.003067,-0.006149,-0.002168,-0.002168,0.008341,-0.012792,0.00737,-0.085242,0.019049
Jonkheer,0.017431,-0.013247,-0.012304,-0.0178,-0.020558,-0.001531,-0.000765,-0.000765,-0.002168,1.0,...,-0.011638,-0.001082,-0.002168,-0.000765,-0.000765,-0.008904,0.018111,-0.014325,-0.030057,-0.021737


## Save the data

In [14]:
data.to_csv("titanic_numerical_clean.csv", index = False, header = False)
data.to_csv("titanic_numerical_clean._With_Headers.csv", index = True)

## Dummy variables

There is a need to create dummy variables instead of ordered categories as I have done before.
This is because the order has no meaning
