In [102]:
#Import Relevant Libraries for Preprocessing

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

#Load Dataset
raw_data = pd.read_csv("Data\\test.csv")
raw_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [103]:
df = raw_data.copy()

#Drop unnecessary columns

df = df.drop(columns=["Cabin", "Ticket"])

#Simple Mapping of Gender

df["Sex"] = df["Sex"].map({"male":0, "female":1}).astype(int)

df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,9.6875,Q
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,12.2875,S


In [104]:
sns.set_theme(style="darkgrid")

#FIlling the age column using a combination of PClass and Gender to reduce the noise in the dataset

guess_ages = np.zeros((2,3))

for i in range(0,2):
    for j in range(0,3):
        guess_df = df[(df['Sex'] == i) & (df['Pclass'] == j+1)]['Age'].dropna()

        age_guess = guess_df.median()
        guess_ages[i,j] = int( age_guess/0.5 + 0.5)*0.5

for i in range(0,2):
    for j in range(0,3):
        df.loc[ (df.Age.isnull()) & (df.Sex==i) & (df.Pclass == j+1), 'Age'] = guess_ages[i, j]

df.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,9.6875,Q
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,12.2875,S


In [105]:
#Creating an age interval for more uniformity and mapping those intervals

df['Ageband'] = pd.cut(df['Age'], 5)

df.loc[ df['Age'] <=16, 'Age'] = 0
df.loc[ (df['Age'] >16) & (df['Age'] <=32), 'Age'] = 1
df.loc[ (df['Age'] >32) & (df['Age'] <=48), 'Age'] = 2
df.loc[ (df['Age'] >48) & (df['Age'] <=64), 'Age'] = 3
df.loc[ (df['Age'] >64) & (df['Age'] <=80), 'Age'] = 4


Age    418
dtype: int64


In [106]:
#Drop age-band from the dataset

df = df.drop("Ageband", axis=1)

#Filling the embarked column with highest frequency

best_port = df.Embarked.dropna().mode()[0]

df['Embarked'] = df['Embarked'].fillna(best_port)

df.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,"Kelly, Mr. James",0,2.0,0,0,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,2.0,1,0,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",0,3.0,0,0,9.6875,Q
3,895,3,"Wirz, Mr. Albert",0,1.0,0,0,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,1.0,1,1,12.2875,S


In [107]:
#Mapping the embarked column

df['Embarked'] = df['Embarked'].map({ "S": 0, "C": 1, "Q": 2})


In [108]:
#Creating new Family and Alone features using the SibSp and Parch columns

df["Family"] = df["SibSp"] + df["Parch"] + 1
print(df["Family"].value_counts())

df["Alone"] = 0

df.loc[ (df["Family"] == 1), "Alone"] = 1

print(df["Alone"].value_counts())


1     253
2      74
3      57
4      14
5       7
7       4
11      4
6       3
8       2
Name: Family, dtype: int64
1    253
0    165
Name: Alone, dtype: int64


In [109]:
#Dropping the Family, SibSp and Parch columns

df = df.drop(columns=["Family", "SibSp", "Parch"])

df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Fare,Embarked,Alone
0,892,3,"Kelly, Mr. James",0,2.0,7.8292,2,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,2.0,7.0,0,0
2,894,2,"Myles, Mr. Thomas Francis",0,3.0,9.6875,2,1
3,895,3,"Wirz, Mr. Albert",0,1.0,8.6625,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,1.0,12.2875,0,0


In [110]:
#Extracting the title from the names

df["Title"] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

#Replacing the rarer titles with Rare and others to the regular form

df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

#Simple mapping the titles

maps = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

df["Title"] = df["Title"].map(maps)
df["Title"] = df["Title"].fillna(0)

df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Fare,Embarked,Alone,Title
0,892,3,"Kelly, Mr. James",0,2.0,7.8292,2,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,2.0,7.0,0,0,3
2,894,2,"Myles, Mr. Thomas Francis",0,3.0,9.6875,2,1,1
3,895,3,"Wirz, Mr. Albert",0,1.0,8.6625,0,1,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,1.0,12.2875,0,0,3


In [112]:
df = df.drop(columns=["Name"])
df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Alone,Title
0,892,3,0,2.0,7.8292,2,1,1
1,893,3,1,2.0,7.0,0,0,3
2,894,2,0,3.0,9.6875,2,1,1
3,895,3,0,1.0,8.6625,0,1,1
4,896,3,1,1.0,12.2875,0,0,3


In [113]:
#Exporting to csv file.
df.to_csv("Data_Processed\Data_Preprocessed_Test.csv")