In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv("titanic.tsv", sep='\t')
data.head(10)

Unnamed: 0,Survived,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,530,2,Hocking\t Mr. Richard George,male,23.0,2,1,29104,11.5,,S
1,0,466,3,Goncalves\t Mr. Manuel Estanslas,male,38.0,0,0,SOTON/O.Q. 3101306,7.05,,S
2,0,753,3,Vande Velde\t Mr. Johannes Joseph,male,33.0,0,0,345780,9.5,,S
3,0,855,2,Carter\t Mrs. Ernest Courtenay (Lilian Hughes),female,44.0,1,0,244252,26.0,,S
4,0,333,1,Graham\t Mr. George Edward,male,38.0,0,1,PC 17582,153.4625,C91,S
5,0,39,3,Vander Planke\t Miss. Augusta Maria,female,18.0,2,0,345764,18.0,,S
6,0,236,3,Harknett\t Miss. Alice Phoebe,female,,0,0,W./C. 6609,7.55,,S
7,0,303,3,Johnson\t Mr. William Cahoone Jr,male,19.0,0,0,LINE,0.0,,S
8,1,18,2,Williams\t Mr. Charles Eugene,male,,0,0,244373,13.0,,S
9,1,505,1,Maioni\t Miss. Roberta,female,16.0,0,0,110152,86.5,B79,S


In [101]:
print(data.shape)

(623, 12)


In [102]:
data.dtypes

Survived         int64
PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [103]:
data.describe()

Unnamed: 0,Survived,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,623.0,623.0,623.0,500.0,623.0,623.0,623.0
mean,0.382022,439.173355,2.343499,29.35552,0.558587,0.375602,30.357637
std,0.486272,256.219803,0.818291,14.537372,1.157675,0.78788,43.910782
min,0.0,1.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,216.5,2.0,20.0,0.0,0.0,7.925
50%,0.0,445.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,658.0,3.0,38.0,1.0,0.0,30.5
max,1.0,891.0,3.0,74.0,8.0,6.0,512.3292


In [104]:
data.isnull().sum()

Survived         0
PassengerId      0
Pclass           0
Name             0
Sex              0
Age            123
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          486
Embarked         2
dtype: int64

In [105]:
cols = data.columns.to_list()
data[cols].nunique()

Survived         2
PassengerId    623
Pclass           3
Name           623
Sex              2
Age             82
SibSp            7
Parch            7
Ticket         505
Fare           213
Cabin          107
Embarked         3
dtype: int64

#### Embarked

In [106]:
data["Embarked"].value_counts()

S    453
C    118
Q     50
Name: Embarked, dtype: int64

In [107]:
data["Embarked"] = data["Embarked"].apply(lambda x: 0 if x in ["S"] else 1 if x in ["C"] else 2)
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

#### Sex

In [108]:
data["Sex"].value_counts()

male      405
female    218
Name: Sex, dtype: int64

In [109]:
data["Sex"] = data["Sex"].apply(lambda x: 0 if x in ["male"] else 1)

#### Survived

In [110]:
data["Survived"].value_counts()

0    385
1    238
Name: Survived, dtype: int64

In [111]:
data["Survived"] = data["Survived"].apply(lambda x: 0 if x in [False] else 1)

#### Age

In [112]:
data["Age"] = data["Age"].fillna(round(data["Age"].mean(), 2))

In [114]:
data["Age"].value_counts()

29.36    123
22.00     20
21.00     19
24.00     19
19.00     17
        ... 
20.50      1
12.00      1
0.75       1
10.00      1
46.00      1
Name: Age, Length: 83, dtype: int64

#### Cabin

In [115]:
data["CabinDeck"] = data["Cabin"].str[0]
data["CabinDeck"] = data["CabinDeck"].replace({pd.NA: "NA"})

In [116]:
data["CabinDeck"].value_counts()

NA    486
C      35
B      34
D      25
E      18
F      11
A       9
G       4
T       1
Name: CabinDeck, dtype: int64

In [117]:
mapping = {'NA': 0, 'C': 1, 'B': 2, 'E': 3, 'D': 4, 'A': 5, 'F': 6, 'G': 7, 'T': 8}

In [118]:
data["CabinDeck"] = data["CabinDeck"].replace(mapping)

In [119]:
data["CabinDeck"].value_counts()

0    486
1     35
2     34
4     25
3     18
6     11
5      9
7      4
8      1
Name: CabinDeck, dtype: int64

#### Name

In [121]:
name_counts = data["Name"].str.extract(r" ([A-Za-z]+)\.")[0].value_counts()
print(name_counts.index[:4])

Index(['Mr', 'Miss', 'Mrs', 'Master'], dtype='object')


In [122]:
data["Title"] = data["Name"].str.extract(r" ([A-Za-z]+)\.")
unique_titles = name_counts.index[4:]
data["Title"] = data["Title"].replace(unique_titles, "Other")

In [123]:
data["Title"].value_counts()

Mr        361
Miss      128
Mrs        85
Master     31
Other      18
Name: Title, dtype: int64

In [124]:
data["Title"] = data["Title"].apply(lambda x: 0 if x in ["Mr"] else 1 if x in ["Miss"] else 2 if x in ["Mrs"] else 3 if x in ["Master"] else 4)

#### Ticket

In [125]:
data["Ticket"].value_counts()

CA 2144       6
347082        5
347088        5
CA. 2343      5
1601          5
             ..
370372        1
345774        1
349251        1
349208        1
SO/C 14885    1
Name: Ticket, Length: 505, dtype: int64

In [126]:
ticket_dict = data.Ticket.value_counts().to_dict()

In [127]:
data["TravelCompanion"] = data["Ticket"].apply(lambda x: ticket_dict[x]-1 if x in ticket_dict else x)

In [128]:
data = data.drop(["Name", "Cabin", "Ticket"], axis = 1)

In [129]:
data.head(10)

Unnamed: 0,Survived,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinDeck,Title,TravelCompanion
0,0,530,2,0,23.0,2,1,11.5,0,0,0,0
1,0,466,3,0,38.0,0,0,7.05,0,0,0,0
2,0,753,3,0,33.0,0,0,9.5,0,0,0,0
3,0,855,2,1,44.0,1,0,26.0,0,0,2,0
4,0,333,1,0,38.0,0,1,153.4625,0,1,0,1
5,0,39,3,1,18.0,2,0,18.0,0,0,1,1
6,0,236,3,1,29.36,0,0,7.55,0,0,1,0
7,0,303,3,0,19.0,0,0,0.0,0,0,0,1
8,1,18,2,0,29.36,0,0,13.0,0,0,0,0
9,1,505,1,1,16.0,0,0,86.5,0,2,1,0


In [130]:
data.isnull().sum()

Survived           0
PassengerId        0
Pclass             0
Sex                0
Age                0
SibSp              0
Parch              0
Fare               0
Embarked           0
CabinDeck          0
Title              0
TravelCompanion    0
dtype: int64

In [131]:
data.dtypes

Survived             int64
PassengerId          int64
Pclass               int64
Sex                  int64
Age                float64
SibSp                int64
Parch                int64
Fare               float64
Embarked             int64
CabinDeck            int64
Title                int64
TravelCompanion      int64
dtype: object