In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Data collect and preprocessing

In [3]:
#load the data from csv file into pandas dataframe.
df = pd.read_csv('./titanic.csv')
#Print first 6 rows of dataset.
df.head(5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [4]:
#print number of rows and columns in that dataset.
df.shape

(1310, 14)

In [5]:
#Print name of the all columns. 
df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

This dataset contain 1310 rows and 14 columns. 

In [6]:
#Print more information about dataset.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   float64
 1   survived   1309 non-null   float64
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   float64
 6   parch      1309 non-null   float64
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(7), object(7)
memory usage: 143.4+ KB


This dataset contain 1310 rows and 14 columns.

In [7]:
#Cheack the number of duplicate rows in dataset.
df.duplicated().sum()

0

There are no duplicate values in that dataset.

In [8]:
#Chech the number of missing values in each column. 
df.isnull().sum()

pclass          1
survived        1
name            1
sex             1
age           264
sibsp           1
parch           1
ticket          1
fare            2
cabin        1015
embarked        3
boat          824
body         1189
home.dest     565
dtype: int64

This dataset contain some missing values.

## Handling the missing values.

In [9]:
#drop the cabin, name columns.
df.drop(columns = ['cabin', 'name'], axis = 1, inplace = True)

In [23]:
# replacing the missing values in pclass, age, sibsp, parch, fare, body columns with its mean values
df['pclass'].fillna(df['pclass'].mean(), inplace = True)
df['age'].fillna(df['age'].mean(), inplace = True)
df['sibsp'].fillna(df['sibsp'].mean(), inplace = True)
df['parch'].fillna(df['parch'].mean(), inplace = True)
df['fare'].fillna(df['fare'].mean(), inplace = True)
df['body'].fillna(df['body'].mean(), inplace = True)

In [24]:
#print mode value of sex column.
print(df['sex'].mode())
print(df['sex'].mode()[0])

0    male
Name: sex, dtype: object
male


In [25]:
#print mode value of ticket column.
print(df['ticket'].mode())
print(df['ticket'].mode()[0])

0    CA. 2343
Name: ticket, dtype: object
CA. 2343


In [26]:
#print mode value of embarked column.
print(df['embarked'].mode())
print(df['embarked'].mode()[0])

0    S
Name: embarked, dtype: object
S


In [27]:
#print mode value of boat column.
print(df['boat'].mode())
print(df['boat'].mode()[0])

0    13
Name: boat, dtype: object
13


In [28]:
#print mode value of home.dest column.
print(df['home.dest'].mode())
print(df['home.dest'].mode()[0])

0    New York, NY
Name: home.dest, dtype: object
New York, NY


In [29]:
# replacing the missing values in sex, ticket, embarked, boat, home.dest columns with its mode values
df['sex'].fillna(df['sex'].mode()[0], inplace = True)
df['ticket'].fillna(df['ticket'].mode()[0], inplace = True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace = True)
df['boat'].fillna(df['boat'].mode()[0], inplace = True)
df['home.dest'].fillna(df['home.dest'].mode()[0], inplace = True)

In [30]:
#Chech the number of missing values in each column.
df.isnull().sum()

pclass       0
survived     1
sex          0
age          0
sibsp        0
parch        0
ticket       0
fare         0
embarked     0
boat         0
body         0
home.dest    0
dtype: int64

In [32]:
#drop the row missing value contain column survived.
df.dropna(axis = 0, inplace = True)

In [33]:
#Chech the number of missing values in each column.
df.isnull().sum()

pclass       0
survived     0
sex          0
age          0
sibsp        0
parch        0
ticket       0
fare         0
embarked     0
boat         0
body         0
home.dest    0
dtype: int64

Now it doesn't contain missing values.

In [34]:
#print number of rows and columns in that dataset.
df.shape

(1309, 12)

Now it contain 1309 rows and 12 columns.

In [35]:
#Get description about dataset.
df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,12.883199,1.041658,0.86556,51.738879,29.591544
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,22.0,0.0,0.0,7.8958,160.809917
50%,3.0,0.0,29.881135,0.0,0.0,14.4542,160.809917
75%,3.0,1.0,35.0,1.0,0.0,31.275,160.809917
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


## Categorical features representing using bargraph.

Pclass

Sex

SibSp ( # of siblings and spouse)

Parch ( # of parents and children)

Embarked

Cabin