In [1]:
import pandas as pd
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

In [2]:
# header=0 means the columns names are in the first row in the data
# setting PassengerId column as index column
data_path = "data/train.csv"
data = pd.read_csv(data_path,header=0,index_col="PassengerId")

In [3]:
# Perview fisrt 5 rows
data.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Check for null values
data.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [5]:
# We See that Age has 177 null values

# Cabin and Embarked also have some null values,but we won't use them
# So I will just leave them as its

# Fill Age Null values with the mean
age_mean = data.Age.mean()
data.Age.fillna(age_mean,inplace=True)

# Check for null values to make sure
data.Age.isna().sum()

0

In [6]:
data.Survived[:5]

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

In [7]:
# Convert Survived columns values from [1,0] to ["Survived","Not Survived"]
survived_mapping = {0: "Not Survived", 1: "Survived"}
data = data.replace({'Survived':survived_mapping})

# Perview column data
data.Survived[:5]

PassengerId
1    Not Survived
2        Survived
3        Survived
4        Survived
5    Not Survived
Name: Survived, dtype: object

In [8]:
# Get number of all survived and not survived number
survived_all = data["Survived"].value_counts()
survived_all

Not Survived    549
Survived        342
Name: Survived, dtype: int64

In [9]:
# Plot Bar Survived vs Not Survived

# For the plot,the bar needs a list of labels on the x-axix
# And list of values on y-axix

# This code will return => ['Not Survived','Survived']
bar_labels = survived_all.index.to_list()

# This code will return => [549,342]
bar_data = survived_all.to_list()

plot_title = "Survived Passengers Vs Not Survived"
x_label = "Survived or Not Survived"
y_label = "Number"

fig = px.bar(
    y=bar_data,
    x=bar_labels,
    color=bar_labels,
    labels={
        'x':x_label,
        'y':y_label
    },
    title=plot_title
)

fig.show()

In [10]:
# Also we can use the same data abve to plot a pie
title = "Titanic Survival Percentage"
fig = px.pie(values=bar_data, names=bar_labels,title=title)
fig.show()

In [11]:
# Now let's do something more interesting
# Let's plot the same bar but this time with addtional info : Male of Female

# To do this we need to group our data by the Sex column
# And then counts the values of the 'Survived' column for each group
survived_data_by_sex = data.groupby(by="Sex")["Survived"].value_counts()
survived_data_by_sex

Sex     Survived    
female  Survived        233
        Not Survived     81
male    Not Survived    468
        Survived        109
Name: Survived, dtype: int64

In [12]:
# Perview how we can use the group
survived_data_by_sex["female"]

Survived
Survived        233
Not Survived     81
Name: Survived, dtype: int64

In [13]:
# Survival for females

# Now We will plot the bar the same way we did before

x_data = survived_data_by_sex["female"].index.to_list()
y_data = survived_data_by_sex["female"].to_list()
plot_title = "Survived Passengers Vs Not Survived for Females"

fig = px.bar(
    y= y_data,
    x= x_data,
    labels={
        "x": "Survived or Not",
        "y": "Number",
         },                 
    color=x_data,
    title=plot_title
)
fig.show()

In [14]:
# If we want to do the same with males
# We will just change survived_data_by_sex["female"] to survived_data_by_sex["male"]

In [15]:
# Now , we want to plot the male and female together
# To This we first need to unstack our group
t = data.groupby(by="Sex")["Survived"].value_counts()

In [16]:
# Preview before unstack
t

Sex     Survived    
female  Survived        233
        Not Survived     81
male    Not Survived    468
        Survived        109
Name: Survived, dtype: int64

In [17]:
# Preview after unstack
t = t.unstack()
t

Survived,Not Survived,Survived
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,81,233
male,468,109


In [18]:
t.loc["male"]

Survived
Not Survived    468
Survived        109
Name: male, dtype: int64

In [19]:
print(t.loc["male"].to_list())
print(t.loc["male"].index.to_list())
print(t.loc["female"].to_list())
print(t.loc["female"].index.to_list())

[468, 109]
['Not Survived', 'Survived']
[81, 233]
['Not Survived', 'Survived']


In [20]:
# Basicly , we just put the groups back to new data frame so we can
# get the data easily

In [21]:
# Now to plot is side by side
# we need 'traces'
# We will define new Figure, add trace for Male and Female

def add_trace(fig,trace_name,x_data,y_data):
    fig.add_trace(go.Bar(
        x = x_data,
        y = y_data,
        name = trace_name,
    ))

fig = go.Figure()


add_trace(
    fig,trace_name="Male",
    x_data=t.loc['male'].index.to_list(),
    y_data=t.loc['male'].to_list()
)

add_trace( 
    fig,trace_name="Female",
    x_data=t.loc['female'].index.to_list(),
    y_data=t.loc['female'].to_list()
)

fig.update_layout(title_text="Male-Female Survival")


fig.show()


In [22]:
# Now we want to pust all passengers into
# Age Categories : Children, Youth, Adults, Old
# To this pandas has function called cut
# You create ranges and labels and pass it to the function

# Children : 0 - 14
# Youth : 15 - 24
# Adults : 25 - 64
# Old : 65 - 100

age_ranges = [0, 14, 24, 64, 100]
age_labels = ["Children","Youth","Adults","Old"]

age_category = pd.cut(data.Age,bins=age_ranges,labels=age_labels)

In [23]:
age_category.head(5)

PassengerId
1     Youth
2    Adults
3    Adults
4    Adults
5    Adults
Name: Age, dtype: category
Categories (4, object): ['Children' < 'Youth' < 'Adults' < 'Old']

In [24]:
age_category.value_counts()

Adults      603
Youth       200
Children     77
Old          11
Name: Age, dtype: int64

In [25]:
# Add Age Categories to our data frame
data["age_category"] = age_category

# Perview the changes
data.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age_category
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Not Survived,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Youth
2,Survived,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Adults
3,Survived,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Adults
4,Survived,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Adults
5,Not Survived,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Adults


In [26]:
# Calucalate precentage of each age-group
all_passengers_by_age = data.age_category.value_counts()
all_passengers_by_age = all_passengers_by_age.to_frame()

# Preview
all_passengers_by_age

Unnamed: 0,age_category
Adults,603
Youth,200
Children,77
Old,11


In [27]:
# Caluclate percentage
total = all_passengers_by_age.sum()
all_passengers_by_age["percentage"] = ((all_passengers_by_age)/total) * 100

# Preview
all_passengers_by_age

Unnamed: 0,age_category,percentage
Adults,603,67.676768
Youth,200,22.446689
Children,77,8.641975
Old,11,1.234568


In [28]:
# Plot age-groups perventage

fig = px.bar(
    all_passengers_by_age["age_category"],
    color=age_labels,
    text=all_passengers_by_age["percentage"].apply(lambda x: '{0:1.2f}%'.format(x)),
    labels={
        'index':'Age Category',
        'value': 'Number'
    },
    title="Titanic Passengers by Age"
)
fig.show()

In [29]:
# Pic Age Categories
fig = px.pie(
    values=all_passengers_by_age["age_category"],
    names=all_passengers_by_age.index,
    title="Titanic Passengers by Age"
)
fig.show()

In [30]:
# Now we will do the same as male-female but this time we will group by Age Category
grouped_by_age_category = data.groupby(by="age_category")["Survived"]
grouped_by_age_category = grouped_by_age_category.value_counts()
grouped_by_age_category

age_category  Survived    
Children      Survived         45
              Not Survived     32
Youth         Not Survived    127
              Survived         73
Adults        Not Survived    380
              Survived        223
Old           Not Survived     10
              Survived          1
Name: Survived, dtype: int64

In [31]:

fig = go.Figure()

# ['Children', 'Youth', 'Adults', 'Old']
for label in age_labels:
    add_trace( 
        fig,trace_name=label,
        x_data=grouped_by_age_category[label].index.to_list(),
        y_data=grouped_by_age_category[label].to_list()
    )

fig.update_layout(title_text="Titanic Passengers by Age")


fig.show()


In [32]:
# Now we will see survival rate vs Passenger Class

# First group  by class and unstack
grouped_by_class = data.groupby(by="Pclass")["Survived"].value_counts()
grouped_by_class = grouped_by_class.unstack()
grouped_by_class

Survived,Not Survived,Survived
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80,136
2,97,87
3,372,119


In [33]:
# Calculate total and precentage
total = grouped_by_class["Not Survived"] + grouped_by_class["Survived"]
grouped_by_class["total"] = total
grouped_by_class["percentage_survived"] = ( grouped_by_class["Survived"]/total )*100
grouped_by_class["percentage_not_survived"] = ( grouped_by_class["Not Survived"]/total )*100

# Preview Data
grouped_by_class

Survived,Not Survived,Survived,total,percentage_survived,percentage_not_survived
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,80,136,216,62.962963,37.037037
2,97,87,184,47.282609,52.717391
3,372,119,491,24.236253,75.763747


In [34]:
# Now we will drop 'all columns except 'percentage_survived' and 'percentage_not_survived'
plot_data = grouped_by_class.drop(grouped_by_class.columns[[0,1, 2]], axis=1)
# Map Class to String
plot_data.index = ['First Class' , 'Second Class' , 'Third Class' ]
plot_data

Survived,percentage_survived,percentage_not_survived
First Class,62.962963,37.037037
Second Class,47.282609,52.717391
Third Class,24.236253,75.763747


In [35]:
# Plot the data

fig = px.bar(
    plot_data, 
    title='Survival Rate VS Passenger Class', 
    labels={
        'percentage_survived':'Number of relatives',
        'value': 'Number'
    },
)
fig.show()

In [36]:
# Finally we will do Age Distribution
# Age Histogram

hist_data = [data["Age"]]
group_labels = ['Age'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels)
fig.update_layout(title_text='Passengers Age Distribution')
fig.show()