In [1]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the Titanic dataset
data = pd.read_csv('titanic.csv')

# Initial Data Exploration and Visualization
# Display the first few rows of the dataset
print("First 5 Rows of the Data:")
print(data.head())

# Basic data exploration
print("\nData Information:")
print(data.info())
print("\nDescriptive Statistics:")
print(data.describe())

# Check for missing values
print("\nMissing Values by Column:")
print(data.isnull().sum())


First 5 Rows of the Data:
   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  

Data Information:
<class 'pandas.core.frame.DataFrame'>
Ran

In [3]:
# Fill missing values for visualizations
# Use direct assignment without inplace=True to avoid chained assignment warnings
data['Age'] = data['Age'].fillna(data['Age'].median())
data['Fare'] = data['Fare'].fillna(data['Fare'].median())
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

# Create FamilySize feature for visualizations
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

# Convert categorical variables for visualizations using direct assignment
data['Sex'] = data['Sex'].map({'female': 0, 'male': 1})
data['Embarked'] = data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Print the updated DataFrame head to confirm changes
print(data.head())

   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name  Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    1  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)    0  47.0      1      0   
2                     Myles, Mr. Thomas Francis    1  62.0      0      0   
3                              Wirz, Mr. Albert    1  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)    0  22.0      1      1   

    Ticket     Fare Cabin  Embarked  FamilySize  
0   330911   7.8292   NaN         1           1  
1   363272   7.0000   NaN         2           2  
2   240276   9.6875   NaN         1           1  
3   315154   8.6625   NaN         2           1  
4  3101298  12.2875   NaN         2           3  


In [4]:
# Visualization: Survival Rate by Class
bar_chart = alt.Chart(data).mark_bar().encode(
    x='Pclass:O',
    y='count()',
    color='Survived:N'
).properties(
    title='Survival Rate by Class'
)
bar_chart.display()

# Visualization: Age Distribution by Survival
histogram = alt.Chart(data).mark_bar().encode(
    x=alt.X('Age:Q', bin=True),
    y='count()',
    color='Survived:N'
).properties(
    title='Age Distribution by Survival'
)
histogram.display()

# Visualization: Scatter Plot for Fare vs. Age
scatter_plot = alt.Chart(data).mark_circle().encode(
    x='Age:Q',
    y='Fare:Q',
    color='Survived:N',
    tooltip=['Name', 'Age', 'Fare', 'Survived']
).properties(
    title='Fare vs. Age by Survival'
)
scatter_plot.display()

# Visualization: Box Plot for Fare by Class and Survival
box_plot = alt.Chart(data).mark_boxplot().encode(
    x='Pclass:O',
    y='Fare:Q',
    color='Survived:N'
).properties(
    title='Fare Distribution by Class and Survival'
)
box_plot.display()

# Visualization: Survival by Class and Gender (Facet Grid)
facet_grid = alt.Chart(data).mark_bar().encode(
    x='Pclass:O',
    y='count()',
    color='Survived:N'
).facet(
    row='Sex:N'
).properties(
    title='Survival by Class and Gender'
)
facet_grid.display()

# Visualization: Correlation Heatmap
numeric_data = data[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'FamilySize']]
correlation = numeric_data.corr()
heatmap = alt.Chart(correlation.reset_index().melt('index')).mark_rect().encode(
    x='index:O',
    y='variable:O',
    color='value:Q'
).properties(
    title='Correlation Heatmap'
)
heatmap.display()

# Visualization: Survival Rate by Family Size
family_survival = alt.Chart(data).mark_bar().encode(
    x='FamilySize:O',
    y='mean(Survived):Q',
    color='FamilySize:O'
).properties(
    title='Survival Rate by Family Size'
)
family_survival.display()

# Visualization: Survival by Port of Embarkation (Stacked Bar Chart)
stacked_bar = alt.Chart(data).mark_bar().encode(
    x='Embarked:N',
    y='count()',
    color='Survived:N'
).properties(
    title='Survival by Port of Embarkation'
)
stacked_bar.display()

# Visualization: Survival Rates by Gender and Class (Heatmap)
survival_rates = data.groupby(['Sex', 'Pclass'])['Survived'].mean().reset_index()
survival_heatmap = alt.Chart(survival_rates).mark_rect().encode(
    x='Pclass:O',
    y='Sex:N',
    color='Survived:Q',
    tooltip=['Sex', 'Pclass', 'Survived']
).properties(
    title='Survival Rates by Gender and Class'
)
survival_heatmap.display()


In [5]:
# Step 2: Feature Engineering for Analysis
# Extract Title from Name
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss').replace(['Mme'], 'Mrs')
data['Title'] = data['Title'].replace(
    ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
data['Title'] = LabelEncoder().fit_transform(data['Title'])

# Create a Deck feature from the Cabin column
data['Deck'] = data['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'Unknown')
data['Deck'] = LabelEncoder().fit_transform(data['Deck'])

# Final Data Overview After Feature Engineering
print("\nFinal Data Overview After Feature Engineering:")
print(data.head())


Final Data Overview After Feature Engineering:
   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name  Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    1  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)    0  47.0      1      0   
2                     Myles, Mr. Thomas Francis    1  62.0      0      0   
3                              Wirz, Mr. Albert    1  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)    0  22.0      1      1   

    Ticket     Fare Cabin  Embarked  FamilySize  Title  Deck  
0   330911   7.8292   NaN         1           1      2     7  
1   363272   7.0000   NaN         2           2      3     7  
2   240276   9.6875   NaN         1           1      2     7  
3   315154   8.6625   NaN 