In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Load the dataset
titanic_df = pd.read_csv('Titanic-Dataset.csv')

In [3]:
# Display the first few rows of the dataset
print("Initial dataset:")
print(titanic_df.head())

Initial dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   

In [4]:
# 1. Handling Missing Values
# Drop the 'Cabin' column due to high percentage of missing values
titanic_df.drop(columns=['Cabin'], inplace=True)

In [5]:
# Impute missing 'Age' values with the median age
titanic_df['Age'].fillna(titanic_df['Age'].median(), inplace=True)


In [6]:
# Impute missing 'Embarked' values with the mode
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0], inplace=True)

In [7]:
# Verify that there are no more missing values
print("\nMissing values after imputation:")
print(titanic_df.isnull().sum())


Missing values after imputation:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [8]:
# 2. Normalization
# Define numerical features
num_features = ['Age', 'Fare']


In [9]:
# Normalize numerical features using StandardScaler
scaler = StandardScaler()
titanic_df[num_features] = scaler.fit_transform(titanic_df[num_features])

In [10]:
# 3. Encoding Categorical Variables
# Define categorical features
cat_features = ['Sex', 'Embarked']

In [12]:
# Encode categorical features using one-hot encoding
titanic_df = pd.get_dummies(titanic_df, columns=cat_features, drop_first=True)

In [14]:
# 4. Feature Engineering
# Creating new feature 'FamilySize' as the sum of 'SibSp' and 'Parch'
titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch']

In [15]:
# Drop columns that won't be used for modeling
titanic_df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)

In [16]:
# Display the first few rows of the processed dataset
print("\nProcessed dataset:")
print(titanic_df.head())


Processed dataset:
   Survived  Pclass       Age  SibSp  Parch      Fare  Sex_male  Embarked_Q  \
0         0       3 -0.565736      1      0 -0.502445      True       False   
1         1       1  0.663861      1      0  0.786845     False       False   
2         1       3 -0.258337      0      0 -0.488854     False       False   
3         1       1  0.433312      1      0  0.420730     False       False   
4         0       3  0.433312      0      0 -0.486337      True       False   

   Embarked_S  FamilySize  
0        True           1  
1       False           1  
2        True           0  
3        True           1  
4        True           0  
