#  1. Import libraries and load data

In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

# Load Titanic dataset (update file path as per Kaggle environment)
df = pd.read_csv('/kaggle/input/titanic/Titanic-Dataset.csv')

# Display columns to verify dataset structure
print("Columns:", df.columns.tolist())

Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


# 2. Check for missing values and basic info

In [9]:
# Check for missing data counts
print("Missing values summary:")
print(df.isnull().sum())

Missing values summary:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


# 3: Handle missing values

In [10]:
# Fill missing numerical values with mean (for 'Age')
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Fill missing categorical values with mode (for 'Embarked')
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


# 4: Encode categorical features

In [11]:
# Label encode binary 'Sex' column
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])

# One-hot encode 'Embarked' column
df = pd.get_dummies(df, columns=['Embarked'])

# 5: Feature scaling

In [12]:
# Standardize 'Age' column
scaler = StandardScaler()
df['Age_scaled'] = scaler.fit_transform(df[['Age']])

# Normalize 'Fare' column
min_max = MinMaxScaler()
df['Fare_normalized'] = min_max.fit_transform(df[['Fare']])

# 6: Preview the final processed dataset

In [13]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S,Age_scaled,Fare_normalized
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,False,False,True,-0.592481,0.014151
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,True,False,False,0.638789,0.139136
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,False,False,True,-0.284663,0.015469
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,False,False,True,0.407926,0.103644
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,False,False,True,0.407926,0.015713
