# Titanic Dataset - Data Preprocessing Activity-1

**Objective:** Use the Titanic dataset to perform data cleaning and preprocessing for machine learning.

In [None]:
# Load the dataset
import pandas as pd
df = pd.read_csv('titanic.csv')  # Ensure the file is in the working directory
df.head()

In [None]:
# Inspect data
df.info()
df.describe()

In [None]:
# Handle missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df.dropna(subset=['Cabin'], inplace=True)

In [None]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)

In [None]:
# Convert data types
df['Age'] = df['Age'].astype(int)
df['Fare'] = df['Fare'].astype(float)

In [None]:
# Remove outliers in 'Fare' column
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Fare'] >= lower_bound) & (df['Fare'] <= upper_bound)]

In [None]:
# Encode categorical variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

In [None]:
# Feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

In [None]:
# Feature engineering
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df.drop(['Name', 'Ticket'], axis=1, inplace=True)

In [None]:
# Separate features and target
X = df.drop('Survived', axis=1)
y = df['Survived']

In [None]:
# Save cleaned dataset
df.to_csv('titanic_cleaned.csv', index=False)
df.head()