In [None]:
import pandas as pd

# Load the Titanic dataset
df = pd.read_csv('titanic.csv')

# Display the first few rows of the dataset
df.head()

# Check for missing values
print(df.isnull().sum())

# Handle missing values if the column exists
if 'Cabin' in df.columns:
    # Drop the 'Cabin' column
    df.drop(columns='Cabin', inplace=True)

# Impute missing values in 'Age' with the median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Impute missing values in 'Embarked' with the mode
if 'Embarked' in df.columns:
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Check again for missing values
print(df.isnull().sum())

# Convert 'Sex' and 'Embarked' to categorical data types if they exist
if 'Sex' in df.columns:
    df['Sex'] = df['Sex'].astype('category')
if 'Embarked' in df.columns:
    df['Embarked'] = df['Embarked'].astype('category')

# Perform EDA
import matplotlib.pyplot as plt
import seaborn as sns

# Descriptive statistics for numerical columns
print(df.describe())

# Histograms for numerical variables
df.hist(bins=20, figsize=(14, 10))
plt.show()

# Survival rate by gender
if 'Sex' in df.columns and 'Survived' in df.columns:
    sns.countplot(data=df, x='Sex', hue='Survived')
    plt.title('Survival Rate by Gender')
    plt.show()

# Survival rate by class
if 'Pclass' in df.columns and 'Survived' in df.columns:
    sns.countplot(data=df, x='Pclass', hue='Survived')
    plt.title('Survival Rate by Class')
    plt.show()

# Age distribution by survival status
if 'Age' in df.columns and 'Survived' in df.columns:
    sns.histplot(data=df, x='Age', hue='Survived', bins=20, kde=True)
    plt.title('Age Distribution by Survival Status')
    plt.show()

# Fare distribution by survival status
if 'Fare' in df.columns and 'Survived' in df.columns:
    sns.histplot(data=df, x='Fare', hue='Survived', bins=20, kde=True)
    plt.title('Fare Distribution by Survival Status')
    plt.show()
