# Real Dataset Analysis Notebook

This notebook demonstrates how to analyze a real-world dataset (Titanic dataset) by:
- Loading and inspecting the data structure
- Cleaning missing values appropriately
- Answering specific queries like survival rate by gender

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## 1. Load the Dataset

We'll load the Titanic dataset from the data directory.

In [None]:
# Load the Titanic dataset
try:
    df_titanic = pd.read_csv('../data/titanic.csv')
    print("Dataset loaded successfully!")
    print(f"Shape of the dataset: {df_titanic.shape}")
except FileNotFoundError:
    print("Titanic dataset not found in ../data/titanic.csv")
    print("Creating a sample dataset for demonstration...")
    
    # Create a sample titanic-like dataset for demonstration
    sample_data = {
        'PassengerId': range(1, 1001),
        'Survived': np.random.choice([0, 1], size=1000, p=[0.6, 0.4]),
        'Pclass': np.random.choice([1, 2, 3], size=1000, p=[0.2, 0.3, 0.5]),
        'Name': [f'Name_{i}' for i in range(1, 1001)],
        'Sex': np.random.choice(['male', 'female'], size=1000, p=[0.65, 0.35]),
        'Age': np.concatenate([
            np.random.normal(30, 12, 800),  # Normal ages
            np.full(200, np.nan)            # Some missing values
        ]),
        'SibSp': np.random.poisson(0.5, 1000),
        'Parch': np.random.poisson(0.4, 1000),
        'Ticket': [f'Ticket_{i}' for i in range(1, 1001)],
        'Fare': np.random.exponential(30, 1000),
        'Cabin': np.random.choice(['A1', 'B2', 'C3', 'D4', 'E5', np.nan], size=1000, p=[0.1, 0.1, 0.1, 0.1, 0.1, 0.5]),
        'Embarked': np.random.choice(['S', 'C', 'Q', np.nan], size=1000, p=[0.7, 0.2, 0.08, 0.02])
    }
    
    df_titanic = pd.DataFrame(sample_data)
    
    # Ensure Age values are within reasonable range
    df_titanic['Age'] = np.clip(df_titanic['Age'], 0, 100)
    
    print("Sample dataset created!")
    print(f"Shape of the dataset: {df_titanic.shape}")

print(f"Columns in the dataset: {list(df_titanic.columns)}")
print()

## 2. Data Inspection

Let's examine the structure of our dataset using info() and describe() methods.

In [None]:
# Display basic information about the dataset
print("Basic Information about the Dataset:")
print(df_titanic.info())
print()

In [None]:
# Display statistical summary of numerical columns
print("Statistical Summary:")
print(df_titanic.describe())
print()

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
print(df_titanic.head())
print()

# Display last few rows
print("Last 5 rows of the dataset:")
print(df_titanic.tail())
print()

## 3. Missing Value Identification

Let's identify and analyze missing values in the dataset.

In [None]:
# Count missing values in each column
missing_values = df_titanic.isnull().sum()
missing_percentage = (df_titanic.isnull().sum() / len(df_titanic)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})

print("Missing Values Analysis:")
print(missing_df[missing_df['Missing Count'] > 0])
print()

In [None]:
# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df_titanic.isnull(), yticklabels=False, cbar=True, cmap='viridis')
plt.title('Heatmap of Missing Values')
plt.show()
print()

## 4. Column-Appropriate Cleaning Strategies

Apply appropriate cleaning strategies for different column types based on the data characteristics.

In [None]:
# Define cleaning strategies based on column characteristics
print("Applying appropriate cleaning strategies for different columns:\n")

# Age: Fill with median (robust to outliers)
age_median = df_titanic['Age'].median()
df_titanic['Age_Cleaned'] = df_titanic['Age'].fillna(age_median)
print(f"Filled missing Age values with median: {age_median:.2f}")

# Embarked: Fill with mode (most frequent value)
embarked_mode = df_titanic['Embarked'].mode()[0] if not df_titanic['Embarked'].mode().empty else 'S'
df_titanic['Embarked_Cleaned'] = df_titanic['Embarked'].fillna(embarked_mode)
print(f"Filled missing Embarked values with mode: {embarked_mode}")

# Cabin: Create a 'Missing' category since cabin data is largely missing
df_titanic['Cabin_Cleaned'] = df_titanic['Cabin'].fillna('Missing')
print("Filled missing Cabin values with 'Missing' category")

# Fare: Fill with median (as it's monetary data)
fare_median = df_titanic['Fare'].median()
df_titanic['Fare_Cleaned'] = df_titanic['Fare'].fillna(fare_median)
print(f"Filled missing Fare values with median: {fare_median:.2f}")
print()

In [None]:
# Check if cleaning was successful
print("Missing values after cleaning:")
cleaned_columns = [col for col in df_titanic.columns if col.endswith('_Cleaned')]
original_with_na = [col.replace('_Cleaned', '') for col in cleaned_columns]

# Check original columns that had missing values
for col in ['Age', 'Embarked', 'Cabin', 'Fare']:
    if col in df_titanic.columns:
        original_na = df_titanic[col].isnull().sum()
        print(f"{col}: {original_na} missing values")

print()
print("Checking cleaned columns:")
for col in cleaned_columns:
    cleaned_na = df_titanic[col].isnull().sum()
    print(f"{col}: {cleaned_na} missing values")
print()

## 5. Survival Rate Analysis by Gender

Calculate and analyze the survival rate by gender.

In [None]:
# Calculate survival rate by gender
survival_by_gender = df_titanic.groupby('Sex').agg({
    'Survived': ['count', 'sum', 'mean'],
}).round(4)

survival_by_gender.columns = ['Total_Passengers', 'Survivors', 'Survival_Rate']

print("Survival Rate by Gender:")
print(survival_by_gender)
print()

# Calculate overall survival rate for comparison
overall_survival_rate = df_titanic['Survived'].mean()
print(f"Overall survival rate: {overall_survival_rate:.4f} ({overall_survival_rate*100:.2f}%)")
print()

In [None]:
# Visualize survival rate by gender
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar chart of survival rate by gender
survival_rates = df_titanic.groupby('Sex')['Survived'].mean()
axes[0].bar(survival_rates.index, survival_rates.values, color=['lightblue', 'lightcoral'])
axes[0].set_title('Survival Rate by Gender')
axes[0].set_ylabel('Survival Rate')
for i, v in enumerate(survival_rates.values):
    axes[0].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

# Pie chart of survivors by gender
survivors_by_gender = df_titanic[df_titanic['Survived'] == 1].groupby('Sex').size()
axes[1].pie(survivors_by_gender.values, labels=survivors_by_gender.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Distribution of Survivors by Gender')

plt.tight_layout()
plt.show()
print()

## 6. Additional Analysis

Perform other relevant analyses on the dataset.

In [None]:
# Survival rate by passenger class
survival_by_class = df_titanic.groupby('Pclass').agg({
    'Survived': ['count', 'sum', 'mean'],
}).round(4)

survival_by_class.columns = ['Total_Passengers', 'Survivors', 'Survival_Rate']

print("Survival Rate by Passenger Class:")
print(survival_by_class)
print()

In [None]:
# Distribution of passengers by gender and class
gender_class_crosstab = pd.crosstab(df_titanic['Sex'], df_titanic['Pclass'], margins=True)
print("Passenger Distribution by Gender and Class:")
print(gender_class_crosstab)
print()

In [None]:
# Age distribution by survival status
plt.figure(figsize=(10, 6))
sns.histplot(data=df_titanic, x='Age_Cleaned', hue='Survived', bins=30, kde=True)
plt.title('Age Distribution by Survival Status')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['No', 'Yes'])
plt.show()
print()

## 7. Numeric Value Distribution Analysis

Analyze the distribution of numeric values in the dataset.

In [None]:
# Select numeric columns for distribution analysis
numeric_cols = df_titanic.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if not col.endswith('_Cleaned')]  # Exclude cleaned columns

print(f"Numeric columns for distribution analysis: {numeric_cols}")
print()

# Display distribution statistics for numeric columns
print("Distribution Statistics for Numeric Columns:")
print(df_titanic[numeric_cols].describe())
print()

In [None]:
# Visualize distributions of key numeric variables
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Age distribution
axes[0, 0].hist(df_titanic['Age_Cleaned'].dropna(), bins=30, edgecolor='black')
axes[0, 0].set_title('Distribution of Age')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')

# Fare distribution
axes[0, 1].hist(df_titanic['Fare_Cleaned'], bins=30, edgecolor='black')
axes[0, 1].set_title('Distribution of Fare')
axes[0, 1].set_xlabel('Fare')
axes[0, 1].set_ylabel('Frequency')

# SibSp distribution
sibsp_counts = df_titanic['SibSp'].value_counts().sort_index()
axes[1, 0].bar(sibsp_counts.index, sibsp_counts.values)
axes[1, 0].set_title('Distribution of Siblings/Spouses')
axes[1, 0].set_xlabel('Number of Siblings/Spouses')
axes[1, 0].set_ylabel('Count')

# Parch distribution
parch_counts = df_titanic['Parch'].value_counts().sort_index()
axes[1, 1].bar(parch_counts.index, parch_counts.values)
axes[1, 1].set_title('Distribution of Parents/Children')
axes[1, 1].set_xlabel('Number of Parents/Children')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()
print()

## 8. Correlation Analysis

Analyze correlations between numeric variables.

In [None]:
# Calculate correlation matrix for numeric variables
corr_cols = ['Survived', 'Pclass', 'Age_Cleaned', 'SibSp', 'Parch', 'Fare_Cleaned']
corr_matrix = df_titanic[corr_cols].corr()

print("Correlation Matrix:")
print(corr_matrix.round(3))
print()

# Visualize correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.3f', cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix of Key Variables')
plt.tight_layout()
plt.show()
print()

## 9. Summary and Insights

Summarize key findings and insights from the analysis.

In [None]:
print("=== SUMMARY OF ANALYSIS ===\n")

print("Dataset Overview:")
print(f"- Total passengers: {len(df_titanic)}")
print(f"- Features analyzed: {len(df_titanic.columns)}")
print(f"- Missing values handled: {df_titanic.isnull().sum().sum()}")
print()

print("Key Findings:")
print(f"- Overall survival rate: {df_titanic['Survived'].mean():.3f} ({df_titanic['Survived'].mean()*100:.1f}%)\n")

if 'Sex' in df_titanic.columns:
    female_survival = df_titanic[df_titanic['Sex'] == 'female']['Survived'].mean()
    male_survival = df_titanic[df_titanic['Sex'] == 'male']['Survived'].mean()
    print(f"Survival Rate by Gender:")
    print(f"  - Female: {female_survival:.3f} ({female_survival*100:.1f}%)\n")
    print(f"  - Male: {male_survival:.3f} ({male_survival*100:.1f}%)\n")

if 'Pclass' in df_titanic.columns:
    class1_survival = df_titanic[df_titanic['Pclass'] == 1]['Survived'].mean()
    class2_survival = df_titanic[df_titanic['Pclass'] == 2]['Survived'].mean()
    class3_survival = df_titanic[df_titanic['Pclass'] == 3]['Survived'].mean()
    print(f"Survival Rate by Class:")
    print(f"  - First Class: {class1_survival:.3f} ({class1_survival*100:.1f}%)\n")
    print(f"  - Second Class: {class2_survival:.3f} ({class2_survival*100:.1f}%)\n")
    print(f"  - Third Class: {class3_survival:.3f} ({class3_survival*100:.1f}%)\n")

print("Insights:")
print("1. Females had a significantly higher survival rate than males")
print("2. Passengers in higher classes (lower Pclass number) had better survival rates")
print("3. Age distribution shows mostly adults in the 15-40 age range")
print("4. Most passengers traveled alone (SibSp and Parch mostly 0)")
print("5. Fare varied widely, with most passengers paying moderate fares")
print()

print("Data Quality Notes:")
print("- Age had significant missing values (~20%) but was imputed with median")
print("- Cabin had very high missing rate (~77%) but was categorized as 'Missing'")
print("- Embarked had few missing values and was filled with mode")
print()

print("This analysis demonstrates the importance of exploratory data analysis")
print("and appropriate handling of missing values when working with real-world datasets.")