In [None]:
## Data Wrangling & Analysis: pandas for structuring and transforming raw data

# --- Import libraries ---
import pandas as pd
import numpy as np

# --- Load a sample dataset ---
# We'll use a public dataset: Titanic from seaborn
import seaborn as sns
df = sns.load_dataset("titanic")

print("Dataset loaded! Shape:", df.shape)
df.head()

# --- Inspect the data ---
print("\nBasic Info:")
print(df.info())

print("\nSummary Stats:")
print(df.describe(include="all").T)

# --- Handling Missing Values ---
print("\nMissing values per column:")
print(df.isnull().sum())

# Example: Fill missing age with median
df['age'] = df['age'].fillna(df['age'].median())

# Drop columns with too many missing values
df = df.drop(columns=['deck'])

print("\nAfter cleaning:")
print(df.isnull().sum())

# --- Filtering Data ---
# Example: Select passengers older than 50
older_passengers = df[df['age'] > 50]
print("\nPassengers older than 50:", older_passengers.shape[0])

# --- Grouping and Aggregation ---
# Example: Average age by class
avg_age_by_class = df.groupby('class')['age'].mean()
print("\nAverage age by class:")
print(avg_age_by_class)

# Survival rate by sex
survival_rate = df.groupby('sex')['survived'].mean()
print("\nSurvival rate by sex:")
print(survival_rate)

# --- Reshaping Data ---
# Pivot table: Survival rate by class and sex
survival_pivot = df.pivot_table(values='survived', index='class', columns='sex', aggfunc='mean')
print("\nPivot Table (Survival Rate by Class & Sex):")
print(survival_pivot)

# --- Merging Example ---
# Let's create a small dataframe of class labels
df_classes = pd.DataFrame({
    'class': ['First', 'Second', 'Third'],
    'luxury_level': ['High', 'Medium', 'Low']
})

merged_df = pd.merge(df, df_classes, on='class', how='left')
print("\nMerged with luxury level:")
print(merged_df[['class', 'luxury_level']].head())

# --- Mission Task ---
print("\nMISSION: Now it's your turn!")
print("1. Create a new column called 'family_size' as sibsp + parch + 1.")
print("2. Find the average survival rate by family size.")
print("3. Try filtering for passengers under 18 and compare their survival rate to adults.")
