<a href="https://colab.research.google.com/github/mehrishka177/-------------heh--heh/blob/main/Untitled18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -----------------------------------
# 🐧 Penguin Differentiation Capstone
# -----------------------------------

# 📦 Import required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 🧪 Set up visual style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# 📁 Load the dataset from uploaded CSV
file_path = "/mnt/data/penguins_size.csv"
df = pd.read_csv(file_path)

# 📝 Display first few rows
print("📌 First 5 rows of the dataset:")
print(df.head())

# 🔍 General info about the dataset
print("\n📌 Dataset Information:")
print(df.info())

# 📊 Summary statistics
print("\n📌 Summary Statistics:")
print(df.describe(include='all'))

# 🔧 Check and clean missing values
print("\n📌 Missing Values Before Cleaning:")
print(df.isnull().sum())

# Drop missing values
df.dropna(inplace=True)

print("\n📌 Missing Values After Cleaning:")
print(df.isnull().sum())

# ✅ Check columns to confirm expected names
print("\n📌 Column Names:")
print(df.columns)

# 1️⃣ Univariate Analysis
# ------------------------

# 🔢 Countplot: Species Distribution
sns.countplot(data=df, x="species", palette="Set2")
plt.title("Distribution of Penguin Species")
plt.xlabel("Species")
plt.ylabel("Count")
plt.show()

# 📊 Histogram: Bill Length Distribution
sns.histplot(data=df, x="bill_length_mm", hue="species", kde=True, palette="muted")
plt.title("Bill Length Distribution by Species")
plt.xlabel("Bill Length (mm)")
plt.show()

# 📊 Histogram: Flipper Length Distribution
sns.histplot(data=df, x="flipper_length_mm", hue="species", kde=True, palette="muted")
plt.title("Flipper Length Distribution by Species")
plt.xlabel("Flipper Length (mm)")
plt.show()

# 2️⃣ Bivariate Analysis
# ------------------------

# 🔁 Scatter Plot: Bill Length vs Bill Depth
sns.scatterplot(data=df, x="bill_length_mm", y="bill_depth_mm", hue="species", style="sex", palette="Dark2")
plt.title("Bill Length vs Bill Depth by Species")
plt.xlabel("Bill Length (mm)")
plt.ylabel("Bill Depth (mm)")
plt.show()

# 📦 Boxplot: Flipper Length by Species
sns.boxplot(data=df, x="species", y="flipper_length_mm", palette="pastel")
plt.title("Flipper Length Distribution Across Species")
plt.xlabel("Species")
plt.ylabel("Flipper Length (mm)")
plt.show()

# 📦 Boxplot: Body Mass by Species
sns.boxplot(data=df, x="species", y="body_mass_g", palette="coolwarm")
plt.title("Body Mass Distribution Across Species")
plt.xlabel("Species")
plt.ylabel("Body Mass (g)")
plt.show()

# 3️⃣ Pairwise Feature Analysis
# ------------------------------

# 🔄 Pairplot: All numeric features colored by species
sns.pairplot(df, hue="species", palette="colorblind")
plt.suptitle("Pairwise Comparison of Features by Species", y=1.02)
plt.show()

# 4️⃣ Group Statistics
# ---------------------

# 📈 Mean values of key numeric features per species
grouped_stats = df.groupby("species")[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]].mean()
print("\n📌 Mean Measurements by Species:")
print(grouped_stats)

# 📈 Mean values split further by sex
grouped_stats_sex = df.groupby(["species", "sex"])[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]].mean()
print("\n📌 Mean Measurements by Species and Sex:")
print(grouped_stats_sex)
