In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

penguins_data = pd.read_csv("penguins_size.csv") 
penguins_data = penguins_data[['species','culmen_length_mm']] 

In [2]:
import pandas as pd
import numpy as np

# --- Thay đường dẫn file nếu cần ---
FILE = "penguins_size.csv"

# --- Đọc dữ liệu ---
df = pd.read_csv(FILE)

# --- Tìm cột numeric và cột species (case-insensitive) ---
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
species_col = None
for c in df.columns:
    if c.lower() == "species":
        species_col = c
        break

# --- Hàm tính diagnostics cho 1 cột numeric ---
def compute_stats(series: pd.Series):
    s = series.dropna().astype(float)
    n = s.count()
    if n == 0:
        return {
            "n": 0, "mean": np.nan, "median": np.nan, "std": np.nan,
            "min": np.nan, "max": np.nan, "skew": np.nan,
            "n_outliers": 0, "outlier_pct": np.nan
        }
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    outliers = s[(s < lower) | (s > upper)]
    return {
        "n": int(n),
        "mean": float(s.mean()),
        "median": float(s.median()),
        "std": float(s.std(ddof=1)),
        "min": float(s.min()),
        "max": float(s.max()),
        "skew": float(s.skew()),
        "n_outliers": int(outliers.count()),
        "outlier_pct": round(outliers.count() / n * 100, 2)
    }

# --- Tính cho tất cả numeric cols ---
rows = []
for col in num_cols:
    stats = compute_stats(df[col])
    row = {"feature": col}
    row.update(stats)
    rows.append(row)

summary_df = pd.DataFrame(rows).set_index("feature")

# --- Tính median theo species nếu có ---
if species_col is not None:
    species_medians = df.groupby(species_col)[num_cols].median().transpose()
    # species_medians: index = feature, columns = species names
else:
    species_medians = None

# --- Hiển thị kết quả ---
pd.set_option("display.float_format", "{:.2f}".format)
print("Numeric summary (global):")
print(summary_df)
print("\n")
if species_medians is not None:
    print("Median theo species:")
    print(species_medians)
else:
    print("Không tìm thấy cột 'species' để tính median theo species.")

# --- Lưu ra file CSV nếu muốn ---
summary_df.to_csv("univariate_summary_global.csv")
if species_medians is not None:
    species_medians.to_csv("univariate_median_by_species.csv")

print("\nKết quả đã được lưu: univariate_summary_global.csv, univariate_median_by_species.csv (nếu có).")


Numeric summary (global):
                     n    mean  median    std     min     max  skew  \
feature                                                               
culmen_length_mm   342   43.92   44.45   5.46   32.10   59.60  0.05   
culmen_depth_mm    342   17.15   17.30   1.97   13.10   21.50 -0.14   
flipper_length_mm  342  200.92  197.00  14.06  172.00  231.00  0.35   
body_mass_g        342 4201.75 4050.00 801.95 2700.00 6300.00  0.47   

                   n_outliers  outlier_pct  
feature                                     
culmen_length_mm            0         0.00  
culmen_depth_mm             0         0.00  
flipper_length_mm           0         0.00  
body_mass_g                 0         0.00  


Median theo species:
species            Adelie  Chinstrap  Gentoo
culmen_length_mm    38.80      49.55   47.30
culmen_depth_mm     18.40      18.45   15.00
flipper_length_mm  190.00     196.00  216.00
body_mass_g       3700.00    3700.00 5000.00

Kết quả đã được lưu: univari