In [None]:
# Experiment 2: Data Wrangling II
# Aim: Handle missing values, detect outliers, and perform data transformation

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("student.csv")

# --- Step 1: Handle Missing Values ---

# Check missing values
print("Missing Values Before Handling:")
print(df.isnull().sum())

# Fill missing values in scores with mean
df['Math_Score'] = df['Math_Score'].fillna(df['Math_Score'].mean())
df['Reading_Score'] = df['Reading_Score'].fillna(df['Reading_Score'].mean())
df['Writing_Score'] = df['Writing_Score'].fillna(df['Writing_Score'].mean())

# Fill missing Gender if any
if df['Gender'].isnull().sum() > 0:
    df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])

print("\nMissing Values After Handling:")
print(df.isnull().sum())

# --- Step 2: Outlier Detection and Removal (Using Z-Score with sklearn) ---

# Use StandardScaler to calculate Z-scores
scaler = StandardScaler()
scores = df[['Math_Score', 'Reading_Score', 'Writing_Score']]
z_scores = scaler.fit_transform(scores)

# Keep only rows where all Z-scores are below 3 (i.e., not outliers)
df = df[(np.abs(z_scores) < 3).all(axis=1)]

# --- Step 3: Data Transformation ---

# Log transform Math Score to reduce skewness
df['log_math'] = np.log1p(df['Math_Score'])  # log1p handles 0 safely

# If Club_Join_Date exists, create Duration
if 'Club_Join_Date' in df.columns:
    df['Duration'] = 2025 - df['Club_Join_Date']

# Encode Gender (Male/Female) into 0/1
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])

# --- Final Cleaned Data Output ---
print("\nFinal Cleaned Data Sample:")
print(df.head())

# --- Viva Questions (with simple answers) ---

# Q1: Why handle missing values?
# A1: To avoid errors and make the dataset complete.

# Q2: Why use Z-score?
# A2: Z-score shows how far a value is from average. Values > 3 are considered outliers.

# Q3: What is log1p used for?
# A3: To reduce skewness and make values more normal-like. It also handles 0 safely.

# Q4: Why encode Gender?
# A4: Machine learning models work with numbers, so text is converted into numeric form.

# Q5: Why remove outliers?
# A5: Outliers can affect accuracy and skew results in analysis.

# Q6: What does Duration mean?
# A6: It shows how long a student has been in the club from their join year till now.

# Done!
