<a href="https://colab.research.google.com/github/madhupandeyy/6th-Sem-ML-LAB/blob/main/LAB_2_ML_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 2: Data Preprocessing for Diabetes and Adult Income Datasets

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

# Function to check missing values
def check_missing(df, name):
    if df.isnull().any().any():
        print(f"Missing values detected in {name}:")
        print(df.isnull().sum())
    else:
        print(f"No missing values in {name}\n")

# ====================== Diabetes Dataset ======================

# 1. Load diabetes dataset
print("Loading Diabetes dataset...")
diabetes = pd.read_csv('/content/Dataset of Diabetes .csv')
print(diabetes.head())  # Show first few rows
print(f"Shape after load: {diabetes.shape}\n")

# 2. Check for missing values before cleaning
check_missing(diabetes, 'Diabetes')

# 3. Drop any rows with missing values
diabetes.dropna(inplace=True)
print("After dropping missing values:")
print(f"Shape: {diabetes.shape}\n")
check_missing(diabetes, 'Diabetes')

# 4. Encode categorical columns ('Gender' and 'CLASS') with label encoding
le_diab = LabelEncoder()
diabetes['Gender'] = le_diab.fit_transform(diabetes['Gender'])
diabetes['CLASS']  = le_diab.fit_transform(diabetes['CLASS'])
print("After label‐encoding 'Gender' and 'CLASS':")
print(diabetes[['Gender','CLASS']].head(), "\n")

# 5. Handle outliers using the IQR method for numerical columns
numerical_cols = ['AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']
for col in numerical_cols:
    Q1  = diabetes[col].quantile(0.25)
    Q3  = diabetes[col].quantile(0.75)
    IQR = Q3 - Q1
    # Clip values to [Q1 - 1.5*IQR, Q3 + 1.5*IQR]
    diabetes[col] = np.clip(diabetes[col],
                            Q1 - 1.5 * IQR,
                            Q3 + 1.5 * IQR)
print("After outlier clipping (IQR) on numerical columns:")
print(diabetes[numerical_cols].describe(), "\n")

# 6. Separate features and target
X_diab = diabetes.drop(['ID', 'No_Pation', 'CLASS'], axis=1)
y_diab = diabetes['CLASS']
print("Features and target separated for Diabetes:")
print(f"  X_diab shape: {X_diab.shape}")
print(f"  y_diab shape: {y_diab.shape}\n")

# 7. Scale feature data using MinMaxScaler and StandardScaler
minmax       = MinMaxScaler()
std_scaler   = StandardScaler()

X_diab_minmax   = minmax.fit_transform(X_diab)
print("After MinMax scaling Diabetes features:")
print(f"  X_diab_minmax shape: {X_diab_minmax.shape}\n")

X_diab_standard = std_scaler.fit_transform(X_diab)
print("After Standard scaling Diabetes features:")
print(f"  X_diab_standard shape: {X_diab_standard.shape}\n")

# ====================== Adult Income Dataset ======================

# 1. Load adult income dataset
print("Loading Adult Income dataset...")
adult = pd.read_csv('/content/adult.csv')
print(adult.head())
print(f"Shape after load: {adult.shape}\n")

# 2. Replace '?' entries with NaN for proper missing‐value handling
adult.replace('?', np.nan, inplace=True)
print("After marking '?' as NaN:")
check_missing(adult, 'Adult Income')

# 3. Identify categorical vs numerical columns
raw_cat_cols = adult.select_dtypes(include=['object']).columns.tolist()
raw_num_cols = adult.select_dtypes(include=['int64','float64']).columns.tolist()
print("Column types detected:")
print(f"  Categorical cols: {raw_cat_cols}")
print(f"  Numerical cols:   {raw_num_cols}\n")

# 4. Fill missing values for features (exclude 'income' target)
for col in raw_cat_cols:
    if col != 'income':
        adult[col] = adult[col].fillna(adult[col].mode()[0])
for col in raw_num_cols:
    adult[col] = adult[col].fillna(adult[col].mean())
print("After filling missing values in features:")
check_missing(adult, 'Adult Income')

# 5. Encode the target column 'income' using LabelEncoder
le_adult = LabelEncoder()
adult['income'] = le_adult.fit_transform(adult['income'])
print("After label‐encoding 'income' target:")
print(adult['income'].value_counts(), "\n")

# 6. Separate features and target
X_adult = adult.drop('income', axis=1)
y_adult = adult['income']
print("Features and target separated for Adult Income:")
print(f"  X_adult shape: {X_adult.shape}")
print(f"  y_adult shape: {y_adult.shape}\n")

# 7. One‐hot encode all categorical feature columns
cat_features = [c for c in raw_cat_cols if c != 'income']
X_adult = pd.get_dummies(X_adult, columns=cat_features)
print("After one‐hot encoding categorical features:")
print(f"  X_adult shape: {X_adult.shape}\n")

# 8. Handle outliers on numeric features using IQR clipping
num_features = X_adult.select_dtypes(include=[np.number]).columns.tolist()
for col in num_features:
    Q1  = X_adult[col].quantile(0.25)
    Q3  = X_adult[col].quantile(0.75)
    IQR = Q3 - Q1
    X_adult[col] = np.clip(X_adult[col],
                           Q1 - 1.5 * IQR,
                           Q3 + 1.5 * IQR)
print("After outlier clipping on Adult numeric features:")
print(X_adult[num_features].describe(), "\n")

# 9. Scale adult features with the same scalers
X_adult_minmax   = minmax.fit_transform(X_adult)
print("After MinMax scaling Adult features:")
print(f"  X_adult_minmax shape: {X_adult_minmax.shape}\n")

X_adult_standard = std_scaler.fit_transform(X_adult)
print("After Standard scaling Adult features:")
print(f"  X_adult_standard shape: {X_adult_standard.shape}")


Loading Diabetes dataset...
    ID  No_Pation Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221      M   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656      F   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223      M   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI CLASS  
0  24.0     N  
1  23.0     N  
2  24.0     N  
3  24.0     N  
4  21.0     N  
Shape after load: (1000, 14)

No missing values in Diabetes

After dropping missing values:
Shape: (1000, 14)

No missing values in Diabetes

After label‐encoding 'Gender' and 'CLASS':
   Gender  CLASS
0       0      0
1       1      0
2       0      0
3       0      0
4       1      0 

After outlier clipping (IQR) on numerical columns:
               AGE         Urea           Cr        HbA1c         Chol  \