In [2]:
# Task 4: Feature Encoding & Scaling (Adult Income Dataset)

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Column names for Adult dataset
columns = [
    "age","workclass","fnlwgt","education","education-num","marital-status",
    "occupation","relationship","race","sex","capital-gain","capital-loss",
    "hours-per-week","native-country","income"
]

# Load dataset
data = pd.read_csv("adult.csv", header=None, names=columns)

# Replace missing values
data = data.replace(" ?", np.nan)
data = data.dropna()

# Separate categorical and numerical columns
categorical_cols = data.select_dtypes(include=['object']).columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

# Label Encoding for target column (income)
le = LabelEncoder()
data["income"] = le.fit_transform(data["income"])

# One-Hot Encoding for categorical features (except income)
data_encoded = pd.get_dummies(data, columns=categorical_cols[:-1])

# Feature Scaling using StandardScaler
scaler = StandardScaler()
data_encoded[numerical_cols] = scaler.fit_transform(data_encoded[numerical_cols])

# Compare before and after scaling
print("Before Scaling:")
print(data[numerical_cols].describe())

print("\nAfter Scaling:")
print(data_encoded[numerical_cols].describe())

# Save processed dataset
data_encoded.to_csv("adult_processed.csv", index=False)
print("\nProcessed dataset saved as adult_processed.csv")


Before Scaling:
                age        fnlwgt  education-num  capital-gain  capital-loss  \
count  30162.000000  3.016200e+04   30162.000000  30162.000000  30162.000000   
mean      38.437902  1.897938e+05      10.121312   1092.007858     88.372489   
std       13.134665  1.056530e+05       2.549995   7406.346497    404.298370   
min       17.000000  1.376900e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.176272e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.784250e+05      10.000000      0.000000      0.000000   
75%       47.000000  2.376285e+05      13.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours-per-week  
count    30162.000000  
mean        40.931238  
std         11.979984  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  

After Scaling:
                age        fnlw