STEP 1 — Import Libraries

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler


STEP 2 — Load Dataset

In [6]:
columns = [
"age","workclass","fnlwgt","education","education-num",
"marital-status","occupation","relationship","race","sex",
"capital-gain","capital-loss","hours-per-week","native-country","income"
]

df = pd.read_csv("adult.csv", header=None, names=columns, skipinitialspace=True)

print(df.head())


   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

STEP 3 — Check Missing Values

In [7]:
print(df.isnull().sum())


age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


In [8]:
df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)


STEP 4 — Identify Categorical & Numerical

In [9]:
categorical_cols = df.select_dtypes(include="object").columns
numerical_cols = df.select_dtypes(exclude="object").columns

print("Categorical:", categorical_cols)
print("Numerical:", numerical_cols)


Categorical: Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income'],
      dtype='object')
Numerical: Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')


STEP 5 — Label Encoding (for ordered target)

In [10]:
le = LabelEncoder()
df["income"] = le.fit_transform(df["income"])


STEP 6 — One-Hot Encoding (for unordered categorical)

In [11]:
df = pd.get_dummies(df, columns=[
"workclass","education","marital-status",
"occupation","relationship","race","sex","native-country"
])


STEP 7 — Feature Scaling

In [12]:
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


STEP 8 — Compare Before & After Scaling

In [13]:
print(df[numerical_cols].describe())


                age        fnlwgt  education-num  capital-gain  capital-loss  \
count  3.016200e+04  3.016200e+04   3.016200e+04  3.016200e+04  3.016200e+04   
mean   1.535952e-16  1.684365e-17  -3.053058e-16 -2.720897e-17  7.703318e-17   
std    1.000017e+00  1.000017e+00   1.000017e+00  1.000017e+00  1.000017e+00   
min   -1.632189e+00 -1.666094e+00  -3.577051e+00 -1.474446e-01 -2.185860e-01   
25%   -7.946967e-01 -6.830644e-01  -4.397382e-01 -1.474446e-01 -2.185860e-01   
50%   -1.094756e-01 -1.076072e-01  -4.757405e-02 -1.474446e-01 -2.185860e-01   
75%    6.518811e-01  4.527602e-01   1.128918e+00 -1.474446e-01 -2.185860e-01   
max    3.925715e+00  1.225647e+01   2.305411e+00  1.335458e+01  1.055581e+01   

       hours-per-week  
count    3.016200e+04  
mean    -2.833973e-16  
std      1.000017e+00  
min     -3.333218e+00  
25%     -7.773411e-02  
50%     -7.773411e-02  
75%      3.396356e-01  
max      4.847229e+00  


STEP 9 — Save Processed Dataset

In [14]:
df.to_csv("adult_preprocessed.csv", index=False)
