In [1]:
import boto3
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import io

In [2]:
df = pd.read_csv('income.csv')
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income >50K
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [3]:
df.info()
df.isnull().sum()
df.replace(' ?', np.nan, inplace=True)  # replace '?' with NaN
print("Missing values per column:\n", df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   education       32561 non-null  object
 3   education-num   32561 non-null  int64 
 4   marital-status  32561 non-null  object
 5   occupation      30718 non-null  object
 6   relationship    32561 non-null  object
 7   race            32561 non-null  object
 8   sex             32561 non-null  object
 9   capital-gain    32561 non-null  int64 
 10  capital-loss    32561 non-null  int64 
 11  hours-per-week  32561 non-null  int64 
 12  native-country  31978 non-null  object
 13  income >50K     32561 non-null  int64 
dtypes: int64(6), object(8)
memory usage: 3.5+ MB
Missing values per column:
 age                  0
workclass         1836
education            0
education-num        0
marital-status   

In [4]:
categorical_cols = ['workclass', 'education', 'marital-status', 
                    'occupation', 'relationship', 'race', 'sex', 
                    'native-country', 'income >50K']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
print("Categorical columns encoded")


Categorical columns encoded


In [5]:
numeric_cols = ['age', 'education-num', 'capital-gain', 
                'capital-loss', 'hours-per-week']

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print("Numeric columns standardized")

Numeric columns standardized


In [6]:
# Example 1: Work-life balance ratio
df['gain_per_hour'] = (df['capital-gain'] + 1) / (df['hours-per-week'] + 1)

# Example 2: Age category binning
df['age_group'] = pd.cut(df['age'], bins=[-3, -1, 0, 1, 3],
                         labels=['Young', 'Adult', 'Middle', 'Senior'])

print("Feature engineering complete")

Feature engineering complete


In [7]:
df.to_csv('processed_income.csv', index=False)
print("Processed dataset saved locally as processed_income.csv")

Processed dataset saved locally as processed_income.csv
