# **Importing Required Libraries**

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer

# **Step 1 : Data Loading**

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
df = pd.read_csv(url, header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain',
           'capital-loss', 'hours-per-week', 'native-country', 'income']
, na_values=' ?', skipinitialspace=True, delimiter=',')

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# **Step 2 : Initial Data Inspection and Cleaning**

### Step 2.1 : Display dataset information and preview data

**Preview Data:** Using head() lets students see the actual records, which is critical for understanding the context of the data.

**Dataset Info:** The info() function shows non-null counts and datatypes, which helps in quickly spotting missing values or incorrect data formats.



In [8]:
# Display dataset information and preview data
# -> Display the first 5 rows and print the basic states for data
# -> Hint: head and info
print(df.head())

print(df.info())

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

**Dataset Dimensions:** Knowing the shape of the data informs students about its scale, which can affect computation time and choice of algorithms.

In [10]:
# Display the dimension of the data
# -> Hint : shape
df.shape

(32561, 15)

### **Step 2.2 : Basic Statistical Summary:**

The describe() function provides vital statistics (min, max, mean, standard deviation) that help identify any outliers or anomalies in numerical columns.

In [17]:
# Check the min , max , count , means , standard diviation etc
# Hint -> describe

Information = df.describe()
print("info:\n", Information)

info:
                 age        fnlwgt  education-num  capital-gain  capital-loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours-per-week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  


# **Step 2.3 : Counting Missing Values:**

This step is essential for diagnosing data quality. Missing values can lead to biased or inaccurate models if not handled properly.

In [18]:
# Count missing values in each column
print(df.isnull().sum())


age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


### **Step 2.4 : Checking for Duplicate Records:**

Duplicates can skew the analysis by over-representing some data, so it's important to remove them.

In [22]:
# Check for duplicate records

dups = df.duplicated().sum()
print(f"Number of duplicate records: {dups}")

Number of duplicate records: 24


### **Step 2.5 : Inspecting Unique Values in Categorical Columns:**

Unique value inspection reveals if there are any unexpected values (e.g., a '?' or extra spaces) that need cleaning. This is important for ensuring reliable encoding later.

In [24]:
# Check the unique data of each Categorical column to find if there is any irrelevant record or data e.g one record contains ? mark

# Select categorical columns (object dtype)
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)

# Check unique values in each categorical column
for col in categorical_cols:
    print(f"\nUnique values in '{col}':")
    print(df[col].unique())


Categorical columns: Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income'],
      dtype='object')

Unique values in 'workclass':
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']

Unique values in 'education':
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']

Unique values in 'marital-status':
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']

Unique values in 'occupation':
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']

Unique values in 'relationship':
['Not-in-famil

### **Step 2.6 : Validating and Converting Data Types:**

Converting columns to the correct datatype (like converting an ID column to a string) prevents errors in operations such as merging, filtering, or encoding.

In [25]:
# Check the data type of each column and if wrong datatype convert it to the suitable datatype

print("Current data types:\n")
print(df.dtypes)


Current data types:

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object


### **Step 2.7 : Checking Value Counts for Categorical Columns:**

Value counts help in understanding the distribution within each category. They are useful for detecting class imbalances and anomalies.

In [28]:
print(df.dtypes)


age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object


In [53]:
# Check value count for each Cateorical column

# Loop through all categorical columns and print value counts
categorical_cols = df.select_dtypes(include='category').columns

for col in categorical_cols:
    print(f"\nValue counts for '{col}':")
    print(df[col].value_counts(dropna=False))  # include NaNs if any



Value counts for 'workclass':
workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64

Value counts for 'education':
education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64

Value counts for 'marital-status':
marital-status
Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: count,

### **Step 2.8 : Handling Missing Values using SimpleImputer:**

Imputation preserves the dataset size while ensuring that no null values interfere with analysis. Different strategies are used for numerical (mean) and categorical (mode) columns based on their characteristics.

In [56]:
# TODO: fill null values either by mean , median  or mode based on type of data

# Separate numeric and categorical columns

# Numeric: fill with mean

# Categorical: fill with most frequent (mode)
from sklearn.impute import SimpleImputer

# Step 1: Separate numeric and categorical columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include='category').columns

# Step 2: Impute numeric columns with mean
num_imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])

# Step 3: Convert categorical cols to 'object' before imputation
df[categorical_cols] = df[categorical_cols].astype('object')

# Step 4: Impute categorical columns with most frequent (mode)
cat_imputer = SimpleImputer(strategy='most_frequent')  #mode hota
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

# Step 5: Reconvert to 'category' dtype
for col in categorical_cols:
    df[col] = df[col].astype('category')



# **Step 3 : Converting Data Types and Cleaning Categorical Data**

[link text](https:// [link text](https://))### **Step 3.1 :Removing Leading/Trailing Spaces:**

Standardize entries in categorical columns so that no extra spaces lead to misclassification of similar values.



In [54]:
# Check for leading/trailing spaces in categorical data and remove them

# Step 1: Identify categorical columns (those with object or category dtype)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Step 2: Remove leading/trailing spaces in each categorical column
for col in categorical_cols:
    df[col] = df[col].str.strip()

# Check if any spaces remain
print(df[categorical_cols].head())


          workclass  education      marital-status         occupation  \
0         State-gov  Bachelors       Never-married       Adm-clerical   
1  Self-emp-not-inc  Bachelors  Married-civ-spouse    Exec-managerial   
2           Private    HS-grad            Divorced  Handlers-cleaners   
3           Private       11th  Married-civ-spouse  Handlers-cleaners   
4           Private  Bachelors  Married-civ-spouse     Prof-specialty   

    relationship   race     sex native-country    age_group  
0  Not-in-family  White    Male  United-States        Adult  
1        Husband  White    Male  United-States  Middle-Aged  
2  Not-in-family  White    Male  United-States        Adult  
3        Husband  Black    Male  United-States  Middle-Aged  
4           Wife  Black  Female           Cuba        Adult  


### **Step 3.2 : Checking and Converting Data Types:**

Ensure that every column is of the correct data type (e.g., IDs as strings, dates as datetime).

In [55]:
# Check the data type of each column and if wrong datatype convert it to the suitable datatype

# Step 1: Check the current data types
print("Current data types:\n")
print(df.dtypes)


# Convert 'age' and similar columns to numeric (int64, float64)
df['age'] = df['age'].astype('int64')  # Example for numerical columns

# Convert categorical columns (e.g., 'workclass', 'education') to 'category' dtype
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].astype('category')

# Convert 'income' to categorical as it's a target variable with classes
df['income'] = df['income'].astype('category')

# You can also check other columns for necessary conversion:
# For example, converting 'native-country' to 'category' type
df['native-country'] = df['native-country'].astype('category')

# Step 3: Verify the changes
print("\nUpdated data types:\n")
print(df.dtypes)


Current data types:

age                            float64
workclass                       object
fnlwgt                         float64
education                       object
education-num                  float64
marital-status                  object
occupation                      object
relationship                    object
race                            object
sex                             object
capital-gain                   float64
capital-loss                   float64
hours-per-week                 float64
native-country                  object
income                           int64
age_group                       object
education_hours_interaction    float64
dtype: object

Updated data types:

age                               int64
workclass                      category
fnlwgt                          float64
education                      category
education-num                   float64
marital-status                 category
occupation                     category


### **3.3 : Converting to 'category' Datatype:**

Transform columns that represent categorical data (like Gender or Embarked port) into the 'category' type to optimize memory and computational performance.

In [44]:
# Convert suitable columns to 'category' datatype

# Step 1: Identify categorical columns (those with 'object' dtype)
categorical_cols = df.select_dtypes(include=['object']).columns

# Step 2: Convert each categorical column to 'category' dtype
for col in categorical_cols:
    df[col] = df[col].astype('category')

# Optional: Check the updated data types
print("\nUpdated data types after conversion:\n")
print(df.dtypes)



Updated data types after conversion:

age                float64
workclass         category
fnlwgt             float64
education         category
education-num      float64
marital-status    category
occupation        category
relationship      category
race              category
sex               category
capital-gain       float64
capital-loss       float64
hours-per-week     float64
native-country    category
income            category
dtype: object


# **Step 4 : Feature Engineering**

### **Step 4.1 : Creating "age_group" Feature:**

Binning converts continuous age values into meaningful categories (e.g., 'Young', 'Adult') that are easier to analyze and interpret.

In [45]:
# create feature "age_group" age categories using binning


bins = [0, 25, 45, 65, np.inf]
labels = ['Young', 'Adult', 'Middle-Aged', 'Senior']

# Step 2: Create a new column 'age_group' based on the age ranges
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

# Step 3: Check the first few rows to see the new column
print(df[['age', 'age_group']].head())


    age    age_group
0  39.0        Adult
1  50.0  Middle-Aged
2  38.0        Adult
3  53.0  Middle-Aged
4  28.0        Adult


### **Step 4.2 : Creating "education_hours_interaction" Feature:**

Interaction features help capture complex relationships between variables. In this case, the interaction between education (via 'education-num') and work intensity ('hours-per-week') may reveal underlying patterns related to social or economic outcomes.

In [57]:
# Create an interaction feature "education_hours_interaction": education-num multiplied by hours-per-week (as a proxy for workload vs. education level)
# Step 1: Create the interaction feature
df['education_hours_interaction'] = df['education-num'] * df['hours-per-week']

# Step 2: Check the first few rows to ensure the new feature is added correctly
print(df[['education-num', 'hours-per-week', 'education_hours_interaction']].head(10))


   education-num  hours-per-week  education_hours_interaction
0           13.0       -0.035429                    -0.460583
1           13.0       -2.222153                   -28.887991
2            9.0       -0.035429                    -0.318865
3            7.0       -0.035429                    -0.248006
4           13.0       -0.035429                    -0.460583
5           14.0       -0.035429                    -0.496012
6            5.0       -1.979184                    -9.895919
7            9.0        0.369519                     3.325674
8           14.0        0.774468                    10.842555
9           13.0       -0.035429                    -0.460583


# **Step 5 : Encoding Categorical Data**

**One-Hot Encoding:** Converts multiple categorical values into binary columns to prevent ordinality.


In [48]:
# One-hot encode the categorical columns (sex, workclass, education, etc.).

# Step 1: Identify categorical columns (already done previously)
categorical_cols = df.select_dtypes(include=['category']).columns

# Step 2: Apply One-Hot Encoding using pd.get_dummies()
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Step 3: Check the new dataframe
print(df_encoded.head())


    age    fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0  39.0   77516.0           13.0        2174.0           0.0            40.0   
1  50.0   83311.0           13.0           0.0           0.0            13.0   
2  38.0  215646.0            9.0           0.0           0.0            40.0   
3  53.0  234721.0            7.0           0.0           0.0            40.0   
4  28.0  338409.0           13.0           0.0           0.0            40.0   

   education_hours_interaction  workclass_Federal-gov  workclass_Local-gov  \
0                        520.0                  False                False   
1                        169.0                  False                False   
2                        360.0                  False                False   
3                        280.0                  False                False   
4                        520.0                  False                False   

   workclass_Never-worked  ...  native-country_Tai


**Label Encoding for Income:** Maps income to binary labels for binary classification tasks.

In [50]:
# Use label encoding for the income column, converting ≤50K to 0 and 50K to 1.

from sklearn.preprocessing import LabelEncoder

# Step 1: Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Step 2: Fit and transform the 'income' column
df['income'] = label_encoder.fit_transform(df['income'])

# Step 3: Check the updated 'income' column
print(df[['income']].head(20))


    income
0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        1
8        1
9        1
10       1
11       1
12       0
13       0
14       1
15       0
16       0
17       0
18       0
19       1


# **Step 6 : Normalization and Standardization**

Standardization transforms the specified columns to a mean of 0 and a standard deviation of 1, which is important to ensure comparability among numerical features during model training.

In [52]:
# Standardize the "age", "hours-per-week", "capital-gain" and "capital-loss" column to have a mean of 0 and a standard deviation of 1.

from sklearn.preprocessing import StandardScaler

# Step 1: Select the columns to standardize
columns_to_standardize = ['age', 'hours-per-week', 'capital-gain', 'capital-loss']

# Step 2: Initialize the StandardScaler
scaler = StandardScaler()

# Step 3: Standardize the selected columns
df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

# Step 4: Check the first few rows to verify the transformation
print(df[columns_to_standardize].head(10))


        age  hours-per-week  capital-gain  capital-loss
0  0.030671       -0.035429      0.148453      -0.21666
1  0.837109       -2.222153     -0.145920      -0.21666
2 -0.042642       -0.035429     -0.145920      -0.21666
3  1.057047       -0.035429     -0.145920      -0.21666
4 -0.775768       -0.035429     -0.145920      -0.21666
5 -0.115955       -0.035429     -0.145920      -0.21666
6  0.763796       -1.979184     -0.145920      -0.21666
7  0.983734        0.369519     -0.145920      -0.21666
8 -0.555830        0.774468      1.761142      -0.21666
9  0.250608       -0.035429      0.555214      -0.21666
