# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error


# Load Dataset

In [2]:
file_path = "adult.csv"

data = pd.read_csv(file_path)

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)
pd.set_option("display.colheader_justify", "left")

data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,,40.0,Cuba,<=50K


In [3]:
data.dtypes

age               float64
workclass          object
fnlwgt            float64
education          object
education-num     float64
marital-status     object
occupation         object
relationship       object
race               object
sex                object
capital-gain      float64
capital-loss      float64
hours-per-week    float64
native-country     object
income             object
dtype: object

In [4]:
data.isnull().sum()

age               1628
workclass         3256
fnlwgt             651
education          326
education-num     2605
marital-status     977
occupation        4884
relationship      1302
race               163
sex                 33
capital-gain      1954
capital-loss      2279
hours-per-week    2930
native-country    3907
income             651
dtype: int64

In [5]:
numerical_features = [feature for feature in data.columns if data[feature].dtypes != "O"]
categorical_features = [feature for feature in data.columns if data[feature].dtypes == "O"]
target_variable = "income"
print("Numerical Features: ", numerical_features)
print("Categorical Features: ", categorical_features)
print("Target Variable: ", target_variable)

Numerical Features:  ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Categorical Features:  ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
Target Variable:  income


# Mean Imputation (rows with multiple Nan)

In [8]:
nan_counts = data[numerical_features].isna().sum(axis=1)
rows_with_multiple_nans = data[numerical_features][nan_counts > 1]
rows_with_multiple_nans = rows_with_multiple_nans.apply(lambda x: x.fillna(x.mean()), axis=0)
data.loc[rows_with_multiple_nans.index, numerical_features] = rows_with_multiple_nans

print(f"\nFinal number of 1 Nan: \n\n{data[numerical_features].isna().sum()}")


# 0 nan = 13423
# 1 nan = 12532
# >1 nan = 6615


Final number of 1 Nan: 

age               1158
fnlwgt             465
education-num     1916
capital-gain      1440
capital-loss      1631
hours-per-week    2212
dtype: int64


# Linear Regression Imputation (numerical features)

In [9]:
clean_numerical_data = data[numerical_features].dropna()
clean_numerical_data.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
2,38.0,215646.0,9.0,0.0,0.0,40.0
3,53.0,234721.0,7.0,0.0,0.0,40.0
6,49.0,160187.0,5.0,0.0,0.0,16.0
7,52.0,209642.0,9.0,0.0,0.0,45.0
9,42.0,159449.0,13.0,5178.0,0.0,40.0


In [10]:
clean_numerical_data.isnull().sum()

age               0
fnlwgt            0
education-num     0
capital-gain      0
capital-loss      0
hours-per-week    0
dtype: int64

In [None]:

results = {}
models = {} 

for target in clean_numerical_data.columns:
    print(f"\nTARGET == {target}")

    X = clean_numerical_data.drop(columns=[target])
    y = clean_numerical_data[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[target] = {"MAE": mae, "R2": r2}
    models[target] = model  

    predictions_df = X_test.copy()
    predictions_df[f"Actual {target}"] = y_test
    predictions_df[f"Predicted {target}"] = y_pred

    print(predictions_df.head())

print("\nFinal results:")
for feature, metrics in results.items():
    print(f"{feature} -> MAE: {metrics['MAE']:.4f}, R2: {metrics['R2']:.4f}")




TARGET == age
       fnlwgt    education-num  capital-gain  capital-loss  hours-per-week  Actual age  Predicted age
2082   297396.0  10.000000          0.0        0.000000     60.000000       40.0        37.969692    
14275  130856.0   9.000000          0.0       94.727982     39.937122       29.0        38.823770    
3651    30497.0  13.000000      15024.0        0.000000     60.000000       34.0        43.119500    
22473  111836.0  10.000000          0.0        0.000000     50.000000       32.0        39.405998    
6624   313986.0   9.992991          0.0       94.727982     40.000000       90.0        37.084926    

TARGET == fnlwgt
       age   education-num  capital-gain  capital-loss  hours-per-week  Actual fnlwgt  Predicted fnlwgt
2082   40.0  10.000000          0.0        0.000000     60.000000       297396.0       188831.366715   
14275  29.0   9.000000          0.0       94.727982     39.937122       130856.0       198282.349155   
3651   34.0  13.000000      15024.0        

In [12]:
missing_numerical_data = data[numerical_features][data[numerical_features].isna().any(axis=1)]
missing_numerical_data.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,39.0,77516.0,13.0,2174.0,,40.0
1,50.0,83311.0,,0.0,0.0,13.0
4,28.0,338409.0,13.0,0.0,,40.0
5,37.0,284582.0,,0.0,0.0,40.0
8,,45781.0,14.0,14084.0,0.0,50.0


In [13]:


filled_numerical_data = missing_numerical_data.copy()

for target in missing_numerical_data.columns:
    X = missing_numerical_data[missing_numerical_data[target].isna()].drop(columns=[target])
    
    y_pred = models[target].predict(X)

    filled_numerical_data.loc[X.index, target] = y_pred

filled_numerical_data

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,39.000000,77516.000000,13.000000,2174.000000,124.985881,40.000000
1,50.000000,83311.000000,9.395268,0.000000,0.000000,13.000000
4,28.000000,338409.000000,13.000000,0.000000,109.311367,40.000000
5,37.000000,284582.000000,9.906145,0.000000,0.000000,40.000000
8,42.506097,45781.000000,14.000000,14084.000000,0.000000,50.000000
...,...,...,...,...,...,...
32538,38.000000,139180.000000,13.000000,15020.000000,0.000000,43.617643
32543,45.000000,119199.000000,12.000000,2196.649679,0.000000,48.000000
32552,43.000000,84661.000000,11.000000,0.000000,0.000000,40.976379
32553,32.000000,189628.513542,14.000000,0.000000,0.000000,11.000000


In [14]:
#copy results back to data
data.loc[filled_numerical_data.index, numerical_features] = filled_numerical_data


# Mode Imputation (categorical features)

In [18]:
data[categorical_features] = data[categorical_features].apply(lambda x: x.fillna(x.mode()[0]))

In [21]:
data.isnull().sum() , data.shape[0]

(age               0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
 dtype: int64,
 32561)

In [22]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,124.985881,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,9.395268,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Prof-specialty,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,109.311367,40.0,Cuba,<=50K


32561