### Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import rankdata, norm
import statsmodels.api as sm
from statsmodels.formula.api import ols

import warnings

In [2]:
warnings.filterwarnings("ignore")

### Reading the dataset

In [3]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Age,Education_Level,Occupation,Number_of_Dependents,Location,Work_Experience,Marital_Status,Employment_Status,Household_Size,Homeownership_Status,Type_of_Housing,Gender,Primary_Mode_of_Transportation,Income
0,56,Master's,Technology,5,Urban,21,Married,Full-time,7,Own,Apartment,Male,Public transit,72510
1,69,High School,Finance,0,Urban,4,Single,Full-time,7,Own,Apartment,Male,Biking,75462
2,46,Bachelor's,Technology,1,Urban,1,Single,Full-time,7,Own,Single-family home,Female,Car,71748
3,32,High School,Others,2,Urban,32,Married,Full-time,1,Own,Apartment,Female,Car,74520
4,60,Bachelor's,Finance,3,Urban,15,Married,Self-employed,4,Own,Townhouse,Male,Walking,640210


### Data Preprocessing 

In [4]:
categorical_col = list(df.select_dtypes(exclude="int64").columns)

In [5]:
categorical_col

['Education_Level',
 'Occupation',
 'Location',
 'Marital_Status',
 'Employment_Status',
 'Homeownership_Status',
 'Type_of_Housing',
 'Gender',
 'Primary_Mode_of_Transportation']

In [6]:
df["Age_rank"] = rankdata(df["Age"])
df["Age_rank_norm"] = norm.ppf((df['Age_rank'] - 0.5) / len(df['Age_rank']))

df["Work_Experience_rank"] = rankdata(df["Work_Experience"])
df["Work_Experience_rank_norm"] = norm.ppf((df["Work_Experience_rank"] - 0.5) / len(df["Work_Experience_rank"]))

In [7]:
df.head()

Unnamed: 0,Age,Education_Level,Occupation,Number_of_Dependents,Location,Work_Experience,Marital_Status,Employment_Status,Household_Size,Homeownership_Status,Type_of_Housing,Gender,Primary_Mode_of_Transportation,Income,Age_rank,Age_rank_norm,Work_Experience_rank,Work_Experience_rank_norm
0,56,Master's,Technology,5,Urban,21,Married,Full-time,7,Own,Apartment,Male,Public transit,72510,7289.5,0.60949,4228.0,-0.194863
1,69,High School,Finance,0,Urban,4,Single,Full-time,7,Own,Apartment,Male,Biking,75462,9729.5,1.925235,915.5,-1.331575
2,46,Bachelor's,Technology,1,Urban,1,Single,Full-time,7,Own,Single-family home,Female,Car,71748,5420.0,0.105348,296.5,-1.886705
3,32,High School,Others,2,Urban,32,Married,Full-time,1,Own,Apartment,Female,Car,74520,2674.0,-0.620847,6395.0,0.356989
4,60,Bachelor's,Finance,3,Urban,15,Married,Self-employed,4,Own,Townhouse,Male,Walking,640210,8021.5,0.849146,3033.0,-0.515076


In [8]:
y = df["Income"]

In [9]:
X = df.drop(["Age", "Work_Experience", "Age_rank", "Work_Experience_rank", "Income"], axis = 1)

In [10]:
X = pd.get_dummies(df, columns=["Occupation", "Location", "Marital_Status", "Employment_Status",
                   "Homeownership_Status", "Type_of_Housing", "Gender", "Primary_Mode_of_Transportation"], dtype="int8")

In [11]:
X["Education_Level"] = X["Education_Level"].replace({"High School":0,
                                                        "Bachelor's":1,
                                                        "Master's":2,
                                                        "Doctorate":3})

In [12]:
X.drop(["Age", "Work_Experience", "Age_rank", "Work_Experience_rank", "Income"], axis=1, inplace=True)

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 30 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Education_Level                                10000 non-null  int64  
 1   Number_of_Dependents                           10000 non-null  int64  
 2   Household_Size                                 10000 non-null  int64  
 3   Age_rank_norm                                  10000 non-null  float64
 4   Work_Experience_rank_norm                      10000 non-null  float64
 5   Occupation_Education                           10000 non-null  int8   
 6   Occupation_Finance                             10000 non-null  int8   
 7   Occupation_Healthcare                          10000 non-null  int8   
 8   Occupation_Others                              10000 non-null  int8   
 9   Occupation_Technology                          1000

In [14]:
necessary_features = []
for col in categorical_col:
    mdl = ols(f"Income ~C({col})", data=df).fit()
    anova_table = sm.stats.anova_lm(mdl, typ=2)

    if anova_table["PR(>F)"][0] < 0.05:
        print(f"ANOVA Test for {col}:")
        print(anova_table)
        necessary_features.append(col)

ANOVA Test for Education_Level:
                          sum_sq      df         F    PR(>F)
C(Education_Level)  2.643432e+13     3.0  2.658278  0.046593
Residual            3.313391e+16  9996.0       NaN       NaN
ANOVA Test for Occupation:
                     sum_sq      df         F   PR(>F)
C(Occupation)  3.709409e+13     4.0  2.798302  0.02453
Residual       3.312325e+16  9995.0       NaN      NaN
ANOVA Test for Location:
                   sum_sq      df          F        PR(>F)
C(Location)  1.041365e+14     2.0  15.746705  1.486080e-07
Residual     3.305621e+16  9997.0        NaN           NaN
ANOVA Test for Employment_Status:
                            sum_sq      df         F    PR(>F)
C(Employment_Status)  3.388654e+13     2.0  5.113189  0.006033
Residual              3.312646e+16  9997.0       NaN       NaN
ANOVA Test for Homeownership_Status:
                               sum_sq      df          F        PR(>F)
C(Homeownership_Status)  1.297858e+14     1.0  39.284772  3.

In [15]:
necessary_features

['Education_Level',
 'Occupation',
 'Location',
 'Employment_Status',
 'Homeownership_Status',
 'Type_of_Housing']