In [6]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# importing required libraries for Machine learing models
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Onboarding the data & create the data frame
df = pd.read_csv(r'E:\Python\AI&ML\SourceData\adult_income.csv')

In [9]:
# Exploratory Data Analysis (EDA)
# To see 1st five records of df
df.head()

Unnamed: 0,age,workclass,education,marital_status,race,gender,hours_per_week,income
0,39,State-gov,Bachelors,Never-married,White,Male,40,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,White,Male,13,<=50K
2,38,Private,HS-grad,Divorced,White,Male,40,<=50K
3,53,Private,11th,Married-civ-spouse,Black,Male,40,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Black,Female,40,<=50K


In [10]:
# To see lsst five records of df
df.tail()

Unnamed: 0,age,workclass,education,marital_status,race,gender,hours_per_week,income
32556,27,Private,Assoc-acdm,Married-civ-spouse,White,Female,38,<=50K
32557,40,Private,HS-grad,Married-civ-spouse,White,Male,40,>50K
32558,58,Private,HS-grad,Widowed,White,Female,40,<=50K
32559,22,Private,HS-grad,Never-married,White,Male,20,<=50K
32560,52,Self-emp-inc,HS-grad,Married-civ-spouse,White,Female,40,>50K


In [11]:
# To see the non-null count and data type of each column in df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   education       32561 non-null  object
 3   marital_status  32561 non-null  object
 4   race            32561 non-null  object
 5   gender          32561 non-null  object
 6   hours_per_week  32561 non-null  int64 
 7   income          32561 non-null  object
dtypes: int64(2), object(6)
memory usage: 2.0+ MB


In [12]:
# To see the dimension (r x c) of df
df.shape

(32561, 8)

In [13]:
# To see the minimal statistical report of df
df.describe()

Unnamed: 0,age,hours_per_week
count,32561.0,32561.0
mean,38.581647,40.437456
std,13.640433,12.347429
min,17.0,1.0
25%,28.0,40.0
50%,37.0,40.0
75%,48.0,45.0
max,90.0,99.0


In [14]:
# To see column names of df
df.columns

Index(['age', 'workclass', 'education', 'marital_status', 'race', 'gender',
       'hours_per_week', 'income'],
      dtype='object')

In [15]:
# Checking for null values in each column
df.isnull().sum()

age                  0
workclass         1836
education            0
marital_status       0
race                 0
gender               0
hours_per_week       0
income               0
dtype: int64

In [16]:
# Replacing workclass column null values with it's mode
wc_mode= df['workclass'].mode()[0]
df['workclass'].fillna(wc_mode, inplace=True)
df.isnull().sum()

age               0
workclass         0
education         0
marital_status    0
race              0
gender            0
hours_per_week    0
income            0
dtype: int64

In [17]:
# create dummy variables
df.dtypes
df1 = pd.get_dummies(df, drop_first=True, dtype=int)
df1.head()

Unnamed: 0,age,hours_per_week,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_11th,...,marital_status_Married-spouse-absent,marital_status_Never-married,marital_status_Separated,marital_status_Widowed,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Male,income_>50K
0,39,40,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,1,0
1,50,13,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,38,40,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3,53,40,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
4,28,40,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [18]:
# Data normalization for age & hours_per_week
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df1[['age', 'hours_per_week']] = scaler.fit_transform(df1[['age', 'hours_per_week']])
df1.head()

Unnamed: 0,age,hours_per_week,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_11th,...,marital_status_Married-spouse-absent,marital_status_Never-married,marital_status_Separated,marital_status_Widowed,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Male,income_>50K
0,0.030671,-0.035429,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,1,0
1,0.837109,-2.222153,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,-0.042642,-0.035429,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3,1.057047,-0.035429,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
4,-0.775768,-0.035429,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [19]:
# identifying dependent & independent variables
x = df1.drop('income_>50K', axis=1)
y = df1['income_>50K']

In [20]:
# spit the x, y datasets to train & test datasets
#from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [21]:
# Build Decesion Tree model to x_train
#  Import & train classifier
# from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)

0,1,2
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`.",'gini'
,"splitter  splitter: {""best"", ""random""}, default=""best"" The strategy used to choose the split at each node. Supported strategies are ""best"" to choose the best split and ""random"" to choose the best random split.",'best'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: int, float or {""sqrt"", ""log2""}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at  each split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. note::  The search for a split does not stop until at least one  valid partition of the node samples is found, even if it requires to  effectively inspect more than ``max_features`` features.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``""best""``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.",
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0


In [22]:
# y_prediction on x_test data
# Test the model
y_pred = dtc.predict(x_test)

In [23]:
# creating confusion matrix
#from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cm

array([[6492,  963],
       [1183, 1131]])

In [24]:
# Model Evaluation (Accuracy)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Key Targets:
# ≈ 50%	    No better than random guessing (binary classification)
# 60–70%	Weak / baseline model
# 70–85%	Reasonable / decent
# 85–95%	Very good
# > 95%	    Excellent (check for overfitting or data leakage)
# Note: High accuracy can be misleading if classes are imbalanced.

Accuracy: 0.7803255195004607
