In [1]:
import pandas as pd

DATA = '/kaggle/input/adult-census-income-dataset/adult.csv'

df = pd.read_csv(filepath_or_buffer=DATA)
df['high income'] = df['income'] != '<=50K' 
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,high income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K,False
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,False
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K,False
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K,False
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K,False


In [2]:
from plotly import express
express.histogram(data_frame=df, x='high income')

Our classes are unbalanced; we have far more people whose income is below the $50k line than above it.

Let's see what we can do by just doing some dimension reduction on just the numerical values.

In [3]:
import arrow
from umap import UMAP

COLUMNS = [column for column, datatype in df.dtypes.to_dict().items() if str(datatype) in {'float64', 'int64'} ]

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-10 18:57:15.363968: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-10 18:57:15.364102: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-10 18:57:15.519986: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=201, n_jobs=1, random_state=2024, verbose=True)
Wed Jul 10 18:57:25 2024 Construct fuzzy simplicial set
Wed Jul 10 18:57:25 2024 Finding Nearest Neighbors
Wed Jul 10 18:57:25 2024 Building RP forest with 14 trees
Wed Jul 10 18:57:29 2024 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	 3  /  15
	Stopping threshold met -- exiting after 3 iterations
Wed Jul 10 18:57:43 2024 Finished Nearest Neighbor Search
Wed Jul 10 18:57:46 2024 Construct embedding


Epochs completed:   0%|            0/201 [00:00]

	completed  0  /  201 epochs
	completed  20  /  201 epochs
	completed  40  /  201 epochs
	completed  60  /  201 epochs
	completed  80  /  201 epochs
	completed  100  /  201 epochs
	completed  120  /  201 epochs
	completed  140  /  201 epochs
	completed  160  /  201 epochs
	completed  180  /  201 epochs
	completed  200  /  201 epochs
Wed Jul 10 18:58:10 2024 Finished embedding
done with UMAP in 0:00:45.168653


In [4]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df, x='x', y='y', color='high income', height=800).show()

This is not especially encouraging; the high and low earners are all mixed up everywhere; what happens if we facet on the target variable?

In [5]:
express.scatter(data_frame=df, x='x', y='y', color='high income', height=800, facet_col='high income').show()

Let's build a simple model and see what happens.

In [6]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

TARGET = 'high income'
X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])

time_start = arrow.now()
model = LogisticRegression(max_iter=100000, tol=1e-4).fit(X=X_train, y=y_train)
print('model fit in {} iterations took {}'.format(model.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))
print('model done in {}'.format(arrow.now() - time_start))

model fit in 59 iterations took 0:00:00.100816
accuracy: 0.7966
model done in 0:00:00.104753


Nearly 80% accuracy is better than we might have expected; let's look at the classification report.

In [7]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.81      0.96      0.88      4945
        True       0.70      0.27      0.39      1568

    accuracy                           0.80      6513
   macro avg       0.76      0.62      0.63      6513
weighted avg       0.78      0.80      0.76      6513



This is not good news; our little model is doing a poor job of predicting true high earners.

Let's try using a more complicated model.

In [8]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=5, random_state=2024,).fit(X=X_train, y=y_train)
tree_pred = tree.predict(X=X_test)

print(classification_report(y_true=y_test, y_pred=tree_pred))

              precision    recall  f1-score   support

       False       0.84      0.96      0.90      4945
        True       0.76      0.44      0.56      1568

    accuracy                           0.83      6513
   macro avg       0.80      0.70      0.73      6513
weighted avg       0.82      0.83      0.82      6513



We can do incrementally better using a decision tree classifier.

Let's look at the confusion matrix from the tree model.

In [9]:
pd.DataFrame(data={'true': y_test, 'predicted': tree_pred}).value_counts().to_frame().reset_index()

Unnamed: 0,true,predicted,count
0,False,False,4730
1,True,False,874
2,True,True,694
3,False,True,215
