In [1]:
import pandas as pd

DATA = '/kaggle/input/adult-census-income-dataset/adult.csv'

df = pd.read_csv(filepath_or_buffer=DATA)
# let's change our target variable to binary
df['target'] = df['income'] == '>50K'
df = df.drop(columns=['income'])
# make the columns that aren't numeric or the target to be categorical
for column in ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']:
    df[column] = df[column].astype('category')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,False
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,False
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,False
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,False
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,False


Are our classes balanced?

In [2]:
df['target'].value_counts().to_dict(), df['target'].value_counts(normalize=True).to_dict()

({False: 24720, True: 7841},
 {False: 0.7591904425539756, True: 0.2408095574460244})

Our classes are unbalanced; roughly 25% of the target variable is true.

In [3]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

LEARNING_RATE = 5e-2 
RANDOM_STATE = 2024
TARGET = 'target'

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=[TARGET,]), df[TARGET], test_size=0.20, random_state=RANDOM_STATE)

model = LGBMClassifier(boosting_type='gbdt', learning_rate=LEARNING_RATE, objective='binary', num_leaves=31, verbose=1, random_state=RANDOM_STATE, force_col_wise=True,)
model.fit(X=X_train, y=y_train, categorical_feature=['workclass'])
result = model.predict(X=X_test, num_iteration=model.best_iteration_, )

[LightGBM] [Info] Number of positive: 6275, number of negative: 19773
[LightGBM] [Info] Total Bins 690
[LightGBM] [Info] Number of data points in the train set: 26048, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.240901 -> initscore=-1.147744
[LightGBM] [Info] Start training from score -1.147744


We don't care so much about the accuracy or the F1 as we do about the True precision and recall, so let's use a classification report.

In [4]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=result))

              precision    recall  f1-score   support

       False       0.89      0.94      0.92      4947
        True       0.78      0.64      0.70      1566

    accuracy                           0.87      6513
   macro avg       0.84      0.79      0.81      6513
weighted avg       0.87      0.87      0.87      6513



We can mock up a confusion matrix pretty easily from the true and predicted values.

In [5]:
pd.DataFrame(data={'true': y_test, 'predicted': result}).value_counts().to_frame().reset_index()

Unnamed: 0,true,predicted,count
0,False,False,4661
1,True,True,1007
2,True,False,559
3,False,True,286
