Get the dataset at https://www.kaggle.com/datasets/wenruliu/adult-income-dataset

In [None]:
import pandas as pd

data = pd.read_csv('adult.csv')

print(data.head())


In [None]:

data = data.replace('?', pd.NA)

print(data.isnull().sum())

data = data.dropna()



In [None]:
# Label encoding.
# If this approach doesn't work, we can try using one-hot encoding.

from sklearn.preprocessing import LabelEncoder

categorical_columns = data.select_dtypes(include='object').columns
print(categorical_columns)

# Keep a dictionary of label encoders for each column for later use.
label_encoder_dict = {}
for col in categorical_columns:
    label_encoder = LabelEncoder()
    data[col] = label_encoder.fit_transform(data[col])
    label_encoder_dict[col] = label_encoder

print(data.head())

X = data.drop(columns='income')
y = data['income']

print(X.head())
print(y.head())

In [None]:
# One-hot encoding.

X = data.drop(columns='income')
y = data['income']

income_encoder = LabelEncoder()
X = pd.get_dummies(X, drop_first=True)
y = income_encoder.fit_transform(y)
print(X.head())
print(y[:5])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

# Optionally apply SMOTE to balance the classes.

# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=42)
# X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print("Coefficients: ", model.coef_)
print("Intercept: ", model.intercept_)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification report: \n", classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

feature_names = X.columns
coefficients = np.abs(model.coef_[0])
plt.barh(feature_names, coefficients)
plt.xlabel('Coefficient Magnitude')
plt.ylabel('Features')
plt.title('Feature Coefficients')
plt.show()

In [None]:
# Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = LogisticRegression(C=100, solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion matrix: ", confusion_matrix(y_test, y_pred))
print("Classification report: ", classification_report(y_test, y_pred))