In [43]:
# EDA for Adult dataset
# Becker,Barry and Kohavi,Ronny. (1996). Adult. UCI Machine Learning Repository. https://doi.org/10.24432/C5XW20.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data from /adult/adult.data
columns = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]

df = pd.read_csv("adult/adult.data", names=columns, skipinitialspace=True, na_values="?")

# Drop rows with missing values
df = df.dropna()

from sklearn.preprocessing import LabelEncoder

# Encode the categorical variables including the target variable
le = LabelEncoder()
categorical_columns = df.select_dtypes(include="object").columns

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [45]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

X = df.drop("income", axis=1)
y = df["income"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# logistic_params = {
#     "max_iter": np.logspace(10, 16, 8, base=2).astype(int),
#     "C": np.logspace(-4, 4, 8, base=2),
#     "solver": ["lbfgs", "liblinear", "sag", "saga"],
#     "penalty": ["l1", "l2", "elasticnet", None],
# Best score:  0.8191803465254054
# Test score:  0.8228078899386706
# {'C': 1.4859942891369482, 'max_iter': 1854, 'penalty': 'l1', 'solver': 'liblinear'}
# }

logistic_params = {
    "max_iter": np.logspace(10, 15, 20, base=2).astype(int),
    "C": np.logspace(-4, 3, 20, base=2),
    "solver": ["liblinear"],
    "penalty": ["l1"],
}

print(f'max_iter: {logistic_params["max_iter"]}')
print(f'C: {logistic_params["C"]}')
print(f'solver: {logistic_params["solver"]}')
print(f'penalty: {logistic_params["penalty"]}')

grid_search = GridSearchCV(
    LogisticRegression(), logistic_params, cv=5, verbose=1, n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)
print("Test score: ", grid_search.score(X_test, y_test))

max_iter: [ 1024  1228  1474  1769  2124  2549  3059  3671  4406  5287  6345  7615
  9139 10968 13163 15797 18958 22751 27304 32768]
C: [0.0625     0.0806837  0.10415775 0.13446132 0.17358139 0.22408302
 0.28927756 0.37343974 0.482088   0.62234629 0.80341123 1.03715504
 1.3389041  1.72844379 2.23131584 2.88049308 3.71854142 4.80041088
 6.19703857 8.        ]
solver: ['liblinear']
penalty: ['l1']
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
Best parameters:  {'C': 1.0371550444461919, 'max_iter': 15797, 'penalty': 'l1', 'solver': 'liblinear'}
Best score:  0.8196363222527372
Test score:  0.8229736449527598
