In [8]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [9]:
data = fetch_openml("adult", version=2, as_frame=True)
df = data.frame

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [10]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [11]:
df["class"] = df["class"].astype(str).str.strip()
df["class"] = df["class"].map({"<=50K": 0, ">50K": 1}).astype(int)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('class', axis=1),
    df['class'],
    test_size=0.5,
    random_state=42,
    stratify=df['class']
)

In [13]:
categorical_cols = X_train.select_dtypes(include=["category"]).columns
numeric_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols),
    ]
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [14]:
log_reg = LogisticRegression(max_iter=5000)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

In [15]:
log_reg.fit(X_train, y_train)
rf.fit(X_train, y_train)
knn.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_knn = knn.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))

Logistic Regression Accuracy: 0.8447405702142731
Random Forest Accuracy: 0.8489463431910749
KNN Accuracy: 0.8262794404108376


In [16]:
voting_hard = VotingClassifier(
    estimators=[
        ("lr", log_reg),
        ("rf", rf),
        ("knn", knn)
    ],
    voting="hard"
)

voting_hard.fit(X_train, y_train)
y_pred_hard = voting_hard.predict(X_test)

print("Hard Voting Accuracy:", accuracy_score(y_test, y_pred_hard))

Hard Voting Accuracy: 0.8484593589516557


In [17]:
voting_soft = VotingClassifier(
    estimators=[
        ("lr", log_reg),
        ("rf", rf),
        ("knn", knn)
    ],
    voting="soft"
)

voting_soft.fit(X_train, y_train)
y_pred_soft = voting_soft.predict(X_test)

print("Soft Voting Accuracy:", accuracy_score(y_test, y_pred_soft))

Soft Voting Accuracy: 0.8489463431910749
