In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import cross_val_score

adult = fetch_ucirepo(id=2)
data = pd.concat([adult.data.features, adult.data.targets], axis=1)
data = data.dropna()

X = data.drop("income", axis=1)
y = data["income"]

cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

X_enc = pd.get_dummies(X, columns=cat_cols)

X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.3, random_state=42, stratify=y)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [47]:
print("\n=== Logistic Regression ===")
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix")
print(cm)
accuracy_score(y_test, y_pred)

# accuracies = cross_val_score(estimator = lr, X = X_train, y = y_train, cv = 10)
# print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
# print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))


=== Logistic Regression ===
Confusion Matrix
[[6883    8  519    6]
 [3151    4  251    2]
 [ 904    1 1432   16]
 [ 414    0  690    6]]


0.5826975572198502

In [46]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix")
print(cm)
accuracy_score(y_test, y_pred)

# accuracies = cross_val_score(estimator = rf, X = X_train, y = y_train, cv = 10)
# print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
# print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Confusion Matrix
[[5943  892  454  127]
 [2687  436  220   65]
 [ 712  141 1275  225]
 [ 329   74  584  123]]


0.5443410093091622

In [45]:
# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix")
print(cm)
accuracy_score(y_test, y_pred)

# accuracies = cross_val_score(estimator = gb, X = X_train, y = y_train, cv = 10)
# print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
# print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Confusion Matrix
[[7055   11  349    1]
 [3238    5  164    1]
 [ 934    2 1396   21]
 [ 438    1  665    6]]


0.5922866941975222

In [None]:
y_pred_gb = gb.predict(X_test)
cm = confusion_matrix(y_test, y_pred_gb, labels=[">50K", "<=50K"])
print("\nConfusion Matrix (rows=pred, cols=true):\n", cm)