In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

# Loading and preparation of data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

# Define column names
column_names = [
    "age", "workclass", "final_weight", "education", "education_num", "marital_status",
    "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
    "hours_per_week", "native_country", "income"
]

data = pd.read_csv(url, names=column_names, sep=",\s*", engine="python")

# Encode categorical variables with one-hot encoding
data_encoded = pd.get_dummies(data, columns=["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"])

# Map the target variable to binary labels (">50K" as 1, "<=50K" as 0)
data_encoded["income"] = data_encoded["income"].map({">50K": 1, "<=50K": 0})

# Separate features that is used to predict (X) and the target variable (y)
X = data_encoded.drop("income", axis=1)
y = data_encoded["income"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values in numeric columns with SimpleImputer
numeric_columns = X_train.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='median')
X_train[numeric_columns] = imputer.fit_transform(X_train[numeric_columns])

In [None]:
# Selection, Training, and Fine-Tuning of the Model

# selected classifier
clf = RandomForestClassifier(random_state=42)

# Define a parameter grid to search for the best hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Print the best hyperparameters found by the grid search
print("Best Hyperparameters:")
print(best_params)

# initialize classifier using the best parameters
clf = RandomForestClassifier(random_state=42, **best_params)

# Train the model with the best hyperparameters
clf.fit(X_train, y_train)

In [None]:
# Evaluation of the Model

# Make predictions for classification
y_pred = clf.predict(X_test)

# Classification Evaluation
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print("Classification Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)