In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

In [2]:
# Load the data
train_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
test_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'

# Define column labels
col_labels = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
    'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
    'hours_per_week', 'native_country', 'wage_class'
]

# Load datasets
train_set = pd.read_csv(train_url, header=None, names=col_labels, na_values=' ?')
test_set = pd.read_csv(test_url, header=None, names=col_labels, na_values=' ?', skiprows=1)

# Combine datasets for consistent preprocessing
data = pd.concat([train_set, test_set], ignore_index=True)

# Handle missing values by removing rows with any NA values
data = data.dropna()

# Encode categorical variables
le = LabelEncoder()
for col in ['workclass', 'education', 'marital_status', 'occupation', 'relationship',
            'race', 'sex', 'native_country', 'wage_class']:
    data[col] = le.fit_transform(data[col])

# Split back into train and test sets
train_data = data[:len(train_set)]
test_data = data[len(train_set):]

# Separate features and target variable
X_train = train_data.drop('wage_class', axis=1)
y_train = train_data['wage_class']
X_test = test_data.drop('wage_class', axis=1)
y_test = test_data['wage_class']

# Initialize and train XGBoost model
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.00047389621672853645

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.83      0.00      0.00      9561
           2       0.00      0.00      0.00         0
           3       1.00      0.00      0.00      3100

    accuracy                           0.00     12661
   macro avg       0.46      0.00      0.00     12661
weighted avg       0.87      0.00      0.00     12661



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
