In [63]:
#Predict whether a person makes over 50K per year or not from classic adult dataset using XGBoost

#Import the required libraries
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

#Import the dataset
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None, low_memory=False)

test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', skiprows = 1,header=None,low_memory=False)

col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation','relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels

In [64]:
#Convert factors to labels
le = LabelEncoder()
train_set['workclass'] = le.fit_transform(train_set['workclass'])
train_set['education'] = le.fit_transform(train_set['education'])
train_set['marital_status'] = le.fit_transform(train_set['marital_status'])
train_set['occupation'] = le.fit_transform(train_set['occupation'])
train_set['relationship'] = le.fit_transform(train_set['relationship'])
train_set['race'] = le.fit_transform(train_set['race'])
train_set['sex'] = le.fit_transform(train_set['sex'])
train_set['native_country'] = le.fit_transform(train_set['native_country'])
train_set['wage_class'] = le.fit_transform(train_set['wage_class'])
test_set['workclass'] = le.fit_transform(test_set['workclass'])
test_set['education'] = le.fit_transform(test_set['education'])
test_set['marital_status'] = le.fit_transform(test_set['marital_status'])
test_set['occupation'] = le.fit_transform(test_set['occupation'])
test_set['relationship'] = le.fit_transform(test_set['relationship'])
test_set['race'] = le.fit_transform(test_set['race'])
test_set['sex'] = le.fit_transform(test_set['sex'])
test_set['native_country'] = le.fit_transform(test_set['native_country'])
test_set['wage_class'] = le.fit_transform(test_set['wage_class'])

for col in train_set.columns:
    train_set[col] = train_set[col].astype(np.int64)

for col in test_set.columns:
    test_set[col] = test_set[col].astype(np.int64)

y_train = train_set['wage_class']
train_set.drop(['wage_class'], axis = 1, inplace = True)

y_test = test_set['wage_class']
test_set.drop(['wage_class'], axis = 1, inplace = True)

In [65]:
#Fit the model 

model = XGBClassifier()
model.fit(train_set, y_train)
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


In [66]:
# make predictions for test data
y_pred = model.predict(test_set)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Model Accuracy: %.2f%%" % (accuracy * 100.0))
print("The predicted values are:\n", predictions)

Model Accuracy: 86.62%
The predicted values are:
 [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0