In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

train_set =  \
pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
test_set =  \
pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', skiprows = 1, header = None)
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',              
'occupation','relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week',          
'native_country', 'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels  

#drop redundant columns
train_set.drop(['age', 'education','race', 'sex', 'native_country' ], axis = 1, inplace = True)
test_set.drop(['age', 'education','race', 'sex', 'native_country' ], axis = 1, inplace = True) 
print(train_set.info())
print(train_set['wage_class'].unique())
print(train_set['marital_status'].unique())
print(train_set['occupation'].unique())
print(train_set['relationship'].unique())

train_set = train_set[(train_set['workclass'] == ' State-gov') | (train_set['workclass'] == ' Self-emp-not-inc') | (train_set['workclass'] ==  ' Private') | (train_set['workclass'] ==  ' Federal-gov') | (train_set['workclass'] == ' Local-gov')
 | (train_set['workclass'] ==  ' Self-emp-inc') | (train_set['workclass'] == ' Without-pay') | (train_set['workclass'] == ' Never-worked')]

test_set = test_set[(test_set['workclass'] == ' State-gov') | (test_set['workclass'] == ' Self-emp-not-inc') | (test_set['workclass'] ==  ' Private') | (test_set['workclass'] ==  ' Federal-gov') | (test_set['workclass'] == ' Local-gov')
 | (test_set['workclass'] ==  ' Self-emp-inc') | (test_set['workclass'] == ' Without-pay') | (test_set['workclass'] == ' Never-worked')]

train_set = train_set[(train_set['occupation'] == ' Adm-clerical') | (train_set['occupation'] == ' Exec-managerial') | (train_set['occupation'] ==  ' Handlers-cleaners') | (train_set['occupation'] ==  ' Prof-specialty') | (train_set['occupation'] == ' Other-service')
 | (train_set['occupation'] ==  ' Sales') | (train_set['occupation'] == ' Craft-repair') | (train_set['occupation'] == ' Transport-moving') 
 | (train_set['occupation'] ==  ' Farming-fishing') | (train_set['occupation'] == ' Machine-op-inspct') | (train_set['occupation'] == ' Tech-support') 
 | (train_set['occupation'] ==  ' Protective-serv') | (train_set['occupation'] == ' Armed-Forces') | (train_set['occupation'] == ' Priv-house-serv')]


test_set = test_set[(test_set['occupation'] == ' Adm-clerical') | (test_set['occupation'] == ' Exec-managerial') | (test_set['occupation'] ==  ' Handlers-cleaners') | (test_set['occupation'] ==  ' Prof-specialty') | (test_set['occupation'] == ' Other-service')
 | (test_set['occupation'] ==  ' Sales') | (test_set['occupation'] == ' Craft-repair') | (test_set['occupation'] == ' Transport-moving') 
 | (test_set['occupation'] ==  ' Farming-fishing') | (test_set['occupation'] == ' Machine-op-inspct') | (test_set['occupation'] == ' Tech-support') 
 | (test_set['occupation'] ==  ' Protective-serv') | (test_set['occupation'] == ' Armed-Forces') | (test_set['occupation'] == ' Priv-house-serv')]


#map non-numeric values to numeric values
train_set['marital_status'] = train_set['marital_status'].map({' Never-married': 0, ' Married-civ-spouse': 1, ' Divorced': 2,
 ' Married-spouse-absent': 3, ' Separated': 4, ' Married-AF-spouse': 5, ' Widowed': 5})
                                        
train_set['occupation'] = train_set['occupation'].map({' Adm-clerical': 0, ' Exec-managerial': 1, ' Handlers-cleaners': 2,
 ' Prof-specialty': 3, ' Other-service': 4, ' Sales': 5, ' Transport-moving': 6, ' Farming-fishing': 7,
 ' Machine-op-inspct': 8, ' Tech-support': 9, ' Craft-repair':10, ' Protective-serv': 11,
 ' Armed-Forces': 12, ' Priv-house-serv': 13})


train_set['relationship'] = train_set['relationship'].map({' Not-in-family': 0, ' Husband': 1, ' Wife': 2, ' Own-child': 3, 
                    ' Unmarried': 4, ' Other-relative': 5})
    

test_set['marital_status'] = test_set['marital_status'].map({' Never-married': 0, ' Married-civ-spouse': 1, ' Divorced': 2,
 ' Married-spouse-absent': 3, ' Separated': 4, ' Married-AF-spouse': 5, ' Widowed': 5})
                                        
test_set['occupation'] = test_set['occupation'].map({' Adm-clerical': 0, ' Exec-managerial': 1, ' Handlers-cleaners': 2,
 ' Prof-specialty': 3, ' Other-service': 4, ' Sales': 5, ' Transport-moving': 6, ' Farming-fishing': 7,
 ' Machine-op-inspct': 8, ' Tech-support': 9, ' Craft-repair':10, ' Protective-serv': 11,
 ' Armed-Forces': 12, ' Priv-house-serv': 13})


test_set['relationship'] = test_set['relationship'].map({' Not-in-family': 0, ' Husband': 1, ' Wife': 2, ' Own-child': 3, 
                    ' Unmarried': 4, ' Other-relative': 5})
                                      
train_set['wage_class'] = train_set['wage_class'].map({' <=50K': 1, ' >50K': 2}) 

test_set['wage_class'].replace(' <=', ' <=50K', inplace = True)
test_set['wage_class'] = test_set['wage_class'].map({' <=50K.': 1, ' >50K.': 2})    
print(test_set['wage_class'].unique())
train_set['workclass'] = train_set['workclass'].map({' State-gov': 0, ' Self-emp-not-inc': 1, ' Private': 2, 
    ' Federal-gov': 3, ' Local-gov': 4, ' Self-emp-inc': 5, ' Without-pay': 6})
test_set['workclass'] = test_set['workclass'].map({' State-gov': 0, ' Self-emp-not-inc': 1, ' Private': 2, 
    ' Federal-gov': 3, ' Local-gov': 4, ' Self-emp-inc': 5, ' Without-pay': 6})

print(train_set.info())
print(test_set.info())


#prepare X, Y                                    
train_y = train_set['wage_class']                                      
test_y = test_set['wage_class']                                      
train_set.drop('wage_class', axis = 1, inplace = True)  
test_set.drop('wage_class', axis = 1, inplace = True)   
                                      
#Create, fit, predict
model = XGBClassifier()
model = model.fit(train_set, train_y) 
pred_y = model.predict(test_set)                                      
accuracy = accuracy_score(test_y, pred_y)
print("========================================")
print("Accuracy using XGBClassifier: %.2f%%" % (accuracy * 100.0))                                      
print("========================================")                                      