# Extreme Gradient Boosting

In [1]:
import pandas as pd
from sklearn import  metrics, model_selection
from xgboost.sklearn import XGBClassifier
from numpy import asarray

In [2]:
PREDICTORS = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
                                        "occupation", "relationship", "race", "sex", "capital-gain",
                                        "capital-loss", "hours-per-week", "native-country"]
PREDICTORS_STRING = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
PREDICTORS_INT = ["age", "fnlwgt", "capital-gain", "capital-loss", "hours-per-week"]
TARGET_VARIABLE = "class"

data = pd.read_csv('adult.data', names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
                                        "occupation", "relationship", "race", "sex", "capital-gain",
                                        "capital-loss", "hours-per-week", "native-country", "class"])
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  class           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Modularizar conversión de datos string a enteros
Es necesario convertir esos valores a enteros para poder ejecutar el algoritmo.

In [4]:
def from_string_to_int(key, data_set):
    data_set[key],class_names = pd.factorize(data_set[key])
    return class_names

## Target a string

In [5]:
class_names = from_string_to_int(TARGET_VARIABLE, data)
print(class_names)
print(data[TARGET_VARIABLE].unique())

Index([' <=50K', ' >50K'], dtype='object')
[0 1]


## Demás atributos a string

In [6]:
for predictor in PREDICTORS_STRING:
    from_string_to_int(predictor, data)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             32561 non-null  int64
 1   workclass       32561 non-null  int64
 2   fnlwgt          32561 non-null  int64
 3   education       32561 non-null  int64
 4   education-num   32561 non-null  int64
 5   marital-status  32561 non-null  int64
 6   occupation      32561 non-null  int64
 7   relationship    32561 non-null  int64
 8   race            32561 non-null  int64
 9   sex             32561 non-null  int64
 10  capital-gain    32561 non-null  int64
 11  capital-loss    32561 non-null  int64
 12  hours-per-week  32561 non-null  int64
 13  native-country  32561 non-null  int64
 14  class           32561 non-null  int64
dtypes: int64(15)
memory usage: 3.7 MB


## Separar los datos en features y target

In [7]:
features = data.iloc[:,:-1]
target = data.iloc[:,-1]
print(features)
print(target)

       age  workclass  fnlwgt  education  education-num  marital-status  \
0       39          0   77516          0             13               0   
1       50          1   83311          0             13               1   
2       38          2  215646          1              9               2   
3       53          2  234721          2              7               1   
4       28          2  338409          0             13               1   
...    ...        ...     ...        ...            ...             ...   
32556   27          2  257302          6             12               1   
32557   40          2  154374          1              9               1   
32558   58          2  151910          1              9               6   
32559   22          2  201490          1              9               0   
32560   52          6  287927          1              9               1   

       occupation  relationship  race  sex  capital-gain  capital-loss  \
0               0        

## Separar datos aleatoriamente en 70/30

In [8]:
features_train, features_test, target_train, target_test = model_selection.train_test_split(features, target, test_size=0.3, random_state=777)

## Entrenar

In [9]:
model1 = XGBClassifier(n_estimators=50, max_depth=3, eta=0.01, subsample=0.3, colsample_bytree=0.2).fit(features_train, target_train)
model2 = XGBClassifier(n_estimators=500, max_depth=8, eta=1, subsample=0.8, colsample_bytree=0.75).fit(features_train, target_train)
default_model = XGBClassifier().fit(features_train, target_train)

## Precisión

In [11]:
prediction1 = model1.predict(features_test)
prediction2 = model2.predict(features_test)
prediction_default = default_model.predict(features_test)

def print_numbers(name, prediction):
    count_misclassified = (target_test != prediction).sum()
    print(name)
    print('Misclassified samples: {}'.format(count_misclassified))
    accuracy = metrics.accuracy_score(target_test, prediction)
    print('Accuracy: {:.2f}'.format(accuracy))
    
print_numbers("Prediction 1", prediction1)
print_numbers("Prediction 2", prediction2)
print_numbers("Prediction default", prediction_default)

Prediction 1
Misclassified samples: 2208
Accuracy: 0.77
Prediction 2
Misclassified samples: 1557
Accuracy: 0.84
Prediction default
Misclassified samples: 1217
Accuracy: 0.88


## Predicciones

In [12]:
test_prediction=pd.DataFrame(data=features_test, columns=PREDICTORS)
test_prediction['Target']=target_test
test_prediction['Prediction']=prediction_default
test_prediction.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Target,Prediction
29305,32,2,130040,1,9,2,2,0,0,0,0,0,40,0,0,0
4110,25,2,186294,1,9,0,0,0,0,1,0,0,50,0,0,0
12044,39,2,237943,0,13,1,5,1,0,0,0,0,30,0,1,0
17027,78,6,237294,1,9,6,5,0,0,0,0,0,45,0,1,0
5821,53,2,157069,6,12,1,9,1,0,0,0,0,40,0,1,0


In [15]:
def get_predicted_class_name(class_number):
    if(class_number == 0):
        return "<=50K"
    return ">50K"

#Greater data: 37, Private, 280464, Some-college, 10, Married-civ-spouse,Exec-managerial, Husband, Black, Male, 0, 0, 80, United-States, >50K
custom_row_greater = [37,2,280464,5, 10, 1, 1,1,1, 0, 0,0, 80, 0]
custom_row_greater = asarray([custom_row_greater])
yhat_greater = default_model.predict(custom_row_greater)
print('Prediction greater: %s' % get_predicted_class_name(yhat_greater[0]))

#Less data: 49, Private, 160187, 9th, 5, Married-spouse-absent, Other-service, Not-in-family, Black, Female, 0, 0, 16, Jamaica, <=50K
custom_row_less = [49,2,160187,4,5, 3, 4, 0, 1, 1, 0, 0, 16, 2]
custom_row_less = asarray([custom_row_less])
yhat_less = default_model.predict(custom_row_less)
print('Prediction less: %s' % get_predicted_class_name(yhat_less[0]))

Prediction greater: >50K
Prediction less: <=50K
