In [1]:
import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn import  metrics, model_selection
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('adult.data', names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
                                        "occupation", "relationship", "race", "sex", "capital-gain",
                                        "capital-loss", "hours-per-week", "native-country", "class"])
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  class           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Modularizar conversión de datos string a enteros

In [4]:
def from_string_to_int(key, data_set):
    data_set[key],class_names = pd.factorize(data_set[key])
    return class_names

## Target a string

In [5]:
class_names = from_string_to_int("class", data)
print(class_names)
print(data['class'].unique())

Index([' <=50K', ' >50K'], dtype='object')
[0 1]


## Demás atributos a string

In [6]:
from_string_to_int("workclass", data)
from_string_to_int("education", data)
from_string_to_int("marital-status", data)
from_string_to_int("occupation", data)
from_string_to_int("relationship", data)
from_string_to_int("race", data)
from_string_to_int("sex", data)
from_string_to_int("native-country", data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             32561 non-null  int64
 1   workclass       32561 non-null  int64
 2   fnlwgt          32561 non-null  int64
 3   education       32561 non-null  int64
 4   education-num   32561 non-null  int64
 5   marital-status  32561 non-null  int64
 6   occupation      32561 non-null  int64
 7   relationship    32561 non-null  int64
 8   race            32561 non-null  int64
 9   sex             32561 non-null  int64
 10  capital-gain    32561 non-null  int64
 11  capital-loss    32561 non-null  int64
 12  hours-per-week  32561 non-null  int64
 13  native-country  32561 non-null  int64
 14  class           32561 non-null  int64
dtypes: int64(15)
memory usage: 3.7 MB


## Separar los datos en features y target

In [7]:
features = data.iloc[:,:-1]
target = data.iloc[:,-1]
print(features)
print(target)

       age  workclass  fnlwgt  education  education-num  marital-status  \
0       39          0   77516          0             13               0   
1       50          1   83311          0             13               1   
2       38          2  215646          1              9               2   
3       53          2  234721          2              7               1   
4       28          2  338409          0             13               1   
...    ...        ...     ...        ...            ...             ...   
32556   27          2  257302          6             12               1   
32557   40          2  154374          1              9               1   
32558   58          2  151910          1              9               6   
32559   22          2  201490          1              9               0   
32560   52          6  287927          1              9               1   

       occupation  relationship  race  sex  capital-gain  capital-loss  \
0               0        

## Separar datos aleatoriamente en 70/30

In [11]:
features_train, features_test, target_train, target_test = model_selection.train_test_split(features, target, test_size=0.3, random_state=777)

## Entrenar

In [35]:
params = {
'objective': 'binary:logistic',
'max_depth': 2,
'learning_rate': 1,
'n_estimators': 5
}
 
model = XGBClassifier().fit(features_train, target_train)

## Precisión

In [36]:
target_pred = model.predict(features_test)

count_misclassified = (target_test != target_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))

accuracy = metrics.accuracy_score(target_test, target_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 1217
Accuracy: 0.88
