<a href="https://colab.research.google.com/github/narutsoo/tutorial/blob/master/DataPrep_for_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Topic : Data Preparation for Gradient Boosting with XGBoost in Python

Reference
* https://machinelearningmastery.com/data-preparation-gradient-boosting-xgboost-python/



# Usecase 1 : Iris data (Label Encoder at "target variable")

Import related libraries

In [2]:
import pandas as pd
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

Import dataset (iris dataset : 4 input features, 3 different classes)

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv', header=None)

In [4]:
data.to_csv('iris.csv', index=False, header=None)

Change from datafram to numpy array (and separated between input and target variables)

In [5]:
dataset = data.values

In [6]:
X = dataset[:,0:4]
y = dataset[:,4]

In [8]:
label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)

In [9]:
y_enc

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

Separate train & test dataset

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.33, random_state=1)

Create the model (XGBoost Classfier)

In [17]:
model = xgboost.XGBClassifier()

In [18]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

To predict the results (by given test set)

In [19]:
y_pred = model.predict(X_test)

In [20]:
print (model.predict(X_test))

[0 1 1 0 2 1 2 0 0 2 1 0 2 1 1 0 1 1 0 0 1 1 2 0 2 1 0 0 1 2 1 2 1 2 2 0 1
 0 1 2 2 0 1 2 1 2 0 0 0 1]


To obtain the accuracy 

In [21]:
print ('Accuracy =', round(accuracy_score(y_test, y_pred)*100, 2), '%')

Accuracy = 96.0 %


# Use-Case 2 : Breast Cancer data (Label Encoder / One Hot Encoder)

Import related libraries

In [33]:
import pandas as pd
import numpy
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

Import dataset (Breast Cancer : 8 input features, 2 classes) >> 286 instances

In [48]:
data = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv', header=None)

Change datframe to numpy array (define input and target variables)

In [49]:
dataset = data.values

In [50]:
X = dataset[:,:-1]
X = X.astype(str)
y = dataset[:,-1]

Encoder section (input variables)

In [51]:
encoded_x = None

In [52]:
a = 0 # just to check if-else condition
for i in range(0, X.shape[1]):
  label_encoder = LabelEncoder()
  feature = label_encoder.fit_transform(X[:,i])
  feature = feature.reshape(X.shape[0], 1)
  onehot_encoder = OneHotEncoder(sparse=False, categories='auto')
  feature = onehot_encoder.fit_transform(feature)
  if encoded_x is None:
    encoded_x = feature
  else :
    a = 1 # just to check if-else condition
    encoded_x = numpy.concatenate((encoded_x, feature), axis=1)

In [53]:
# print (a)
# print (encoded_x.shape)

In [54]:
## all unique value of each column
# print (data[0].nunique())
# print (data[1].nunique())
# print (data[2].nunique())
# print (data[3].nunique())
# print (data[4].nunique())
# print (data[5].nunique())
# print (data[6].nunique())
# print (data[7].nunique())
# print (data[8].nunique())
# print (data[9].nunique())

Encoder section (target variable)

In [55]:
target_encoder = LabelEncoder()
encoded_y = target_encoder.fit_transform(y)

Separate train & test dataset

In [56]:
X_train, X_test, y_train, y_test = train_test_split(encoded_x, encoded_y, test_size=0.33, random_state=7)

Create the model

In [57]:
model = XGBClassifier()

In [58]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [59]:
y_pred = model.predict(X_test)

In [60]:
print ('Accuracy =', round(accuracy_score(y_test, y_pred)*100, 2), '%')

Accuracy = 71.58 %


In [None]:
# Use-Case 3 : Horse Colic

# Use-Case 3 : Horse Colic data (Impute missing data)

Import related libraries

In [64]:
import pandas as pd
import numpy
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

Import dataset

In [65]:
data = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.data', delim_whitespace=True, header=None)

In [66]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
0,2,1,530101,38.5,66,28,3,3,?,2,5,4,4,?,?,?,3,5,45.0,8.4,?,?,2,2,11300,0,0,2
1,1,1,534817,39.2,88,20,?,?,4,1,3,4,2,?,?,?,4,2,50.0,85.0,2,2,3,2,2208,0,0,2
2,2,1,530334,38.3,40,24,1,1,3,1,3,3,1,?,?,?,1,1,33.0,6.7,?,?,1,2,0,0,0,1
3,1,9,5290409,39.1,164,84,4,1,6,2,2,4,4,1,2,5.00,3,?,48.0,7.2,3,5.30,2,1,2208,0,0,1
4,2,1,530255,37.3,104,35,?,?,6,2,?,?,?,?,?,?,?,?,74.0,7.4,?,?,2,2,4300,0,0,2


Change dataframe to numpy array

In [67]:
dataset = data.values

In [68]:
# To define input and output variables
X = dataset[:,:-1]
y = dataset[:,-1]

To imput missing value 
1. To replace '?' with 'Nan'  
2. To replace 'Nan' using Simple Imputer

In [69]:
# To replace '?' with 'Nan'
X[X == '?'] = numpy.nan

In [70]:
# To replace 'Nan' using Simple Imputer
imputer = SimpleImputer()
imputed_X = imputer.fit_transform(X)

In [71]:
pd.DataFrame(imputed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
0,2.0,1.0,530101.0,38.500000,66.0,28.0,3.000000,3.000000,2.853755,2.00000,5.00000,4.000000,4.000000,1.755102,1.582474,4.707547,3.000000,5.000000,45.0,8.400000,2.037037,3.019608,2.0,2.0,11300.0,0.0,0.0
1,1.0,1.0,534817.0,39.200000,88.0,20.0,2.348361,2.017316,4.000000,1.00000,3.00000,4.000000,2.000000,1.755102,1.582474,4.707547,4.000000,2.000000,50.0,85.000000,2.000000,2.000000,3.0,2.0,2208.0,0.0,0.0
2,2.0,1.0,530334.0,38.300000,40.0,24.0,1.000000,1.000000,3.000000,1.00000,3.00000,3.000000,1.000000,1.755102,1.582474,4.707547,1.000000,1.000000,33.0,6.700000,2.037037,3.019608,1.0,2.0,0.0,0.0,0.0
3,1.0,9.0,5290409.0,39.100000,164.0,84.0,4.000000,1.000000,6.000000,2.00000,2.00000,4.000000,4.000000,1.000000,2.000000,5.000000,3.000000,3.692308,48.0,7.200000,3.000000,5.300000,2.0,1.0,2208.0,0.0,0.0
4,2.0,1.0,530255.0,37.300000,104.0,35.0,2.348361,2.017316,6.000000,2.00000,2.95102,2.917969,2.266393,1.755102,1.582474,4.707547,2.757576,3.692308,74.0,7.400000,2.037037,3.019608,2.0,2.0,4300.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1.0,1.0,533886.0,38.167917,120.0,70.0,4.000000,2.017316,4.000000,2.00000,2.00000,4.000000,2.266393,1.755102,1.582474,4.707547,2.757576,5.000000,55.0,65.000000,2.037037,3.019608,3.0,2.0,3205.0,0.0,0.0
296,2.0,1.0,527702.0,37.200000,72.0,24.0,3.000000,2.000000,4.000000,2.00000,4.00000,3.000000,3.000000,3.000000,1.000000,4.707547,4.000000,4.000000,44.0,24.456929,3.000000,3.300000,3.0,1.0,2208.0,0.0,0.0
297,1.0,1.0,529386.0,37.500000,72.0,30.0,4.000000,3.000000,4.000000,1.00000,4.00000,4.000000,3.000000,2.000000,1.000000,4.707547,3.000000,5.000000,60.0,6.800000,2.037037,3.019608,2.0,1.0,3205.0,0.0,0.0
298,1.0,1.0,530612.0,36.500000,100.0,24.0,3.000000,3.000000,3.000000,1.00000,3.00000,3.000000,3.000000,3.000000,1.000000,4.707547,4.000000,4.000000,50.0,6.000000,3.000000,3.400000,1.0,1.0,2208.0,0.0,0.0


In [79]:
# # To re-check Null value for each column
# pd.DataFrame(X).isnull().sum()

To encode target variable

In [73]:
label_encoder = LabelEncoder()
label_encoded_y = label_encoder.fit_transform(y)

Separate train & test dataset

In [75]:
X_train, X_test, y_train, y_test = train_test_split(imputed_X, label_encoded_y, test_size=0.33, random_state=3)

To define the model

In [76]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

To predict results from test dataset

In [77]:
y_pred = model.predict(X_test)

To print the accuracy

In [78]:
print ('Accuracy =', round(accuracy_score(y_test, y_pred)*100, 2), '%')

Accuracy = 86.87 %
