[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/unboxai/examples-gallery/blob/main/tabular-classification/xgboost/xgboost.ipynb)


# Tabular classification using XGBoost

This notebook illustrates how XGBoostmodels can be upladed to the Unbox platform.

## Importing the modules and loading the dataset

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split

We have stored the dataset on the following S3 bucket. If, for some reason, you get an error reading the csv directly from it, feel free to copy and paste the URL in your browser and download the csv file. Alternatively, you can also find the dataset on [this Kaggle competition](https://www.kaggle.com/datasets/uciml/mushroom-classification).

In [2]:
DATASET_URL = "https://unbox-static-assets.s3.us-west-2.amazonaws.com/examples-datasets/tabular-classification/mushrooms.csv"

In [5]:
df = pd.read_csv(DATASET_URL)
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


## Pre-processing the categorical features

In [7]:
def data_encode_one_hot(df, encoders):
    """ Encodes categorical features using one-hot encoding. """
    df = df.copy(True)
    df.reset_index(drop=True, inplace=True) # Causes NaNs otherwise
    for feature, enc in encoders.items():
        print(f"encoding {feature}")
        enc_df = pd.DataFrame(enc.transform(df[[feature]]).toarray(), columns=enc.get_feature_names([feature]))
        df = df.join(enc_df)
        df = df.drop(columns=feature)
    return df

In [8]:
def create_encoder_dict(df, categorical_feature_names):
    """ Creates encoders for each of the categorical features. 
        The predict function will need these encoders. 
    """
    from sklearn.preprocessing import OneHotEncoder
    encoders = {}
    for feature in categorical_feature_names:
        enc = OneHotEncoder(handle_unknown='ignore')
        enc.fit(df[[feature]])
        encoders[feature] = enc
    return encoders

In [9]:
# replacing class names with 0 and 1
class_map = {"e": 0, "p": 1}

X, y = df.loc[:, df.columns != "class"], df[["class"]].replace(class_map)

In [11]:
encoders = create_encoder_dict(X, list(X.columns))

X_enc_one_hot = data_encode_one_hot(X, encoders)
X_enc_one_hot

encoding cap-shape
encoding cap-surface
encoding cap-color
encoding bruises
encoding odor
encoding gill-attachment
encoding gill-spacing
encoding gill-size
encoding gill-color
encoding stalk-shape
encoding stalk-root
encoding stalk-surface-above-ring
encoding stalk-surface-below-ring
encoding stalk-color-above-ring
encoding stalk-color-below-ring
encoding veil-type
encoding veil-color
encoding ring-number
encoding ring-type
encoding spore-print-color
encoding population
encoding habitat


Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8120,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8121,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8122,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Splitting the data into training and validation sets

In [12]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)
x_train_one_hot = data_encode_one_hot(x_train, encoders)
x_val_one_hot = data_encode_one_hot(x_val, encoders)

encoding cap-shape
encoding cap-surface
encoding cap-color
encoding bruises
encoding odor
encoding gill-attachment
encoding gill-spacing
encoding gill-size
encoding gill-color
encoding stalk-shape
encoding stalk-root
encoding stalk-surface-above-ring
encoding stalk-surface-below-ring
encoding stalk-color-above-ring
encoding stalk-color-below-ring
encoding veil-type
encoding veil-color
encoding ring-number
encoding ring-type
encoding spore-print-color
encoding population
encoding habitat
encoding cap-shape
encoding cap-surface
encoding cap-color
encoding bruises
encoding odor
encoding gill-attachment
encoding gill-spacing
encoding gill-size
encoding gill-color
encoding stalk-shape
encoding stalk-root
encoding stalk-surface-above-ring
encoding stalk-surface-below-ring
encoding stalk-color-above-ring
encoding stalk-color-below-ring
encoding veil-type
encoding veil-color
encoding ring-number
encoding ring-type
encoding spore-print-color
encoding population
encoding habitat


## Training and evaluating the model's performance

In [13]:
# Using XGBoost data format
dtrain = xgb.DMatrix(x_train_one_hot, label=y_train)
dval = xgb.DMatrix(x_val_one_hot, label=y_val)

In [14]:
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2

xgboost_model = xgb.train(param, dtrain, num_round)

In [15]:
preds = xgboost_model.predict(dval)
labels = dval.get_label()

In [16]:
print(
    "error rate=%f"
    % (
        sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i])
        / float(len(preds))
    )
)

error rate=0.021538


## Unbox part!

### Instantiating the client

In [17]:
import unboxapi

client = unboxapi.UnboxClient("YOUR_API_KEY_HERE")

### Creating a project on the platform

In [18]:
project = client.create_project(name="Agaricus with XGBoost", 
                                description="Evaluation of ML approaches")

Creating project on Unbox! Check out https://unbox.ai/projects to have a look!


### Uploading the validation set

In [19]:
# Add the ground truths to the ordinal dataset for Unbox
x_val['class'] = y_val.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_val['class'] = y_val.values


In [20]:
# some important parameters
class_names = ["e", "p"]  # the classes on the dataset
feature_names = list(X.columns)  # feature names in the un-processed dataset
categorical_feature_names = feature_names # all features are categorical in this dataset

In [21]:
from unboxapi.tasks import TaskType

dataset = project.add_dataframe(
    df=x_val,
    class_names=class_names,
    label_column_name='class',
    name="Mushrooms Validation",
    description='this is my mushroom dataset',
    task_type=TaskType.TabularClassification,
    feature_names=feature_names,
    categorical_feature_names=categorical_feature_names,
)

Uploading dataset to Unbox! Check out https://unbox.ai/datasets to have a look!


### Uploading the model

First, it is important to create a `predict_proba` function, which is how Unbox interacts with your model

In [22]:
def predict_proba(model, input_features: np.ndarray, col_names, one_hot_encoder, encoders):
    """Convert the raw input_features into one-hot encoded features
    using our one hot encoder and each feature's encoder. """
    # Encoding the features using the encoders
    df = pd.DataFrame(input_features, columns=col_names)
    encoded_df = one_hot_encoder(df, encoders)
    
    # Converting the data to the XGBoost data format
    data_xgb = xgb.DMatrix(encoded_df)
    
    # Making the predictions with the model
    preds = model.predict(data_xgb)
    
    # Post-processing the predictions to the format Unbox expects
    preds_proba = [[1 - p, p] for p in preds]
    return preds_proba

Let's test the `predict_proba` function to make sure the input-output format is consistent with what Unbox expects:

In [24]:
predict_proba(xgboost_model, x_val[:3][feature_names].to_numpy(), feature_names, data_encode_one_hot, encoders)

encoding cap-shape
encoding cap-surface
encoding cap-color
encoding bruises
encoding odor
encoding gill-attachment
encoding gill-spacing
encoding gill-size
encoding gill-color
encoding stalk-shape
encoding stalk-root
encoding stalk-surface-above-ring
encoding stalk-surface-below-ring
encoding stalk-color-above-ring
encoding stalk-color-below-ring
encoding veil-type
encoding veil-color
encoding ring-number
encoding ring-type
encoding spore-print-color
encoding population
encoding habitat


[[0.07492601871490479, 0.925074],
 [0.9475165642797947, 0.052483436],
 [0.7099717259407043, 0.29002827]]

Now, we can upload the model:

In [None]:
from unboxapi.models import ModelType

model = project.add_model(
    function=predict_proba, 
    model=xgboost_model,
    model_type=ModelType.xgboost,
    task_type=TaskType.TabularClassification,
    class_names=class_names,
    name='XGBoost Classifier',
    description='this is my mushrooms classification model',
    feature_names=feature_names,
    train_sample_df=x_train[:3000],
    train_sample_label_column_name='classes',
    categorical_feature_names=categorical_feature_names,
    col_names=feature_names,
    one_hot_encoder=data_encode_one_hot,
    encoders=encoders,
)