[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/unboxai/examples-gallery/blob/main/tabular-classification/xgboost/xgboost.ipynb)


# Tabular classification using XGBoost

This notebook illustrates how XGBoostmodels can be upladed to the Unbox platform.

In [None]:
!pip install -r requirements.txt

## Importing the modules and loading the dataset

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split

We have stored the dataset on the following S3 bucket. If, for some reason, you get an error reading the csv directly from it, feel free to copy and paste the URL in your browser and download the csv file. Alternatively, you can also find the dataset on [this Kaggle competition](https://www.kaggle.com/datasets/uciml/mushroom-classification).

In [None]:
DATASET_URL = "https://unbox-static-assets.s3.us-west-2.amazonaws.com/examples-datasets/tabular-classification/mushrooms.csv"

In [None]:
df = pd.read_csv(DATASET_URL)
df.head()

## Pre-processing the categorical features

In [None]:
def data_encode_one_hot(df, encoders):
    """ Encodes categorical features using one-hot encoding. """
    df = df.copy(True)
    df.reset_index(drop=True, inplace=True) # Causes NaNs otherwise
    for feature, enc in encoders.items():
        print(f"encoding {feature}")
        enc_df = pd.DataFrame(enc.transform(df[[feature]]).toarray(), columns=enc.get_feature_names([feature]))
        df = df.join(enc_df)
        df = df.drop(columns=feature)
    return df

In [None]:
def create_encoder_dict(df, categorical_feature_names):
    """ Creates encoders for each of the categorical features. 
        The predict function will need these encoders. 
    """
    from sklearn.preprocessing import OneHotEncoder
    encoders = {}
    for feature in categorical_feature_names:
        enc = OneHotEncoder(handle_unknown='ignore')
        enc.fit(df[[feature]])
        encoders[feature] = enc
    return encoders

In [None]:
# replacing class names with 0 and 1
class_map = {"e": 0, "p": 1}

X, y = df.loc[:, df.columns != "class"], df[["class"]].replace(class_map)

In [None]:
encoders = create_encoder_dict(X, list(X.columns))

X_enc_one_hot = data_encode_one_hot(X, encoders)
X_enc_one_hot

## Splitting the data into training and validation sets

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)
x_train_one_hot = data_encode_one_hot(x_train, encoders)
x_val_one_hot = data_encode_one_hot(x_val, encoders)

## Training and evaluating the model's performance

In [None]:
# Using XGBoost data format
dtrain = xgb.DMatrix(x_train_one_hot, label=y_train)
dval = xgb.DMatrix(x_val_one_hot, label=y_val)

In [None]:
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2

xgboost_model = xgb.train(param, dtrain, num_round)

In [None]:
preds = xgboost_model.predict(dval)
labels = dval.get_label()

In [None]:
print(
    "error rate=%f"
    % (
        sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i])
        / float(len(preds))
    )
)

## Unbox part!

### Instantiating the client

In [None]:
import unboxapi

client = unboxapi.UnboxClient("YOUR_API_KEY_HERE")

### Creating a project on the platform

In [None]:
from unboxapi.tasks import TaskType

project = client.create_project(name="XGBoost project", 
                                task_type=TaskType.TabularClassification,
                                description="Evaluation of ML approaches")

### Uploading the validation set

In [None]:
# Add the ground truths to the ordinal dataset for Unbox
x_val['class'] = y_val.values
x_train['class'] = y_train.values

In [None]:
# some important parameters
class_names = ["e", "p"]  # the classes on the dataset
feature_names = list(X.columns)  # feature names in the un-processed dataset
categorical_feature_names = feature_names # all features are categorical in this dataset

In [None]:
dataset = project.add_dataframe(
    df=x_val,
    class_names=class_names,
    label_column_name='class',
    commit_message='this is my mushroom dataset',
    feature_names=feature_names,
    categorical_feature_names=categorical_feature_names,
)

### Uploading the model

First, it is important to create a `predict_proba` function, which is how Unbox interacts with your model

In [None]:
def predict_proba(model, input_features: np.ndarray, col_names, one_hot_encoder, encoders):
    """Convert the raw input_features into one-hot encoded features
    using our one hot encoder and each feature's encoder. """
    # Encoding the features using the encoders
    df = pd.DataFrame(input_features, columns=col_names)
    encoded_df = one_hot_encoder(df, encoders)
    
    # Converting the data to the XGBoost data format
    data_xgb = xgb.DMatrix(encoded_df)
    
    # Making the predictions with the model
    preds = model.predict(data_xgb)
    
    # Post-processing the predictions to the format Unbox expects
    preds_proba = [[1 - p, p] for p in preds]
    return np.array(preds_proba)

Let's test the `predict_proba` function to make sure the input-output format is consistent with what Unbox expects:

In [None]:
predict_proba(xgboost_model, x_val[:3][feature_names].to_numpy(), feature_names, data_encode_one_hot, encoders)

Now, we can upload the model:

In [None]:
from unboxapi.models import ModelType

model = project.add_model(
    function=predict_proba, 
    model=xgboost_model,
    model_type=ModelType.xgboost,
    class_names=class_names,
    name='XGBoost Classifier',
    commit_message='this is my mushrooms classification model',
    feature_names=feature_names,
    train_sample_df=x_train[:3000],
    train_sample_label_column_name='class',
    requirements_txt_file='requirements.txt',
    categorical_feature_names=categorical_feature_names,
    col_names=feature_names,
    one_hot_encoder=data_encode_one_hot,
    encoders=encoders,
)