In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

### Data preparation

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv'
df = pd.read_csv(data)

In [3]:
df.columns = df.columns.str.lower()

status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)

home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

df = df[df.status != 'unk'].reset_index(drop=True)

In [4]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=11)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

del df_train['status']
del df_test['status']

In [5]:
dv = DictVectorizer(sparse=False)

train_dicts = df_train.fillna(0).to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

test_dicts = df_test.fillna(0).to_dict(orient='records')
X_test = dv.transform(test_dicts)

### Random forest

In [6]:
rf = RandomForestClassifier(n_estimators=200,
                            max_depth=10,
                            min_samples_leaf=3,
                            random_state=1)
rf.fit(X_train, y_train)

### XGBoost

Note:

We removed feature names

It was 

```python
features = dv.get_feature_names_out()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
```

Now it's

```python
dtrain = xgb.DMatrix(X_train, label=y_train)
```

In [7]:
dtrain = xgb.DMatrix(X_train, label=y_train)

In [8]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 3,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=175)

### BentoML

In [9]:
import bentoml

###  Q1 : What is the installed version of BentoML ?

In [10]:
bentoml.__version__

'1.0.7'

In [11]:
bentoml.xgboost.save_model(
    'credit_risk_model',
    model,
    custom_objects={
        'dictVectorizer': dv
    })

Model(tag="credit_risk_model:sbwk65cuq67guqo7", path="/home/meriem/bentoml/models/credit_risk_model/sbwk65cuq67guqo7/")

###  Q2 : How big approximately is the saved BentoML model? Size can slightly vary depending on your local development environment. Choose the size closest to your model ?

In [12]:
!bentoml models list 

[1m [0m[1mTag                         [0m[1m [0m[1m [0m[1mModule         [0m[1m [0m[1m [0m[1mSize      [0m[1m [0m[1m [0m[1mCreation Time      [0m[1m [0m
 credit_risk_model:sbwk65cuq…  bentoml.xgboost  197.77 KiB  2022-10-25 18:07:53 
 credit_risk_model:vyzteisul…  bentoml.xgboost  197.77 KiB  2022-10-25 12:39:26 
 credit_risk_model:34m2lzst4…  bentoml.xgboost  197.77 KiB  2022-10-24 22:28:58 
 credit_risk_model:2t53mestz…  bentoml.xgboost  197.77 KiB  2022-10-24 19:29:44 
 mlzoomcamp_homework:jsi67fs…  bentoml.sklearn  5.82 KiB    2022-10-14 15:48:43 
 mlzoomcamp_homework:qtzdz3s…  bentoml.sklearn  5.79 KiB    2022-10-13 21:42:14 


In [13]:
!bentoml models get credit_risk_model:2t53mestzgsce6lh

[38;2;249;38;114;48;2;39;40;34mname[0m[38;2;248;248;242;48;2;39;40;34m:[0m[38;2;248;248;242;48;2;39;40;34m [0m[48;2;39;40;34mcredit_risk_model[0m[48;2;39;40;34m                                                         [0m
[38;2;249;38;114;48;2;39;40;34mversion[0m[38;2;248;248;242;48;2;39;40;34m:[0m[38;2;248;248;242;48;2;39;40;34m [0m[48;2;39;40;34m2t53mestzgsce6lh[0m[48;2;39;40;34m                                                       [0m
[38;2;249;38;114;48;2;39;40;34mmodule[0m[38;2;248;248;242;48;2;39;40;34m:[0m[38;2;248;248;242;48;2;39;40;34m [0m[48;2;39;40;34mbentoml.xgboost[0m[48;2;39;40;34m                                                         [0m
[38;2;249;38;114;48;2;39;40;34mlabels[0m[38;2;248;248;242;48;2;39;40;34m:[0m[38;2;248;248;242;48;2;39;40;34m [0m[48;2;39;40;34m{[0m[48;2;39;40;34m}[0m[48;2;39;40;34m                                                                      [0m
[38;2;249;38;114;48;2;39;40;34moptions[0m[38;2;24

credit_risk_model:6lkhfmcsâ€¦  bentoml.xgboost  197.77 KiB  2022-10-22 19:18:37 

Answer : 197.77 KiB 

###  Q3 : Say you have the following data that you're sending to your service:

{
  "name": "Tim",
  "age": 37,
  "country": "US",
  "rating": 3.14
}

What would the pydantic class look like?

Answer :

class UserProfile(BaseModel) :

    name : str
    age : int
    country : str
    rating : float


### Q4 : We've prepared a model for you that you can import using:

curl -O https://s3.us-west-2.amazonaws.com/bentoml.com/mlzoomcamp/coolmodel.bentomodel

bentoml models import coolmodel.bentomodel

What version of scikit-learn was this model trained with?

In [14]:
!bentoml models get mlzoomcamp_homework:qtzdz3slg6mwwdu5

[38;2;249;38;114;48;2;39;40;34mname[0m[38;2;248;248;242;48;2;39;40;34m:[0m[38;2;248;248;242;48;2;39;40;34m [0m[48;2;39;40;34mmlzoomcamp_homework[0m[48;2;39;40;34m                                                       [0m
[38;2;249;38;114;48;2;39;40;34mversion[0m[38;2;248;248;242;48;2;39;40;34m:[0m[38;2;248;248;242;48;2;39;40;34m [0m[48;2;39;40;34mqtzdz3slg6mwwdu5[0m[48;2;39;40;34m                                                       [0m
[38;2;249;38;114;48;2;39;40;34mmodule[0m[38;2;248;248;242;48;2;39;40;34m:[0m[38;2;248;248;242;48;2;39;40;34m [0m[48;2;39;40;34mbentoml.sklearn[0m[48;2;39;40;34m                                                         [0m
[38;2;249;38;114;48;2;39;40;34mlabels[0m[38;2;248;248;242;48;2;39;40;34m:[0m[38;2;248;248;242;48;2;39;40;34m [0m[48;2;39;40;34m{[0m[48;2;39;40;34m}[0m[48;2;39;40;34m                                                                      [0m
[38;2;249;38;114;48;2;39;40;34moptions[0m[38;2;24

Answer : 1.1.1 

### Q5 :
Create a bento out of this scikit-learn model. The output type for this endpoint should be NumpyNdarray()

Send this array to the Bento:

[[6.4,3.5,4.5,1.2]]
You can use curl or the Swagger UI. What value does it return?

In [15]:
Answer : 1

### Question 6 :
Ensure to serve your bento with --production for this question

Install locust using:

pip install locust
Use the following locust file: locustfile.py

Ensure that it is pointed at your bento's endpoint (In case you didn't name your endpoint "classify")

Configure 100 users with ramp time of 10 users per second. Click "Start Swarming" and ensure that it is working.

Now download a second model with this command:

curl -O https://s3.us-west-2.amazonaws.com/bentoml.com/mlzoomcamp/coolmodel2.bentomodel
Or you can download with this link as well: https://s3.us-west-2.amazonaws.com/bentoml.com/mlzoomcamp/coolmodel2.bentomodel

Now import the model:

bentoml models import coolmodel2.bentomodel
Update your bento's runner tag and test with both models. Which model allows more traffic (more throughput) as you ramp up the traffic?

Hint 1: Remember to turn off and turn on your bento service between changing the model tag. Use Ctl-C to close the service in between trials.

Hint 2: Increase the number of concurrent users to see which one has higher throughput

Which model has better performance at higher volumes?

The first model

The second model

Answer : second model 