### In this demo we will train a simple xgboost model and will deploy it as a UDF (user-defined function).

In [1]:
import xgboost as xgb
import pandas as pd
import memsql
import numpy as np

### Reading data
#### We use here the same dataset as Amazon Sagemaker [tutorial](https://aws.amazon.com/getting-started/hands-on/build-train-deploy-machine-learning-model-sagemaker/) does

In [2]:
all_data = pd.read_csv('../bank_clean.csv', index_col=0)
print("number of rows:", len(all_data.index))
all_data.head()

number of rows: 41188


Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin,job_blue_collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


### Splitting data as 80% train set, 20% as test set.

In [3]:
train_data=all_data.sample(frac=0.8, random_state=200) #random state is a seed value
test_data=all_data.drop(train_data.index)

In [4]:
print("number of rows in train set:", len(train_data.index))
print("number of rows in  test set:", len( test_data.index))

number of rows in train set: 32950
number of rows in  test set: 8238


### Converting from pandas dataframe to NumPy matrix for the xgboost

In [5]:
X_train = train_data.drop(['y_no', 'y_yes'], axis=1).to_numpy()
y_train = train_data['y_yes'].to_numpy()
X_test = test_data.drop(['y_no', 'y_yes'], axis=1).to_numpy()
y_test = test_data['y_yes'].to_numpy()

### Train the model

In [6]:
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='error', early_stopping_rounds=10, verbose=0);

### Lets inspect the model
We inspect here only the last tree that was built.

In [7]:
booster = clf.get_booster()
print(booster.get_dump()[-1])

0:[f0<82.5] yes=1,no=2,missing=1
	1:[f0<81.5] yes=3,no=4,missing=3
		3:[f21<0.5] yes=7,no=8,missing=7
			7:[f0<49.5] yes=15,no=16,missing=15
				15:[f22<0.5] yes=21,no=22,missing=21
					21:[f0<48.5] yes=31,no=32,missing=31
						31:leaf=-0.000693206384
						32:leaf=-0.098523736
					22:[f42<0.5] yes=33,no=34,missing=33
						33:leaf=-0.0812441334
						34:leaf=0.0961404517
				16:[f19<0.5] yes=23,no=24,missing=23
					23:[f55<0.5] yes=35,no=36,missing=35
						35:leaf=-0.051924251
						36:leaf=0.0439539962
					24:[f0<52.5] yes=37,no=38,missing=37
						37:leaf=0.0684938058
						38:leaf=0.0101644313
			8:[f1<1.5] yes=17,no=18,missing=17
				17:[f0<35.5] yes=25,no=26,missing=25
					25:leaf=0.0301328655
					26:leaf=0.411336005
				18:[f3<0.5] yes=27,no=28,missing=27
					27:leaf=-0.119725056
					28:leaf=0.081386067
		4:[f22<0.5] yes=9,no=10,missing=9
			9:leaf=-0.0818154588
			10:leaf=0.422050208
	2:[f33<0.5] yes=5,no=6,missing=5
		5:[f54<0.5] yes=11,no=12,missing=11
			11:[f30<0.

It's hard to read model without feature names, so let's add them

In [8]:
booster.feature_names = list(train_data.drop(['y_yes', 'y_no'], axis=1).columns)
print(booster.get_dump()[-1])

0:[age<82.5] yes=1,no=2,missing=1
	1:[age<81.5] yes=3,no=4,missing=3
		3:[marital_unknown<0.5] yes=7,no=8,missing=7
			7:[age<49.5] yes=15,no=16,missing=15
				15:[education_basic_4y<0.5] yes=21,no=22,missing=21
					21:[age<48.5] yes=31,no=32,missing=31
						31:leaf=-0.000693206384
						32:leaf=-0.098523736
					22:[month_aug<0.5] yes=33,no=34,missing=33
						33:leaf=-0.0812441334
						34:leaf=0.0961404517
				16:[marital_married<0.5] yes=23,no=24,missing=23
					23:[day_of_week_wed<0.5] yes=35,no=36,missing=35
						35:leaf=-0.051924251
						36:leaf=0.0439539962
					24:[age<52.5] yes=37,no=38,missing=37
						37:leaf=0.0684938058
						38:leaf=0.0101644313
			8:[campaign<1.5] yes=17,no=18,missing=17
				17:[age<35.5] yes=25,no=26,missing=25
					25:leaf=0.0301328655
					26:leaf=0.411336005
				18:[previous<0.5] yes=27,no=28,missing=27
					27:leaf=-0.119725056
					28:leaf=0.081386067
		4:[education_basic_4y<0.5] yes=9,no=10,missing=9
			9:leaf=-0.0818154588
			10:leaf=0.42205

### Connect to memsql (look at README for better understanding how to create MemSQL host)

In [9]:
from memsql.common import database
memsql_host="YOUR MEMSQL HOST HERE"
memsql_port=3306  # YOUR MEMSQL PORT HERE
memsql_user="YOUR USERNAME HERE"
memsql_password="YOUR PASSWORD HERE"

memsql_conn = database.connect(
    host=memsql_host, port=memsql_port, 
    user=memsql_user, password=memsql_password)

memsql_conn.query('CREATE DATABASE IF NOT EXISTS testsm')
memsql_conn.query('USE testsm');

### Deploy model

In [10]:
import lib.memsql_udf as udf_tool
udf_tool.upload_xgb_to_memsql(booster, memsql_conn, 'model1', allow_overwrite=True)

In [11]:
rows = memsql_conn.query("SHOW FUNCTIONS LIKE 'model1_%'")
functions_df = pd.DataFrame([dict(r) for r in rows])
functions_df.head()

Unnamed: 0,Functions_in_testsm (model1_%),Function Type,Definer
0,model1_tree0,User Defined Function,admin@%
1,model1_tree1,User Defined Function,admin@%
2,model1_tree10,User Defined Function,admin@%
3,model1_tree11,User Defined Function,admin@%
4,model1_tree12,User Defined Function,admin@%


### LOAD data into the table

In [12]:
import lib.memsql_csv as csv_tool
memsql_conn.query("DROP TABLE IF EXISTS bank")
csv_tool.load_csv_to_table("../bank_clean.csv", "bank", ["id"] + list(all_data.columns), memsql_conn)

### Let's select a few rows of data to predict

In [13]:
rows = memsql_conn.query(f"SELECT {(' ,').join(booster.feature_names)} FROM bank ORDER BY id LIMIT 10;")
arr = np.array([list(row.values()) for row in rows]);

### Let's first predict using our UDF 

In [14]:
query = f"SELECT model1({', '.join(booster.feature_names)}) AS res FROM bank ORDER BY id LIMIT 10;"
res = memsql_conn.query(query)
for predict in res:
    print(predict['res'])

0.0268150092913469
0.024671236979178977
0.0273293839174817
0.02556405448884422
0.035877220967041476
0.02429350499307467
0.053079061507028985
0.026184611219336133
0.028962693285142605
0.03391228325193492


### And compare the results with xgboost model predictions

In [15]:
actual_res = booster.predict(xgb.DMatrix(arr, feature_names=booster.feature_names))
for actual_predict in actual_res:
    print(actual_predict)

0.026815016
0.024671245
0.027329398
0.025564069
0.03587723
0.024293514
0.053079065
0.02618462
0.028962703
0.033912294
