# Description

<h3>In this demo we will train a xgboost model on part of TPC-H dataset and will deploy it as a UDF (user-defined function). Then we will avaluate several queries with deployed UDF in this huge dataset.</h3>

<blockquote>
<h3>ATTENTION: for this demo it is recomended to have a cluster with more then 100 GB of Disk space and 10 GB of Memory space</h3>
</blockquote>

In [1]:
import time
import xgboost
import numpy as np
import pandas as pd
from lib import memsql_udf
from memsql.common import database
from IPython.display import clear_output

# Connecting to MemSQL

<h3>Please, enter your credentials into the cell below</h3>

In [2]:
memsql_host="<your_memsql_host>"
memsql_port=3306
memsql_user="root"
memsql_password=""

In [3]:
memsql_conn = database.connect(
    host=memsql_host, port=memsql_port, 
    user=memsql_user, password=memsql_password)

# Exporting Data from S3 to MemSQL using pipelines

In [4]:
memsql_conn.query('DROP DATABASE IF EXISTS tpch')
memsql_conn.query('CREATE DATABASE tpch')
memsql_conn.query('USE tpch')
memsql_conn.query(
    '''
    CREATE TABLE `lineitem` (
    `l_orderkey` bigint(11) NOT NULL,
    `l_partkey` int(11) NOT NULL,
    `l_suppkey` int(11) NOT NULL,
    `l_linenumber` int(11) NOT NULL,
    `l_quantity` decimal(15,2) NOT NULL,
    `l_extendedprice` decimal(15,2) NOT NULL,
    `l_discount` decimal(15,2) NOT NULL,
    `l_tax` decimal(15,2) NOT NULL,
    `l_returnflag` char(1) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
    `l_linestatus` char(1) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
    `l_shipdate` date NOT NULL,
    `l_commitdate` date NOT NULL,
    `l_receiptdate` date NOT NULL,
    `l_shipinstruct` char(25) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
    `l_shipmode` char(10) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
    `l_comment` varchar(44) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
    SHARD KEY (`l_orderkey`) USING CLUSTERED COLUMNSTORE
    )
    '''
)
memsql_conn.query(
    '''
    CREATE OR REPLACE PIPELINE tpch_100_lineitem
        AS LOAD DATA S3 'memsql-tpch-dataset/sf_100/lineitem/'
        config '{"region":"us-east-1"}'
        SKIP DUPLICATE KEY ERRORS
        INTO TABLE lineitem
        FIELDS TERMINATED BY '|'
        LINES TERMINATED BY '|\n';
    '''
)
memsql_conn.query("START ALL PIPELINES")

0

<h3> Wait while all data will be loaded. With Free Trial cluster, started in <a href="https://www.memsql.com/helios/" >Helios</a> this process takes around 7 minutes.</h3>

In [5]:
%%time
loaded_rows = 0
all_rows = 538837902
while(loaded_rows != all_rows):
    clear_output(wait=True)
    print(f'loaded {loaded_rows} rows out of {all_rows}')
    loaded_rows = memsql_conn.query("SELECT COUNT(*) AS size FROM lineitem")[0]["size"] 
    time.sleep(1)

clear_output(wait=True)
print(f'loaded {loaded_rows} rows out of {all_rows}')

loaded 538837902 rows out of 538837902
CPU times: user 1.97 s, sys: 424 ms, total: 2.4 s
Wall time: 9min 9s


# Load small part of data from MemSQL and prepare it for training

In [6]:
feature_columns = ['l_partkey','l_suppkey','l_quantity','l_discount','l_tax']
prediction_column = 'l_extendedprice'
all_columns = np.append(feature_columns, prediction_column)

In [7]:
all_data = memsql_conn.query(f'SELECT {", ".join(all_columns)} FROM lineitem LIMIT 50000')

In [8]:
def query_result_to_df(res):
    mp = {}
    for row in res:
        for col in row.keys():
            if not (col in mp):
                mp[col] = []
            mp[col].append(row[col])

    return pd.DataFrame(mp)

all_data_df = query_result_to_df(all_data)
all_data_df.head()

Unnamed: 0,l_partkey,l_suppkey,l_quantity,l_discount,l_tax,l_extendedprice
0,13509257,9284,33.0,0.03,0.08,41764.14
1,10068229,568250,37.0,0.08,0.05,44278.64
2,16715007,715008,37.0,0.01,0.02,37783.29
3,9111442,611461,23.0,0.04,0.01,33418.77
4,5204300,954316,39.0,0.03,0.05,46957.56


In [9]:
train_data=all_data_df.sample(frac=0.8, random_state=200) #random state is a seed value
test_data=all_data_df.drop(train_data.index)

In [10]:
print("number of rows in train set:", len(train_data.index))
print("number of rows in  test set:", len( test_data.index))

number of rows in train set: 40000
number of rows in  test set: 10000


In [11]:
X_train = train_data.drop([prediction_column], axis=1).to_numpy()
y_train = train_data[prediction_column].to_numpy()
X_test = test_data.drop([prediction_column], axis=1).to_numpy()
y_test = test_data[prediction_column].to_numpy()

# Actually Training Model

In [12]:
rgr = xgboost.XGBRegressor()
rgr.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='error', verbose=0)
booster = rgr.get_booster()
booster.feature_names = feature_columns

# Deploying Model to MemSQL

In [13]:
memsql_udf.upload_xgb_to_memsql(
    booster,
    feature_columns,
    memsql_conn,
    memsql_udf.F.SUM
)

# Advanture time

<h3> Lets try some interesting queries. <a href="https://en.wikipedia.org/wiki/Mean_squared_error"> MSE </a> is a perfect choice for the begining. This query will compute UDF on more then $5*10^8$ rows and then compute MSE.</h3>

In [14]:
%%time
res = memsql_conn.query(
    ' '.join([
        f'SELECT',
        f'AVG(POW({prediction_column}-apply_trees({", ".join(feature_columns)}), 2)) AS MSE',
        f'FROM lineitem'    
    ])
)

query_result_to_df(res).head()

CPU times: user 30.9 ms, sys: 9.19 ms, total: 40.1 ms
Wall time: 13min 52s


Unnamed: 0,MSE
0,71584700.0


<h3>Get predictions for 5 rows</h3>

In [15]:
%%time
res = memsql_conn.query(
    ' '.join([
        f'SELECT',
        f'{prediction_column} AS expected,'
        f'apply_trees({", ".join(feature_columns)}) AS predicted,',
        f'ABS({prediction_column} - apply_trees({", ".join(feature_columns)})) AS difference',
        f'FROM lineitem',
        f'LIMIT 5'        
    ])
)

print("5 random predictions:")
query_result_to_df(res).head()

5 random predictions:
CPU times: user 0 ns, sys: 6.88 ms, total: 6.88 ms
Wall time: 1.61 s


Unnamed: 0,expected,predicted,difference
0,65086.23,51014.148041,14072.081959
1,9909.78,8169.488541,1740.291459
2,29044.68,31319.485516,2274.805516
3,82654.5,71788.248732,10866.251268
4,11520.04,11354.590266,165.449734


<h3> Get 5 best predictions</h3>

In [16]:
%%time
res = memsql_conn.query(
    ' '.join([
        f'SELECT',
        f'{prediction_column} AS expected,'
        f'apply_trees({", ".join(feature_columns)}) AS predicted,',
        f'ABS({prediction_column} - apply_trees({", ".join(feature_columns)})) AS difference',
        f'FROM lineitem',
        f'ORDER BY difference ASC',
        f'LIMIT 5'        
    ])
)

print("5 best predictions:")
query_result_to_df(res).head()

5 best predictions:
CPU times: user 35.4 ms, sys: 4.96 ms, total: 40.3 ms
Wall time: 13min 14s


Unnamed: 0,expected,predicted,difference
0,1831.95,1831.949999,1e-06
1,43967.77,43967.770008,8e-06
2,47588.41,47588.410012,1.2e-05
3,63443.4,63443.400013,1.3e-05
4,29410.0,29410.000038,3.8e-05


<h3> Get 5 worst predictions</h3>

In [17]:
%%time
res = memsql_conn.query(
    ' '.join([
        f'SELECT',
        f'{prediction_column} AS expected,'
        f'apply_trees({", ".join(feature_columns)}) AS predicted,',
        f'ABS({prediction_column} - apply_trees({", ".join(feature_columns)})) AS difference',
        f'FROM lineitem',
        f'ORDER BY difference DESC',
        f'LIMIT 5'        
    ])
)

print("5 worst predictions:")
query_result_to_df(res).head()

5 worst predictions:
CPU times: user 36.8 ms, sys: 671 µs, total: 37.5 ms
Wall time: 12min 3s


Unnamed: 0,expected,predicted,difference
0,102250.5,43836.088301,58414.411699
1,101970.0,44940.312008,57029.687992
2,101219.5,44425.953623,56793.546377
3,101299.5,45150.427655,56149.072345
4,101049.5,45150.427655,55899.072345
