# RTSMADlib  - Plpython Model Demo

## Logistic Regression Model Sample and Model deployment

In this sample we demonstrate how to build python models in Greenplum database and deploy that model to Kubernetes using RTSMADlib tooling. 


## Pre-Requisites
1. A running instance of Greenplum 6 latest
2. Install plcontainer add-on feature. 
2. A runing insance of kubernetes environment

PLContainer feature enable to run non-database native code to run in secured isolated docker VMs in distributed mode. 
In this demo we use plpython3u (python 3.7.5) version of images. To learn more about pl container please visit https://gpdb.docs.pivotal.io/6-6/analytics/pl_container.html

### The notebook perform the below tasks

1. Connect to greenplum and setup the session
2. Create the schema and create all the tables needed, Load test the data to Greenpulm
3. Build and train model using Python in Greenplum database 
4. Test Batch Score the model .
5. Operationalize model with RTSMADlib
6. Test Model REST Service
7. Undeploy the Model container

#### To learn more about ML/AI in Greenplum please visit https://gpdb.docs.pivotal.io/6-6/analytics/overview.html

#### If you need to install q quick greenplum environment please visit AWS/GCP/Azure marketplace and provision a single node install.

### Create SQL Connection to Greenplum

In [None]:
import psycopg2               # Python-PostgreSQL Database Adapter - https://pypi.python.org/pypi/psycopg2
import pandas as pd           # Python Data Analysis Library - https://pandas.pydata.org/
import math  
import json

%load_ext sql

# PLEASE MODIFY THE BELOW AS PER YOUR GREENPLUM CLUSTER SETTINGS
database_host = 'ec2-54-196-119-66.compute-1.amazonaws.com'
database_databasename = 'dev'
database_username = 'gpbot'
database_password = 'GpRocks@20'
database_port = '5432'

try:
    connString = "host='{}' dbname='{}' user='{}' password='{}' port={}".format(database_host,database_databasename,database_username,database_password,database_port)
    # print connString
    conn = psycopg2.connect(connString)
    cur = conn.cursor()
    conn.autocommit = True
        
    connection = 'postgresql://{}:{}@{}:{}/{}'.format(database_username,database_password,database_host,database_port,database_databasename)
    %sql $connection

    message = "<span style='color:green'>**Connection successful!**</span>"
    print(message)
except Exception as e:
    message = "<span style='color:red'>**ERROR: Unable to connect to the database ({})**</span>".format(e)
    print(message) 

# Create a model repository schema 
 ### In this step we create schema plpy and create a repository table. This table is used to store models in database so that they can be read and executed. This also support model versioning.

In [None]:
%%sql

DROP SCHEMA IF EXISTS pyml cascade;

CREATE SCHEMA pyml;

-- model repository table used for python models
CREATE TABLE pyml.model_repo (
	id serial primary key,
	model_name text,
	model bytea not null,
	model_description text not null,
	model_version int not null
) ;



# verify the plpython plcontainer environment

In [None]:
%%sql


create type pyml.pymodule_info as(mod_name text, mod_version text);

create or replace function pyml.pymodsversions() returns 
	setof pyml.pymodule_info
as
$$
	# container: plc_python3_shared
	"""
		This Function return a list of data science python modules & versions deployed
	"""
	import sys
	import numpy as np
	import scipy as sc
	import pandas as pd
	import sklearn as sk
	version_list=[]
	version_list.append({"mod_name" : "Python", "mod_version" : sys.version_info})
	version_list.append({"mod_name" : "numpy", "mod_version" : np.__version__})
	version_list.append({"mod_name" : "scipy", "mod_version" : sc.__version__})
	version_list.append({"mod_name" : "pandas", "mod_version" : pd.__version__})
	version_list.append({"mod_name" : "sklearn", "mod_version" : sk.__version__})
	return version_list
$$ language plcontainer	;


select pyml.pymodsversions();

# Create  data 
### In this cell we Create a schema and table for data and  Create views to split daat for train and test.

In [None]:
%%sql

drop schema if exists plcpymldemo cascade;

create schema plcpymldemo;

--create data table
create table plcpymldemo.employee_salary(id serial, years_of_experience float, salary float) distributed randomly;

insert into plcpymldemo.employee_salary(years_of_experience, salary) 
values 
(1.1,	39343.00),
(1.3,	46205.00),
(1.5,	37731.00),
(2.0,	43525.00),
(2.2,	39891.00),
(2.9,	56642.00),
(3.0,	60150.00),
(3.2,	54445.00),
(3.2,	64445.00),
(3.7,	57189.00),
(3.9,	63218.00),
(4.0,	55794.00),
(4.0,	56957.00),
(4.1,	57081.00),
(4.5,	61111.00),
(4.9,	67938.00),
(5.1,	66029.00),
(5.3,	83088.00),
(5.9,	81363.00),
(6.0,	93940.00),
(6.8,	91738.00),
(7.1,	98273.00),
(7.9,	101302.00),
(8.2,	113812.00),
(8.7,	109431.00),
(9.0,	105582.00),
(9.5,	116969.00),
(9.6,	112635.00),
(10.3,	122391.00),
(10.5,	121872.00);


-- create a view to seperate test data with training data.
-- we basically use id column to create a score or percentage and then split data 70/30.

drop view if exists plcpymldemo.employ_salary_lr;

create view plcpymldemo.employee_salary_lr as 
	select * , abs(hashtext(id::text)%100) comp
	from plcpymldemo.employee_salary;

-- 70% data is for training, this view gives that

create view plcpymldemo.employee_salary_lr_training as select * from plcpymldemo.employee_salary_lr where comp <=70;

-- 30% data is for testing, this view gives that

create view plcpymldemo.employee_salary_lr_test as select * from plcpymldemo.employee_salary_lr where comp >70;


# create plpython model

### we create a plpython3u based function that represent a single variable Linear regression. This function will return a byte array representing the pickled model.

In [None]:
%%sql

create or replace function plcpymldemo.employee_salary_LR_model() returns bytea as
$$
	# container: plc_python3_shared
	"""
	 Usage: simple linear regression demo
	 """
	import numpy as np
	import pandas as pd
	from pickle import dumps
	from sklearn.linear_model import LinearRegression
	''' load training data from view '''
	tableData = plpy.execute('select years_of_experience, salary from plcpymldemo.employee_salary_lr_training')	
	frame = []
	for rec in tableData:
		frame.append(rec)
	df = pd.DataFrame(frame)
	''' dependent variable, i.e years_of_experience '''
	x = df.iloc[:, :-1].values
	# independent variable, i.e salary 
	y = df.iloc[:, 1].values
	''' fit model '''
	regressor = LinearRegression()
	regressor.fit(x, y)
	return dumps(regressor)
$$ language plcontainer	;


# Model Store Function
## Here we create function that exuctes model function and stores the model into model repo table. The model function is passed as an arguments to this function.

In [None]:
%%sql

create or replace function plcpymldemo.model_store(modelFunction text, modelName text, modelDesc text, modelVersion int) 
returns void as
$$
/**
 * Usage: This function is used to store the ML model in pyml.pyml_model_repo table
 */
 DECLARE
 model bytea;
BEGIN
 execute format ('select %s ()', modelFunction) into model ;
 insert into pyml.model_repo(model_name, model, model_description, model_version)
	values(modelName, model, modelDesc, modelVersion);
END
$$
language plpgsql;



# Store model
### invoke the model_store function to persist model

In [None]:
%%sql

select plcpymldemo.model_store('plcpymldemo.employee_salary_LR_model', 'employee_salary_simple_linear_regression', 'A simple model demo LR', 1)



# Model driver function
### This function is used to execute the model. Here we create a standard framework to help run model in database aswell as in K8. This allows us to deligate the model execution driver to model author so that we can seemlessly shift and load this in to k8. We will see this in action in model deployment cell.


In [None]:
%%sql
CREATE OR REPLACE FUNCTION plcpymldemo.employee_salary_lr_model_driver(model_name text,model_version int, input_table text, output_table text) 
RETURNS void
AS $$ 	
	# container: plc_python3_shared
	from pickle import loads
	import pandas as pd
	import numpy as np
	"""
		This function is used to run the model by loading it from repository and runa prediction.
		The input need is the model_name and model_version in the model repo
		and the payload table from where the input to model is read. The caller should insert the
		payload in to the table and invoke this function. 
        Example usage is;
 		select plcpymldemo.employee_salary_lr_model_driver('employee_salary_simple_linear_regression', 1, 'plcpymldemo.employee_salary_predict_model_input')
	"""
	' Read model from table and deserialize .....'	
	splan = plpy.prepare("SELECT model FROM pyml.model_repo WHERE model_name = $1 and model_version = $2", ['text', 'integer'])
	rv = plpy.execute(splan, [model_name, model_version])
	model = loads(rv[0]['model'])
	iqry  = 'insert into ' + output_table + '(years_of_exp, predicted_salary) values($1, $2)'
	iplan = plpy.prepare(iqry, ["float", "float"]);
	' Read years of experiences from input table .....'
	tableData = plpy.execute('SELECT * FROM %s ;' % (input_table))
	predictions = []
	for rec in tableData:
		frame = []
		frame.append(rec)
		df = pd.DataFrame(frame)
		yearsOfExps = df.iloc[:].values
		' Run prediction.'
		pred = model.predict(yearsOfExps)
		plpy.execute(iplan, [yearsOfExps[0][0], pred[0]])	
 $$
language plcontainer;

## Model test tables
### Here we create tables need to run model. Basically the model input data table and model output table. 

In [None]:
%%sql

--create test payload table
create table plcpymldemo.employee_salary_predict_model_input as 
	select  years_of_experience 
		from plcpymldemo.employee_salary_lr_test;
--verify model input
select * from plcpymldemo.employee_salary_predict_model_input;

--create output table
create table plcpymldemo.employee_salary_predict_model_output(years_of_exp float, predicted_salary float);


# Run prediction 
### Here we run the model model driver function with model input table and output tables created a cell above.

In [None]:
%%sql
select plcpymldemo.employee_salary_lr_model_driver('employee_salary_simple_linear_regression', 1, 'plcpymldemo.employee_salary_predict_model_input','plcpymldemo.employee_salary_predict_model_output')


# verify model run results

In [None]:
%%sql
select * from plcpymldemo.employee_salary_predict_model_output;

# RTSMADlib

## Operationalize the  model 

The plpython model from Greenplum is containerized and deployed container management system. In this case we are using local docker environment. The rtsmadlib tool will take care of how to bundle, deploy and serve the model as REST endpoint.

In [None]:
! source ~/.bash_profile
! rts4madlib --help

# Deployment manifest of Model

In [None]:
import json

pymodel_config = json.dumps({
 "plpyrest.pydeps": "numpy==1.14.6,scipy==1.4.0,pandas==0.25.3,scikit-learn==0.22",
 "modeldb-datasource.jdbc-url" : "jdbc:postgresql://ec2-54-196-119-66.compute-1.amazonaws.com/dev",
 "modeldb-datasource.username" : "gpbot",
 "modeldb-datasource.password" : "GpRocks@20",
 "plpyrest.modelreposchema" : "pyml",
 "plpyrest.modelrepotable" : "model_repo",
 "plpyrest.modelname" : "employee_salary_simple_linear_regression",
 "plpyrest.modelversion" : 1,
 "plpyrest.modeldescription" : "linear regression model with 1 dependent variable demo",
 "plpyrest.payloadtable" : "employee_salary_predict_model_input",
 "plpyrest.resultstable" : "employee_salary_predict_model_output",
 "plpyrest.modelschema" : "plcpymldemo",
 "plpyrest.modeldriverfunction" : "employee_salary_lr_model_driver",
 "plpyrest.modelquery" : "select plcpymldemo.employee_salary_lr_model_driver('employee_salary_simple_linear_regression', 1, 'plcpymldemo.employee_salary_predict_model_input', 'plcpymldemo.employee_salary_predict_model_output')"

})

with open("pymodel-config.json", "w") as f:
    f.write(pymodel_config)

# Deploy model to kubernetes

In [None]:
! source ~/.bash_profile && rts4madlib --name plpylrmodel --action deploy --type plpy-model --target kubernetes --input pymodel-config.json

In [None]:
! kubectl get all

# Verify pymodel container
The log files of deployment should show the service end points container. We use the endpoint to test. The below tests the information end point on the model container.

In [None]:
! curl -v -H "Content-Type:application/json" http://192.168.99.100:30123/actuator/info

# Test model

In [None]:
! curl -v -H "Content-Type:application/json"  http://192.168.99.100:30123/predict -d '{"years_of_experience": 10.0 }'

# undeploying model

In [None]:
! rts4madlib --name plpylrmodel --action undeploy --type plpy-model --target kubernetes

In [None]:
! kubectl get all