
# RTSMADlib Demo

## Random forest classifier Model Sample and Model deployment

In this sample we demonstrate how to deploy a Apache MADlib model using RTSMADlib. We will be running the sample from Apache MADlib documentation https://madlib.apache.org/docs/latest/group__grp__random__forest.html#examples 

We will show how to deploy a model with multiple MADlib model tables and a customized return data from Model REST service

### Pre-Requisites
1. A running instance of Greenplum with MADlib
2. In tasklet 1 and 14; 
   - Modify database connection parameters
3. A runing insance of local docker environment

### The notebook perform the below tasks

1. Connect to greenplum and setup the session
2. Create the schema and create all the tables needed, Load test the data to Greenpulm
3. Build and train model
4. Test Batch Score the model .
5. Operationalize model with RTSMADlib
6. Test Model REST Service
7. Undeploy the Model container

# Create SQL Connection to Greenplum

In [None]:
import psycopg2               # Python-PostgreSQL Database Adapter - https://pypi.python.org/pypi/psycopg2
import pandas as pd           # Python Data Analysis Library - https://pandas.pydata.org/
import math  
import json

%load_ext sql

# PLEASE MODIFY THE BELOW AS PER YOUR GREENPLUM CLUSTER SETTINGS
database_host = '{HOST}'
database_databasename = '{DATABASE}'
database_username = '{USER}'
database_password = '{PASSWD}'
database_port = '{PORT}'

try:
    connString = "host='{}' dbname='{}' user='{}' password='{}' port={}".format(database_host,database_databasename,database_username,database_password,database_port)
    # print connString
    conn = psycopg2.connect(connString)
    cur = conn.cursor()
    conn.autocommit = True
        
    connection = 'postgresql://{}:{}@{}:{}/{}'.format(database_username,database_password,database_host,database_port,database_databasename)
    %sql $connection

    message = "<span style='color:green'>**Connection successful!**</span>"
    print(message)
except Exception as e:
    message = "<span style='color:red'>**ERROR: Unable to connect to the database ({})**</span>".format(e)
    print(message)

# Create Schema

In [None]:
%%sql

DROP SCHEMA IF EXISTS madlib_demo CASCADE;
CREATE SCHEMA madlib_demo;

DROP TABLE IF EXISTS madlib_demo.rf_golf CASCADE;

CREATE TABLE madlib_demo.rf_golf (
    id integer NOT NULL,
    "OUTLOOK" text,
    temperature double precision,
    humidity double precision,
    "Temp_Humidity" double precision[],
    clouds_airquality text[],
    windy boolean,
    class text
);


# Generate Sample data

In [None]:
%%sql
INSERT INTO madlib_demo.rf_golf VALUES
(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play'),
(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play'),
(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play'),
(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play'),
(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play'),
(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play'),
(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play'),
(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play'),
(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play'),
(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play'),
(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play'),
(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play'),
(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play'),
(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play');


# Train model

In [None]:
%%sql
DROP TABLE IF EXISTS madlib_demo.rf_train_output;
DROP TABLE IF EXISTS madlib_demo.rf_train_output_group;
DROP TABLE IF EXISTS madlib_demo.rf_train_output_summary;

SELECT madlib.forest_train('madlib_demo.rf_golf',         -- source table
                           'madlib_demo.rf_train_output',    -- output model table
                           'id',              -- id column
                           'class',           -- response
                           '"OUTLOOK", temperature, humidity, windy',   -- features
                           NULL,              -- exclude columns
                           NULL,              -- grouping columns
                           20::integer,       -- number of trees
                           2::integer,        -- number of random features
                           TRUE::boolean,     -- variable importance
                           1::integer,        -- num_permutations
                           8::integer,        -- max depth
                           3::integer,        -- min split
                           1::integer,        -- min bucket
                           10::integer        -- number of splits per continuous variable
                           );

SELECT * FROM madlib_demo.rf_train_output_summary;

DROP TABLE IF EXISTS madlib_demo.imp_output;
SELECT madlib.get_var_importance('madlib_demo.rf_train_output','madlib_demo.imp_output');
SELECT * FROM madlib_demo.imp_output ORDER BY oob_var_importance DESC;


# Run Prediction on model

In [None]:
%%sql
DROP TABLE IF EXISTS madlib_demo.prediction_results;

SELECT madlib.forest_predict('madlib_demo.rf_train_output',        -- tree model
                             'madlib_demo.rf_golf',             -- new data table
                             'madlib_demo.prediction_results',  -- output table
                             'response');           -- show response

SELECT g.id, class, estimated_class 
FROM madlib_demo.prediction_results p,
    madlib_demo.rf_golf g 
WHERE p.id = g.id ORDER BY g.id;

# RTSMADlib
Operationalize the model
The MADlib model from Greenplum is containerized and deployed container management system. In this case we are using local docker environment. The rtsmadlib tool will take care of how to bundle, deploy and serve the model as REST endpoint.

In [None]:
!rts4madlib --help

# Deployment manifest of Model

In [None]:
import json
myconfig=json.dumps ({
  "modeldb-datasource.jdbc-url": "jdbc:postgresql://{HOST}:{PORT}/{DATABASE}",
  "modeldb-datasource.userName": "{USER}",
  "modeldb-datasource.password": "{PASSWD}",
  "madlibrest.modelname": "Random_Forest_Classification_Example",
  "madlibrest.modeldescription": "Random Forest Classification Example",
  "madlibrest.modelschema": "madlib_demo",
  "madlibrest.modeltables": [
    "rf_train_output",
    "rf_train_output_group",
    "rf_train_output_summary"
  ],
  "madlibrest.modelinputtable": "rf_golf",
  "madlibrest.modelquery": "SELECT madlib.forest_predict('madlib_demo.rf_train_output', 'madlib_demo.rf_golf', 'madlib_demo.rf_prediction_results', 'prob')",
  "madlibrest.resultstable": "rf_prediction_results",
  "madlibrest.resultsquery": "SELECT g.id, class, \"estimated_prob_Don't Play\", \"estimated_prob_Play\" FROM madlib_demo.rf_prediction_results p, madlib_demo.rf_golf g WHERE p.id = g.id ORDER BY g.id"
}
)


with open("model-config.json", "w") as f:
    f.write(myconfig)
    

# Deploy

In [None]:
! rts4madlib --name golfrf --action deploy --type madlib-model --target docker --inputJson model-config.json

# Testing - RTSMADlib container
The log files of deployment should show the service end points container. We use the endpoint to test. The below tests the information end point on the model container.

In [None]:
! curl -v -H "Content-Type:application/json" http://127.0.0.1:8087/actuator/info

In [None]:
! curl -v -H "Content-Type:application/json" -X POST http://localhost:8087/predict -d '{"id":2,"OUTLOOK":"overcast","temperature":64.00,"humidity":65.00,"windy":true,"class":"Play"}'

# Undeploy Model

In [None]:
! rts4madlib --name golfrf --action undeploy --type madlib-model --target docker

In [None]:
!docker ps

Thank you!