## Credit Application Approval Modeling

### Environment Setup

In [54]:
# Python dependencies
import os 
import json


In [29]:
# load and configure SQL magic plugin
%load_ext sql


The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [30]:
# connect to local single node GPDB instance
%sql postgresql://gpadmin:pivotal@127.0.0.1:5432/gpadmin
            

'Connected: gpadmin@gpadmin'

### Model Development

In [31]:
%%sql

-- create mfdemo schema
CREATE SCHEMA mfdemo;


 * postgresql://gpadmin:***@127.0.0.1:5432/gpadmin
(psycopg2.ProgrammingError) schema "mfdemo" already exists
 [SQL: '-- create mfdemo schema\nCREATE SCHEMA mfdemo;'] (Background on this error at: http://sqlalche.me/e/f405)


In [48]:
%%sql

-- pull data from source
DROP EXTERNAL TABLE IF EXISTS mfdemo.credit_application_external;
CREATE EXTERNAL WEB TABLE mfdemo.credit_application_external (
    a1 varchar(1)
   ,a2 float
   ,a3 float
   ,a4 varchar(1)
   ,a5 varchar(2)
   ,a6 varchar(2)
   ,a7 varchar(2)
   ,a8 float
   ,a9 boolean
   ,a10 boolean
   ,a11 float
   ,a12 boolean
   ,a13 varchar(1)
   ,a14 float
   ,a15 float
   ,a16 varchar(1)
) LOCATION ('http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data')
FORMAT 'CSV'
(NULL AS '?');


 * postgresql://gpadmin:***@127.0.0.1:5432/gpadmin
Done.
Done.


[]

In [90]:
%%sql

-- impute average for null values
DROP TABLE IF EXISTS mfdemo.credit_application_data;
CREATE TABLE mfdemo.credit_application_data AS
SELECT row_number() OVER() AS _id
      ,coalesce(a1,'b') AS a1
      ,coalesce(a2, avg(a2) OVER()) AS a2
      ,coalesce(a3, avg(a3) OVER()) AS a3
      ,coalesce(a4, 'u') AS a4
      ,coalesce(a5, 'g') AS a5
      ,coalesce(a6, 'c') AS a6
      ,coalesce(a7, 'v') AS a7
      ,coalesce(a8, avg(a8) OVER()) AS a8
      ,coalesce(a9, True) AS a9
      ,coalesce(a10, False) AS a10
      ,coalesce(a11, 0) AS a11
      ,coalesce(a12, False) AS a12
      ,coalesce(a13, 'g') AS a13
      ,coalesce(a14, avg(a14) OVER()) AS a14
      ,coalesce(a15, avg(a15) OVER()) AS a15
      ,CASE WHEN a16 = '+' THEN 1 ELSE 0 END AS a16
FROM public.credit_application_external
DISTRIBUTED RANDOMLY;


 * postgresql://gpadmin:***@127.0.0.1:5432/gpadmin
Done.
690 rows affected.


[]

In [91]:
%%sql

-- one hot encoding
DROP TABLE IF EXISTS mfdemo.model_inputs;
SELECT madlib.encode_categorical_variables (
    'mfdemo.credit_application_data',
    'mfdemo.model_inputs',
    'a1,a4,a5,a6,a7,a9,a10,a12,a13',
    NULL,
    '_id,a16,a2,a3,a8,a11,a14,a15',
    NULL,
    'a1=b, a4=y, a5=p, a6=x, a7=z, a9=false, a10=false, a12=false, a13=s'
);


 * postgresql://gpadmin:***@127.0.0.1:5432/gpadmin
Done.
1 rows affected.


encode_categorical_variables


In [94]:
%%sql

DROP TABLE IF EXISTS mfdemo.model
                    ,mfdemo.model_train
                    ,mfdemo.model_test;
SELECT madlib.train_test_split(
    'mfdemo.model_inputs',
    'mfdemo.model',
    0.7,
    NULL,
    NULL,
    '*',
    FALSE,
    TRUE
)


 * postgresql://gpadmin:***@127.0.0.1:5432/gpadmin
Done.
1 rows affected.


train_test_split


In [95]:
%%sql

DROP TABLE IF EXISTS mfdemo.rf_model_output, mfdemo.rf_model_output_summary, mfdemo.rf_model_output_group;
SELECT madlib.forest_train(
        'mfdemo.model_train',
        'mfdemo.rf_model_output',
        '_id',
        'a16',
        'a2,a3,a8,a11,a14,a15,a1_a,a4_l,a4_u,a5_g,a5_gg,a6_aa,a6_c,a6_cc,a6_d,a6_e,a6_ff,a6_i,a6_j,a6_k,a6_m,a6_q,a6_r,a6_w,a7_bb,a7_dd,a7_ff,a7_h,a7_j,a7_n,a7_o,a7_v,a9_true,a10_true,a12_true,a13_g,a13_p',
        null,
        null,
        10::integer,
        5::integer,
        true::boolean,
        5::integer,
        10::integer,
        3::integer,
        1::integer,
        10::integer
    )


 * postgresql://gpadmin:***@127.0.0.1:5432/gpadmin
Done.
1 rows affected.


forest_train


In [96]:
%%sql

DROP TABLE IF EXISTS mfdemo.model_test_scored;
SELECT madlib.forest_predict('mfdemo.rf_model_output','mfdemo.model_test','mfdemo.model_test_scored','prob');


 * postgresql://gpadmin:***@127.0.0.1:5432/gpadmin
Done.
1 rows affected.


forest_predict


In [99]:
%%sql

DROP TABLE IF EXISTS mfdemo.model_test_scored_tmp;
CREATE TABLE mfdemo.model_test_scored_tmp AS
SELECT *
FROM mfdemo.model_test_scored
JOIN mfdemo.model_test
USING (_id);

DROP TABLE mfdemo.model_test_scored;
ALTER TABLE mfdemo.model_test_scored_tmp RENAME TO model_test_scored;


 * postgresql://gpadmin:***@127.0.0.1:5432/gpadmin
Done.
(psycopg2.ProgrammingError) column name "a16" specified more than once
 [SQL: 'CREATE TABLE mfdemo.model_test_scored_tmp AS\nSELECT *\nFROM mfdemo.model_test_scored\nJOIN mfdemo.model_test\nUSING (_id);'] (Background on this error at: http://sqlalche.me/e/f405)


In [100]:
%%sql

DROP TABLE IF EXISTS mfdemo.model_test_scored_auc;
SELECT madlib.area_under_roc(
    'mfdemo.model_test_scored'
   ,'mfdemo.model_test_scored_auc'
   ,'estimated_prob_1'
   ,'a16'
)


 * postgresql://gpadmin:***@127.0.0.1:5432/gpadmin
Done.
1 rows affected.


area_under_roc


In [101]:
%%sql

SELECT *
FROM mfdemo.model_test_scored_auc;


 * postgresql://gpadmin:***@127.0.0.1:5432/gpadmin
1 rows affected.


area_under_roc
0.8160440613026819


### Flow Deployment

#### Deploy model as REST API

In [102]:
# create model deployment configurations
configs = {
 "greenplum-datasource.jdbc-url": "jdbc:postgresql://127.0.0.1:5432/gpadmin",
 "greenplum-datasource.userName": "gpadmin",
 "greenplum-datasource.password": "pivotal",
 "app.modeldescription": "Credit Application Approval Model",
 "app.modelschema": "mfdemo",
 "app.modeltables": ["rf_model_output","rf_model_output_group","rf_model_output_summary"],
 "app.actortablename": "model_train",
 "app.actionquery": "SELECT madlib.forest_predict('mfdemo.rf_model_output','mfdemo.model_test','mfdemo.model_test_scored','prob');",
 "app.resultstable": "model_test"
}

# write configs to file
with open('madlib_rest.json', 'a') as myFile:
    myFile.write(json.dumps(configs))
                 

dir_path = os.path.dirname(os.path.realpath("madlib_rest.json"))
print(dir_path)


/Users/jvawdrey/code/CreditCardTransactionGenerator/notebooks


In [103]:
%%bash -s "$dir_path" 

# deploy model as REST API using MADlib Flow
madlibflow --type model --action deploy --target kubernetes --inputJson $1/madlib_rest.json


bash: line 2: madlibflow: command not found


CalledProcessError: Command 'b'\nmadlibflow --type model --action deploy --target kubernetes --inputJson $1/madlib_rest.json\n'' returned non-zero exit status 127.

In [None]:
%%bash

# test deploy by sending request to actuator
curl -v -H "Accept:application/json" http://localhost:8085/actuator/info


In [104]:
## sample record to test deploy deploy
testRecord = %sql SELECT * FROM mfdemo.model_inputs LIMIT 1;

jsonRecord = json.loads(testRecord.DataFrame().to_json(orient="records"))[0]
print(jsonRecord)


 * postgresql://gpadmin:***@127.0.0.1:5432/gpadmin
1 rows affected.
{'_id': 2, 'a16': 1, 'a2': 58.67, 'a3': 4.46, 'a8': 3.04, 'a11': 6.0, 'a14': 43.0, 'a15': 560.0, 'a1_a': 1, 'a4_l': 0, 'a4_u': 1, 'a5_g': 1, 'a5_gg': 0, 'a6_aa': 0, 'a6_c': 0, 'a6_cc': 0, 'a6_d': 0, 'a6_e': 0, 'a6_ff': 0, 'a6_i': 0, 'a6_j': 0, 'a6_k': 0, 'a6_m': 0, 'a6_q': 1, 'a6_r': 0, 'a6_w': 0, 'a7_bb': 0, 'a7_dd': 0, 'a7_ff': 0, 'a7_h': 1, 'a7_j': 0, 'a7_n': 0, 'a7_o': 0, 'a7_v': 0, 'a9_true': 0, 'a10_true': 0, 'a12_true': 0, 'a13_g': 1, 'a13_p': 0}


In [None]:
%%bash -s "$jsonRecord" 

# test scoring by sending record sample
curl -v -H "Content-Type:application/json" 
-X POST http://localhost:8085/predict 
-d '$1'
