In [2]:
if db_id('Hospital_Py') is null create database Hospital_Py;

In [1]:
USE [Hospital_Py]

#### Note: Steps to configure instance (enable spees + clr) and grant SQLRGroup access completed prior. (Ref: [ActionScripts](https://github.com/microsoft/r-server-hospital-length-of-stay/tree/master/Resources/ActionScripts))

## [Step 0](https://microsoft.github.io/r-server-hospital-length-of-stay/dba.html)

##### Create objects, load data, compute stats.

In [34]:
set nocount on;
exec dbo.compute_stats
select * from stats
select * from statsbak

variable_name,type,mode,mean,std
ClaimClaimStatusID,int,2,,
ClaimStatusDescription,varchar,Closed,,
StatesStateCode,varchar,FL,,
ClaimRoomsWithDamage,smallint,0,1.0,1.3240661742950537
LossTypeDescription,varchar,Wind,,
PolicyVersionAttributesFormType,varchar,HO3,,
PolicyVersionAttributesOccupancyType,varchar,Owner,,
PolicyVersionAttributesCoverageA,float,,181559.156076,245711.62649245863
ClaimMoneyReserve,decimal,,0.0,0.0
ClaimMoneyLosses,decimal,,13939.741116,32793.82588103388


variable_name,type,mode,mean,std
rcount,varchar,0,,
gender,varchar,F,,
dialysisrenalendstage,varchar,0,,
asthma,varchar,0,,
irondef,varchar,0,,
pneum,varchar,0,,
substancedependence,varchar,0,,
psychologicaldisordermajor,varchar,0,,
depress,varchar,0,,
psychother,varchar,0,,


In [None]:
-- runs all steps below. preprocessing, clearnin, feature engineering, training, scoring & evaluating.
-- exec Initial_Run_Once_Py

## [Step 1](https://github.com/microsoft/r-server-hospital-length-of-stay/blob/master/SQLPy/step1_data_processing.sql)
##### Preprocessing and Cleaning

In [41]:
EXEC [dbo].[fill_NA_mode_mean] @input='LengthOfStay', @output = 'LoS0';

## [Step 2](https://github.com/microsoft/r-server-hospital-length-of-stay/blob/master/SQLPy/step2_feature_engineering.sql)
##### Feature Engineering ([Diagrams](https://microsoft.github.io/r-server-hospital-length-of-stay/data-scientist.html))

In [42]:
EXEC [dbo].[feature_engineering]  @input = 'LoS0', @output = 'LoS', @is_production = '0';
EXEC [dbo].[get_column_info] @input = 'LoS';

## Step 3

##### a. [Splitting](https://github.com/microsoft/r-server-hospital-length-of-stay/blob/master/SQLPy/step3a_splitting.sql), b. [Training](https://github.com/microsoft/r-server-hospital-length-of-stay/blob/master/SQLPy/step3b_training.sql), c. [Scoring](https://github.com/microsoft/r-server-hospital-length-of-stay/blob/master/SQLPy/step3c_scoring.sql), d. [Evaluating](https://github.com/microsoft/r-server-hospital-length-of-stay/blob/master/SQLPy/step3d_evaluating.sql)

In [20]:
EXEC [dbo].[splitting] @splitting_percent = 70, @input = 'LoS';

In [None]:
exec [dbo].[train_model] @model_name = 'RF', @dataset_name = 'LoS'; -- Dev svr Total execution time: 00:00:35.689
select getdate();

In [None]:
select model_name, (datalength(model)/1024)/1024 as MB  from models where model_name = 'RF';
go
SELECT last_user_update
FROM   sys.dm_db_index_usage_stats us
    JOIN sys.tables t
        ON t.object_id = us.object_id
WHERE  database_id = db_id()
    AND t.object_id = object_id('dbo.models') 

model_name,MB
RF,2


last_user_update
2021-03-11 19:25:20.960


In [1]:
use Hospital_Py
go
exec sp_rename 'Forest_Prediction', 'Forest_PredictionBak'

In [None]:
SELECT count(*) FROM LoS WHERE ClaimClaimID NOT IN (SELECT ClaimClaimID FROM Train_Id)

(No column name)
29974


In [2]:
EXEC [dbo].[score] @model_name = 'RF',
	@inquery = 'SELECT * FROM LoS WHERE ClaimClaimID NOT IN (SELECT ClaimClaimID FROM Train_Id)',
	@output = 'Forest_Prediction';

In [1]:
select top 20 * from Forest_Prediction

lengthofstay_Pred,lengthofstay,ClaimClaimID
17.30743474477387,9,5
19.11814350187633,15,7
30.97463748640614,40,9
16.935966505913527,4,19
17.30743474477387,9,21
16.935966505913527,4,22
16.935966505913527,7,28
16.935966505913527,0,30
17.60621439720507,12,32
59.00441803777379,71,35


### Train GBT, FT, NN

In [None]:
use Hospital_Py
go
set nocount on;
exec [dbo].[train_model] @model_name = 'GBT', @dataset_name = 'LoS';
go
exec [dbo].[train_model] @model_name = 'FT', @dataset_name = 'LoS'; --failed, set max_num_bins
go
exec [dbo].[train_model] @model_name = 'NN', @dataset_name = 'LoS'; --failed
go


In [3]:
select model_name, (datalength(model)/1024)/1024 as MB from models;

model_name,MB
RF,2
GBT,0


### Score GBT, FT, NN

In [9]:
drop table if exists Boosted_PredictionBak;
exec sp_rename 'Boosted_Prediction', 'Boosted_PredictionBak';
exec [dbo].[score] @model_name = 'GBT',	@inquery = 'SELECT * FROM LoS WHERE ClaimClaimID NOT IN (SELECT ClaimClaimID FROM Train_Id)', @output = 'Boosted_Prediction';
go 
-- drop table if exists Fast_PredictionBak;
-- exec sp_rename 'Fast_Prediction', 'Fast_PredictionBak';
-- exec [dbo].[score] @model_name = 'FT', @inquery = 'SELECT * FROM LoS WHERE ClaimClaimID NOT IN (SELECT ClaimClaimID FROM Train_Id)', @output = 'Fast_Prediction';
-- go
-- drop table if exists NN_PredictionBak;
-- exec sp_rename 'NN_Prediction', 'NN_PredictionBak';
-- exec [dbo].[score] @model_name = 'NN', @inquery = 'SELECT * FROM LoS WHERE ClaimClaimID NOT IN (SELECT ClaimClaimID FROM Train_Id)', @output = 'NN_Prediction';
-- go
SELECT top 20 *  FROM [Hospital_Py].[dbo].[Boosted_Prediction]
-- SELECT top 20 *  FROM [Hospital_Py].[dbo].[Fast_Prediction]
-- SELECT top 20 *  FROM [Hospital_Py].[dbo].[NN_Prediction]


In [4]:
SELECT top 20 *  FROM [Hospital_Py].[dbo].[Boosted_Prediction]

lengthofstay_Pred,lengthofstay,ClaimClaimID
25.628452103238345,9,5
25.628452103238345,15,7
25.628452103238345,40,9
25.628452103238345,4,19
25.628452103238345,9,21
25.628452103238345,4,22
25.628452103238345,7,28
25.628452103238345,0,30
25.628452103238345,12,32
41.24410339279123,71,35


# 🥊 Realtime Scoring

#### [Proc](https://github.com/microsoft/r-server-hospital-length-of-stay/blob/master/SQLPy/partA_train_real_time_scoring_sp.sql) ([errors](https://stackoverflow.com/questions/55643467/dataframe-object-has-no-attribute-str-problem)), [Train](https://github.com/microsoft/r-server-hospital-length-of-stay/blob/master/SQLPy/partB_prepare_for_real_time_scoring.sql), [Score](https://github.com/microsoft/r-server-hospital-length-of-stay/blob/master/SQLPy/partC_real_time_scoring.sql)

In [3]:
-- My simple proc to serialize the model bin, cause train_model_real_time_scoring errors. 
Use Hospital_Py
go
create or alter proc [GetRTSModelRF]   
as
declare @info varbinary(max);
select @info = info from dbo.ColInfo;
declare @info varbinary(max);
select @info = info from dbo.ColInfo;

exec sp_execute_external_script @language = N'Python', @script = N' 
import dill
from numpy import sqrt
from pandas import DataFrame
from revoscalepy import rx_set_compute_context, RxSqlServerData, rx_dforest, RxOdbcData, rx_serialize_model, rx_write_object, RxLocalSeq
from microsoftml import adadelta_optimizer

connection_string = "Driver=SQL Server;Server=localhost;Database=Hospital_Py;Trusted_Connection=true;"

column_info = dill.loads(info)

##	Set training dataset, set features and types.

variables_all = [var for var in column_info]
#variables_to_remove = ["eid", "vdate", "discharged", "facid"]
variables_to_remove = ["ClaimClaimID", "ClaimDateClosed", "ClaimReportedDate"]
training_variables = [x for x in variables_all if x not in variables_to_remove]
LoS_Train = RxSqlServerData(sql_query = "SELECT ClaimClaimID, {} FROM LoS WHERE ClaimClaimID IN (SELECT ClaimClaimID from Train_Id)".format(", ".join(training_variables)),
                            connection_string = connection_string,
                            column_info = column_info)

##	Specify the variables to keep for the training 

#variables_to_remove = ["eid", "vdate", "discharged", "facid", "lengthofstay"]
variables_to_remove = ["ClaimClaimID", "ClaimDateClosed", "ClaimReportedDate", "lengthofstay"]
training_variables = [x for x in variables_all if x not in variables_to_remove]
formula = "lengthofstay ~ " + " + ".join(training_variables)

## Train RF Model
dest = RxOdbcData(connection_string, table = "RTS")
model = rx_dforest(formula=formula,
                    data=LoS_Train,
                    n_tree=40,
                    cp=0.00005,
                    min_split=int(sqrt(70000)),
                    max_num_bins=int(sqrt(70000)),
                    seed=5)
serialized_model = rx_serialize_model(model, realtime_scoring_only = True)
rx_write_object(dest, key_name="id", key="RF", value_name="value", value=serialized_model, serialize=False, compress=None, overwrite=False)'

, @params = N'@info varbinary(max)'
, @info = @info;

GO


In [None]:
-- drop table if exists RTSBak;
-- go
-- exec sp_rename 'RTS', 'RTSBak'
go
exec GetRTSModelRF; 

-- Dev server: Total execution time: 00:20:17.325

In [6]:
select id, (datalength(value)/1024)/1024 as MB from RTS

id,MB
RF,0


In [22]:
--- Perform Real Time Scoring on previously trained models
--- Use 'RF' for Random Forest, use 'GBT' for b_trees, use 'FT' for Fast Trees, or use 'NN' for Neural Network
--- Results are written to RTS_Prediction

--- NOTE: Run prepare_real_time_scoring.sql before running this script.

Use Hospital_Py
GO

--- Real Time Scoring

--	Get the trained model
DECLARE @model_name VARCHAR(3) = 'RF'
DECLARE @model VARBINARY(max) = (SELECT value FROM [dbo].[RTS] WHERE id = @model_name);		

--- Real Time Scoring is meant for small scoring request, which is why we select the top 10 for this example.
DECLARE @inputData VARCHAR(max);
SET @inputData = 'SELECT TOP (10) ClaimClaimID, ClaimClaimStatusID, ClaimStatusDescription, StatesStateCode, LossTypeDescription, PolicyVersionAttributesFormType, 
	PolicyVersionAttributesOccupancyType, PolicyVersionAttributesCoverageA, ClaimMoneyLosses, ClaimMoneyLAE, ClaimRoomsWithDamage, 
	number_of_issues, lengthofstay
FROM LoS WHERE 	ClaimClaimID NOT IN (SELECT ClaimClaimID from Train_Id) ORDER BY ClaimClaimID';

DECLARE @output_table TABLE(lengthofstay_Pred FLOAT);
INSERT @output_table EXEC [dbo].[sp_rxPredict] @model = @model, @inputData = @inputData;
DROP TABLE IF EXISTS RTS_PredictionBak;
exec sp_rename 'RTS_Prediction', 'RTS_PredictionBak';
SELECT * INTO RTS_Prediction FROM @output_table

In [8]:
select top 10 ClaimClaimID, LengthOfStay from LoS Order by ClaimClaimID
select * from RTS_Prediction

ClaimClaimID,LengthOfStay
1,5
2,19
3,4
4,16
5,9
6,4
7,15
8,25
9,40
10,1


lengthofstay_Pred
24.000814132212437
281.10619473986014
204.47810635851164
53.38754342384557
57.4645801226574
475.1654887397811
239.28146462247355
45.95507795975456
57.4645801226574
305.4491891191269


In [10]:
SELECT TOP 10 * FROM LoS WHERE ClaimClaimID NOT IN (SELECT ClaimClaimID from Train_Id) ORDER BY ClaimClaimID

ClaimClaimID,ClaimReportedDate,ClaimClaimStatusID,ClaimStatusDescription,StatesStateCode,LossTypeDescription,PolicyVersionAttributesFormType,PolicyVersionAttributesOccupancyType,ClaimMoneyReserve,PolicyVersionAttributesCoverageA,ClaimMoneyLosses,ClaimMoneyLAE,ClaimRoomsWithDamage,number_of_issues,ClaimDateClosed,lengthofstay
5,2018-05-07,2,Closed,FL,Loss Assessment,HO6,Owner,0.0,-0.5354209646243084,-0.3640850311065803,-0.255113842707539,-0.7552492612632524,0,2018-05-16,9
7,2019-02-04,2,Closed,FL,Loss Assessment,HO6,Owner,0.0,-0.5354209646243084,-0.3640850311065803,-0.255113842707539,-0.7552492612632524,0,2019-02-19,15
9,2018-06-06,2,Closed,FL,Loss Assessment,HO6,Owner,0.0,-0.5354209646243084,-0.3640850311065803,-0.255113842707539,-0.7552492612632524,0,2018-07-16,40
19,2018-05-11,2,Closed,FL,Loss Assessment,HO6,Owner,0.0,-0.5354209646243084,-0.3640850311065803,-0.255113842707539,-0.7552492612632524,0,2018-05-15,4
21,2018-11-27,2,Closed,FL,Loss Assessment,HO6,Owner,0.0,-0.5354209646243084,-0.3640850311065803,-0.255113842707539,-0.7552492612632524,0,2018-12-06,9
22,2019-02-25,2,Closed,FL,Loss Assessment,HO6,Owner,0.0,-0.5354209646243084,-0.3640850311065803,-0.255113842707539,-0.7552492612632524,0,2019-03-01,4
28,2018-06-19,2,Closed,FL,Loss Assessment,HO6,Owner,0.0,-0.5354209646243084,-0.3640850311065803,-0.255113842707539,-0.7552492612632524,0,2018-06-26,7
30,2018-06-06,2,Closed,FL,Loss Assessment,HO6,Owner,0.0,-0.5354209646243084,-0.3640850311065803,-0.255113842707539,-0.7552492612632524,0,2018-06-06,0
32,2019-01-02,2,Closed,FL,Loss Assessment,HO6,Owner,0.0,-0.5354209646243084,-0.3640850311065803,-0.255113842707539,-0.7552492612632524,0,2019-01-14,12
35,2018-12-10,2,Closed,FL,Loss Assessment,HO6,Owner,0.0,-0.5354209646243084,-0.3640850311065803,-0.255113842707539,-0.7552492612632524,0,2019-02-19,71


In [11]:
select min(claimreporteddate) StartDate, max(claimreporteddate) EndDate from LengthOfStay

StartDate,EndDate
2018-02-15,2021-02-09


In [19]:
exec do_native_predict 43714

LengthOfStay_Pred
305.4491891191269
