# Predict Total Loss Based on Structured Claim Data

## First Generate a Dataset
We will generate a binary classification dataset to represent structured insurance claim data.  We will set two informative features.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import make_classification

plt.figure(figsize=(8, 8))
plt.subplots_adjust(bottom=.05, top=.9, left=.05, right=.95)

plt.subplot(321)
plt.title("Two informative features, one cluster per class", fontsize='small')
X1, Y1 = make_classification(n_samples=1000, n_features=13, n_redundant=0, n_informative=2,
                             n_classes=2, n_clusters_per_class=1, shuffle=False,
                             class_sep=2.0)

# scatter plot of the first 2 features, highlighting separation of Loss/Not-Loss classes
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
            s=25, edgecolor='k')

plt.show()

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# split data into train and test sets
seed = 7
val_size  = 0.20
test_size = 0.10

# Give 70% to train
X_train, X_test, y_train, y_test = train_test_split(X1, Y1, 
                                                    test_size=(test_size + val_size), random_state=seed)

# Of the remaining 30%, give 2/3 to validation and 1/3 to test
X_test, X_val, y_test, y_val     = train_test_split(X_test, y_test, 
                                                    test_size=(test_size / (test_size + val_size)), random_state=seed)

In [None]:
# normalize
mean = X_train.mean(axis=0)
X_train -= mean
std = X_train.std(axis=0)
X_train /= std

X_test -= mean
X_test /= std

X_val -= mean
X_val /= std

In [None]:
print('Train shape: {}, Test shape: {}, Val shape: {}'.format(X_train.shape, 
                                                              X_test.shape, X_val.shape))
print('Train target: {}, Test target: {}, Val target: {}'.format(y_train.shape, 
                                                                 y_test.shape, y_val.shape))

print('\nSample observation: {}\nSample target: {}'.format(X_test[0], y_test[0]))

In [None]:
import os
data_dir = os.path.join(os.getcwd(), 'local_loss_data')
print('Local data dir: {}'.format(data_dir))

import pandas as pd
xtrain = pd.DataFrame(X_train)
xtrain.to_csv(f'{data_dir}/train/xtrain.csv', header=None, index=False)
ytrain = pd.DataFrame(y_train)
ytrain.to_csv(f'{data_dir}/train/ytrain.csv', header=None, index=False)

xtest = pd.DataFrame(X_test)
xtest.to_csv(f'{data_dir}/test/xtest.csv', header=None, index=False)
ytest = pd.DataFrame(y_test)
ytest.to_csv(f'{data_dir}/test/ytest.csv', header=None, index=False)

xval = pd.DataFrame(X_val)
xval.to_csv(f'{data_dir}/val/xval.csv', header=None, index=False)
yval = pd.DataFrame(y_val)
yval.to_csv(f'{data_dir}/val/yval.csv', header=None, index=False)

In [None]:
!pygmentize 'scripts/loss_train.py'

# Create a training job using the `TensorFlow` estimator

The `sagemaker.tensorflow.TensorFlow` estimator handles locating the script mode container, uploading your script to a S3 location and creating a SageMaker training job. 

In [None]:
!/bin/bash ./setup.sh

In [None]:
import os
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

role = get_execution_role()
region = sagemaker_session.boto_session.region_name

from sagemaker.tensorflow import TensorFlow

local = False
if (local):
    train_instance_type = 'local'
    serve_instance_type = 'local'
else:
    train_instance_type = 'ml.c5.xlarge' 
    serve_instance_type = 'ml.m4.xlarge'

hyperparameters = {'epochs': 35, 'data_dir': '/opt/ml/input/data'}

loss_estimator = TensorFlow(entry_point='loss_train.py',
                       source_dir='scripts',
                       train_instance_type=train_instance_type,
                       train_instance_count=1,
                       metric_definitions=[
                           {'Name' : 'validation:acc', 
                            'Regex': '.*step.* - val_acc: (\\S+)\n'},
                           {'Name' : 'validation:loss', 
                            'Regex': '- val_loss: (.*?) '}],
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(), # Pass notebook role to container
                       framework_version='1.12',
                       py_version='py3',
                       script_mode=True)

# In training script, you have to save the model in 'saved model' format to use TF serving
#https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory

In [None]:
if (local):
    loss_estimator.fit({'train': f'file://{data_dir}/train',
                    'test' : f'file://{data_dir}/test',
                    'val'  : f'file://{data_dir}/val'}) 
else:
    # upload the files to the s3 bucket
    s3_base = sagemaker_session.upload_data(path=data_dir, 
                                           bucket='roymark-aws-ml',
                                           key_prefix='loss')
    print(s3_base)
    loss_estimator.fit({'train': f'{s3_base}/train',
                    'test' : f'{s3_base}/test',
                    'val'  : f'{s3_base}/val'})

In [None]:
# deploy following script mode training
loss_predictor = loss_estimator.deploy(initial_instance_count=1, 
                                       instance_type=serve_instance_type,
                                       endpoint_type='tensorflow-serving')

In [None]:
results = loss_predictor.predict(X_test)
print('Results: {}\n'.format(results))

In [None]:
tmp_results = results['predictions']
fail_count = 0
test_count = len(X_test)
for i in range(test_count):
    if (tmp_results[i][0] > 0.5):
        class_predict = 1
    else:
        class_predict = 0
    if (class_predict == y_test[i]):
        result = 'PASS'
    else:
        result = '*FAIL'
        fail_count += 1
    print('Result: {:.3f}, Target: {}, Result: {}'.format(tmp_results[i][0], 
                                                          y_test[i],
                                                         result))
print('Tests: {}, Fails: {}'.format(test_count, fail_count))

In [None]:
if not local:
    sagemaker.Session().delete_endpoint(loss_predictor.endpoint)