# Accelerate Modeling at Scale with SAS Viya

Instead of building models sequentially and tuning each one to perfection, the following code allows a data scientist to build hundreds of models simultaneously, testing different algorithms for each micro segment leveraging the power of SAS Viya. 

The code automatically generates and assesses all models per micro segment, selects micro segment champion model and top challenger models, and then publishes all models into a model repository for governance, monitoring, and publishing as needed.

### Import SWAT and Additional Packages

In [14]:
import swat
from swat import *
import requests, json
import pandas as pd
import numpy as np
from IPython.display import display
import params_box
from params_box import impute_params, partition_params, get_model_segments, set_model_params
from matplotlib import pyplot as plt
import base64
import os
import pprint
%matplotlib inline

### SAS Viya CAS Server connection details

In [15]:
cashost='localhost'
casport=5570
casauth='~/.authinfo'

### Start CAS Session

In [16]:
s = CAS(cashost, casport, authinfo=casauth)

### Load CAS Actionsets

In [17]:
actionsets = ['cardinality', 'sampling', 'fedSQL', 'decisionTree', 'neuralNet', 'regression', 'svm', 'astore']
for a in actionsets:
    s.loadactionset(a)

NOTE: Added action set 'cardinality'.
NOTE: Added action set 'sampling'.
NOTE: Added action set 'fedSQL'.
NOTE: Added action set 'decisionTree'.
NOTE: Added action set 'neuralNet'.
NOTE: Added action set 'regression'.
NOTE: Added action set 'svm'.
NOTE: Added action set 'astore'.


### Load Data into CAS

In [18]:
indata_dir="/opt/sas/viya/config/data/cas/default/public"
indata="asset_failure_prediction"
s.loadactionset(actionset="table")
if not s.table.tableExists(table=indata).exists:
    tbl = s.upload_file(indata_dir+"/"+indata+".sas7bdat", casout={"name":indata})

NOTE: Added action set 'table'.
NOTE: Cloud Analytic Services made the uploaded file available as table ASSET_FAILUR_PREDICTION in caslib CASUSER(sasdemo).
NOTE: The table ASSET_FAILUR_PREDICTION has been created in caslib CASUSER(sasdemo) from binary data uploaded to Cloud Analytic Services.


### Define Data, Models, and Parameters

In [11]:
tab = params_box.tab()
display(tab)

Tab(children=(VBox(children=(Text(value='', description='Caslib ', placeholder='Specify caslib'), Text(value='…

#### Use the head method to display first few rows of table

In [None]:
# get caslib and table from input
caslib = params_box.caslib(tab)
table = params_box.table(tab)

# create reference to telco_detractor_table in memory
tbl = s.CASTable(name=table, caslib=caslib)
tbl.head(5)

#### Print summary statistics using cardinality action set

In [None]:
# cardinality action set
tbl.cardinality.summarize(cardinality = dict(name = 'data_card', replace = True))
df_data_card = s.CASTable('data_card').to_frame() # bring the data locally

# get list of nominal and numerical variables
type_c = list(df_data_card.query('_TYPE_ == "C"')['_VARNAME_'])
type_n = list(df_data_card.query('_TYPE_ == "N"')['_VARNAME_'])

# print summary statistics
df_data_card['_PCTMISS_'] = (df_data_card['_NMISS_']/df_data_card['_NOBS_'])*100
print('\n', 'Summary Statistics'.center(90, ' '))
df_data_card[['_VARNAME_','_TYPE_','_PCTMISS_','_MIN_','_MAX_','_MEAN_','_STDDEV_','_SKEWNESS_','_KURTOSIS_']].round(2)

#### Plot distributions of variables

In [None]:
# Use the built in hist() method to plot the distribution of every variable
tbl.hist(figsize = (15,75), layout = (28, 5));

### Define up to 2 segment variables, and specify variables to reject

In [None]:
display(params_box.segments(tab))

In [None]:
# obtain inputs from box
params_map = params_box.get(tab)

# extract target, segment1, segment2, and rejected values
target = params_map['setup']['target']
segment1 = params_map['setup']['seg1']
segment2 = params_map['setup']['seg2']
rejected = params_map['setup']['rejected']

# display rejected variables
pd.DataFrame(rejected, columns = ['Rejected Variables'])

#### Display unique values of target, segment 1 and segment 2 variables

In [None]:
# display distinct values of Target variable
target_distinct = tbl[target].unique()

# display distinct values of Segment 1 variable
segment1_distinct = tbl[segment1].unique()

# display distinct value of Segment 2 variable
segment2_distinct = tbl[segment2].unique()

df1 = pd.DataFrame(target_distinct, columns = ['Unique target Values'])
df2 = pd.DataFrame(segment1_distinct, columns = ['Unique ' + segment1])
df3 = pd.DataFrame(segment2_distinct, columns = ['Unique ' + segment2])

display(df1, df2, df3)

#### Remove rejected variables from input variables, and show which variables remain

In [None]:
# get a list of all inputs
varnames = list(df_data_card['_VARNAME_'])
varnames.remove(target)

# create list of non-rejected variables
inputs = [item for item in varnames if item not in rejected]

# get nominal and numerical variables in inputs
inputs_c = [item for item in inputs if item in type_c]
inputs_n = [item for item in inputs if item in type_n]

# display input variables and their type
inputs_df = pd.DataFrame(inputs, columns=['Variables'])
inputs_df['Type'] = inputs_df['Variables'].apply(lambda x: 'C' if x in inputs_c else 'N')
display(inputs_df)

### Generate Modeling Segments Based on Chosen Variables¶

In [None]:
# create segments
class1 = tbl[segment1].unique().tolist()
class2 = tbl[segment2].unique().tolist()

# creates the main segments according to the two inputs
if not segment2:
    segments_main = [{segment1: i} for i in class1]
else:
    segments_main = [{segment1: i, segment2: j} for i in class1 for j in class2]

# create an id for each segment
for idx,val in enumerate(segments_main):
    val['segment_id'] = idx

# display the segments
segments_pd = pd.DataFrame(segments_main, columns=['segment_id', segment1, segment2])
segments_pd.set_index('segment_id')

#### Define function to: 
#### 1) create a new session for each segment
#### 2) subset the main table to get the segment table 

In [None]:
def create_session(conn, segment, table, segment_var1, segment_var2=None, caslib='Public'):
    # create a new session
    new_sess = conn.copy()
    segment['session'] = new_sess
    
    # load actionsets for new session
    actionsets = ['cardinality', 'sampling', 'fedSQL', 'decisionTree', 'neuralNet', 'svm', 'regression', 'astore']
    for a in actionsets:
        new_sess.invoke('loadactionset', actionset=a)
    
    # subset main table according to segments, using a where clause
    tbl = new_sess.CASTable(name=table, caslib=caslib)
    query1 = "{seg1} = {seg1_value}" 
    query2 = "{seg2} = {seg2_value}"
    query1_str = query1.format(seg1 = segment_var1, seg1_value = '"{}"'.format(segment[segment_var1]) 
                 if isinstance(segment[segment_var1], str) else segment[segment_var1])
    query2_str = query2.format(seg2 = segment_var2, seg2_value = '"{}"'.format(segment[segment_var2]) 
                 if isinstance(segment[segment_var2], str) else segment[segment_var2])
    if segment_var2:
        segment_tbl = tbl.query(query1_str + ' and ' + query2_str)
    else:
        segment_tbl = tbl.query(query1_str)
    
    # impute segment
    new_sess.invoke('impute', **impute_params(segment_tbl, out=table+'_im'))
    
    # partition segment
    new_sess.invoke('srs', **partition_params(table+'_im', out=table+'_im', replace=True))
    
    # map the table to the segment
    segment['segment_tbl'] = new_sess.CASTable(table+'_im')
    segment['non_partind'] = segment_tbl
    
    return segment

### Create Segments, Calculate Number of Observations and Target Event Rate for Each Segment

In [None]:
# create one session per segment
for segment in segments_main:
    create_session(s, segment, table, segment1, segment2)

# calculate target event rate for each segment
for segment in segments_main:
    # getting count of non-missing values for each segment
    segment['count'] = segment['segment_tbl'].count()[target]
    # getting count of event for each segment
    segment['event_count'] = segment['segment_tbl'].query("%s = %s" % (target, 1)).count()[target]
    # calculating target event rate percentage for each segment
    if segment['count'] == 0:
        segment['tgt_event_rate'] = 0
    else:
        segment['tgt_event_rate'] = segment['event_count'] / segment['count']

# terminate sessions
for segment in segments_main:
    segment['session'].close()
    segment['session'] = None

# display the segments
segments_pd = pd.DataFrame(segments_main, columns=['segment_id', segment1, segment2, 'count', 'tgt_event_rate'])
segments_pd.set_index('segment_id')

#### Set to reject segments not meeting target event rate range, or minimum number of observations

In [None]:
# get parameters from input box
min_obs = params_box.get(tab)['setup']['min_obs']
event_rate = params_box.get(tab)['setup']['tgt_event_rate']

# set to use or reject segment
for segment in segments_main:
    if segment['count'] < min_obs:
        segment['use'] = False
    elif segment['tgt_event_rate'] < event_rate[0] or segment['tgt_event_rate'] > event_rate[1]:
        segment['use'] = False
    else:
        segment['use'] = True

# display information about individual segments including count and target event rates
segments_r_pd = pd.DataFrame(segments_main, columns=['segment_id', segment1, segment2, 'count', 'tgt_event_rate', 'use'])
segments_r_pd.set_index('segment_id')

# display number of segments to be included and excluded
use_count = segments_r_pd.loc[segments_r_pd['use'] == True].count()['use']
exclude_count = segments_r_pd.loc[segments_r_pd['use'] == False].count()['use']
print ("Count of segments to be included : " , use_count)
print("Count of segments to be excluded : ", exclude_count)
display(segments_r_pd)

### Train Models in Parallel - Generate One Session Per Model

#### Functions to Train, Score, and Assess

In [None]:
def train_segment(segment):
    sess = segment['session']
    if segment['train_params']:
        sess.invoke(**segment['train_params'])
    return sess

def score_segment(segment):
    sess = segment['session']
    if segment['score_params']:
        sess.invoke(**segment['score_params'])
    return sess
    
def assess_segment(segment):
    sess = segment['session']
    if segment['assess_params']:
        sess.invoke(**segment['assess_params'])
    return sess

#### Create one CAS session per model per segment
#### Set the training, scoring and assess parameters

In [None]:
# generate all the model-segments
model_segments = get_model_segments(tab, segments_main)

# create the sessions using create_session
for segment in model_segments:
    create_session(s, segment, table, segment1, segment2)

# set the training, scoring, and assessment parameters
for segment in model_segments:
    set_model_params(tab, segment, inputs_c, inputs_n, tgt_type='C')
    
# display the result
segments_pd = pd.DataFrame(model_segments, columns=['segment_id', segment1, segment2, 'count', 'tgt_event_rate', 'model'])
segments_pd.set_index('segment_id')

### Run Training on All Segments

In [None]:
# invoke training for all segments
for segment in model_segments:
    train_segment(segment)

# iterate through all sessions and get responses
train_sess = [segment['session'] for segment in model_segments if segment['train_params'] is not None]
train_resp = [(sess, k, v) for resp,sess in getnext(*train_sess) for k,v in resp]

# extract and save training output
for segment in model_segments:
    for sess,k,v in train_resp:
        if sess == segment['session']:
            segment['train_key']=k
            segment['train_value']=v

### Run Scoring on All Segments

In [None]:
for segment in model_segments:
    score_segment(segment)

score_sess = [segment['session'] for segment in model_segments if segment['score_params'] is not None]
score_resp = [(sess, k, v) for resp,sess in getnext(*score_sess) for k,v in resp]

for segment in model_segments:
    for sess,k,v in score_resp:
        if sess == segment['session']:
            segment['score_key']=k
            segment['score_value']=v   

### Run Assessment on All Segments

In [None]:
for segment in model_segments:
    assess_segment(segment)

assess_sess = [segment['session'] for segment in model_segments if segment['assess_params'] is not None]
assess_resp = [(sess, k, v) for resp,sess in getnext(*assess_sess) for k,v in resp]

for segment in model_segments:
    for sess,k,v in assess_resp:
        if sess == segment['session']:
            if k == 'LIFTInfo':
                segment['LIFTInfo']=v
            elif k == 'ROCInfo':
                segment['ROCInfo']=v
                v['misclass'] = 1-v['ACC']
                segment['misclassification']=v[round(v['CutOff'], 2) == 0.5][['misclass']].iloc[0]['misclass']
                segment['ks']=v[round(v['CutOff'], 2) == 0.5][['KS']]

### Draw Assessment Plots

#### Create ROC Curves per segment

In [None]:
for m in segments_main:
    if not m['use']:
        continue
    plt.rcParams.update({'figure.max_open_warning': 0})
    plt.figure(figsize=(16,5))
    for s in model_segments:
        if s['segment_id'] == m['segment_id']:
            rocinfo = s.get('ROCInfo')
            if rocinfo is not None:
                plt.title('Segment 1: ' + str(m[segment1]) + ' Segment 2: ' + str(m[segment2]))
                plt.xlabel("False Positive Rate")
                plt.ylabel("True Positive Rate")
                plt.grid(True)
                plt.plot(rocinfo["FPR"], rocinfo["Sensitivity"], label=s['model'])
                plt.legend(loc="best")

#### Create Lift Charts per segment

In [None]:
for m in segments_main:
    if not m['use']:
        continue
    plt.rcParams.update({'figure.max_open_warning': 0})
    plt.figure(figsize=(16,5))
    for s in model_segments:
        if s['segment_id'] == m['segment_id']:
            rocinfo = s.get('LIFTInfo')
            if rocinfo is not None:
                plt.title('Segment 1: ' + str(m[segment1]) + ' Segment 2: ' + str(m[segment2]))
                plt.xlabel("Depth")
                plt.ylabel("Lift")
                plt.grid(True)
                plt.plot(rocinfo["Depth"], rocinfo["Lift"], label=s['model'])
                plt.legend(loc="best")

#### Summary of Results for All Segments

In [None]:
assess_res = [{'Segment ID': segment['segment_id'], segment1: segment[segment1], segment2: segment[segment2], 'Model': segment['model'], 'Misclassification': segment['misclassification'], 'Target Event Rate': segment['tgt_event_rate']} for segment in model_segments]

assess_pd = pd.DataFrame(assess_res, columns=['Segment ID', segment1, segment2, 'Model', 'Misclassification', 'Target Event Rate'])
assess_pd.set_index('Segment ID')

### Find and Display the Champion Model by Misclassification for Each Segment

In [None]:
def find_champion(models):
    misclass = [model['misclassification'] for model in models]
    lowest = min(misclass)
    idx = misclass.index(lowest)
    return models[idx]

champion_list = []

for segment in segments_main:
    if not segment['use']:
        continue
    all_models = [seg for seg in model_segments if segment[segment1] == seg[segment1] and segment[segment2] == seg[segment2]]
    champ = find_champion(all_models)
    champion_list.append({segment1 : champ[segment1], segment2: champ[segment2], 'Champion Model': champ['model'], 'Misclassification': champ['misclassification'], 'Target Event Rate': champ['tgt_event_rate']})

champion_pd = pd.DataFrame(champion_list, columns=[segment1, segment2, 'Champion Model', 'Misclassification', 'Target Event Rate'])
champion_pd

### Find and Display Top Challenger Model by Misclassification for Each Segment

In [None]:
def find_challenger(models):
    misclass = [model['misclassification'] for model in models]
    lowest = min(misclass)
    misclass.remove(lowest)
    second_lowest = min(misclass)
    idx = misclass.index(second_lowest)
    return models[idx]

challenger_list = []

for segment in segments_main:
    if not segment['use']:
        continue
    all_models = [seg for seg in model_segments if segment[segment1] == seg[segment1] and segment[segment2] == seg[segment2]]
    challenge = find_challenger(all_models)
    challenger_list.append({segment1 : challenge[segment1], segment2: challenge[segment2], 'Challenger Model': challenge['model'], 'Misclassification': challenge['misclassification'], 'Target Event Rate': challenge['tgt_event_rate']})

challenger_pd = pd.DataFrame(challenger_list, columns=[segment1, segment2, 'Challenger Model', 'Misclassification', 'Target Event Rate'])
challenger_pd

### Register all models in SAS Model Manager

#### Functions to create SAS Model Manager repository, project folder and project

In [None]:
def createRepository(repoName):
    contentType = "application/vnd.sas.models.repository+json"
    repository = {"name" : repoName}
    myRepo = requests.post(viyahost + '/modelRepository/repositories', data=json.dumps(repository), headers=getHeaders(contentType))
    return myRepo.json()

def createProjectFolder(repoName, repositoryFolderId):
    contentType = "application/json"
    parentURI = '/folders/folders/' + repositoryFolderId
    newFolder = {"name": repoName + "_project"}
    myFolder = requests.post(viyahost + '/folders/folders?parentFolderUri='+parentURI, data=json.dumps(newFolder), headers=getHeaders(contentType));
    folderID = json.loads(myFolder.content.decode('utf-8'))['id']
    return folderID

def createProject(segmentName, repositoryID, folderId):
    contentType = 'application/vnd.sas.models.project+json'
    newProj = {'name': segmentName, 'repositoryId': repositoryID, 'folderId': folderId}
    myProj = requests.post(viyahost + '/modelRepository/projects', data=json.dumps(newProj), headers=getHeaders(contentType))
    projectID = myProj.json()['id']
    return projectID

#### Functions to register and set champion model flag in SAS Model Manager

In [None]:
def registerModel(modelName, projectID, folderId):
    contentType = 'application/vnd.sas.models.model+json'
    newModel= {'name': modelName,'projectId': projectID,'folderId': folderId}
    myModel = requests.post(viyahost + '/modelRepository/models', data=json.dumps(newModel), headers = getHeaders(contentType))
    return myModel.json()

def setChampionModel(projectID, modelID):
    contentType = 'application/json'
    resp = requests.post(viyahost + '/modelRepository/projects/'+projectID+'/champion?modelId='+modelID, headers = getHeaders(contentType))
    return resp.json()

def clearChampionModel(projectID):
    contentType = 'application/json'
    resp = requests.delete(viyahost + '/modelRepository/projects/'+projectID+'/champion', headers = getHeaders(contentType))
    if resp.status_code == 204:
        return {'response': resp, 'result': 'Successfully cleared Champion'}
    return resp.json()

#### Define Model Repository Variables

In [None]:
project_name=params_box.get(tab)['setup']['proj_name']
mm_segments = [str(segment[segment1]) + '_' + str(segment[segment2]) for segment in segments_main if segment['use']]
mm_models = params_box.get_models(tab)

#### Get OAuth Token

In [None]:
# Helper functions to authenticate through OAuth
def getHeaders(contentType="application/json"):
    headers = {"Content-Type": contentType, "Authorization": "Bearer " + auth_token}
    return headers

def getAuthToken(url, user, password, auth):
    headers = {'Accept': 'application/json',
               'Content-Type': 'application/x-www-form-urlencoded',
               'Authorization': 'Basic ' + auth}
    payload = 'grant_type=password&username='+user+'&password='+password
    authReturn = requests.post(url+'/SASLogon/oauth/token', data=payload, headers=headers, verify = False)
    return authReturn.json()['access_token']

# get OAuth Token
auth_token = getAuthToken('http://localhost', 'sasdemo', 'Orion123', base64.b64encode(b'pipeline_script:sastronauts').decode())

### Terminate All CAS Sessions

In [None]:
for segment in model_segments:
    segment['session'].terminate()