# Generation of performance files via an XGBOOST model (pickle file)

In [1]:
import os
import pandas as pd
import numpy as np
import xgboost
from sklearn.base import TransformerMixin
from sklearn.tree import DecisionTreeClassifier

from sklearn import model_selection
from sklearn.metrics import accuracy_score
from swat import *
from swat.render import render_html
from pprint import pprint
from matplotlib import pyplot as plt
import pandas as pd
import sys
%matplotlib inline

datadir='/home/sasdemo01/GitHub/Notebooks/Python_LifeCycle_Management/'

cashost='localhost'
casport=5570
casauth='/home/sasdemo01/.authinfo'
sess = CAS(cashost, casport, authinfo=casauth, caslib="public")
print(sess)

CAS('localhost', 5570, 'sasdemo01', protocol='cas', name='py-session-1', session='68e274d2-8105-0349-8850-32c4b610f20e')


In [2]:
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [3]:
data_sets = ['HMEQ_MONITOR_1', 'HMEQ_MONITOR_2', 'HMEQ_MONITOR_3', 'HMEQ_MONITOR_4', 'HMEQ_MONITOR_ALL',]

targetvar = ['BAD']
inputvars = ['LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']

# Dictionaries for different monitoring data sets (d) and to be scored parts X. 
d = {}
X = {}
for idx, ds in enumerate(data_sets):
#    print(idx, ds)
    d[idx] = pd.read_csv(datadir + ds + ".csv", na_values=['NULL'])
    Y = d[idx][targetvar]
    X0 = d[idx][inputvars]
    # impute the missing value
    X[idx] = DataFrameImputer().fit_transform(X0)

### Import the Python XGBOOST Model

In [4]:
import pickle
loaded_model = pickle.load(open(datadir + "PythonModelXGBOOST/train/xgboost_model_obj.pkl", "rb"))

### Score New Data in Python

In [5]:
ds_names={}
for idx, ds in enumerate(data_sets):
    y_pred=loaded_model.predict_proba(X[idx])
    temp1=pd.DataFrame(y_pred,columns=["P_BAD0","P_BAD1"])
    frames=[d[idx],temp1]
    ds_names[ds] = pd.concat(frames,axis=1,ignore_index=False)

In [6]:
for idx, ds in enumerate(data_sets):
    results = sess.table.tableExists(caslib="Public", name="{}".format(ds))
#   Table not loaded yet
    if(results.exists == 0):
        sess.upload_frame(ds_names[ds],casout={"caslib":"Public","promote":True, "name":"{}".format(ds)})

NOTE: Cloud Analytic Services made the uploaded file available as table HMEQ_MONITOR_1 in caslib Public.
NOTE: The table HMEQ_MONITOR_1 has been created in caslib Public from binary data uploaded to Cloud Analytic Services.
NOTE: Cloud Analytic Services made the uploaded file available as table HMEQ_MONITOR_2 in caslib Public.
NOTE: The table HMEQ_MONITOR_2 has been created in caslib Public from binary data uploaded to Cloud Analytic Services.
NOTE: Cloud Analytic Services made the uploaded file available as table HMEQ_MONITOR_3 in caslib Public.
NOTE: The table HMEQ_MONITOR_3 has been created in caslib Public from binary data uploaded to Cloud Analytic Services.
NOTE: Cloud Analytic Services made the uploaded file available as table HMEQ_MONITOR_4 in caslib Public.
NOTE: The table HMEQ_MONITOR_4 has been created in caslib Public from binary data uploaded to Cloud Analytic Services.
NOTE: Cloud Analytic Services made the uploaded file available as table HMEQ_MONITOR_ALL in caslib Publi

In [7]:
sess.terminate()