***
# Modelling in Python using SAS Viya
***

This notebook is a modelling pipeline that builds a Python model.

In [1]:
import sys
import os
import warnings
import swat
import pandas as pd

In [2]:
conn = swat.CAS('server', 8777, 'student', 'Metadata0')

### Zaladowanie danych

In [3]:
indata = conn.upload_file('/home/student/SGF20VIY/hmeq.csv', casOut=dict(caslib='casuser', name='HMEQ', replace=True))

NOTE: Cloud Analytic Services made the uploaded file available as table HMEQ in caslib CASUSER(student).
NOTE: The table HMEQ has been created in caslib CASUSER(student) from binary data uploaded to Cloud Analytic Services.


In [4]:
input_df = indata.to_frame()
inputs_nominal = ['JOB', 'REASON']
inputs_interval = ['LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']
target = 'BAD'

input_df[inputs_interval + [target,]] = input_df[inputs_interval + [target, ]].apply(
    pd.to_numeric, errors='coerce', downcast='float'
)

In [5]:
input_df[inputs_interval] = input_df[inputs_interval].fillna(input_df[inputs_interval].mean())

input_df.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1.0,1100.0,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366669,1.0,9.0,33.77993
1,1.0,1300.0,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833336,0.0,14.0,33.77993
2,1.0,1500.0,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.46666,1.0,10.0,33.77993
3,1.0,1500.0,73760.9375,101776.15625,,,8.92227,0.25457,0.449442,179.766449,1.186055,21.296097,33.77993
4,0.0,1700.0,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333336,0.0,14.0,33.77993


***
# Model Development
***

### Training Python models
---
Training and comparing a Logistic Regression and XGBoost model:
* Partitioned with two thirds training, one third validation
* Data imputed using median for numerical values and mode for categorical
* Categorical variables dummy encoded
* Comparison of missclassification rates to determine champion

In [6]:
from matplotlib import pyplot as plt

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor

In [7]:
# Create training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(
    input_df.drop(target, axis=1), input_df[target], test_size=0.3, random_state=54321)

In [8]:
# Define data pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, inputs_interval),
    ('cat', categorical_transformer, inputs_nominal)])

In [9]:
lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(solver='lbfgs', random_state=54321))])

lr.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               

In [10]:
bst = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(max_depth=20, n_estimators=100, learning_rate=0.5, random_state=54321))])

bst.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               

In [11]:
lr.round()

AttributeError: 'Pipeline' object has no attribute 'round'

In [None]:
lr_misclassification = 1 - accuracy_score(y_valid.array, lr.predict(X_valid).round())
bst_misclassification = 1 - accuracy_score(y_valid.array, bst.predict(X_valid).round())
print('Misclassification Rates\n',
      'GBoost:', bst_misclassification, '\n',
      'Logistic Regression:', lr_misclassification)

### Assess Model Performance
---
Model diagnostics of Python gradient boosting and logistic regression:
* Compute false positive rate and true positive rate
* Calculate the area under the curve
* Plot ROC curve

In [None]:
models = [{'label': 'XGBoost','model': bst,},
          {'label': 'Logistic Regression','model': lr,}]

plt.figure(figsize = (7, 5))
for m in models:
    model = m['model']
    fpr, tpr, thresholds = metrics.roc_curve(y_valid, model.predict(X_valid))
    auc = metrics.roc_auc_score(y_valid, model.predict(X_valid))
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], auc))
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Postive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (using validation data)')
plt.legend(loc="lower right")
plt.show()

In [None]:
conn.close()