# SAS Innovate Workbench Workshop

## 1. Importing Packages

In [None]:
# %pip install imbalanced-learn
# %pip install python-dotenv
# %pip install sasctl

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import requests

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score

from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sasviya.ml.tree import ForestClassifier
from sasviya.ml.tree import GradientBoostingClassifier as sas_GradientBoostingClassifier

# 2. Data preparation

In [None]:
# List all files in the directory
files = os.listdir()

# Filter files that end with .csv
csv_files = [f for f in files if f.endswith('.csv')]

print(csv_files)

In [None]:
# Load CSV file as a DataFrame
bank = pd.read_csv(csv_files[0])

In [None]:
# displaying dataframe
bank.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# understanding datasets dimensions
print("Bank_Train data shape:", bank.shape)

Note Score set is bigger than Train.

*DISCUSSION:* Can consider adding synthetic data to Train set.

In [None]:
# understanding column format
bank.info()

In [None]:
# parsing categorical variables
bank["AccountID"] = bank["AccountID"].astype('category')
bank["Status"] = bank["Status"].astype('category')
bank["Customer_Value"] = bank["Customer_Value"].astype('category')
bank["Home_Flag"] = bank["Home_Flag"].astype('category')
bank['Activity_Status'] = bank['Activity_Status'].astype('category')

In [None]:
# missing variable exploration
bank.isna().sum()

Only Demog_Age and AvgSale3Yr_DP have missing data, but missing 20-25% of data.

In [None]:
# finding duplicated data
bank.duplicated().sum()

In [None]:
# summary statistics
bank.describe().T

In [None]:
def plot_variable_types(df):
    # Separate continuous and categorical variables
    continuous_vars = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_vars = df.select_dtypes(include=['object', 'category']).columns

    # Plot histograms for continuous variables in one figure
    plt.figure(figsize=(20, 50))
    for i, col in enumerate(continuous_vars):
        plt.subplot(len(continuous_vars)//2 + 1, 2, i + 1)
        plt.hist(df[col], bins=20, edgecolor='black')
        plt.title(f'{col} Distribution (Histogram)')
        plt.xlabel(col)
        plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

    # Plot bar charts for categorical variables in another figure
    plt.figure(figsize=(20, 10))
    for i, col in enumerate(categorical_vars):
        plt.subplot(len(categorical_vars)//2 + 1, 2, i + 1)
        value_counts = df[col].value_counts()
        plt.bar(value_counts.index, value_counts.values, color='skyblue')
        plt.title(f'{col} Distribution (Bar Chart)')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
plot_variable_types(bank)

* Age is skewed to the left, have some customers under 18- consider dropping unless they are high worth customers.
* AvgSale3Yr, AvgSale3Yr_DP, AvgSaleLife and LastProdAmt are all highly concentrated in one bucket with samll number of positive outliers.
* Cnt1Yr_DP, CntPur3Yr, CntPur3Yr_DP, CntPurLife, CntPurLife_DP, CntTotPromo, CustTenure are all skewed to the right.
* Imbalanced dataset, with about 20% of data buying the insurance product and 80% not.

## 4. Data Wrangling

### Imputation

In [None]:
# Selecting only numeric columns
numeric_cols = bank.select_dtypes(include=np.number).columns

# Calculate mean of numeric columns
mean_values = bank[numeric_cols].mean()

# Fill missing values in numeric columns with their respective means
bank[numeric_cols] = bank[numeric_cols].fillna(mean_values)

### Lable encoding

* did not use one-hot encoding because want to preserve ordinality of variables

In [None]:
# Select categorical columns excluding 'Activity_Status' and 'Customer_Value'
categorical_cols = [col for col in bank.select_dtypes(include=['category']).columns if col != 'Activity_Status' and col != 'Customer_Value']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode each categorical column using LabelEncoder
for col in categorical_cols:
    bank[col] = label_encoder.fit_transform(bank[col])

# Mapping for 'Activity_Status' and 'Customer_Value'
label_encoding = {
    'Activity_Status': {'High': 0, 'Average': 1, 'Low': 2},
    'Customer_Value': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
}

# Apply mapping
for col, mapping in label_encoding.items():
    bank[col] = bank[col].map(mapping).astype("int64")

In [None]:
# checking result of parsing and imputation
bank.info()

### Splitting into train and test sets

In [None]:
x = bank.drop(['Status'], axis=1)
y = bank['Status']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# checking split results
print(X_train.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_valid.shape)

## 5. Modelling

### python sklearn models

* Decision Tree
* Random Forest
* Gradient Boosting

In [None]:
# Decision Tree, max 3 layers
dt = DecisionTreeClassifier(max_depth=3)

dt.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, dt.predict(X_train))
test_accuracy = accuracy_score(y_valid, dt.predict(X_valid))

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

In [None]:
test_report = classification_report(y_valid, dt.predict(X_valid), output_dict=True)
pd.DataFrame(test_report).T["recall"]

In [None]:
# Plot feature importance
dt_feat = pd.DataFrame(dt.feature_importances_, index=X_train.columns, columns=['feat_importance'])
dt_feat.sort_values('feat_importance').tail(8).plot.barh()
plt.show()

In [None]:
# Random forest, with SMOTE

model = RandomForestClassifier(
    class_weight='balanced',
    random_state=42
)

pipeline = Pipeline([
#    ('scaler', StandardScaler()),
#    ('imputer', SimpleImputer(strategy='mean')),
    ('SMOTEENN', SMOTE(random_state=42)),
    ('classifier', model)
])

pipeline.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, pipeline.predict(X_train))
test_accuracy = accuracy_score(y_valid, pipeline.predict(X_valid))

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

test_report = classification_report(y_valid, pipeline.predict(X_valid), output_dict=True)
print(pd.DataFrame(test_report).T["recall"])

In [None]:
# Gradient Boosting, with SMOTE

model = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.7,
    min_samples_leaf=1,
    random_state=42
)

pipeline = Pipeline([
#    ('scaler', StandardScaler()),
#    ('imputer', SimpleImputer(strategy='mean')),
    ('SMOTEENN', SMOTE(random_state=42)),
    ('classifier', model)
])

pipeline.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, pipeline.predict(X_train))
test_accuracy = accuracy_score(y_valid, pipeline.predict(X_valid))

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

test_report = classification_report(y_valid, pipeline.predict(X_valid), output_dict=True)
print(pd.DataFrame(test_report).T["recall"])

### SAS Models

* Decision Tree
* Gradient Boosting

In [None]:
# SAS Gradient Boosting, with SMOTE

model = sas_GradientBoostingClassifier( # not to be confused with sklearn GB
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.7,
    min_samples_leaf=1,
    random_state=42
)

pipeline2 = Pipeline([
#    ('scaler', StandardScaler(with_mean=True)),
#    ('imputer', SimpleImputer(strategy='mean')),
    ('SMOTEENN', SMOTE(random_state=42)),
    ('classifier', model)
])

pipeline2.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, pipeline2.predict(X_train))
test_accuracy = accuracy_score(y_valid, pipeline2.predict(X_valid))

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

test_report = classification_report(y_valid, pipeline2.predict(X_valid), output_dict=True)
pd.DataFrame(test_report).T["recall"]

In [None]:
# SAS Decision Tree, with SMOTE

model = ForestClassifier(
    random_state=70
)

pipeline2 = Pipeline([
#    ('scaler', StandardScaler(with_mean=True)),
#    ('imputer', SimpleImputer(strategy='mean')),
    ('SMOTEENN', SMOTE(random_state=42)),
    ('classifier', model)
])

pipeline2.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, pipeline2.predict(X_train))
test_accuracy = accuracy_score(y_valid, pipeline2.predict(X_valid))

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

test_report = classification_report(y_valid, pipeline2.predict(X_valid), output_dict=True)
pd.DataFrame(test_report).T["recall"]

## 6. Model Deployment

### Exporting pipeline objects to .pkl files

In [None]:
# sklearn model
# dump the pipeline object
with open('sklearn_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)


del pipeline

# load the pipeline object
with open('./sklearn_pipeline.pkl', mode='rb') as file:
    pipeline = pickle.load(file)
pipeline

In [None]:
# SAS model
# dump the pipeline object
with open('sas_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline2, file)

del pipeline

# load the pipeline object
with open('./sas_pipeline.pkl', mode='rb') as file:
    pipeline2 = pickle.load(file)
pipeline2

### Deploying SAS model manager

In [None]:
# get access token for viya env using refresh token. change to your own viya server and preferred authentication method.
url = "https://apgtps2demo.gtp.unx.sas.com"
auth_url = f"{url}/SASLogon/oauth/token"

In [None]:
import sasctl
from sasctl import Session
import requests

In [None]:
# reading long-lived refresh token from txt file

payload=f'grant_type=refresh_token&refresh_token={refresh_token}'
headers = {
  'Accept': 'application/json',
  'Content-Type': 'application/x-www-form-urlencoded',
  'Authorization': 'Basic c2FzLmNsaTo=',
}

response = requests.request("POST", auth_url, headers=headers, data=payload, verify=False)
access_token = response.json()['access_token']

st = Session("https://apgtps2demo.gtp.unx.sas.com", token=access_token, verify_ssl=False)
st