# SAS Innovate Workbench Workshop

## 1. Importing Packages

In [24]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score

from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE

from sasviya.ml.tree import ForestClassifier

# 2. Data preparation

In [2]:
# List all files in the directory
files = os.listdir()

# Filter files that end with .csv
csv_files = [f for f in files if f.endswith('.csv')]

print(csv_files)

['bank.csv', 'Bank_Score.csv', 'Bank_Train.csv']


In [3]:
# Load CSV file as a DataFrame
bank = pd.read_csv(csv_files[0])

In [4]:
# displaying dataframe
bank.head()

Unnamed: 0,AccountID,Status,Customer_Value,Age,Home_Flag,Homeval,Inc,Pr,Activity_Status,AvgSale3Yr,...,AvgSale3Yr_DP,LastProdAmt,CntPur3Yr,CntPurLife,CntPur3Yr_DP,CntPurLife_DP,CntTotPromo,MnthsLastPur,Cnt1Yr_DP,CustTenure
0,5200000001,1,A,,0,57600,52106,24,High,5.71,...,5.25,10.0,7,22,4,6,20,5,9,92
1,5200000002,1,A,,0,57587,52106,24,High,5.71,...,5.25,10.0,7,22,4,6,20,5,9,92
2,5200000003,1,A,,0,44167,42422,0,High,12.8,...,13.0,12.0,5,16,3,8,27,16,11,91
3,5200000004,0,A,68.0,0,90587,59785,32,High,21.6,...,20.0,25.0,5,21,2,7,19,15,9,123
4,5200000005,0,A,,0,100313,0,0,High,7.33,...,7.6,10.0,6,38,5,19,13,24,6,128


## 3. Exploratory Data Analysis (EDA)

In [None]:
# understanding datasets dimensions
print("Bank_Train data shape:", bank.shape)

Note Score set is bigger than Train.

*DISCUSSION:* Can consider adding synthetic data to Train set.

In [None]:
# understanding column format
bank.info()

In [None]:
# parsing categorical variables
bank["AccountID"] = bank["AccountID"].astype('category')
bank["Status"] = bank["Status"].astype('category')
bank["Customer_Value"] = bank["Customer_Value"].astype('category')
bank["Home_Flag"] = bank["Home_Flag"].astype('category')
bank['Activity_Status'] = bank['Activity_Status'].astype('category')

In [None]:
# missing variable exploration
bank.isna().sum()

Only Demog_Age and AvgSale3Yr_DP have missing data, but missing 20-25% of data.

In [None]:
# finding duplicated data
bank.duplicated().sum()

In [None]:
# summary statistics
bank.describe().T

In [None]:
def plot_variable_types(df):
    # Separate continuous and categorical variables
    continuous_vars = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_vars = df.select_dtypes(include=['object', 'category']).columns

    # Plot histograms for continuous variables in one figure
    plt.figure(figsize=(20, 50))
    for i, col in enumerate(continuous_vars):
        plt.subplot(len(continuous_vars)//2 + 1, 2, i + 1)
        plt.hist(df[col], bins=20, edgecolor='black')
        plt.title(f'{col} Distribution (Histogram)')
        plt.xlabel(col)
        plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

    # Plot bar charts for categorical variables in another figure
    plt.figure(figsize=(20, 10))
    for i, col in enumerate(categorical_vars):
        plt.subplot(len(categorical_vars)//2 + 1, 2, i + 1)
        value_counts = df[col].value_counts()
        plt.bar(value_counts.index, value_counts.values, color='skyblue')
        plt.title(f'{col} Distribution (Bar Chart)')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
plot_variable_types(bank)

* Age is skewed to the left, have some customers under 18- consider dropping unless they are high worth customers.
* AvgSale3Yr, AvgSale3Yr_DP, AvgSaleLife and LastProdAmt are all highly concentrated in one bucket with samll number of positive outliers.
* Cnt1Yr_DP, CntPur3Yr, CntPur3Yr_DP, CntPurLife, CntPurLife_DP, CntTotPromo, CustTenure are all skewed to the right.
* Imbalanced dataset, with about 20% of data buying the insurance product and 80% not.

## 4. Data Wrangling

### Imputation

In [5]:
# Selecting only numeric columns
numeric_cols = bank.select_dtypes(include=np.number).columns

# Calculate mean of numeric columns
mean_values = bank[numeric_cols].mean()

# Fill missing values in numeric columns with their respective means
bank[numeric_cols] = bank[numeric_cols].fillna(mean_values)

### Lable encoding

* did not use one-hot encoding because want to preserve ordinality of variables

In [6]:
# Select categorical columns excluding 'Activity_Status' and 'Customer_Value'
categorical_cols = [col for col in bank.select_dtypes(include=['object']).columns if col != 'Activity_Status' and col != 'Customer_Value']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode each categorical column using LabelEncoder
for col in categorical_cols:
    bank[col] = label_encoder.fit_transform(bank[col])

# Mapping for 'Activity_Status' and 'Customer_Value'
label_encoding = {
    'Activity_Status': {'High': 0, 'Average': 1, 'Low': 2},
    'Customer_Value': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
}

# Apply mapping
for col, mapping in label_encoding.items():
    bank[col] = bank[col].map(mapping)

In [7]:
# checking result of parsing and imputation
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108600 entries, 0 to 108599
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   AccountID        108600 non-null  int64  
 1   Status           108600 non-null  int64  
 2   Customer_Value   108600 non-null  int64  
 3   Age              108600 non-null  float64
 4   Home_Flag        108600 non-null  int64  
 5   Homeval          108600 non-null  int64  
 6   Inc              108600 non-null  int64  
 7   Pr               108600 non-null  int64  
 8   Activity_Status  108600 non-null  int64  
 9   AvgSale3Yr       108600 non-null  float64
 10  AvgSaleLife      108600 non-null  float64
 11  AvgSale3Yr_DP    108600 non-null  float64
 12  LastProdAmt      108600 non-null  float64
 13  CntPur3Yr        108600 non-null  int64  
 14  CntPurLife       108600 non-null  int64  
 15  CntPur3Yr_DP     108600 non-null  int64  
 16  CntPurLife_DP    108600 non-null  int6

### Splitting into train and test sets

In [8]:
x = bank.drop(['Status'], axis=1)
y = bank['Status']

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

In [10]:
# checking split results
print(X_train.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_valid.shape)

(86880, 20)
(21720, 20)
(86880,)
(21720,)


## 5. Modelling

### python sklearn models

* Decision Tree
* Random Forest
* Gradient Boosting

In [11]:
# Decision Tree, max 3 layers
dt = DecisionTreeClassifier(max_depth=3)

dt.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, dt.predict(X_train))
test_accuracy = accuracy_score(y_valid, dt.predict(X_valid))

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Train Accuracy: 0.8302255985267035
Test Accuracy: 0.8246777163904235


In [12]:
test_report = classification_report(y_valid, dt.predict(X_valid), output_dict=True)
pd.DataFrame(test_report).T["recall"]

0               0.941378
1               0.381215
accuracy        0.824678
macro avg       0.661297
weighted avg    0.824678
Name: recall, dtype: float64

In [None]:
# Plot feature importance
dt_feat = pd.DataFrame(dt.feature_importances_, index=X_train.columns, columns=['feat_importance'])
dt_feat.sort_values('feat_importance').tail(8).plot.barh()
plt.show()

In [21]:
# Random forest, with SMOTE

model = RandomForestClassifier(
    class_weight='balanced',
    random_state=42
)

pipeline = Pipeline([
#    ('scaler', StandardScaler()),
#    ('imputer', SimpleImputer(strategy='mean')),
    ('SMOTEENN', SMOTE(random_state=42)),
    ('classifier', model)
])

pipeline.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, pipeline.predict(X_train))
test_accuracy = accuracy_score(y_valid, pipeline.predict(X_valid))

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

test_report = classification_report(y_valid, pipeline.predict(X_valid), output_dict=True)
print(pd.DataFrame(test_report).T["recall"])

Train Accuracy: 0.9999769797421731
Test Accuracy: 0.8408839779005525
0               0.882873
1               0.681326
accuracy        0.840884
macro avg       0.782099
weighted avg    0.840884
Name: recall, dtype: float64


In [22]:
# Gradient Boosting, with SMOTE

model = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.7,
    min_samples_leaf=1,
    random_state=42
)

pipeline = Pipeline([
#    ('scaler', StandardScaler()),
#    ('imputer', SimpleImputer(strategy='mean')),
    ('SMOTEENN', SMOTE(random_state=42)),
    ('classifier', model)
])

pipeline.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, pipeline.predict(X_train))
test_accuracy = accuracy_score(y_valid, pipeline.predict(X_valid))

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

test_report = classification_report(y_valid, pipeline.predict(X_valid), output_dict=True)
print(pd.DataFrame(test_report).T["recall"])

Train Accuracy: 0.8229166666666666
Test Accuracy: 0.8151012891344384
0               0.849666
1               0.683757
accuracy        0.815101
macro avg       0.766711
weighted avg    0.815101
Name: recall, dtype: float64


### SAS Models

* Decision Tree

In [23]:
# SAS Decision Tree, with SMOTE

model = ForestClassifier(
    random_state=70
)

pipeline2 = Pipeline([
#    ('scaler', StandardScaler(with_mean=True)),
#    ('imputer', SimpleImputer(strategy='mean')),
    ('SMOTEENN', SMOTE(random_state=42)),
    ('classifier', model)
])

pipeline2.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, pipeline2.predict(X_train))
test_accuracy = accuracy_score(y_valid, pipeline2.predict(X_valid))

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

test_report = classification_report(y_valid, pipeline2.predict(X_valid), output_dict=True)
pd.DataFrame(test_report).T["recall"]

Train Accuracy: 0.7982504604051566
Test Accuracy: 0.7810313075506445


0               0.783193
1               0.772818
accuracy        0.781031
macro avg       0.778005
weighted avg    0.781031
Name: recall, dtype: float64