In [1]:
import os

import pandas as pd
import numpy as np

# store elements as dictionary keys and their counts as dictionary values
from collections import Counter

# scikit-learn
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline

# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report

# Function for creating model pipelines - sklearn
from sklearn.pipeline import make_pipeline

# Function for creating model pipelines - imblearn
from imblearn.pipeline import make_pipeline as imbl_pipe

# Over-sampling using SMOTE
from imblearn.over_sampling import SMOTE


## Load Analytical Base Table

In [2]:
df = pd.read_csv(os.path.join("../Resources", "analytical_base_table.csv"))
print(f"Dataframe dimensions: {df.shape}")
df.head()

Dataframe dimensions: (10000, 9)


Unnamed: 0,Geography,Gender,Age,Tenure,Balance,NumOfProducts,IsActiveMember,EstimatedSalary,Exited
0,France,Female,42,2,0.0,1,1,101348.88,1
1,Spain,Female,41,1,83807.86,1,1,112542.58,0
2,France,Female,42,8,159660.8,3,0,113931.57,1
3,France,Female,39,1,0.0,2,0,93826.63,0
4,Spain,Female,43,2,125510.82,1,1,79084.1,0


### Separate dataframe into separate object

In [3]:
# Object for target variable
y = df.Exited

# object for input features
X = df.drop(['Exited'], axis=1)

# display shapes of X and y
print(X.shape, y.shape)

(10000, 8) (10000,)


In [4]:
# List numerical features
num_columns = X.select_dtypes(include='number').columns.tolist()
num_columns

['Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'IsActiveMember',
 'EstimatedSalary']

In [5]:
# List categorical features
cat_columns = X.select_dtypes(include='object').columns.tolist()
cat_columns

['Geography', 'Gender']

In [6]:
def class_count(a):
    counter=Counter(a)
    kv=[list(counter.keys()),list(counter.values())]
    dff = pd.DataFrame(np.array(kv).T, columns=['Exited','Count'])
    dff['Count'] = dff['Count'].astype('int64')
    dff['%'] = round(dff['Count'] / a.shape[0] * 100, 2)
    return dff.sort_values('Count',ascending=False)

In [7]:
class_count(y)

Unnamed: 0,Exited,Count,%
1,0,7963,79.63
0,1,2037,20.37


## Create a Train Test Split

In [8]:
random_state = 8

# Split X and y into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=random_state,
                                                   stratify=df.Exited)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

7000 3000 7000 3000


In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 6347 to 2994
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Geography        7000 non-null   object 
 1   Gender           7000 non-null   object 
 2   Age              7000 non-null   int64  
 3   Tenure           7000 non-null   int64  
 4   Balance          7000 non-null   float64
 5   NumOfProducts    7000 non-null   int64  
 6   IsActiveMember   7000 non-null   int64  
 7   EstimatedSalary  7000 non-null   float64
dtypes: float64(2), int64(4), object(2)
memory usage: 492.2+ KB


## Pre-processing Pipeline

### Scale numerical data and encode categorical data
Construct a pre-processing pipeline from the given transformers: MinMaxScaler and OneHotEncoder

Create lists of indexes from the list of column names

Need to be numeric not string to specify columns name in column transformer

In [10]:
num_features = [] 

for i in num_columns:
    location = X.columns.get_loc(i)
    num_features.append(location)
print(num_features)  

[2, 3, 4, 5, 6, 7]


In [11]:
cat_features = []

for i in cat_columns:
    location = X.columns.get_loc(i)
    cat_features.append(location)
print(cat_features)  

[0, 1]


In [12]:
# Define column transformer
# Need to be numeric not string to specify columns name 
preprocess = make_column_transformer(
    (MinMaxScaler(), num_features),
    (OneHotEncoder(), cat_features)
)
preprocess

## Build Model Pipeline with SMOTE

* We are going to use the Pipeline from the imblearn package in place of scikit-learn Pipeline.

* It takes care automatically to re-sample when called fit() on the pipeline, and does not re-sample test data (when called transform() or predict()).

In [13]:
# Import classifier
from sklearn.svm import SVC 

# Define model with pipeline
model = imbl_pipe(preprocess,
                  SMOTE(sampling_strategy='auto', random_state=random_state),
                  SVC(random_state=random_state))

model

In [14]:
# Create the GridSearchCV model
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV

param_grid = {'svc__kernel' : ['linear', 'rbf', 'poly', 'sigmoid'],
              'svc__C': [0.0005,0.001, 0.01, 0.1, 0.5],
              'svc__gamma': [5, 1, 0.1, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3, cv= 5, n_jobs=4, scoring='accuracy')

In [15]:
X_train = X_train.values
X_test = X_test.values

In [16]:
# Train the model with GridSearch
grid.fit(X_train, y_train)


Fitting 5 folds for each of 80 candidates, totalling 400 fits


In [17]:
print(grid.best_params_)

{'svc__C': 0.5, 'svc__gamma': 5, 'svc__kernel': 'poly'}


In [18]:
 # List the best score
print(grid.best_score_)

0.7859999999999999


In [19]:
print(f"Training Data Score: {grid.score(X_train, y_train)}")
print(f"Testing Data Score: {grid.score(X_test, y_test)}")

Training Data Score: 0.801
Testing Data Score: 0.7913333333333333


In [20]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)
predictions

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, predictions)
print(cm)

[[1920  469]
 [ 157  454]]


In [22]:
cm = np.around(cm / cm.sum(axis=1)[:, np.newaxis], 2)
print(cm)

[[0.8  0.2 ]
 [0.26 0.74]]


In [23]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.80      0.86      2389
           1       0.49      0.74      0.59       611

    accuracy                           0.79      3000
   macro avg       0.71      0.77      0.73      3000
weighted avg       0.84      0.79      0.81      3000



In [24]:
predictions

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
pred = grid.predict(X_test[:1])

In [26]:
print(f"Predicted classes: {pred}")
print(f"Actual Labels: {list(y_test[:1])}")

Predicted classes: [1]
Actual Labels: [1]


## Save the Model

In [27]:
import joblib

# We are saving our grid model
filename = '../models/SVM_model.sav'
joblib.dump(grid, filename)

['../models/SVM_model.sav']

## Loading the Model

In [28]:
# load the model
svm_model = joblib.load(filename)
print(svm_model.score(X_test, y_test))

0.7913333333333333


### Predict class for new data

In [29]:
# Let's use the first X_test record as new data
X_test[:1]

array([['France', 'Male', 50, 4, 165438.26, 1, 0, 120770.75]],
      dtype=object)

In [30]:
pred_new = grid.predict(X_test[:1])

In [31]:
print(f"Predicted classes: {pred_new}")
print(f"Actual Labels: {list(y_test[:1])}")

Predicted classes: [1]
Actual Labels: [1]
