In [1]:
# Importing Necessary Libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline

Read the dataset

In [6]:
# Load the Universal Bank Data
df = pd.read_csv('../dataset/lab5/UniversalBank.csv')
display(df)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0


In [7]:
# Checking for null values
df.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In this dataset, ID and ZIP code are not useful for the prediction. So, I will drop these columns.

In [9]:
# Dropping ID and ZIP Code columns from the dataset
df1 = df.drop(["ID","ZIP Code"], axis = 1)
display(df1)

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,1.9,3,0,0,0,0,1,0
4996,30,4,15,4,0.4,1,85,0,0,0,1,0
4997,63,39,24,2,0.3,3,0,0,0,0,0,0
4998,65,40,49,3,0.5,2,0,0,0,0,1,0


Separate the class with response variable is `Personal Loan` and the rest of the columns are features.

In [10]:
zero_class = df1[df1["Personal Loan"] == 0]
one_class = df1[df1["Personal Loan"] == 1]

# shape
print(zero_class.shape)
print(one_class.shape)

(4520, 12)
(480, 12)


In [11]:
# train test split
from sklearn.model_selection import train_test_split
X = df1.drop("Personal Loan", axis = 1)
y = df1["Personal Loan"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# shape
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4000, 11)
(1000, 11)
(4000,)
(1000,)


## **Support Vector Machine (SVM)**

Support Vector Machine (SVM) is a supervised machine learning algorithm that can be used for both classification or regression challenges. However, it is mostly used in classification problems. In the SVM algorithm, we plot each data item as a point in n-dimensional space (where n is the number of features you have) with the value of each feature being the value of a particular coordinate. Then, we perform classification by finding the hyper-plane that differentiates the two classes very well.

In this notebook, I will use the SVM algorithm to predict the `Personal Loan` column. 3 types of SVM will be used: Linear, Polynomial, and Radial Basis Function (RBF).

In [12]:
# Importing the SVM
from sklearn.svm import SVC

### **1. Linear SVM**

In [14]:
# Linear SVM
svc = SVC(kernel = "linear")

# Fitting the model
svc.fit(X_train, y_train)

# Predicting the model
y_pred = svc.predict(X_test)

# Importing the metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Accuracy Score
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))

# Classification Report
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy Score:  0.952
Confusion Matrix: 
 [[889   6]
 [ 42  63]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       895
           1       0.91      0.60      0.72       105

    accuracy                           0.95      1000
   macro avg       0.93      0.80      0.85      1000
weighted avg       0.95      0.95      0.95      1000



### **2. Polynomial SVM**

In [15]:
# Polynomial SVM
svc = SVC(kernel = "poly")

# Fitting the model
svc.fit(X_train, y_train)

# Predicting the model
y_pred = svc.predict(X_test)

# Accuracy Score
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))

# Classification Report
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy Score:  0.904
Confusion Matrix: 
 [[895   0]
 [ 96   9]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.90      1.00      0.95       895
           1       1.00      0.09      0.16       105

    accuracy                           0.90      1000
   macro avg       0.95      0.54      0.55      1000
weighted avg       0.91      0.90      0.87      1000



### **3. Radial Basis Function (RBF) SVM**

In [17]:
# Radial Basis Function SVM
svc = SVC(kernel = "rbf")

# Fitting the model
svc.fit(X_train, y_train)

# Predicting the model
y_pred = svc.predict(X_test)

# Accuracy Score
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))

# Classification Report
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy Score:  0.9
Confusion Matrix: 
 [[892   3]
 [ 97   8]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.90      1.00      0.95       895
           1       0.73      0.08      0.14       105

    accuracy                           0.90      1000
   macro avg       0.81      0.54      0.54      1000
weighted avg       0.88      0.90      0.86      1000



HW: Analyze the dataset again, consider for scaling the data (`StandardScaler` or `MinMaxScaler`), and try to improve the model performance.