# SVM  analysis of the riding mover.csv data.


## 1. Import the Libraries

Import modules

In [1]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
# set random seed to ensure that results are repeatable
np.random.seed(1)

## 2. Load data

Load data (it's already cleaned and preprocessed)

In [2]:
# load data

riding=pd.read_csv("RidingMowers.csv")

## 3. Conduct initial exploration of the data

In [3]:
# look at the data

riding.head(3) 

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner


In [4]:
# Summary of the data

riding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Income     24 non-null     float64
 1   Lot_Size   24 non-null     float64
 2   Ownership  24 non-null     object 
dtypes: float64(2), object(1)
memory usage: 704.0+ bytes


In [5]:
# look at the data
riding.tail(5)

Unnamed: 0,Income,Lot_Size,Ownership
19,66.0,18.4,Nonowner
20,47.4,16.4,Nonowner
21,33.0,18.8,Nonowner
22,51.0,14.0,Nonowner
23,63.0,14.8,Nonowner


In [6]:
# statistical summary 
riding.describe()

Unnamed: 0,Income,Lot_Size
count,24.0,24.0
mean,68.4375,18.95
std,19.793144,2.428275
min,33.0,14.0
25%,52.35,17.5
50%,64.8,19.0
75%,83.1,20.8
max,110.1,23.6


In [7]:
# Check the missing values by summing the total na's for each variable
riding.isna().sum()

Income       0
Lot_Size     0
Ownership    0
dtype: int64

In [8]:
riding['Ownership'].unique

<bound method Series.unique of 0        Owner
1        Owner
2        Owner
3        Owner
4        Owner
5        Owner
6        Owner
7        Owner
8        Owner
9        Owner
10       Owner
11       Owner
12    Nonowner
13    Nonowner
14    Nonowner
15    Nonowner
16    Nonowner
17    Nonowner
18    Nonowner
19    Nonowner
20    Nonowner
21    Nonowner
22    Nonowner
23    Nonowner
Name: Ownership, dtype: object>

In [9]:
# Encoding the categorical variable using one hot encoding

dummies_df = pd.get_dummies(riding['Ownership'], prefix='Ownership', drop_first=True)
riding = riding.join(dummies_df)
riding.drop('Ownership', axis=1, inplace = True)

In [10]:
riding.head(4)

Unnamed: 0,Income,Lot_Size,Ownership_Owner
0,60.0,18.4,1
1,85.5,16.8,1
2,64.8,21.6,1
3,61.5,20.8,1


In [11]:
riding['Ownership_Owner'].unique

<bound method Series.unique of 0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
Name: Ownership_Owner, dtype: uint8>

In [12]:
riding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Income           24 non-null     float64
 1   Lot_Size         24 non-null     float64
 2   Ownership_Owner  24 non-null     uint8  
dtypes: float64(2), uint8(1)
memory usage: 536.0 bytes


## 4.  Splitting the data using training and testing 

First, let's create a dataframe to load the model performance metrics into.

In [13]:
Indpnt= riding.loc[:,['Income','Lot_Size']]
Dpnt= riding.loc[:,['Ownership_Owner']]

In [14]:
Indpnt_train,Indpnt_test,Dpnt_train,Dpnt_test = train_test_split(Indpnt,Dpnt,test_size=1/3)

In [15]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### 5 Fit a SVM classification model using linear kernal

In [16]:
svm_lin_model = SVC(kernel="linear", probability=True)
svm_lin= svm_lin_model.fit(Indpnt, np.ravel(Dpnt))

In [17]:
model_preds = svm_lin_model.predict(Indpnt)
c_matrix = confusion_matrix(Dpnt, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

### 6 Fit a SVM classification model using rbf kernal

In [18]:
svm_rbf_model = SVC(kernel="rbf", C=10, gamma='scale',probability=True)
svm_rbf = svm_rbf_model.fit(Indpnt_train, np.ravel(Dpnt_train))

In [19]:
model_preds = svm_rbf_model.predict(Indpnt)
c_matrix = confusion_matrix(Dpnt, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

### 7 Fit a SVM classification model using polynomial kernal

In [20]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1.0, C=10,probability=True)
svm_poly= svm_poly_model.fit(Indpnt_train, np.ravel(Dpnt_train))

In [21]:
model_preds = svm_poly_model.predict(Indpnt)
c_matrix = confusion_matrix(Dpnt, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## Prediction Probability for all 3 models

In [22]:
## Linear Kernel

riding["predicted"]=svm_lin.predict(Indpnt)
riding

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted
0,60.0,18.4,1,0
1,85.5,16.8,1,0
2,64.8,21.6,1,1
3,61.5,20.8,1,1
4,87.0,23.6,1,1
5,110.1,19.2,1,1
6,108.0,17.6,1,1
7,82.8,22.4,1,1
8,69.0,20.0,1,1
9,93.0,20.8,1,1


In [23]:
riding['pred_prob'] = svm_lin.predict_proba(Indpnt)[:,1]
riding

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.361419
1,85.5,16.8,1,0,0.482763
2,64.8,21.6,1,1,0.723801
3,61.5,20.8,1,1,0.619957
4,87.0,23.6,1,1,0.943088
5,110.1,19.2,1,1,0.886001
6,108.0,17.6,1,1,0.784569
7,82.8,22.4,1,1,0.892847
8,69.0,20.0,1,1,0.623345
9,93.0,20.8,1,1,0.872913


In [24]:
## RBF Kernel

riding["predicted"]=svm_rbf.predict(Indpnt)
riding

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.361419
1,85.5,16.8,1,1,0.482763
2,64.8,21.6,1,1,0.723801
3,61.5,20.8,1,1,0.619957
4,87.0,23.6,1,1,0.943088
5,110.1,19.2,1,1,0.886001
6,108.0,17.6,1,1,0.784569
7,82.8,22.4,1,1,0.892847
8,69.0,20.0,1,1,0.623345
9,93.0,20.8,1,1,0.872913


In [25]:
riding['pred_prob'] = svm_rbf.predict_proba(Indpnt)[:,1]
riding

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.382182
1,85.5,16.8,1,1,0.738629
2,64.8,21.6,1,1,0.567431
3,61.5,20.8,1,1,0.476066
4,87.0,23.6,1,1,0.83206
5,110.1,19.2,1,1,0.759051
6,108.0,17.6,1,1,0.75904
7,82.8,22.4,1,1,0.804546
8,69.0,20.0,1,1,0.60839
9,93.0,20.8,1,1,0.813595


In [26]:
## Polynomial Kernel

riding["predicted"]=svm_poly.predict(Indpnt)
riding

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.382182
1,85.5,16.8,1,1,0.738629
2,64.8,21.6,1,1,0.567431
3,61.5,20.8,1,1,0.476066
4,87.0,23.6,1,1,0.83206
5,110.1,19.2,1,1,0.759051
6,108.0,17.6,1,1,0.75904
7,82.8,22.4,1,1,0.804546
8,69.0,20.0,1,1,0.60839
9,93.0,20.8,1,1,0.813595


In [27]:
riding['pred_prob'] = svm_poly.predict_proba(Indpnt)[:,1]
riding

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.558405
1,85.5,16.8,1,1,0.56233
2,64.8,21.6,1,1,0.565553
3,61.5,20.8,1,1,0.562769
4,87.0,23.6,1,1,0.578904
5,110.1,19.2,1,1,0.570517
6,108.0,17.6,1,1,0.565548
7,82.8,22.4,1,1,0.574482
8,69.0,20.0,1,1,0.56444
9,93.0,20.8,1,1,0.573509


## SUMMARY

In [28]:
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,0.791667,0.769231,0.833333,0.8
0,rbf svm,0.75,0.714286,0.833333,0.769231
0,poly svm,0.791667,0.769231,0.833333,0.8


In [29]:
performance.sort_values(by=['Accuracy'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.714286,0.833333,0.769231
0,linear svm,0.791667,0.769231,0.833333,0.8
0,poly svm,0.791667,0.769231,0.833333,0.8


In [30]:
performance.sort_values(by=['Precision'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.714286,0.833333,0.769231
0,linear svm,0.791667,0.769231,0.833333,0.8
0,poly svm,0.791667,0.769231,0.833333,0.8


In [31]:
performance.sort_values(by=['Recall'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,0.791667,0.769231,0.833333,0.8
0,rbf svm,0.75,0.714286,0.833333,0.769231
0,poly svm,0.791667,0.769231,0.833333,0.8


In [32]:
performance.sort_values(by=['F1'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.714286,0.833333,0.769231
0,linear svm,0.791667,0.769231,0.833333,0.8
0,poly svm,0.791667,0.769231,0.833333,0.8


## Inference

Based on the above 3 models, I assume that the "Poly SVM" is performing best as it is having higher values.
So, I'm savig this "Poly SVM" model to the pickel file to generate the text based interface.

In [33]:
import pickle

# save model
pickle.dump(svm_poly, open("svm_model.pkl", "wb"))

# If you wish to load this model later, simply use pickle.load method
#loaded_model = pickle.load(open('logistic_model_example01.pkl', "rb"))