#SVM Classification

In [1]:
#import packages
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### SVM Boundary Plotting Function

In [3]:
# CODE SOURCE IS DIRECTLY FROM DOCUMENTATION
# https://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane.html


def plot_svm_boundary(model, X, Y, figsize=(8,6),title="SVM Boundary Plot"):
    # plot the line, the points, and the nearest vectors to the plane
    plt.figure(figsize=(8,6))
    plt.clf()

    # Data Scatter Plot
    plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=Y, zorder=10, cmap=plt.cm.Paired, edgecolors='k')

    # plot the decision function
    ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()

    # create grid to evaluate model
    xx = np.linspace(xlim[0], xlim[1], 100)
    yy = np.linspace(ylim[0], ylim[1], 100)
    YY, XX = np.meshgrid(yy, xx)
    xy = np.vstack([XX.ravel(), YY.ravel()]).T
    Z = model.decision_function(xy).reshape(XX.shape)

    # plot decision boundary and margins
    Z = Z.reshape(XX.shape)
    ax.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired,shading='auto')
    ax.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'], levels=[-.5, 0, .5])

    # plot support vectors
    ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s = 100,
                facecolors='none', zorder=10, edgecolors='k', linewidth=1)

    plt.xlim(xlim[0], xlim[1])
    plt.ylim(ylim[0], ylim[1])
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.title(title)
    plt.show()

## Linear Data Classification (1.5 point)

In [None]:
## Read the documentation of the SVM class at
## https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
## Describe the role of
## C: It is the strength of penalty, ot acts as inverse regularization parameter. If C is high margin length decreases and vice versa.
##
## kernel: It specifies the type of kernal used. if none is give it uses "rbf" by default.  If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples)
##
## gamma:
##
## degree: It specifies the degeree of kernal polynomial function. it is non negative.

## load linear dataset

linear_test_data = pd.read_csv("data_linear_test.csv")
linear_train_data = pd.read_csv("data_linear_train.csv")

## plot training data
print(linear_train_data)
xpoints = np.array(linear_train_data["x1"])
ypoints = np.array(linear_train_data["x2"])

plt.scatter(xpoints, ypoints,c=linear_train_data["y"])
plt.show()


In [None]:
## create an instance of the SVC(kernel=”linear”,C=0.1)
model = SVC(kernel = "linear" , C = 0.1)
## fit the model
model.fit(linear_train_data[["x1" , "x2"]], linear_train_data["y"])

In [None]:
## plot svm boundary
plot_svm_boundary(model, linear_train_data[["x1" , "x2"]], linear_train_data["y"], figsize=(8,6),title="SVM Boundary Plot")
## you can use above mentioned function for plotting

In [None]:
## print the confusion matrix, accuracy scores for training data.
from sklearn import datasets, svm
from sklearn.metrics import ConfusionMatrixDisplay , accuracy_score
disp = ConfusionMatrixDisplay.from_estimator(
        model,
        linear_train_data[["x1" , "x2"]],
        linear_train_data[["y"]],
        cmap=plt.cm.Blues,
    )


In [None]:
## predict labels for linear test dataset
pred = model.predict(linear_test_data[["x1", "x2"]])
pred

In [None]:
## saving linear test result datframe to csv file
linear_test_data["y"] = pred
linear_test_data

#Non-Linear SVM Classifier

In [None]:
## loading non-linear dataset
non_linear_train_data = pd.read_csv("data_nonlinear_train.csv")
xpoints = np.array(non_linear_train_data["x1"])
ypoints = np.array(non_linear_train_data["x2"])

plt.scatter(xpoints, ypoints,c=linear_train_data["y"])
plt.show()
non_linear_train_data
## plotting training data

###Linear Kernel

In [None]:
## SVC class with linear kernel and C = 0.2
model_nonlinear = SVC(kernel = "linear" , C = 0.2)
model_nonlinear.fit(non_linear_train_data[["x1" , "x2"]], non_linear_train_data["y"])

In [None]:
## plot svm boundary
plot_svm_boundary(model_nonlinear, non_linear_train_data[["x1" , "x2"]], non_linear_train_data["y"], figsize=(8,6),title="SVM Boundary Plot for nonlinear train dataset")

In [None]:
## print the confusion matrix, accuracy scores for training data.
from sklearn.metrics import classification_report
disp = ConfusionMatrixDisplay.from_estimator(
        model_nonlinear,
        non_linear_train_data[["x1" , "x2"]],
        non_linear_train_data[["y"]],
        cmap=plt.cm.Blues,
    )
non_linear_pred_nonlinear_model = model_nonlinear.predict(non_linear_train_data[["x1" , "x2"]])
target_names = [ 'class 1', 'class 2']
print(classification_report(non_linear_train_data["y"], non_linear_pred_nonlinear_model, target_names=target_names))


In [114]:
## load non-linear test data
non_linear_test_data = pd.read_csv("data_nonlinear_test.csv")


In [115]:
## predicting labels for non-linear test dataset
non_linear_test_pred = model_nonlinear.predict(non_linear_test_data[["x1" , "x2"]])


### Polynomial Kernel

In [None]:
## SVC Class with polynomial kernel, degree = 2 and C = 0.2
model_dg2 = SVC(kernel = "linear" , C = 0.2 , degree = 2)
model_dg2.fit(non_linear_test_data[["x1" , "x2"]], non_linear_test_data["y"])

In [None]:
## plot svm boundary
plot_svm_boundary(model_dg2, non_linear_train_data[["x1" , "x2"]], non_linear_train_data["y"], figsize=(8,6),title="SVM Boundary Plot for nonlinear train dataset")

In [None]:
## print the confusion matrix, accuracy scores for training data.
disp = ConfusionMatrixDisplay.from_estimator(
        model_dg2,
        non_linear_train_data[["x1" , "x2"]],
        non_linear_train_data[["y"]],
        cmap=plt.cm.Blues,
    )
target_names = [ 'class 1', 'class 2']
non_linear_train_pred_model_dg2 = model_dg2.predict(non_linear_train_data[["x1" , "x2"]])
print(classification_report(non_linear_train_data["y"], non_linear_train_pred_model_dg2, target_names=target_names))


In [121]:
## predicting labels for non-linear test dataset
non_linear_test_pred_model_dg2 = model_dg2.predict(non_linear_test_data[["x1" , "x2"]])

### RBF Kernel

In [None]:
## SVC Class with RBF kernel and C=0.2
svc_rbf = SVC(kernel = "rbf")
svc_rbf.fit(non_linear_train_data[["x1" , "x2"]], non_linear_train_data["y"])

In [None]:
## plot svm boundary
plot_svm_boundary(svc_rbf, non_linear_train_data[["x1" , "x2"]], non_linear_train_data["y"], figsize=(8,6),title="SVM Boundary Plot for nonlinear train dataset")


In [None]:
## print the confusion matrix, accuracy scores for training data.
disp = ConfusionMatrixDisplay.from_estimator(
        svc_rbf,
        non_linear_train_data[["x1" , "x2"]],
        non_linear_train_data[["y"]],
        cmap=plt.cm.Blues,
    )
target_names = [ 'class 1', 'class 2']
non_linear_train_pred_svc_rbf = svc_rbf.predict(non_linear_train_data[["x1" , "x2"]])
print(classification_report(non_linear_train_data["y"], non_linear_train_pred_svc_rbf, target_names=target_names))

In [123]:
## predicting labels for non-linear test dataset
non_linear_test_svc_rbf_pred = svc_rbf.predict(non_linear_test_data[["x1" , "x2"]])

In [None]:
## saving non-linear test result datframe to csv file
non_linear_test_data["y"] = non_linear_test_svc_rbf_pred
non_linear_test_data

## Write your observations with comparisons. (0.5 point)

## Hyper-Parameter Tuning (1.5 point)

In [None]:
## load the data
wine = pd.read_csv("wine_fraud.csv")
wine

In [154]:
target = wine["quality"]
indep = wine.drop("quality",axis=1)

In [144]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(indep , target, test_size=0.33, random_state=42)

In [None]:
## data pre-processing


## apply label encoding to the categorical feature column
## https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn.preprocessing import LabelEncoder , StandardScaler
le = LabelEncoder()
le.fit(target)
result = le.fit_transform(target)
le.fit(wine["quality"].unique())
wine["quality"] = le.fit_transform(wine["quality"])
## split into X and y, target column is 'quality'

## apply feature standardization
## https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
scaler = StandardScaler()
print(scaler.fit(indep))

In [None]:
## train-test split

In [None]:
## define param_grid

In [None]:
## define Grid Search class

In [None]:
## print best score

In [None]:
## print best params


In [None]:
## train the final model with best parameters

In [None]:
## classification report for test data