In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split as tts
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [4]:
def initialize_dataset(input_file: str) -> tuple:

    dataframe = pd.read_csv(input_file)
    cols = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
    pre = ['gender', 'group', 'parent_ed', 'lunch', 'prep_course']
    dataframe  = pd.get_dummies(dataframe , columns=cols, prefix=pre, dtype=int)

    # Convert score columns to integers
    dataframe[['math score', 'reading score', 'writing score']] = dataframe[['math score', 'reading score', 'writing score']].astype(int)

    # Create a new column 'passed' based on your condition
    dataframe['passed'] = dataframe.apply(lambda row: 1 if (row['math score'] > 60) & (row['reading score'] > 60) & (row['writing score'] > 60) else -1, axis=1)
    dataframe.drop(['math score', 'reading score', 'writing score'], axis=1, inplace=True)

    x_train, x_test, y_train, y_test = tts(dataframe.drop(['passed'], axis=1), dataframe['passed'], test_size=0.2, random_state=42)

    return (x_train.to_numpy(), y_train.to_numpy(), x_test.to_numpy(), y_test.to_numpy())

In [5]:
def linear_kernel(x1 : np.ndarray, x2 : np.ndarray) -> np.ndarray:
    return np.dot(x1, x2.T)

In [6]:
def rbf_kernel(x1 : np.ndarray, x2 : np.ndarray) -> np.ndarray:
    sigma = 1.0
    return np.exp((np.linalg.norm(x1 - x2) ** 2) / (-2 * sigma ** 2))

In [15]:
class SVM:
    def __init__(self, kernel, C = 0.5):
        self.w = 0
        self.b = 0
        self.kernel = kernel
        self.C = C

    # Hinge Loss Function / Calculation
    def hingeloss(self, w, b, x, y):

        # Regularizer term
        reg = 0.5 * (w * w)

        for i in range(x.shape[0]):
            # Optimization term
            opt_term = y[i] * (self.kernel(w, x[i]) - b)

            # calculating loss
            loss = reg + self.C * max(1 - opt_term, 0)

        return loss[0][0]

    def fit(self, X, Y, portion_size = 100, learn_rate = 0.001, max_iter = 1000):
        # The number of features in X
        number_of_features = X.shape[1]

        # The number of Samples in X
        number_of_samples = X.shape[0]

        # Creating ids from 0 to number_of_samples - 1
        ids = np.arange(number_of_samples)

        # Shuffling the samples randomly
        np.random.shuffle(ids)

        # Create an array of random numbers in the range [-1, 1] for w
        w = np.random.uniform(low=-1, high=1, size=(1, number_of_features))

        # Create a random number in the range [-10, 10] for b
        b = np.random.uniform(low=-10, high=10)
        losses = []

        # Gradient Descent logic
        for _ in range(max_iter):
            # Calculating the Hinge Loss
            loss = self.hingeloss(w, b, X, Y)

            # Appending all losses
            losses.append(loss)

            # Starting from 0 to the number of samples with batch_size as interval
            for batch_initial in range(0, number_of_samples, portion_size):
                gradw = 0
                gradb = 0

                for j in range(batch_initial, batch_initial + portion_size):
                    if j < number_of_samples:
                        x = ids[j]
                        ti = Y[x] * (self.kernel(w, X[x]) + b)

                        if ti > 1:
                            gradw += 0
                            gradb += 0
                        else:
                            # Calculating the gradients

                            #w.r.t w
                            gradw += self.C *Y[x] * X[x]
                            # w.r.t b
                            gradb += self.C * Y[x]

                # Updating weights and bias
                w = w - learn_rate * w + learn_rate * gradw
                b = b + learn_rate * gradb

        self.w = w
        self.b = b

        return self.w, self.b, losses

    def predict(self, X):
        return np.sign(np.dot(X, self.w[0]) - self.b)


In [16]:
input_file = "dataset.csv"
x_train, y_train, x_test, y_test = initialize_dataset(input_file)
svm = SVM(linear_kernel)
w, b, losses = svm.fit(x_train, y_train)
prediction = svm.predict(x_test)
lss = losses.pop()

print("Loss:", lss)
print("Prediction:", prediction)
print("Accuracy:", accuracy_score(prediction, y_test))
print("Precision:", precision_score(y_test, prediction))
print("Recall:", recall_score(y_test, prediction))
print("Confusion Matrix:\n", confusion_matrix(y_test, prediction))
print("w, b:", [w, b])

Loss: 0.0016388487898917495
Prediction: [ 1.  1.  1.  1. -1.  1.  1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1.  1.
  1.  1.  1.  1.  1.  1. -1.  1.  1. -1.  1.  1.  1. -1. -1.  1.  1.  1.
  1.  1. -1.  1. -1.  1. -1.  1.  1.  1.  1.  1. -1.  1. -1.  1.  1.  1.
 -1.  1. -1.  1.  1.  1. -1. -1. -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
 -1.  1. -1.  1.  1.  1. -1.  1.  1.  1.  1. -1. -1. -1.  1.  1. -1.  1.
  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1. -1.  1.
  1. -1. -1.  1.  1.  1. -1. -1.  1. -1. -1. -1. -1.  1.  1.  1.  1. -1.
  1.  1. -1.  1.  1. -1.  1.  1.  1.  1.  1. -1. -1.  1.  1.  1.  1.  1.
 -1.  1.  1.  1.  1. -1.  1. -1. -1.  1.  1. -1.  1.  1. -1.  1.  1.  1.
 -1.  1. -1.  1.  1. -1.  1. -1. -1. -1.  1.  1. -1.  1. -1. -1.  1.  1.
 -1.  1. -1.  1.  1.  1.  1.  1.  1. -1. -1. -1.  1.  1.  1.  1.  1.  1.
 -1.  1.]
Accuracy: 0.69
Precision: 0.6956521739130435
Recall: 0.8275862068965517
Confusion Matrix:
 [[42 42]
 [20 96]]
w, b: [array([[ 0.05630129,  0.061397