## Regularized Support Vector Machine
### Joseph Melby
### 11/02/18

#### Given training data: MNIST X train.csv (feature values), MNIST y train.csv (labels)

#### Test data: MNIST X test.csv (feature values), MNIST y test.csv (labels) .

#### File House feature MNIST description.csv gives a brief introduction to these data sets.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler

def read_dataset(feature_file:str, label_file:str) -> tuple:
    """Reads data set in *.csv to data frame in Pandas.

    Args:
    feature_file (str): File path to input features
    label_file (str): File path to input labels

    Returns:
    tuple: A tuple of numpy arrays (features and labels)
    """
    df_X = pd.read_csv(feature_file)
    df_y = pd.read_csv(label_file)
    X = df_X.values # convert values in dataframe to numpy array (features)
    y = df_y.values # convert values in dataframe to numpy array (label)
    return X, y

X_train, y_train = read_dataset('MNIST_X_train.csv', 'MNIST_y_train.csv')
X_test, y_test = read_dataset('MNIST_X_test.csv', 'MNIST_y_test.csv')

def plot_digit(feature_vector):
    """Plots a given digit.

    Args:
    feature_vector (numpy array): A vector representing a single digit

    Returns:
    None
    """
    plt.gray() 
    plt.matshow(feature_vector.reshape(8,8))
    plt.show() 

print('Label ', y_train[0])

def normalize_features(X_train:np.ndarray, X_test:np.ndarray) -> tuple:
    """Normalizes features using StandardScaler from sklearn.

    Args:
    X_train (numpy array): Training data features
    X_test (numpy array): Testing data features

    Returns:
    tuple: Normalized features for training and testing data
    """
    scaler = StandardScaler() # call an object function
    scaler.fit(X_train) # calculate mean, std in X_train
    X_train_norm = scaler.transform(X_train) # apply normalization on X_train
    X_test_norm = scaler.transform(X_test) # we use the same normalization on X_test
    return X_train_norm, X_test_norm

X_train_norm, X_test_norm = normalize_features(X_train, X_test)

def one_hot_encoder(y_train:np.ndarray, y_test:np.ndarray) -> tuple:
    """One-hot encodes the given labels.

    Args:
    y_train (numpy array): Training data labels
    y_test (numpy array): Testing data labels

    Returns:
    tuple: One-hot encoded labels for training and testing data
    """
    trainneg = np.ones(shape=(len(y_train),10))
    train = -1*trainneg
    testneg = np.ones(shape=(len(y_train),10))
    test = -1*testneg
    for i in range(len(y_train)):
        trainval = y_train[i]
        train[i,trainval] = 1
        testval = y_train[i]
        test[i,testval] = 1
        y_train_ohe = train
        y_test_ohe = test
    return y_train_ohe, y_test_ohe

y_train_ohe, y_test_ohe = one_hot_encoder(y_train, y_test)
print(y_train_ohe[0])
print(y_train_ohe[1])
print(y_test_ohe[0])


def predictor(X, c, b):
    """
    Computes the prediction for the input data based on the hyperplane equation.
    
    Args:
        X (numpy.ndarray): Input data of shape (n_samples, n_features).
        c (numpy.ndarray): Coefficients of the hyperplane of shape (n_features,).
        b (float): Intercept of the hyperplane.
    
    Returns:
        numpy.ndarray: Prediction of the input data based on the hyperplane equation of shape (n_samples,).
    """
    return X.dot(c) + b


def loss(X, y_train, c, b, reg):
    """
    Computes the loss function for SVM.
    
    Parameters:
        X (numpy array): Input data
        y_train (numpy array): Target labels
        c (numpy array): Coefficients
        b (float): Intercept
        reg (float): Regularization parameter
        
    Returns:
        float: Value of the loss function
    """
    hingeloss = np.zeros(shape=(len(y_train)))
    for i in range(len(y_train)):
        hingeloss[i]= np.max([0, 1-y_train[i]*predictor(X,c,b)[i]])
    return np.linalg.norm(c) + reg*sum(hingeloss)


def hinge_gradient(X, y_train, c, b):
    """
    Computes the gradient of the hinge loss function.
    
    Parameters:
        X (numpy array): Input data
        y_train (numpy array): Target labels
        c (numpy array): Coefficients
        b (float): Intercept
        
    Returns:
        numpy array: Gradient of the hinge loss function
    """
    hingegrad = np.zeros(shape=(len(c)+1))
    for i in range(len(y_train)):
        if 1-y_train[i]*predictor(X,c,b)[i] > 0:
            hingegrad[0] += -y_train[i]
            for j in range(len(c)):
                hingegrad[j+1] = -y_train[i]*X[i,j]
    return hingegrad


def gradient_descent(X, y, c, b, reg, epochs=1000, learning_rate=0.01):
    """
    Performs gradient descent to optimize the SVM objective function.
    
    Parameters:
        X (numpy array): Input data
        y (numpy array): Target labels
        c (numpy array): Coefficients
        b (float): Intercept
        reg (float): Regularization parameter
        epochs (int): Number of epochs for training
        learning_rate (float): Learning rate for gradient descent
        
    Returns:
        numpy array: Optimized coefficients
        float: Optimized intercept
        list: Loss history during training
    """
    loss_history = [0]*epochs
    normgrad = np.zeros(shape=(len(c)+1))
    for i in range(len(c)):
        normgrad[i+1] = c[i]*(np.linalg.norm(c))**(-1/2)
    hingegrad = hinge_gradient(X, y_train, c, b)
    for epoch in range(epochs):
        loss_history[epoch] = loss(X, y, c, b, reg).ravel()
        gradient = normgrad + hingegrad
        b = b - learning_rate*gradient[0]
        c = c - learning_rate*gradient[1:]
    return c, b, loss_history


def log_reg_binary_train(X_train, y_train):  
    """
    Trains a binary logistic regression model.
    
    Parameters:
        X_train (numpy array): Input data
        y_train (numpy array): Binary target labels
        
    Returns:
        numpy array: Optimized coefficients of the logistic model
    """
    coeffs_0 = np.zeros((X_train_norm.shape[1], 1))
    b_0 = 0
    coeffs_grad, b_grad, history_loss = gradient_descent(X_train, y_train, coeffs_0, b_0, 100, epochs=100, learning_rate=0.1)
    return coeffs_grad

def log_reg_OVR_train(X_train, y_train):
    """
    Trains a set of binary logistic regression models using the "One-vs-Rest" approach.

    Args:
    - X_train (ndarray): Input data with shape (n_samples, n_features).
    - y_train (ndarray): Target labels with shape (n_samples, n_classes).

    Returns:
    - weights_list (list): A list containing the weights of each trained model.
    """
    weights_list = []
    for i in range(y_train.shape[1]):
        y_train_one_column = y_train[:, i]
        weights_one_column = log_reg_binary_train(X_train, y_train_one_column)
        weights_list.append(weights_one_column)
    return weights_list

def prediction(weights_list, X_test):
    """
    Computes the predicted labels for a given set of test samples.

    Args:
    - weights_list (list): A list of weights of the trained binary logistic regression models.
    - X_test (ndarray): Test data with shape (n_samples, n_features).

    Returns:
    - ypred (ndarray): Predicted labels for each test sample with shape (n_samples,).
    """
    i = 0
    for weights in weights_list:
        decision_one_column = predictor(X_test, weights)
        if i == 0:
            decision_matrix = decision_one_column
        else:
            decision_matrix = np.concatenate((decision_matrix, decision_one_column), axis=1)
        i += 1
    labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    num_test_samples = X_test.shape[0]
    ypred = np.zeros(num_test_samples, dtype=int)
    for i in range(num_test_samples):
        ypred[i] = labels[np.argmax(decision_matrix[i, :])]
    return ypred

weights_list = log_reg_OVR_train(X_train_norm, y_train_ohe)
index = 20    
plot_digit(X_test[index])
ypred = prediction(weights_list, X_test_norm[index:index+1])
print(ypred)

def accuracy(ypred, yexact):
    """
    Computes the accuracy of the predicted labels.

    Args:
    - ypred (ndarray): Predicted labels with shape (n_samples,).
    - yexact (ndarray): True labels with shape (n_samples,).

    Returns:
    - accuracy (float): The accuracy of the predicted labels.
    """
    p = np.array(ypred == yexact, dtype=int)
    return np.sum(p) / float(len(yexact))

ypred = prediction(weights_list, X_test_norm)
print('Accuracy of our model ', accuracy(ypred, y_test.ravel()))


# Stochastic GD (SGD)
def SGD(X, y, c, epochs=1000, learning_rate=0.00, batch_size=10):
    """
    Performs stochastic gradient descent to minimize the loss function of logistic regression.

    Args:
    - X (ndarray): Input data with shape (n_samples, n_features).
    - y (ndarray): Target labels with shape (n_samples,).
    - c (ndarray): Initial weight vector with shape (n_features,).
    - epochs (int): The number of epochs to train the model.
    - learning_rate (float): The learning rate of the model.
    - batch_size (int): The batch size to use during training.

    Returns:
    - c (ndarray): The learned weight vector with shape (n_features,).
    - loss_history (list): A list of the loss values during training.
    """
    y = y.reshape(-1, 1)
    loss_history = [0]*epochs
    for epoch in range(epochs):
        # loop through batches
        batch_loss = []
        for i in np.arange(0, X.shape[0], batch_size):
            X_current_batch = X[i:i+batch_size]
            y_current_batch = y[i:i+batch_size]
            yhat = predictor(X_current_batch, c)
            loss_current_batch = loss(X_current_batch, c, y_current_batch).ravel()
            batch_loss.append(loss_current_batch)
            gradient = XT.dot(yhat - y)/float(len(y))
            c = c - learning_rate*gradient
        loss_history[epoch] = np.average(batch_loss)
    return c, loss_history