## Regularized Support Vector Machine
### Joseph Melby
### 11/02/18
### Problem 1
#### Given training data: MNIST X train.csv (feature values), MNIST y train.csv (labels)

#### Test data: MNIST X test.csv (feature values), MNIST y test.csv (labels) .

#### File House feature MNIST description.csv gives a brief introduction to these data sets.

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline 

def read_dataset(feature_file, label_file):
    ''' Read data set in *.csv to data frame in Pandas'''
    df_X = pd.read_csv(feature_file)
    df_y = pd.read_csv(label_file)
    X = df_X.values # convert values in dataframe to numpy array (features)
    y = df_y.values # convert values in dataframe to numpy array (label)
    return X, y


X_train, y_train = read_dataset('MNIST_X_train.csv', 'MNIST_y_train.csv')
X_test, y_test = read_dataset('MNIST_X_test.csv', 'MNIST_y_test.csv')

#print(X_train.shape)
#print(X_test.shape)

def plot_digit(feature_vector): 
    plt.gray() 
    plt.matshow(feature_vector.reshape(8,8))
    plt.show() 

#plot_digit(X_train[0])
print('Label ', y_train[0])

def normalize_features(X_train, X_test):
    from sklearn.preprocessing import StandardScaler #import libaray
    scaler = StandardScaler() # call an object function
    scaler.fit(X_train) # calculate mean, std in X_train
    X_train_norm = scaler.transform(X_train) # apply normalization on X_train
    X_test_norm = scaler.transform(X_test) # we use the same normalization on X_test
    return X_train_norm, X_test_norm

X_train_norm, X_test_norm = normalize_features(X_train, X_test)


def one_hot_encoder(y_train, y_test):
    trainneg = np.ones(shape=(len(y_train),10))
    train = -1*trainneg
    testneg = np.ones(shape=(len(y_train),10))
    test = -1*testneg
    for i in range(len(y_train)):
        trainval = y_train[i]
        train[i,trainval] = 1
        testval = y_train[i]
        test[i,testval] = 1
        y_train_ohe = train
        y_test_ohe = test
    return y_train_ohe, y_test_ohe
# label is 0 -> [1 0 0 0 0 0 0 0 0]
# label is 3 -> [0 0 0 1 0 0 0 0 0]

y_train_ohe, y_test_ohe = one_hot_encoder(y_train, y_test)
print(y_train_ohe[0])
print(y_train_ohe[1])
print(y_test_ohe[0])


# Predictor function should be a general hyperplane equation.
def predictor(X, c, b):
    ##hyperplane function
     return X.dot(c)+b


######################### Our classifier at the end should output either 1 or -1 ##################################
    
def loss(X, y_train, c, b, reg):
#     print(y_pred.shape)
    hingeloss = np.zeros(shape=(len(y_train)))
    for i in range(len(y_train)):
        hingeloss[i]= np.max([0, 1-y_train[i]*predictor(X,c,b)[i]])
    return np.linalg.norm(c) + reg*sum(hingeloss)

def hinge_gradient(X, y_train, c, b):
    hingegrad = np.zeros(shape=(len(c)+1))
    for i in range(len(y_train)):
        if 1-y_train[i]*predictor(X,c,b)[i] > 0:
            hingegrad[0] += -y_train[i]
            for j in range(len(c)):
                hingegrad[j+1] = -y_train[i]*X[i,j]
    return hingegrad

def gradient_descent(X, y, c, b, reg, epochs=1000, learning_rate=0.01):
    loss_history = [0]*epochs
    normgrad = np.zeros(shape=(len(c)+1))
    for i in range(len(c)):
        normgrad[i+1] = c[i]*(np.linalg.norm(c))**(-1/2)
    hingegrad = hinge_gradient(X, y_train, c, b)
    for epoch in range(epochs):
        loss_history[epoch] = loss(X, y, c, b, reg).ravel()
        gradient = normgrad + hingegrad
        # updating coeffs upon the gradient change]
        b = b - learning_rate*gradient[0]
        c = c - learning_rate*gradient[1:]
    return c, b, loss_history


def log_reg_binary_train(X_train, y_train):  
    ''' Training our model based on the training data
        Input: X_train: input features
               y_train: binary labels
        Return: coeffs of the logistic model
    '''
    coeffs_0 = np.zeros((X_train_norm.shape[1], 1))
    b_0 = 0
    coeffs_grad, b_grad, history_loss = gradient_descent(X_train, y_train, coeffs_0, b_0, 100, epochs=100, learning_rate=0.1)
    return coeffs_grad


def log_reg_OVR_train(X_train, y_train):# y_train: one_hot_encoder labels
    # y_train will have 10 columns
    weights = []
    for i in range(y_train.shape[1]): # 10 columns 
        y_train_one_column = y_train[:,i] # pick ith columns
        weights_one_column = log_reg_binary_train(X_train, y_train_one_column)
        weights.append(weights_one_column)
    return weights


def prediction(weights_list, X_test):
    i = 0
    for weights in weights_list:
        decision_one_column = predictor(X_test, weights)
        # probabily of one column
        if i == 0:
            decision_matrix = decision_one_column
        else:
            # combine all decision columns to form a matrix
            decision_matrix = np.concatenate(
                              (decision_matrix, decision_one_column),
                               axis=1)
        i += 1
    labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    num_test_samples = X_test.shape[0]
    # find which index gives us the highest probability
    ypred = np.zeros(num_test_samples, dtype=int) 
    for i in range(num_test_samples):
        ypred[i] = labels[np.argmax(decision_matrix[i,:])]
    return ypred


weights_list = log_reg_OVR_train(X_train_norm, y_train_ohe)
index = 20    
plot_digit(X_test[index])
ypred = prediction(weights_list, X_test_norm[index:index+1])
print(ypred)

def accuracy(ypred, yexact):
    p = np.array(ypred == yexact, dtype = int)
    return np.sum(p)/float(len(yexact))

ypred = prediction(weights_list, X_test_norm)
print('Accuracy of our model ', accuracy(ypred, y_test.ravel()))


# Stochastic GD (SGD)
def SGD(X, y, c, epochs=1000, learning_rate=0.00, batch_size=10):
    y = y.reshape(-1, 1)
    loss_history = [0]*epochs
    for epoch in range(epochs):
        # loop through batches
        batch_loss = []
        for i in np.arange(0, X.shape[0], batch_size):
            X_current_batch = X[i:i+batch_size]
            y_current_batch = y[i:i+batch_size]
            yhat = predictor(X_current_batch, c)
            loss_current_batch = loss_function(X_current_batch, c, y_current_batch).ravel()
            batch_loss.append(loss_current_batch)
            gradient = XT.dot(yhat - y)/float(len(y))
            c = c - learning_rate*gradient
        loss_history[epoch] = np.average(batch_loss)
    return c, loss_history

Label  [2]
[-1. -1.  1. -1. -1. -1. -1. -1. -1. -1.]
[-1. -1. -1. -1. -1. -1. -1. -1. -1.  1.]
[-1. -1.  1. -1. -1. -1. -1. -1. -1. -1.]




ValueError: setting an array element with a sequence.