In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


def sigmoid(z):
    return 1.0 / (1 + np.exp(-z))


def predict(X, w, y=None):
    # X_new: Nsample x (d+1)
    # w: (d+1) x 1
    # y_new: Nsample

    z = np.dot(X, w)
    y_hat = sigmoid(z)
    loss = calc_loss_function(y_hat, y)
    pred = np.round(y_hat)
    acc = calc_accuracy(pred, y)
    return y_hat, loss, acc


def calc_accuracy(predicted_labels, actual_labels):
    diff = predicted_labels - actual_labels
    return 1.0 - (float(np.count_nonzero(diff)) / len(diff))


def calc_loss_function(y_hat, t):
    N = t.shape[0]
    cost1 = -t * np.log(y_hat)
    cost2 = (1 - t) * np.log(1 - y_hat)
    loss = cost1 - cost2
    loss = loss.sum() / N
    return loss


def train(X_train, y_train, X_val, t_val):
    N_train = X_train.shape[0]
    N_val = X_val.shape[0]

    w = np.zeros((X_train.shape[1], N_class))

    # w: (d+1)x1
    acc_val_list = []
    losses_train = []
    W_best = None
    acc_best = 0.0
    epoch_best = 0
    epoch_list = [i for i in range(1, MaxEpoch + 1)]

    for epoch in range(MaxEpoch):
        loss_this_epoch = 0
        for b in range(int(np.ceil(N_train / batch_size))):
            X_batch = X_train[b * batch_size: (b + 1) * batch_size]
            y_batch = y_train[b * batch_size: (b + 1) * batch_size]

            y_hat_batch, loss_batch, acc = predict(X_batch, w, y_batch)
            loss_this_epoch += loss_batch

            # Mini-batch gradient descent
            w = w - alpha * (1 / batch_size) * np.dot(X_batch.T, y_hat_batch - y_batch)

        # monitor model behavior after each epoch
        # 1. Compute the training loss by averaging loss_this_epoc
        # loss_this_epoch = loss_this_epoch / int(np.ceil(N_train / batch_size))
        loss_this_epoch = loss_this_epoch / N_train
        losses_train.append(loss_this_epoch)
        # print(loss_this_epoch)
        # 2.perform validation on the validation dataset
        _, _, acc_val = predict(X_val, w, t_val)
        acc_val_list.append(acc_val)
        # 3. keep track of the best validation epoch, acc, and weight
        current_acc_best = max(acc_best, acc_val)
        if current_acc_best != acc_best:
            acc_best = current_acc_best
            W_best = w
            epoch_best = epoch
        print(
            "epoch:[{0}/{1}]\t"
            "alpha:{2}\t"
            "train loss: {loss:.5f}\t"
            "acc:{acc:.5f}\t"
            "acc best:{acc_best:.5f}\t"
            "epoch best:{epoch_best}\t".format(epoch, MaxEpoch, alpha, loss=loss_this_epoch, acc=acc_val, acc_best=acc_best,
                                               epoch_best=epoch_best))

    # plot_train_graph(losses_train, epoch_list)
    # plot_val_graph(acc_val_list, epoch_list)
    return epoch_best, acc_best, W_best


if __name__ == '__main__':
    # alpha = 0.1  # learning rate
    batch_size = 100  # batch size
    # MaxEpoch = 50  # Maximum epoch
    alpha_list = [0.1, 0.01]
    MaxEpoch_list = [20, 50]
    N_class = 1

    data = pd.read_csv("/content/drive/My Drive/cmput466_final/spambase/spambase.csv")
    target_arr = []
    data_target = data["class"]

    for ele in data_target.items():
        target_arr.append(ele[1])
    y_ = np.array(target_arr)

    x_train, x_test, y_train, y_test = train_test_split(data.iloc[:, :-1], y_, test_size=0.2, random_state=0)

    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=0)

    x_train = (x_train - np.mean(x_train, axis=0)) / np.std(x_train, axis=0)
    x_val = (x_val - np.mean(x_val, axis=0)) / np.std(x_val, axis=0)
    x_test = (x_test - np.mean(x_test, axis=0)) / np.std(x_test, axis=0)

    X_train = np.concatenate((np.ones([x_train.shape[0], 1]), x_train), axis=1)
    X_val = np.concatenate((np.ones([x_val.shape[0], 1]), x_val), axis=1)
    X_test = np.concatenate((np.ones([x_test.shape[0], 1]), x_test), axis=1)

    # print(X_train.shape)
    # print(X_val.shape)
    # print(X_test.shape)

    y_train = y_train.reshape([-1, 1])
    y_val = y_val.reshape([-1, 1])
    y_test = y_test.reshape([-1, 1])

    for alpha in alpha_list:
        for MaxEpoch in MaxEpoch_list:
            _, _, w = train(X_train, y_train, X_val, y_val)
            y_hat_test, _, acc_test = predict(X_test, w, y_test)
            print("test accuracy : %.5f" % acc_test)
            print("---------------------------------------------")

epoch:[0/20]	alpha:0.1	train loss: 0.00463	acc:0.90109	acc best:0.90109	epoch best:0	
epoch:[1/20]	alpha:0.1	train loss: 0.00332	acc:0.90326	acc best:0.90326	epoch best:1	
epoch:[2/20]	alpha:0.1	train loss: 0.00299	acc:0.90652	acc best:0.90652	epoch best:2	
epoch:[3/20]	alpha:0.1	train loss: 0.00283	acc:0.90978	acc best:0.90978	epoch best:3	
epoch:[4/20]	alpha:0.1	train loss: 0.00272	acc:0.90870	acc best:0.90978	epoch best:3	
epoch:[5/20]	alpha:0.1	train loss: 0.00265	acc:0.91087	acc best:0.91087	epoch best:5	
epoch:[6/20]	alpha:0.1	train loss: 0.00260	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[7/20]	alpha:0.1	train loss: 0.00255	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[8/20]	alpha:0.1	train loss: 0.00252	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[9/20]	alpha:0.1	train loss: 0.00248	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[10/20]	alpha:0.1	train loss: 0.00246	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[11/20]	alpha:0.1	train loss: 0.00243	acc:0.913



epoch:[8/50]	alpha:0.1	train loss: 0.00252	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[9/50]	alpha:0.1	train loss: 0.00248	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[10/50]	alpha:0.1	train loss: 0.00246	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[11/50]	alpha:0.1	train loss: 0.00243	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[12/50]	alpha:0.1	train loss: 0.00241	acc:0.91087	acc best:0.91304	epoch best:6	
epoch:[13/50]	alpha:0.1	train loss: 0.00239	acc:0.91087	acc best:0.91304	epoch best:6	
epoch:[14/50]	alpha:0.1	train loss: 0.00238	acc:0.91087	acc best:0.91304	epoch best:6	
epoch:[15/50]	alpha:0.1	train loss: 0.00236	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[16/50]	alpha:0.1	train loss: 0.00235	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[17/50]	alpha:0.1	train loss: 0.00233	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[18/50]	alpha:0.1	train loss: 0.00232	acc:0.91304	acc best:0.91304	epoch best:6	
epoch:[19/50]	alpha:0.1	train loss: 0.00231	a

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

def data_preprocess():
    data = pd.read_csv("/content/drive/My Drive/cmput466_final/spambase/spambase.csv")
    target_arr = []
    data_target = data["class"]

    for ele in data_target.items():
        target_arr.append(ele[1])
    y_ = np.array(target_arr)

    x_train, x_test, y_train, y_test = train_test_split(data.iloc[:, :-1], y_, test_size=0.2, random_state=0)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=0)


    x_train = (x_train - np.mean(x_train, axis=0)) / np.std(x_train, axis=0)
    x_test = (x_test - np.mean(x_test, axis=0)) / np.std(x_test, axis=0)

    X_train = np.concatenate((np.ones([x_train.shape[0], 1]), x_train), axis=1)
    X_test = np.concatenate((np.ones([x_test.shape[0], 1]), x_test), axis=1)

    y_train = y_train.reshape([-1, 1])
    y_test = y_test.reshape([-1, 1])

    return X_train, y_train, X_test, y_test

def calc_accuracy(predicted_labels, actual_labels):
    diff = predicted_labels - actual_labels
    return 1.0 - (float(np.count_nonzero(diff)) / len(diff))

def majority_guess(y_train, y_test):
    prediction = np.bincount(y_train.flatten()).argmax()
    guess = np.array([prediction for i in range(len(y_test))])
    guess = guess.reshape([-1, 1])
    return guess

def decision_tree(X_train, y_train, X_test):
    classifier = DecisionTreeClassifier()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    y_pred = y_pred.reshape([-1, 1])
    return y_pred

def knn(X_train, y_train, X_test, k):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    y_pred = y_pred.reshape([-1, 1])
    return y_pred

def neural_network(X_train, y_train, X_test, opt, max_iter):
    nn = MLPClassifier(solver=opt, activation='relu', alpha=1e-3, hidden_layer_sizes = (5, 2), max_iter=max_iter)
    nn.fit(X_train, y_train)
    y_pred = nn.predict(X_test)
    y_pred = y_pred.reshape([-1, 1])
    return y_pred



if __name__ == '__main__':
    X_train, y_train, X_test, y_test = data_preprocess()
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)
    maj_guess = majority_guess(y_train, y_test)
    acc_maj = calc_accuracy(maj_guess, y_test)
    print("Acc of majority guess: %.5f" % acc_maj)

    dt = decision_tree(X_train, y_train.flatten(), X_test)
    acc_nb = calc_accuracy(dt, y_test)
    print("Acc of decision tree: %.5f" % acc_nb)

    for k in range(3, 6):
      knn_res = knn(X_train, y_train.flatten(), X_test, k)
      acc_knn = calc_accuracy(knn_res, y_test)
      print("Acc of knn: %.5f with k = %d" % (acc_knn, k))
    for opt in ["adam", "sgd"]:
      for max_iter in [100, 200]:
        nn = neural_network(X_train, y_train.flatten(), X_test, opt=opt, max_iter=max_iter)
        acc_nn = calc_accuracy(nn, y_test)
        print("iteration:[%d], opt:[%s], Acc of neural network: %.5f" % (max_iter, opt, acc_nn))




(2760, 58)
(2760, 1)
(921, 58)
(921, 1)
Acc of majority guess: 0.58415
Acc of decision tree: 0.91857
Acc of knn: 0.91531 with k = 3
Acc of knn: 0.90988 with k = 4
Acc of knn: 0.91314 with k = 5




iteration:[100], opt:[adam], Acc of neural network: 0.91965




iteration:[200], opt:[adam], Acc of neural network: 0.91314




iteration:[100], opt:[sgd], Acc of neural network: 0.90445
iteration:[200], opt:[sgd], Acc of neural network: 0.89142


