In [3]:
# Basic Perceptron
# Reference: https://stackoverflow.com/questions/47213847/how-to-implement-perceptron-in-python

from __future__ import division
import numpy as np
import os
# os.chdir("/home/minion/Desktop/ML/HW2")
np.random.seed(123)

# Need to print the percent positive


def perceptron_avg(X, Y, epochs):
    ones = np.ones(X.shape[0]).reshape(X.shape[0], 1)  # Creates an array of ones for the bias row.
    X1 = np.append(ones, X, axis=1)  # Append the data matrix to the new row of ones.

    w = np.zeros(X1.shape[1])  # Create an array of zeros to store the weights.
    w_a = np.zeros(X1.shape[1])  # average weights
    final_iter = epochs  # Assign final iteration variable to the specified epoch.
    
    c=0
    update_counter=0
    for epoch in range(epochs):  # For each epoch until reaching the maximum specified epoch.
        misclassified = 0

        for i, x in enumerate(X1):  # For each observation and its index in the training set
            y = Y[i]  # Store label for the current observation.
            predicted_y = np.dot(x, w)  # Calculate the prediction for y.
            h = predicted_y*y  # Create a flag to check if the prediction is right.

            if h <= 0:  # If prediction is below or equal to zero, then it is wrong.
                w = w + x*y  # Update the weight to shift and rotate the plane to be more accurate.
                w_a = w_a + c*x*y
                misclassified += 1  # Add 1 to the misclassification counter. 
            c+=1   
            
        update_counter+=misclassified
            # else: if the prediction is above 0, then it is correct, and we can proceed.
        # The process is repeated until all observations have been iterated over, for the requested number of epochs.
        if misclassified == 0:  # If we converge, we don't need to continue.
            final_iter = epoch
            break

    updates = misclassified
    return c*w-w_a, final_iter, updates,update_counter/(final_iter*X1.shape[0])  # Return an array of weight and the number of epochs went.


def calculate_error_rate(X, Y, w):
    ones = np.ones(X.shape[0]).reshape(X.shape[0], 1)  # Creates an array of ones for the bias row.
    X1 = np.append(ones, X, axis=1)  # Append the data matrix to the new row of ones.

    misclassified = 0

    for i, x in enumerate(X1):  # For each observation and its index in the dev set
        y = Y[i]  # Store label for the current observation.
        predicted_y = np.dot(x, w)  # Calculate the prediction for y.
        h = predicted_y * y  # Create a flag to check if the prediction is right.

        if h <= 0:  # If prediction is below or equal to zero, then it is wrong.
            misclassified += 1  # Add 1 to the misclassification counter.

    return misclassified/(X.shape[0])


def load_binarized_features(filename, num_rows, num_features):

    lines = open(filename).readlines()
    lines = [line.strip().split(", ") for line in lines]

    loaded_data = [[value for idx, value in enumerate(line) if idx not in [9]] for line in lines] #9 is target

    mapping = {}
    new_data = []

    for row in loaded_data:
        new_row = []
        for j, x in enumerate(row):
            feature = (j, x)
            if feature not in mapping:
                mapping[feature] = len(mapping)  # insert a new feature into the index
            new_row.append(mapping[feature])
        new_data.append(new_row)

    binary_data = np.zeros((num_rows, num_features))
    # store normalized numerical values
    for idx, row in enumerate(new_data):
        for jdx in row:
            binary_data[idx][jdx] = 1

    return binary_data


def load_labels(filename):
    lines = open(filename).readlines()
    lines = [line.strip().split(", ") for line in lines]

    labels = [[value for idx, value in enumerate(line) if idx in [9]] for line in lines]
    labels = [val for sublist in labels for val in sublist]
    y_array = []

    for label in labels:
        if label == '<=50K':
            y_array.append(-1)
        elif label == '>50K':
            y_array.append(1)

    y_array = np.array(y_array)
    return y_array

def main():

    filenames = ['income.train.txt.5k', 'income.dev.txt']
    with open('combined.txt', 'w') as outfile:
        for f in filenames:
            with open(f) as infile:
                for line in infile:
                    outfile.write(line)

    filename = 'combined.txt'
    num_rows = 6000
    num_features = 233 ### WHY?

    binarized_features = load_binarized_features(filename, num_rows, num_features)

    xTrain = binarized_features[0:5000]
    xDev = binarized_features[5000:6001]

    labels = load_labels(filename)

    yTrain = np.array(labels[0:5000])
    yDev = np.array(labels[5000:6001])


    # Fit perceptron
    max_epochs = 100
    for epoch in range(1, max_epochs+1):
        w, final_iter, updates, update_rate = perceptron_avg(xTrain, yTrain, epoch)
        # Calculate error rate
        print("epoch", epoch, 
              "updates", updates, round(calculate_error_rate(xDev, yDev, w)*100, 2), "%",
              'update_rate:',update_rate)
        epoch += 1


if __name__ == "__main__":
    main()

epoch 1 updates 1257 15.0 % update_rate: 0.2514
epoch 2 updates 1221 15.1 % update_rate: 0.2478
epoch 3 updates 1177 14.8 % update_rate: 0.24366666666666667
epoch 4 updates 1170 14.7 % update_rate: 0.24125
epoch 5 updates 1172 14.8 % update_rate: 0.23988
epoch 6 updates 1185 15.2 % update_rate: 0.2394
epoch 7 updates 1165 15.5 % update_rate: 0.23848571428571427
epoch 8 updates 1185 15.9 % update_rate: 0.2383
epoch 9 updates 1184 15.8 % update_rate: 0.23813333333333334
epoch 10 updates 1181 15.7 % update_rate: 0.23794
epoch 11 updates 1156 15.6 % update_rate: 0.23732727272727272
epoch 12 updates 1138 15.7 % update_rate: 0.23651666666666665
epoch 13 updates 1165 15.7 % update_rate: 0.23624615384615386
epoch 14 updates 1174 15.7 % update_rate: 0.23614285714285715
epoch 15 updates 1166 15.6 % update_rate: 0.23594666666666667
epoch 16 updates 1162 15.6 % update_rate: 0.235725
epoch 17 updates 1183 15.4 % update_rate: 0.23577647058823528
epoch 18 updates 1159 15.4 % update_rate: 0.2355555555

KeyboardInterrupt: 

In [None]:
#!/usr/bin/env python3

# Averaged Perceptron
# References:
# https://stackoverflow.com/questions/47213469/how-to-implement-averaged-perceptron-in-python-without-scikit-learn
# https://stackoverflow.com/questions/47213847/how-to-implement-perceptron-in-python

from collections import defaultdict
import numpy as np
import os
os.chdir("/home/minion/Desktop/ML/HW2")


def process_data(filename):
    X, Y = [], []
    for j, line in enumerate(open(filename)):
        line = line.strip()
        features = line.split(", ")
        feat_vec = np.zeros(dimension)
        for i, fv in enumerate(features[:-1]):  # last one is target
            if (i, fv) in feature_map:  # ignore unobserved features
                feat_vec[feature_map[i, fv]] = 1

        X.append(feat_vec)
        Y.append(1 if features[-1] == ">50K" else -1)  # fake for testdata
    return np.array(X), np.array(Y)


def perceptron_avg(X, Y, epochs):
    ones = np.ones(X.shape[0]).reshape(X.shape[0], 1)  # Creates an array of ones for the bias row.
    X1 = np.append(ones, X, axis=1)  # Append the data matrix to the new row of ones.

    w = np.zeros(X1.shape[1])  # Create an array of zeros to store the weights.
    w_a = np.zeros(X1.shape[1])  # average weights
    final_iter = epochs  # Assign final iteration variable to the specified epoch.

    c = 0
    update_counter = 0
    for epoch in range(epochs):  # For each epoch until reaching the maximum specified epoch.
        misclassified = 0

        for i, x in enumerate(X1):  # For each observation and its index in the training set
            y = Y[i]  # Store label for the current observation.
            predicted_y = np.dot(x, w)  # Calculate the prediction for y.
            h = predicted_y * y  # Create a flag to check if the prediction is right.

            if h <= 0:  # If prediction is below or equal to zero, then it is wrong.
                w = w + x * y  # Update the weight to shift and rotate the plane to be more accurate.
                w_a = w_a + c * x * y
                misclassified += 1  # Add 1 to the misclassification counter.
            c += 1

        # else: if the prediction is above 0, then it is correct, and we can proceed.
        # The process is repeated until all observations have been iterated over, for the requested number of epochs.
        if misclassified == 0:  # If we converge, we don't need to continue.
            final_iter = epoch
            break

    updates = misclassified
    return c * w - w_a, final_iter, updates  # Return an array of weight and the number of epochs went.


def calculate_error_rate(X, Y, w):
    ones = np.ones(X.shape[0]).reshape(X.shape[0], 1)  # Creates an array of ones for the bias row.
    X1 = np.append(ones, X, axis=1)  # Append the data matrix to the new row of ones.

    misclassified = 0
    predictions = []
    for i, x in enumerate(X1):  # For each observation and its index in the dev set
        y = Y[i]  # Store label for the current observation.
        predicted_y = np.dot(x, w)  # Calculate the prediction for y.
        predicted_y = 1 if predicted_y > 0 else -1
        predictions.append(predicted_y)
        h = predicted_y * y  # Create a flag to check if the prediction is right.
        if h <= 0:  # If prediction is below or equal to zero, then it is wrong.
            misclassified += 1  # Add 1 to the misclassification counter.

    positive_percentage = predictions.count(1) / len(predictions)
    return misclassified / (X.shape[0]), positive_percentage


def get_high_weights_features(w, feature_map):
    x = {key: w[value] for key, value in feature_map.items()}
    return {k: v for k, v in sorted(x.items(), key=lambda item: item[1])}


def predict(X, w):
    ones = np.ones(X.shape[0]).reshape(X.shape[0], 1)  # Creates an array of ones for the bias row.
    X1 = np.append(ones, X, axis=1)  # Append the data matrix to the new row of ones.

    predictions = []
    for i, x in enumerate(X1):  # For each observation and its index in the dev set
        predicted_y = np.dot(x, w)  # Calculate the prediction for y.
        predictions.append(1 if predicted_y > 0 else -1)
    positive_percentage = predictions.count(1) / len(predictions)

    return predictions, positive_percentage


if __name__ == "__main__":
    field_value_freqs = defaultdict(lambda: defaultdict(int))  # field_id -> value -> freq
    for line in open("income.train.txt.5k"):
        line = line.strip()
        features = line.split(", ")[:-1]  # exclude target label
        for i, fv in enumerate(features):
            field_value_freqs[i][fv] += 1

    feature_map = {}
    feature_remap = {}
    for i, value_freqs in field_value_freqs.items():
        for v in value_freqs:
            k = len(feature_map)  # bias
            feature_map[i, v] = k
            feature_remap[k] = i, v

    dimension = len(feature_map)  # bias
    print("dimensionality: %d" % dimension)  # , feature_map

    train_data = process_data("income.train.txt.5k")
    dev_data = process_data("income.dev.txt")
    test_data = process_data("income.test.blind")

    xTrain = train_data[0][:5000]
    yTrain = train_data[1][:5000]
    xDev = dev_data[0][:1000]
    yDev = dev_data[1][:1000]

    # Fit perceptron
    max_epochs = 5

    for epoch in range(1, max_epochs + 1):
        w, final_iter, updates = perceptron_avg(xTrain, yTrain, epoch)
        error_rate, positive_percentage = calculate_error_rate(xDev, yDev, w)
        print("epoch", epoch, "--> updates:", updates, "(", round(updates / xTrain.shape[0] * 100, 2), "% ) dev_err:",
              round(error_rate * 100, 2), "% (+:", round(positive_percentage * 100, 2), "%)")
        epoch += 1

    sorted_weights = get_high_weights_features(w, feature_map)
    print(sorted_weights)