In [26]:
# Basic Perceptron
# Reference: https://stackoverflow.com/questions/47213847/how-to-implement-perceptron-in-python

from __future__ import division
import numpy as np
import os
# os.chdir("/home/minion/Desktop/ML/HW2")
np.random.seed(123)

# Need to print the percent positive


def perceptron_basic(X, Y, epochs):
    ones = np.ones(X.shape[0]).reshape(X.shape[0], 1)  # Creates an array of ones for the bias row.
    X1 = np.append(ones, X, axis=1)  # Append the data matrix to the new row of ones.

    w = np.zeros(X1.shape[1])  # Create an array of zeros to store the weights.
    final_iter = epochs  # Assign final iteration variable to the specified epoch.

    for epoch in range(epochs):  # For each epoch until reaching the maximum specified epoch.
        misclassified = 0

        for i, x in enumerate(X1):  # For each observation and its index in the training set
            y = Y[i]  # Store label for the current observation.
            predicted_y = np.dot(x, w)  # Calculate the prediction for y.
            h = predicted_y*y  # Create a flag to check if the prediction is right.

            if h <= 0:  # If prediction is below or equal to zero, then it is wrong.
                w = w + x*y  # Update the weight to shift and rotate the plane to be more accurate.
                misclassified += 1  # Add 1 to the misclassification counter.
            # else: if the prediction is above 0, then it is correct, and we can proceed.
        # The process is repeated until all observations have been iterated over, for the requested number of epochs.
        if misclassified == 0:  # If we converge, we don't need to continue.
            final_iter = epoch
            break
    updates = misclassified
    return w, final_iter, updates  # Return an array of weight and the number of epochs went.


def calculate_error_rate(X, Y, w):
    ones = np.ones(X.shape[0]).reshape(X.shape[0], 1)  # Creates an array of ones for the bias row.
    X1 = np.append(ones, X, axis=1)  # Append the data matrix to the new row of ones.

    misclassified = 0

    for i, x in enumerate(X1):  # For each observation and its index in the dev set
        y = Y[i]  # Store label for the current observation.
        predicted_y = np.dot(x, w)  # Calculate the prediction for y.
        h = predicted_y * y  # Create a flag to check if the prediction is right.

        if h <= 0:  # If prediction is below or equal to zero, then it is wrong.
            misclassified += 1  # Add 1 to the misclassification counter.

    return misclassified/(X.shape[0])


def load_binarized_features(filename, num_rows, num_features):

    lines = open(filename).readlines()
    lines = [line.strip().split(", ") for line in lines]
    

    loaded_data = [[value for idx, value in enumerate(line) if idx not in [9]] for line in lines] #9 is target

    # Combining features     
    education= [feature for line in lines for i,feature in enumerate(line) if i==2]
    sector= [feature for line in lines for i,feature in enumerate(line) if i==4]
    loaded_combined_data=[person+[education[i]+'/'+sector[i]] for i,person in enumerate(loaded_data)]
    num_of_combined_features=len(set([education[i]+'/'+sector[i] for i,person in enumerate(loaded_data)]))
    print(len(loaded_combined_data[0]))
       
    mapping = {}
    new_data = []

    for row in loaded_combined_data:
        new_row = []
        for j, x in enumerate(row):
            feature = (j, x)
            if feature not in mapping:
                mapping[feature] = len(mapping)  # insert a new feature into the index
            new_row.append(mapping[feature])
        new_data.append(new_row)

        
    binary_data = -np.ones((num_rows, num_features+num_of_combined_features))
    # store normalized numerical values
    for idx, row in enumerate(new_data):
        for jdx in row:
            binary_data[idx][jdx] = 1

    return binary_data


def load_labels(filename):
    lines = open(filename).readlines()
    lines = [line.strip().split(", ") for line in lines]

    labels = [[value for idx, value in enumerate(line) if idx in [9]] for line in lines]
    labels = [val for sublist in labels for val in sublist]
    y_array = []

    for label in labels:
        if label == '<=50K':
            y_array.append(-1)
        elif label == '>50K':
            y_array.append(1)

    y_array = np.array(y_array)
    return y_array

def main():

    filenames = ['income.train.txt.5k', 'income.dev.txt']
    with open('combined.txt', 'w') as outfile:
        for f in filenames:
            with open(f) as infile:
                for line in infile:
                    outfile.write(line)

    filename = 'combined.txt'
    num_rows = 6000
    num_features = 233 ### WHY?

    binarized_features = load_binarized_features(filename, num_rows, num_features)
    print(binarized_features)
    
    xTrain = binarized_features[0:5000]
    xDev = binarized_features[5000:6001]

    labels = load_labels(filename)

    yTrain = np.array(labels[0:5000])
    yDev = np.array(labels[5000:6001])


    # Fit perceptron
    max_epochs = 10
    for epoch in range(1, max_epochs+1):
        w, final_iter, updates = perceptron_basic(xTrain, yTrain, epoch)
        # Calculate error rate
        print("epoch", epoch, "updates", updates, round(calculate_error_rate(xDev, yDev, w)*100, 2), "%")
        epoch += 1


if __name__ == "__main__":
    main()

10
[[ 1.  1.  1. ... -1. -1. -1.]
 [-1. -1. -1. ... -1. -1. -1.]
 [-1. -1. -1. ... -1. -1. -1.]
 ...
 [-1. -1. -1. ... -1. -1. -1.]
 [-1. -1. -1. ... -1. -1. -1.]
 [-1. -1. -1. ... -1. -1. -1.]]
epoch 1 updates 1349 16.4 %
epoch 2 updates 1194 15.7 %
epoch 3 updates 1160 16.3 %
epoch 4 updates 1140 16.0 %
epoch 5 updates 1123 22.2 %
epoch 6 updates 1105 15.4 %
epoch 7 updates 1100 15.5 %
epoch 8 updates 1090 15.9 %
epoch 9 updates 1106 16.1 %
epoch 10 updates 1101 21.6 %
