In [138]:
#!/usr/bin/env python3

# Basic Perceptron
# Reference: https://stackoverflow.com/questions/47213847/how-to-implement-perceptron-in-python


## NEED TO:
# Calculate the positive rate
# Experiment with putting all positives in training at the beginning of the file
# Experiment with other binarization methods


from collections import defaultdict
import numpy as np
import os
# os.chdir("/home/minion/Desktop/ML/HW2")


def process_data(filename):
    X, Y = [], []
    for j, line in enumerate(open(filename)):
        line = line.strip()
        features = line.split(", ")
        feat_vec = np.zeros(dimension)
        for i, fv in enumerate(features[:-1]): # last one is target
            if (i, fv) in feature_map: # ignore unobserved features
                feat_vec[feature_map[i, fv]] = 1

        X.append(feat_vec)
        Y.append(1 if features[-1] == ">50K" else -1) # fake for testdata
    return np.array(X), np.array(Y)

def get_high_weights_features(w, feature_map):
    x={key:w[value] for key,value in feature_map.items()}
    return {k: v for k, v in sorted(x.items(), key=lambda item: item[1])}
        

def perceptron_basic(X, Y, epochs):
    ones = np.ones(X.shape[0]).reshape(X.shape[0], 1)  # Creates an array of ones for the bias row.
    X1 = np.append(ones, X, axis=1)  # Append the data matrix to the new row of ones.

    w = np.zeros(X1.shape[1])  # Create an array of zeros to store the weights.
    final_iter = epochs  # Assign final iteration variable to the specified epoch.

    for epoch in range(epochs):  # For each epoch until reaching the maximum specified epoch.
        misclassified = 0

        for i, x in enumerate(X1):  # For each observation and its index in the training set
            y = Y[i]  # Store label for the current observation.
            predicted_y = np.dot(x, w)  # Calculate the prediction for y.
            h = predicted_y*y  # Create a flag to check if the prediction is right.
            if h <= 0:  # If prediction is below or equal to zero, then it is wrong.
                w = w + x*y  # Update the weight to shift and rotate the plane to be more accurate.
                misclassified += 1  # Add 1 to the misclassification counter.
            # else: if the prediction is above 0, then it is correct, and we can proceed.
        # The process is repeated until all observations have been iterated over, for the requested number of epochs.
        if misclassified == 0:  # If we converge, we don't need to continue.
            final_iter = epoch
            break

    updates = misclassified

    return w, final_iter, updates  # Return an array of weight and the number of epochs went.


def calculate_error_rate(X, Y, w):
    ones = np.ones(X.shape[0]).reshape(X.shape[0], 1)  # Creates an array of ones for the bias row.
    X1 = np.append(ones, X, axis=1)  # Append the data matrix to the new row of ones.

    misclassified = 0
    predictions = []
    for i, x in enumerate(X1):  # For each observation and its index in the dev set
        y = Y[i]  # Store label for the current observation.
        predicted_y = np.dot(x, w)  # Calculate the prediction for y.
        predicted_y = 1 if predicted_y>0 else -1
        predictions.append(predicted_y)
        h = predicted_y * y  # Create a flag to check if the prediction is right.
        if h <= 0:  # If prediction is below or equal to zero, then it is wrong.
            misclassified += 1  # Add 1 to the misclassification counter.

    positive_percentage = predictions.count(1)/len(predictions)
    return misclassified/(X.shape[0]), positive_percentage

def predict(X,w):
    ones = np.ones(X.shape[0]).reshape(X.shape[0], 1)  # Creates an array of ones for the bias row.
    X1 = np.append(ones, X, axis=1)  # Append the data matrix to the new row of ones.
    
    predictions = []
    for i, x in enumerate(X1):  # For each observation and its index in the dev set
        predicted_y = np.dot(x, w)  # Calculate the prediction for y.
        predictions.append(1 if predicted_y>0 else -1)
    positive_percentage = predictions.count(1)/len(predictions)

    return predictions, positive_percentage

def feature_eng_b(filename):
    X, Y = [], []
    for j, line in enumerate(open(filename)):
        line = line.strip()
        features = line.split(", ")
        feat_vec = np.zeros(dimension+1)
        for i, fv in enumerate(features[:-1]): # last one is target
            if i== 7:
                 feat_vec[dimension-1] = fv
            elif i== 0:
                 feat_vec[dimension] = fv
                          
            if (i, fv) in feature_map: # ignore unobserved features
                feat_vec[feature_map[i, fv]] = 1
        
        X.append(feat_vec)
        Y.append(1 if features[-1] == ">50K" else -1) # fake for testdata
    X, Y = np.array(X), np.array(Y)

    # Mean Centering the columns
    columns_mean = [np.mean(column) for column in X.T]
    for i,row in enumerate(X): 
        for j, cell in enumerate(row):
            X[i,j] = cell-columns_mean[j]

    return X, Y
    
def feature_eng_a(filename):
    X, Y = [], []
    for j, line in enumerate(open(filename)):
        line = line.strip()
        features = line.split(", ")
        feat_vec = np.zeros(dimension+2)
        for i, fv in enumerate(features[:-1]): # last one is target
#             if i== 7:
#                  feat_vec[dimension] = fv
#             elif i== 0:
#                  feat_vec[dimension+1] = fv
                          
            if (i, fv) in feature_map: # ignore unobserved features
                feat_vec[feature_map[i, fv]] = 1

        X.append(feat_vec)
        Y.append(1 if features[-1] == ">50K" else -1) # fake for testdata
            
    return np.array(X), np.array(Y)
        

def feature_eng_c(filename):
    X, Y = [], []
    for j, line in enumerate(open(filename)):
        line = line.strip()
        features = line.split(", ")
        feat_vec = np.zeros(dimension+2)
        for i, fv in enumerate(features[:-1]): # last one is target
            if i== 7:
                 feat_vec[dimension] = fv
            elif i== 0:
                 feat_vec[dimension+1] = fv
                          
            if (i, fv) in feature_map: # ignore unobserved features
                feat_vec[feature_map[i, fv]] = 1
        
        X.append(feat_vec)
        Y.append(1 if features[-1] == ">50K" else -1) # fake for testdata
    X, Y = np.array(X), np.array(Y)

    # Mean Centering the columns
    columns_mean = [np.mean(column) for column in X.T]
    columns_std = [np.std(column) for column in X.T]
    for i,row in enumerate(X): 
        for j, cell in enumerate(row):
            X[i,j] = (cell-columns_mean[j])
            if columns_std[j]!=0:
                X[i,j] /= columns_std[j]
    return X, Y
        
def feature_eng_c_2(filename):
    X, Y = [], []
    for j, line in enumerate(open(filename)):
        line = line.strip()
        features = line.split(", ")
        feat_vec = np.zeros(dimension)
        for i, fv in enumerate(features[:-1]): # last one is target
            if (i, fv) in feature_map: # ignore unobserved features
                feat_vec[feature_map[i, fv]] = 1

        X.append(feat_vec)
        Y.append(1 if features[-1] == ">50K" else -1) # fake for testdata
    
    X, Y = np.array(X), np.array(Y)
    # Mean Centering the columns
    columns_mean = [np.mean(column) for column in X.T]
    columns_std = [np.std(column) for column in X.T]
    for i,row in enumerate(X): 
        for j, cell in enumerate(row):
            X[i,j] = (cell-columns_mean[j])
            if columns_std[j]!=0:
                X[i,j] /= columns_std[j]
    
    return np.array(X), np.array(Y)


def feature_eng_d(filename):
    X, Y = [], []
    for j, line in enumerate(open(filename)):
        line = line.strip()
        features = line.split(", ")
        feat_vec = np.zeros(dimension+2)
        for i, fv in enumerate(features[:-1]): # last one is target
            if i== 7:
                 feat_vec[dimension] = fv
            elif i== 0:
                 feat_vec[dimension+1] = fv
                          
            if (i, fv) in feature_map: # ignore unobserved features
                feat_vec[feature_map[i, fv]] = 1

        X.append(feat_vec)
        Y.append(1 if features[-1] == ">50K" else -1) # fake for testdata
            
    return np.array(X), np.array(Y)
        
def combine_features(data,feature1,feature2,feature_map):
    col1 = data[:,feature_map[feature1]]
    col2 = data[:,feature_map[feature2]]
    combined_col = np.array([ 1 if col1[i]==1 and col2[i]==1 else 0 
                             for i in range(len(col1))]).reshape(data.shape[0],1)
    
    data = np.append(data, combined_col, axis=1)
    return data
    
if __name__ == "__main__":
    field_value_freqs = defaultdict(lambda : defaultdict(int)) # field_id -> value -> freq
    for line in open("income.train.txt.5k"):
        line = line.strip()
        features = line.split(", ")[:-1] # exclude target label
        for i, fv in enumerate(features):
            field_value_freqs[i][fv] += 1
#     print(field_value_freqs)
    feature_map = {}
    feature_remap = {}
    for i, value_freqs in field_value_freqs.items():
        for v in value_freqs:
            k = len(feature_map) # bias
            feature_map[i, v] = k
            feature_remap[k] = i, v
#     print(feature_map)
    dimension = len(feature_map) # bias


    train_data = process_data("income.train.txt.5k")
    dev_data = process_data("income.dev.txt")
    test_data = process_data("income.test.blind")
    
    feature_weights = list(get_high_weights_features(w, feature_map).keys())
    important_features = feature_weights[:10] + feature_weights[-10:]
    
    for i,feature1 in enumerate(important_features):
        for j, feature2 in enumerate(important_features):
            if i<j and feature1[0]!=feature2[0]:
                
                print(f"combining {feature1} and {feature2} ...")
                print("dimensionality:", train_data[0].shape[1]) #, feature_map

                xTrain = train_data[0][:5000]
                yTrain = train_data[1][:5000]
                xDev = dev_data[0][:1000]
                yDev = dev_data[1][:1000]
                XTest = test_data[0]

                xTrain_combined = combine_features(xTrain,feature1,feature2, feature_map)
                xDev_combined = combine_features(xDev,feature1,feature2, feature_map)
                XTest = combine_features(XTest,feature1,feature2, feature_map)

#                 print(xTrain_combined.shape,xTrain.shape)
                # Fit perceptron
                max_epochs = 5

                for epoch in range(1, max_epochs+1):
                    w, final_iter, updates = perceptron_basic(xTrain_combined, yTrain, epoch)
                    # Calculate error rate
                    error_rate, positive_percentage = calculate_error_rate(xDev_combined, yDev, w)
                    print("epoch", epoch, "--> updates", updates, "(", round(updates/xTrain.shape[0]*100, 2), "% ) dev_err",
                          round(error_rate*100, 2), "% (+:", round(positive_percentage*100, 2),"%")
                    epoch += 1

#     test_predictions, test_positive_percentage = predict(XTest, w)
#     test_predictions=['50k<' if label>0 else '50k>' for label in test_predictions]
#     test_file = open("income.test.blind")
#     with open("income.test.predicted",'w') as f:
#         for i,line in enumerate(test_file.readlines()):
#             f.write(line.strip()+ ', ' +test_predictions[i]+ '\n')
            
#     print(f"test_positive_percentage:{test_positive_percentage*100}%")
    

combining (0, '37') and (2, 'Assoc-voc') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.0 % (+: 27.6 %
epoch 2 --> updates 1193 ( 23.86 % ) dev_err 18.5 % (+: 21.3 %
epoch 3 --> updates 1190 ( 23.8 % ) dev_err 19.3 % (+: 25.5 %
epoch 4 --> updates 1173 ( 23.46 % ) dev_err 19.6 % (+: 20.0 %
epoch 5 --> updates 1171 ( 23.42 % ) dev_err 19.8 % (+: 26.2 %
combining (0, '37') and (4, 'Machine-op-inspct') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (0, '37') and (4, 'Handlers-cleaners') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % 

epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (0, '27') and (2, 'Assoc-voc') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (0, '27') and (4, 'Machine-op-inspct') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (0, '27') and (4, 'Handlers-cleaners') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % 

epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (0, '65') and (2, 'Assoc-voc') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1158 ( 23.16 % ) dev_err 18.0 % (+: 10.8 %
epoch 5 --> updates 1156 ( 23.12 % ) dev_err 18.4 % (+: 16.4 %
combining (0, '65') and (4, 'Machine-op-inspct') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (0, '65') and (4, 'Handlers-cleaners') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 %

epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (4, 'Machine-op-inspct') and (0, '45') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1180 ( 23.6 % ) dev_err 18.0 % (+: 13.2 %
combining (4, 'Machine-op-inspct') and (0, '58') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1173 ( 23.46 % ) dev_err 19.2 % (+: 8.6 %
epoch 5 --> updates 1169 ( 23.38 % ) dev_err 19.7 % (+: 19.3 %
combining (4, 'Machine-op-inspct') and 

epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1172 ( 23.44 % ) dev_err 17.8 % (+: 11.8 %
epoch 5 --> updates 1191 ( 23.82 % ) dev_err 20.9 % (+: 28.9 %
combining (0, '58') and (7, '9') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (0, '58') and (8, 'France') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
c

epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (4, 'Handlers-cleaners') and (7, '32') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (4, 'Handlers-cleaners') and (2, '5th-6th') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1164 ( 23.28 % ) dev_err 17.7 % (+: 16.3 %
combining (4, 'Handlers-cleaners') and (2, '10th') ...
dimensionality: 230
epoch 1 --> updates 1

epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (0, '28') and (7, '32') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (0, '28') and (2, '5th-6th') ...
dimensionality: 230
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
combining (0, '28') and (2, '10th') ...
dimensionality: 230
epo

In [None]:
epoch 1 --> updates 1257 ( 25.14 % ) dev_err 21.1 % (+: 27.5 %
epoch 2 --> updates 1221 ( 24.42 % ) dev_err 18.8 % (+: 25.4 %
epoch 3 --> updates 1177 ( 23.54 % ) dev_err 17.5 % (+: 21.5 %
epoch 4 --> updates 1170 ( 23.4 % ) dev_err 19.1 % (+: 12.3 %
epoch 5 --> updates 1172 ( 23.44 % ) dev_err 18.7 % (+: 17.7 %
                                                     
