In [1]:
# Student: Corey McFarland
# Descritption: Feature Map and Preceptron implementations

import sys
from collections import defaultdict
import numpy as np
import time

def process_data(filename, feature_map, dimension, numerical_fields=[]):
    """
    Code pulled from hw1 solution
    """
    X, Y = [], []
    for j, line in enumerate(open(filename)):
        line = line.strip()
        features = line.split(", ")
        feat_vec = np.zeros(dimension, dtype="float")
        for i, fv in enumerate(features[:-1]): # last one is target
            if i in numerical_fields: # two numerical fields
                feat_vec[feature_map[i, 0]] = float(fv) / 50  # NB: diff 2 not 1!
            elif (i, fv) in feature_map: # ignore unobserved features
                feat_vec[feature_map[i, fv]] = 1
        
        # Add bias feature
        feat_vec = np.append(feat_vec, 1)
        
        X.append(feat_vec)
        Y.append(1 if features[-1] == ">50K" else -1) # fake for testdata

    return np.array(X), np.array(Y)

def process_data_sort(filename, feature_map, dimension, numerical_fields=[]):
    """
    Function to order training data by target
    """
    X, Y = [], []
    for j, line in enumerate(open(filename)):
        line = line.strip()
        features = line.split(", ")
        feat_vec = np.zeros(dimension, dtype="float")
        for i, fv in enumerate(features[:-1]): # last one is target
            if i in numerical_fields: # two numerical fields
                feat_vec[feature_map[i, 0]] = float(fv) / 50  # NB: diff 2 not 1!
            elif (i, fv) in feature_map: # ignore unobserved features
                feat_vec[feature_map[i, fv]] = 1
        
        # Add bias feature
        feat_vec = np.append(feat_vec, 1)
        
        # Add positive to start, negative to the end
        if features[-1] == ">50K":
            X.insert(0, feat_vec)
            Y.insert(0, 1)
            
        else:
            X.append(feat_vec)
            Y.append(-1)

    return np.array(X), np.array(Y)

def process_data_rv(filename, feature_map, dimension, numerical_fields=[]):
    """
    Function to gather real number values for numerical fields
    """
    X, Y = [], []
    for j, line in enumerate(open(filename)):
        line = line.strip()
        features = line.split(", ")
        feat_vec = np.zeros(dimension, dtype="float")
        for i, fv in enumerate(features[:-1]): # last one is target
            if i in numerical_fields: # two numerical fields
                feat_vec[feature_map[i, 0]] = float(fv) / 50  # NB: diff 2 not 1!
            elif (i, fv) in feature_map: # ignore unobserved features
                feat_vec[feature_map[i, fv]] = 1
        
        # Add real-valued numerical features
        feat_vec = np.append(feat_vec, float(features[0]))
        feat_vec = np.append(feat_vec, float(features[7]))
        
        # Add bias feature
        feat_vec = np.append(feat_vec, 1)
        
        X.append(feat_vec)
        Y.append(1 if features[-1] == ">50K" else -1) # fake for testdata

    return np.array(X), np.array(Y)

def process_data_cmb(filename, c_arr1, c_arr2, feature_map, dimension, numerical_fields=[]):
    """
    Function to build training data with additional combinations of fields
    """
    X, Y = [], []
    for j, line in enumerate(open(filename)):
        line = line.strip()
        features = line.split(", ")
        
        # Add combined features
        for x in c_arr1:
            for y in c_arr2:
                if x == y:
                    continue
                else:
                    features.insert(-1, features[x] + features[y])

        feat_vec = np.zeros(dimension, dtype="float")
        for i, fv in enumerate(features[:-1]): # last one is target
            if i in numerical_fields: # two numerical fields
                feat_vec[feature_map[i, 0]] = float(fv) / 50  # NB: diff 2 not 1!
            elif (i, fv) in feature_map: # ignore unobserved features
                feat_vec[feature_map[i, fv]] = 1
        
        # Add bias feature
        feat_vec = np.append(feat_vec, 1)
        
        X.append(feat_vec)
        Y.append(1 if features[-1] == ">50K" else -1) # fake for testdata

    return np.array(X), np.array(Y)

def perceptron(train_data, pred_data, n_epochs, zero_mean = False, unit_var = False):
    """
    Simple perceptron algorthim.
    """
    
    w = np.zeros(len(train_data[0][0]), dtype="float")
    epoch = 0
    
    if zero_mean:
        mean = np.mean(train_data[0], axis=0)
        
    if unit_var:
        uv = np.std(train_data[0], axis=0)
    
    while epoch < n_epochs:
        epoch += 1
        update = 0
        err = 0
        pos = 0
        
        # Train weights based on provided training data
        for i, (x, y) in enumerate(train_data):
            if zero_mean:
                x -= mean
            if zero_mean and unit_var:
                x /= uv
            if y * x.dot(w) <= 0:
                w += x*y
                update += 1

        update_rate = update / len(train_data) * 100
        
        # Predict for provided prediction data
        
        prediction = []
        
        for j, x in enumerate(pred_data[0]):
            if x.dot(w) <= 0:
                prediction.append(-1)

            else:
                prediction.append(1)
                pos += 1
                
            if prediction[j] != pred_data[1][j]:
                err += 1
        
        print("epoch: {:d} updates: {:d} ({:.1f}%) dev_err {:.1f}% (+:{:.1f}%)".format(epoch, update, update_rate, err / len(pred_data[0]) * 100, pos / len(pred_data[0])  * 100))
    
    return [w]

def avg_perceptron(train_data, pred_data, n_epochs, zero_mean = False, unit_var = False):
    """
    Average perceptron algorthim.
    """
    
    w = np.zeros(len(train_data[0][0]), dtype="float")
    wa = np.zeros(len(train_data[0][0]), dtype="float")
    c = np.zeros(1, dtype="float")
    epoch = 0
    
    if zero_mean:
        mean = np.mean(train_data[0], axis=0)
        
    if unit_var:
        uv = np.std(train_data[0], axis=0)
    
    while epoch < n_epochs:
        epoch += 1
        update = 0
        err = 0
        pos = 0
        
        # Train weights based on provided training data
        for i, (x, y) in enumerate(train_data):
            if zero_mean:
                x -= mean
            if zero_mean and unit_var:
                x /= uv
            if y * x.dot(w) <= 0:
                w += x*y
                wa += c*x*y
                update += 1
            c += 1
        
        out = c*w - wa

        update_rate = update / len(train_data) * 100
        
        # Predict for provided prediction data
        
        prediction = []
        
        for j, x in enumerate(pred_data[0]):
            if x.dot(out) <= 0:
                prediction.append(-1)

            else:
                prediction.append(1)
                pos += 1
                
            if prediction[j] != pred_data[1][j]:
                err += 1
        
        print("epoch: {:d} updates: {:d} ({:.1f}%) dev_err {:.1f}% (+:{:.1f}%)".format(epoch, update, update_rate, err / len(pred_data[0]) * 100, pos / len(pred_data[0])  * 100))

    return [out, prediction]

def small_large(weights, feature_map):
    """
    Function to print 5 largest and smallest weights
    """

    print("5 Smallest, plus Bias: ")
    min = np.argpartition(weights, 6)[:6]
    print(min)
    print(weights[min])
    for val in min:
        if val > len(list(feature_map.values())):
            continue
        print(list(feature_map.keys())[val])
        
    print("5 Largest: ")
    max = np.argpartition(weights, -6)[-6:]
    print(max)
    print(weights[max])
    for val in max:
        if val > len(list(feature_map.values())):
            continue
        print(list(feature_map.keys())[val])

def timed_perceptron(train_data, pred_data, time):
    """
    Average perceptron algorthim that runs until a certain time limit rather than a number of epochs
    """
    
    w = np.zeros(len(train_data[0][0]), dtype="float")
    wa = np.zeros(len(train_data[0][0]), dtype="float")
    c = np.zeros(1, dtype="float")
    epoch = 0
    s = time.time()
    e = time.time()
    
    while e - s < time:
        epoch += 1
        update = 0
        err = 0
        pos = 0
        
        # Train weights based on provided training data
        for i, (x, y) in enumerate(train_data):
            if y * x.dot(w) <= 0:
                w += x*y
                wa += c*x*y
                update += 1
            c += 1
        
        out = c*w - wa

        update_rate = update / len(train_data) * 100
        
        # Predict for provided prediction data
        
        prediction = []
        
        for j, x in enumerate(pred_data[0]):
            if x.dot(out) <= 0:
                prediction.append(-1)

            else:
                prediction.append(1)
                pos += 1
                
            if prediction[j] != pred_data[1][j]:
                err += 1
        
        
        print("epoch: {:d} updates: {:d} ({:.1f}%) dev_err {:.1f}% (+:{:.1f}%)".format(epoch, update, update_rate, err / len(pred_data[0]) * 100, pos / len(pred_data[0])  * 100))
        e = time.time()
        
    return [out]        

def map_features(filename, c_arr1=[], c_arr2=[]):
    """
    Function to map features, including optional combinations of fields
    """
    field_value_freqs = defaultdict(lambda : defaultdict(int)) # field_id -> value -> freq
    numerical_fields = [] # [] for binarizing all the fields
    for line in open(filename):
        line = line.strip()
        features = line.split(", ")[:-1] # exclude target label

        # Add combination features
        for x in c_arr1:
            for y in c_arr2:
                if x == y:
                    continue
                else:
                    features.append(features[x] + features[y])
                
        for i, fv in enumerate(features):
            field_value_freqs[i][0 if i in numerical_fields else fv] += 1

    feature_map = {}
    feature_remap = {}
    for i, value_freqs in field_value_freqs.items():
        for v in value_freqs:
            k = len(feature_map) # bias
            feature_map[i, v] = k
            feature_remap[k] = i, v

    dimension = len(feature_map) + 1 # bias
    print("dimensionality: %d" % dimension) #, feature_map

    return feature_map, dimension
        

standard_map = map_features("income.train.txt.5k")
    
train_data = process_data("income.train.txt.5k", standard_map[0], standard_map[1])
dev_data = process_data("income.dev.txt", standard_map[0], standard_map[1])

train_data_s = process_data_sort("income.train.txt.5k", standard_map[0], standard_map[1])
dev_data_s = process_data_sort("income.dev.txt", standard_map[0], standard_map[1])

train_data_rv = process_data_rv("income.train.txt.5k", standard_map[0], standard_map[1])
dev_data_rv = process_data_rv("income.dev.txt", standard_map[0], standard_map[1])

train_data_zm = process_data_rv("income.train.txt.5k", standard_map[0], standard_map[1])
dev_data_zm = process_data_rv("income.dev.txt", standard_map[0], standard_map[1])

dimensionality: 231


In [2]:
# 2.1 5 epochs of simple perceptron
p = perceptron(list(zip(*train_data)), dev_data, 5)

epoch: 1 updates: 1257 (25.1%) dev_err 21.1% (+:27.5%)
epoch: 2 updates: 1221 (24.4%) dev_err 18.8% (+:25.4%)
epoch: 3 updates: 1177 (23.5%) dev_err 17.5% (+:21.5%)
epoch: 4 updates: 1170 (23.4%) dev_err 19.1% (+:12.3%)
epoch: 5 updates: 1172 (23.4%) dev_err 18.7% (+:17.7%)


In [3]:
# 2.2 5 epochs of average perceptron
ap = avg_perceptron(list(zip(*train_data)), dev_data, 5)

epoch: 1 updates: 1257 (25.1%) dev_err 15.0% (+:18.6%)
epoch: 2 updates: 1221 (24.4%) dev_err 15.1% (+:19.3%)
epoch: 3 updates: 1177 (23.5%) dev_err 14.8% (+:20.0%)
epoch: 4 updates: 1170 (23.4%) dev_err 14.7% (+:19.3%)
epoch: 5 updates: 1172 (23.4%) dev_err 14.8% (+:20.0%)


In [4]:
# 2.4 Simple perceptron smallest and largest weights
small_large(p[0], standard_map[0])

5 Smallest, plus Bias: 
[ 52  46 231   3  81 104]
[-7. -7. -7. -8. -7. -7.]
(0, '75')
(0, '26')
(0, '28')
(2, '7th-8th')
(4, 'Farming-fishing')
5 Largest: 
[ 77   2 134  83 201  85]
[6. 6. 6. 7. 8. 8.]
(2, 'Masters')
(0, '53')
(7, '48')
(2, 'Prof-school')
(8, 'Iran')
(2, 'Doctorate')


In [5]:
# 2.4 Average perceptron smallest and largest weights
small_large(ap[0], standard_map[0])

5 Smallest, plus Bias: 
[231   3  81 104  46 143]
[-146333. -150069. -141763. -140683. -124170. -118670.]
(0, '28')
(2, '7th-8th')
(4, 'Farming-fishing')
(0, '26')
(7, '24')
5 Largest: 
[161 201 148  83  85  90]
[ 96688. 121924. 102122. 137016. 178980. 151494.]
(7, '58')
(8, 'Iran')
(7, '65')
(2, 'Prof-school')
(2, 'Doctorate')
(3, 'Married-civ-spouse')


In [6]:
# Weights for average perceptron
for i, x in enumerate(list(standard_map[0].keys())):
    print("Key: {}, Weight: {}".format(x, ap[0][i]))
print("Key: (/231/, 'Bias'), Weight: {}".format(ap[0][-1]))

Key: (0, '50'), Weight: 93516.0
Key: (0, '38'), Weight: 24289.0
Key: (0, '53'), Weight: 70149.0
Key: (0, '28'), Weight: -150069.0
Key: (0, '37'), Weight: 5987.0
Key: (0, '49'), Weight: 28558.0
Key: (0, '52'), Weight: 41353.0
Key: (0, '31'), Weight: 8195.0
Key: (0, '42'), Weight: 3302.0
Key: (0, '30'), Weight: 4319.0
Key: (0, '23'), Weight: -98809.0
Key: (0, '32'), Weight: -39849.0
Key: (0, '34'), Weight: 6295.0
Key: (0, '25'), Weight: -79370.0
Key: (0, '43'), Weight: 683.0
Key: (0, '35'), Weight: 9829.0
Key: (0, '59'), Weight: 24691.0
Key: (0, '56'), Weight: 60692.0
Key: (0, '19'), Weight: -85321.0
Key: (0, '39'), Weight: 7443.0
Key: (0, '20'), Weight: -91099.0
Key: (0, '45'), Weight: 30057.0
Key: (0, '22'), Weight: -23441.0
Key: (0, '48'), Weight: 80010.0
Key: (0, '21'), Weight: -29754.0
Key: (0, '57'), Weight: 74359.0
Key: (0, '44'), Weight: 48414.0
Key: (0, '41'), Weight: 42958.0
Key: (0, '29'), Weight: -72273.0
Key: (0, '47'), Weight: 70656.0
Key: (0, '46'), Weight: 66687.0
Key: (0

In [7]:
# 3.1/3.2
tap = avg_perceptron(list(zip(*train_data)), dev_data, 69.72)

epoch: 1 updates: 1257 (25.1%) dev_err 15.0% (+:18.6%)
epoch: 2 updates: 1221 (24.4%) dev_err 15.1% (+:19.3%)
epoch: 3 updates: 1177 (23.5%) dev_err 14.8% (+:20.0%)
epoch: 4 updates: 1170 (23.4%) dev_err 14.7% (+:19.3%)
epoch: 5 updates: 1172 (23.4%) dev_err 14.8% (+:20.0%)
epoch: 6 updates: 1185 (23.7%) dev_err 15.2% (+:20.4%)
epoch: 7 updates: 1165 (23.3%) dev_err 15.5% (+:20.3%)
epoch: 8 updates: 1185 (23.7%) dev_err 15.9% (+:20.7%)
epoch: 9 updates: 1184 (23.7%) dev_err 15.8% (+:20.8%)
epoch: 10 updates: 1181 (23.6%) dev_err 15.7% (+:20.9%)
epoch: 11 updates: 1156 (23.1%) dev_err 15.6% (+:20.8%)
epoch: 12 updates: 1138 (22.8%) dev_err 15.7% (+:20.9%)
epoch: 13 updates: 1165 (23.3%) dev_err 15.7% (+:20.9%)
epoch: 14 updates: 1174 (23.5%) dev_err 15.7% (+:20.9%)
epoch: 15 updates: 1166 (23.3%) dev_err 15.6% (+:20.8%)
epoch: 16 updates: 1162 (23.2%) dev_err 15.6% (+:20.8%)
epoch: 17 updates: 1183 (23.7%) dev_err 15.4% (+:20.6%)
epoch: 18 updates: 1159 (23.2%) dev_err 15.4% (+:20.6%)
e

In [8]:
# 4.1
sp = perceptron(list(zip(*train_data_s)), dev_data_s, 5)
print()
sap = avg_perceptron(list(zip(*train_data_s)), dev_data_s, 5)

epoch: 1 updates: 3 (0.1%) dev_err 23.6% (+:0.0%)
epoch: 2 updates: 8 (0.2%) dev_err 23.6% (+:0.0%)
epoch: 3 updates: 10 (0.2%) dev_err 23.6% (+:0.0%)
epoch: 4 updates: 10 (0.2%) dev_err 23.6% (+:0.0%)
epoch: 5 updates: 12 (0.2%) dev_err 23.6% (+:0.0%)

epoch: 1 updates: 3 (0.1%) dev_err 23.9% (+:0.3%)
epoch: 2 updates: 8 (0.2%) dev_err 23.7% (+:0.1%)
epoch: 3 updates: 10 (0.2%) dev_err 23.8% (+:0.2%)
epoch: 4 updates: 10 (0.2%) dev_err 23.9% (+:0.3%)
epoch: 5 updates: 12 (0.2%) dev_err 23.8% (+:0.2%)


In [9]:
# 4.2(a)
rp = perceptron(list(zip(*train_data_rv)), dev_data_rv, 5)
print()
rap = avg_perceptron(list(zip(*train_data_rv)), dev_data_rv, 5)

epoch: 1 updates: 1858 (37.2%) dev_err 23.8% (+:0.2%)
epoch: 2 updates: 1676 (33.5%) dev_err 23.7% (+:0.1%)
epoch: 3 updates: 1601 (32.0%) dev_err 18.6% (+:23.8%)
epoch: 4 updates: 1516 (30.3%) dev_err 19.6% (+:26.8%)
epoch: 5 updates: 1510 (30.2%) dev_err 23.4% (+:0.2%)

epoch: 1 updates: 1858 (37.2%) dev_err 23.6% (+:0.0%)
epoch: 2 updates: 1676 (33.5%) dev_err 23.6% (+:0.0%)
epoch: 3 updates: 1601 (32.0%) dev_err 23.6% (+:0.0%)
epoch: 4 updates: 1516 (30.3%) dev_err 23.5% (+:0.1%)
epoch: 5 updates: 1510 (30.2%) dev_err 22.7% (+:1.5%)


In [10]:
# 4.2(b)
zmp = perceptron(list(zip(*train_data)), dev_data, 5, True)
print()
zmap = avg_perceptron(list(zip(*train_data)), dev_data, 5, True)

epoch: 1 updates: 1282 (25.6%) dev_err 37.8% (+:58.8%)
epoch: 2 updates: 1452 (29.0%) dev_err 39.5% (+:60.9%)
epoch: 3 updates: 1340 (26.8%) dev_err 38.1% (+:59.3%)
epoch: 4 updates: 1322 (26.4%) dev_err 37.7% (+:58.9%)


  return array(a, dtype, copy=False, order=order, subok=True)


epoch: 5 updates: 1310 (26.2%) dev_err 37.6% (+:58.8%)

epoch: 1 updates: 1596 (31.9%) dev_err 34.1% (+:54.7%)
epoch: 2 updates: 1212 (24.2%) dev_err 35.7% (+:56.9%)
epoch: 3 updates: 911 (18.2%) dev_err 36.3% (+:57.5%)
epoch: 4 updates: 1076 (21.5%) dev_err 36.5% (+:57.9%)
epoch: 5 updates: 1349 (27.0%) dev_err 36.5% (+:57.9%)


In [11]:
# 4.2(c)
uvp = perceptron(list(zip(*train_data)), dev_data, 5, True, True)
print()
uvap = avg_perceptron(list(zip(*train_data)), dev_data, 5, True, True)

epoch: 1 updates: 1220 (24.4%) dev_err 22.2% (+:32.0%)
epoch: 2 updates: 1189 (23.8%) dev_err 21.6% (+:30.6%)
epoch: 3 updates: 1195 (23.9%) dev_err 19.1% (+:20.3%)
epoch: 4 updates: 1204 (24.1%) dev_err 19.3% (+:21.7%)
epoch: 5 updates: 1195 (23.9%) dev_err 21.3% (+:16.1%)

epoch: 1 updates: 1207 (24.1%) dev_err 18.5% (+:31.9%)
epoch: 2 updates: 1262 (25.2%) dev_err 17.1% (+:28.1%)
epoch: 3 updates: 1456 (29.1%) dev_err 18.4% (+:29.4%)
epoch: 4 updates: 1464 (29.3%) dev_err 19.8% (+:33.0%)
epoch: 5 updates: 1386 (27.7%) dev_err 21.4% (+:35.2%)


In [12]:
cmb_set = [1, 2, 3, 4, 5, 6, 8]

In [13]:
# 4.2(d)
for i, x in enumerate(cmb_set):
    for j, y in enumerate(cmb_set):
        if i == j:
            continue
        print(i, j)
        combined_map = map_features("income.train.txt.5k", [cmb_set[i]], [cmb_set[j]])
        train_data_cmb = process_data_cmb("income.train.txt.5k", [cmb_set[i]], [cmb_set[j]], combined_map[0], combined_map[1])
        dev_data_cmb = process_data_cmb("income.dev.txt", [cmb_set[i]], [cmb_set[j]], combined_map[0], combined_map[1])
        
        cmbap = avg_perceptron(list(zip(*train_data_cmb)), dev_data_cmb, 5)

0 1
dimensionality: 312
epoch: 1 updates: 1234 (24.7%) dev_err 14.9% (+:18.9%)
epoch: 2 updates: 1201 (24.0%) dev_err 15.0% (+:19.6%)
epoch: 3 updates: 1206 (24.1%) dev_err 15.7% (+:19.7%)
epoch: 4 updates: 1147 (22.9%) dev_err 16.1% (+:19.7%)
epoch: 5 updates: 1156 (23.1%) dev_err 15.8% (+:20.0%)
0 2
dimensionality: 269
epoch: 1 updates: 1256 (25.1%) dev_err 15.0% (+:19.0%)
epoch: 2 updates: 1168 (23.4%) dev_err 14.6% (+:19.6%)
epoch: 3 updates: 1171 (23.4%) dev_err 14.8% (+:19.6%)
epoch: 4 updates: 1155 (23.1%) dev_err 14.7% (+:19.9%)
epoch: 5 updates: 1121 (22.4%) dev_err 15.1% (+:20.1%)
0 3
dimensionality: 301
epoch: 1 updates: 1235 (24.7%) dev_err 15.7% (+:19.3%)
epoch: 2 updates: 1203 (24.1%) dev_err 15.4% (+:20.0%)
epoch: 3 updates: 1185 (23.7%) dev_err 15.6% (+:20.0%)
epoch: 4 updates: 1187 (23.7%) dev_err 15.5% (+:20.3%)
epoch: 5 updates: 1161 (23.2%) dev_err 15.9% (+:20.3%)
0 4
dimensionality: 259
epoch: 1 updates: 1235 (24.7%) dev_err 15.0% (+:18.4%)
epoch: 2 updates: 1175 (

epoch: 1 updates: 1277 (25.5%) dev_err 14.3% (+:18.7%)
epoch: 2 updates: 1203 (24.1%) dev_err 14.6% (+:19.4%)
epoch: 3 updates: 1189 (23.8%) dev_err 15.1% (+:20.1%)
epoch: 4 updates: 1181 (23.6%) dev_err 15.0% (+:20.4%)
epoch: 5 updates: 1170 (23.4%) dev_err 15.0% (+:20.6%)
4 6
dimensionality: 289
epoch: 1 updates: 1247 (24.9%) dev_err 15.0% (+:19.0%)
epoch: 2 updates: 1182 (23.6%) dev_err 14.4% (+:19.6%)
epoch: 3 updates: 1161 (23.2%) dev_err 14.9% (+:20.5%)
epoch: 4 updates: 1191 (23.8%) dev_err 14.8% (+:20.4%)
epoch: 5 updates: 1166 (23.3%) dev_err 15.3% (+:20.5%)
5 0
dimensionality: 244
epoch: 1 updates: 1261 (25.2%) dev_err 14.9% (+:19.1%)
epoch: 2 updates: 1173 (23.5%) dev_err 15.1% (+:19.3%)
epoch: 3 updates: 1156 (23.1%) dev_err 14.9% (+:19.7%)
epoch: 4 updates: 1157 (23.1%) dev_err 15.2% (+:19.8%)
epoch: 5 updates: 1179 (23.6%) dev_err 14.9% (+:19.9%)
5 1
dimensionality: 263
epoch: 1 updates: 1255 (25.1%) dev_err 14.7% (+:18.7%)
epoch: 2 updates: 1165 (23.3%) dev_err 14.4% (+:

In [14]:
combined_map = map_features("income.train.txt.5k", [3, 4], [6])
train_data_cmb = process_data_cmb("income.train.txt.5k", [3, 4], [6], combined_map[0], combined_map[1])
dev_data_cmb = process_data_cmb("income.dev.txt", [3, 4], [6], combined_map[0], combined_map[1])
        
cmbap = avg_perceptron(list(zip(*train_data_cmb)), dev_data_cmb, 5)

dimensionality: 272
epoch: 1 updates: 1238 (24.8%) dev_err 15.0% (+:18.8%)
epoch: 2 updates: 1187 (23.7%) dev_err 15.0% (+:20.4%)
epoch: 3 updates: 1161 (23.2%) dev_err 14.2% (+:20.6%)
epoch: 4 updates: 1137 (22.7%) dev_err 14.4% (+:20.8%)
epoch: 5 updates: 1157 (23.1%) dev_err 15.0% (+:21.0%)


In [15]:
combined_map = map_features("income.train.txt.5k", [6, 8], [4])
train_data_cmb = process_data_cmb("income.train.txt.5k", [6, 8], [4], combined_map[0], combined_map[1])
dev_data_cmb = process_data_cmb("income.dev.txt", [6, 8], [4], combined_map[0], combined_map[1])
        
cmbap = avg_perceptron(list(zip(*train_data_cmb)), dev_data_cmb, 5)

dimensionality: 475
epoch: 1 updates: 1288 (25.8%) dev_err 14.6% (+:19.6%)
epoch: 2 updates: 1199 (24.0%) dev_err 14.5% (+:20.7%)
epoch: 3 updates: 1192 (23.8%) dev_err 14.7% (+:20.9%)
epoch: 4 updates: 1188 (23.8%) dev_err 14.4% (+:21.0%)
epoch: 5 updates: 1175 (23.5%) dev_err 14.5% (+:21.1%)


In [16]:
combined_map = map_features("income.train.txt.5k", [3, 4], [6, 8])
train_data_cmb = process_data_cmb("income.train.txt.5k", [3, 4], [6, 8], combined_map[0], combined_map[1])
dev_data_cmb = process_data_cmb("income.dev.txt", [3, 4], [6, 8], combined_map[0], combined_map[1])
        
cmbap = avg_perceptron(list(zip(*train_data_cmb)), dev_data_cmb, 5)

dimensionality: 614
epoch: 1 updates: 1277 (25.5%) dev_err 14.7% (+:19.9%)
epoch: 2 updates: 1192 (23.8%) dev_err 15.0% (+:21.0%)
epoch: 3 updates: 1176 (23.5%) dev_err 15.2% (+:21.8%)
epoch: 4 updates: 1141 (22.8%) dev_err 15.4% (+:21.6%)
epoch: 5 updates: 1153 (23.1%) dev_err 15.2% (+:21.6%)


In [17]:
# 4.3 dev
combined_map = map_features("income.train.txt.5k", [3, 4], [6])
train_data_dev = process_data_cmb("income.dev.txt", [3, 4], [6], combined_map[0], combined_map[1])
dev_data_dev = process_data_cmb("income.dev.txt", [3, 4], [6], combined_map[0], combined_map[1])
        
devap = avg_perceptron(list(zip(*train_data_dev)), dev_data_dev, 5)

dimensionality: 272
epoch: 1 updates: 238 (23.8%) dev_err 13.8% (+:20.8%)
epoch: 2 updates: 208 (20.8%) dev_err 13.1% (+:21.1%)
epoch: 3 updates: 201 (20.1%) dev_err 12.8% (+:21.8%)
epoch: 4 updates: 195 (19.5%) dev_err 11.9% (+:21.5%)
epoch: 5 updates: 187 (18.7%) dev_err 12.1% (+:21.1%)


In [25]:
# 4.3 blind
combined_map = map_features("income.train.txt.5k", [3, 4], [6])
train_data_blind = process_data_cmb("income.train.txt.5k", [3, 4], [6], combined_map[0], combined_map[1])
dev_data_blind = process_data_cmb("income.test.blind", [3, 4], [6], combined_map[0], combined_map[1])
        
devap = avg_perceptron(list(zip(*train_data_blind)), dev_data_blind, 4)

dimensionality: 272
epoch: 1 updates: 1238 (24.8%) dev_err 13.4% (+:13.4%)
epoch: 2 updates: 1187 (23.7%) dev_err 15.1% (+:15.1%)
epoch: 3 updates: 1161 (23.2%) dev_err 15.4% (+:15.4%)
epoch: 4 updates: 1137 (22.7%) dev_err 15.6% (+:15.6%)


In [26]:
with open('income.test.predicted', 'w') as out:
    for i, line in enumerate(open('income.test.blind').readlines()):
        if devap[1][i] == 1:
            pred = ">50k"
        else:
            pred = "<=50k"
        
        out.write(line[:-2] + ', ' + pred + '\n')