In [None]:
from __future__ import division

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


### Problem 4

In [6]:
from scipy.io import loadmat
spam = loadmat('./homework2/data/spam.mat')

In [23]:
spam_data = spam['data']
spam_labels = spam['labels']
spam_test_data = spam['testdata']
spam_test_labels = spam['testlabels']

In [30]:
spam_tups = zip(spam_data, spam_labels)

In [113]:
model

array([ -3.06800000e+01,  -3.55120000e+02,  -6.99900000e+01,
         3.42400000e+01,  -7.53300000e+01,   1.10500000e+01,
         8.65700000e+01,   3.59000000e+01,   1.67000000e+01,
        -9.03000000e+01,   1.98500000e+01,  -4.24100000e+02,
        -4.88000000e+00,   6.10000000e-01,   2.51700000e+01,
         1.16430000e+02,   5.09100000e+01,   1.56000000e+00,
        -5.42140000e+02,   3.75300000e+01,   1.76900000e+01,
         1.19400000e+02,   8.28900000e+01,   4.26400000e+01,
        -9.76130000e+02,  -4.69150000e+02,  -6.70280000e+02,
        -2.12610000e+02,  -2.03030000e+02,  -1.77080000e+02,
        -1.13970000e+02,  -9.68500000e+01,  -1.77620000e+02,
        -9.70000000e+01,  -1.85520000e+02,  -1.50830000e+02,
        -2.26000000e+02,  -2.12100000e+01,  -1.48390000e+02,
        -8.22000000e+01,  -1.02580000e+02,  -2.04150000e+02,
        -8.95400000e+01,  -1.32930000e+02,  -4.40890000e+02,
        -3.40390000e+02,  -7.60000000e+00,  -6.84500000e+01,
        -4.19620000e+01,

In [85]:
def predict(features,  weights):
    prediction = np.dot(features, weights)
    if prediction > 0:
        return 1
    else:
        return -1

In [123]:
def update_weights(prediction, label, features, weights):
    if prediction != label:
        weights = weights + (label * features)
    
    return weights

In [124]:
def fit(examples):
    weights = np.zeros(examples[0][0].shape)
    for features, label in examples:
        prediction = predict(features, weights)
        weights = update_weights(prediction, label, 
                                 features, weights)
    return weights

In [125]:
def compute_errors(predictions, labels):
    zipped = zip(predictions, labels)
    errors = [ix for ix in enumerate(zipped) 
              if zipped[0] != zipped[1]]
    
    return errors

In [128]:
def test_model(testdata, testlabels, model):
    preds = [predict(features, model) 
             for features in testdata]
    errors = compute_errors(preds, [t[0] for t in testlabels])
    
#     return len(errors) / testlabels.shape[0]
    return preds

In [127]:
# 1. Averaged-Perceptron with 64 passes through the data.
test_model(spam_test_data, spam_test_labels, model)

0

In [8]:
# 2. Logistic regression model with MLE for parameter estimation.

In [9]:
# 3. Generative model classifier where class conditional distributions are multivariate Gaussian
# distributions with shared covariance matrix for all classes. Use MLE for parameter estimation.

In [10]:
# 4. Same as above, except arbitrary Gaussians (i.e., each class with its own covariance matrix).

In [11]:
# 5&6. Averaged-Perceptron and logistic regression as above, with feature map

In [18]:
from collections import defaultdict
import pickle
import random


class AveragedPerceptron(object):

    '''An averaged perceptron, as implemented by Matthew Honnibal.

    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    '''

    def __init__(self):
        # Each feature gets its own weight vector, so weights is a dict-of-dicts
        self.weights = {}
        self.classes = set()
        # The accumulated values, for the averaging. These will be keyed by
        # feature/clas tuples
        self._totals = defaultdict(int)
        # The last time the feature was changed, for the averaging. Also
        # keyed by feature/clas tuples
        # (tstamps is short for timestamps)
        self._tstamps = defaultdict(int)
        # Number of instances seen
        self.i = 0

    def predict(self, features):
        '''Dot-product the features and current weights and return the best label.'''
        scores = defaultdict(float)
        for feat, value in features.items():
            if feat not in self.weights or value == 0:
                continue
            weights = self.weights[feat]
            for label, weight in weights.items():
                scores[label] += value * weight
        # Do a secondary alphabetic sort, for stability
        return max(self.classes, key=lambda label: (scores[label], label))

    def update(self, truth, guess, features):
        '''Update the feature weights.'''
        def upd_feat(c, f, w, v):
            param = (f, c)
            self._totals[param] += (self.i - self._tstamps[param]) * w
            self._tstamps[param] = self.i
            self.weights[f][c] = w + v

        self.i += 1
        if truth == guess:
            return None
        for f in features:
            weights = self.weights.setdefault(f, {})
            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
        return None

    def average_weights(self):
        '''Average weights from all iterations.'''
        for feat, weights in self.weights.items():
            new_feat_weights = {}
            for clas, weight in weights.items():
                param = (feat, clas)
                total = self._totals[param]
                total += (self.i - self._tstamps[param]) * weight
                averaged = round(total / float(self.i), 3)
                if averaged:
                    new_feat_weights[clas] = averaged
            self.weights[feat] = new_feat_weights
        return None

    def save(self, path):
        '''Save the pickled model weights.'''
        return pickle.dump(dict(self.weights), open(path, 'w'))

    def load(self, path):
        '''Load the pickled model weights.'''
        self.weights = pickle.load(open(path))
        return None


def train(nr_iter, examples):
    '''Return an averaged perceptron model trained on ``examples`` for
    ``nr_iter`` iterations.
    '''
    model = AveragedPerceptron()
    for i in range(nr_iter):
        random.shuffle(examples)
        for features, class_ in examples:
            scores = model.predict(features)
            guess, score = max(scores.items(), key=lambda i: i[1])
            if guess != class_:
                model.update(class_, guess, features)
    model.average_weights()
    return model