In [266]:
from __future__ import division
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


### Problem 4

In [3]:
from scipy.io import loadmat
spam = loadmat('./homework2/data/spam_fixed.mat')

In [4]:
spam_data = spam['data']
spam_labels = spam['labels']
spam_test_data = spam['testdata']
spam_test_labels = spam['testlabels']

In [116]:
def compute_errors(predictions, labels):
    """ Generates a list of indexes of misclassified 
    examples
    """
    zipped = zip(predictions, labels)
    errors = [ix for ix, tup in enumerate(zipped)
              if tup[0] != tup[1]]
    
    return errors

In [117]:
def test_scikits_model(clf, test_data, test_labels):
    preds = clf.predict(test_data)
    errors = compute_errors(preds, [t[0] for t in test_labels])

    return len(errors) / test_labels.shape[0]

In [118]:
def combinations(x):
    combos = np.array(list(itertools.combinations(x,2)))
    x_prime = np.array([x1 * x2.T for x1, x2 in combos])
    
    return x_prime

In [119]:
def expand_features(features):
    original = features
    squared = features ** 2
    combos = combinations(features)
    return np.concatenate([original, squared, combos])

### 4.1 Averaged-Perceptron with 64 passes through the data.

In [100]:
def predict(features,  weights):
    """ 
    Predicts a label (1, -1) given a vector 
    of features and weights
    
    Args:
        features:
        weights:

    Return:
        prediction: int of -1 or 1
    """
    prediction = np.dot(features, weights)
    if prediction > 0:
        return 1
    else:
        return -1

In [101]:
def update_weights(prediction, label, features, weights):
    """
    Args:
        prediction: int of predicted label (1 or -1)
        label: the true label of the data point (1 or -1)
        features: numpy array of feature values
        weights: 1d numpy array of weights for the features
    
    Returns:
        weights: 
    """
    if prediction != label:
        weights = weights + (label * features)
    
    return weights

In [102]:
def perceptron_fit(examples):
    """ 
    Generates a vector of weights
    
    Args:
        examples: vector of feature, label tuples
    
    Returns:
        weights: d-dimensional vector of weights
    """
    weights = np.zeros(examples[0][0].shape)
    for features, label in examples:
        prediction = predict(features, weights)
        weights = update_weights(prediction, label[0], 
                                 features, weights)
    return weights

In [111]:
# add trained bias to signature and prediction
def test_perceptron_model(predict, testdata, testlabels, trained_weights):
    """ Generates predictions from a trained weight vector """
    preds = [predict(features, trained_weights)
             for features in testdata]
    errors = compute_errors(preds, [t[0] for t in testlabels])
    
    return len(errors) / testlabels.shape[0]

In [104]:
def avg_perceptron_train(num_iterations, examples):
    weights = np.zeros(examples[0][0].shape)
    cweights = np.zeros(examples[0][0].shape)
    bias = 0
    cbias = 0
    counter = 1
    
    for iteration in range(0, num_iterations):
        np.random.shuffle(examples)
        for features, label in examples:
            if np.dot(features, weights) + bias <= 0:
                # update the weights for this iteration
                weights = weights + (label * features)
                bias = bias + label
                # update the cached weights
                cweights = cweights + (label * counter * features)
                cbias = bias + (label * counter)
            counter += 1
            
    return (weights - ((1/counter) * cweights), bias - ((1/counter) * cbias))

In [105]:
def avg_perceptron_train(num_iterations, examples):
    weights = np.zeros(examples[0][0].shape)
    cweights = np.zeros(examples[0][0].shape)
    bias = 0
    counter = 1
    
    for iteration in range(0, num_iterations):
        np.random.shuffle(examples)
        for features, label in examples:
            if np.dot(features, weights) <= 0:
                # update the weights for this iteration
                weights = weights + (label * features)
                # update the cached weights
                cweights = cweights + (label * counter * features)
            counter += 1
            
    return weights - ((1/counter) * cweights)

In [106]:
def avg_perceptron_test(features, weights, bias):
    activation = np.dot(features, weights) + bias
    if activation > 0:
        return 1
    else:
        return -1

In [243]:
class AveragedPerceptron(object):
    def __init__(self):
        self.num_iterations = 64
    
    def _prep_examples(self, X, y):
        return zip(X, y)
    
    def fit(self, training_data, training_labels):
        examples = self._prep_examples(training_data, training_labels)
        weights = np.zeros(examples[0][0].shape)
        cweights = np.zeros(examples[0][0].shape)
        bias = 0
        counter = 1

        for iteration in range(0, self.num_iterations):
            np.random.shuffle(examples)
            for features, label in examples:
                if np.dot(features, weights) <= 0:
                    # update the weights for this iteration
                    weights = weights + (label * features)
                    # update the cached weights
                    cweights = cweights + (label * counter * features)
                counter += 1

        self.model = weights - ((1/counter) * cweights)
    
    def predict(self, test_data):
        return np.array([self._predict_one(features) 
                         for features in test_data])
    
    def _predict_one(self, features):
        activation = np.dot(features, self.model)
        if activation > 0:
            return 1
        else:
            return -1

In [244]:
# 1. Averaged-Perceptron with 64 passes through the data.
# spam_examples = zip(spam_data, spam_labels)
# trained_weights = avg_perceptron_train(64, spam_examples)
# test_perceptron_model(predict, spam_test_data, spam_test_labels, trained_weights)
clf = AveragedPerceptron()
clf.fit(spam_data, spam_labels)

In [247]:
test_scikits_model(clf, spam_test_data, spam_test_labels)

0.14127604166666666

### 4.2 Logistic regression model with MLE for parameter estimation.

In [124]:
from sklearn.linear_model import LogisticRegression

In [125]:
clf = LogisticRegression()
clf.fit(spam_data, np.ravel(spam_labels))
test_scikits_model(clf, spam_test_data, spam_test_labels)

0.07747395833333333

### 4.3  Generative model classifier where class conditional distributions are multivariate Gaussian distributions with shared covariance matrix for all classes. Use MLE for parameter estimation.

In [229]:
# http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html#sklearn.discriminant_analysis.LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(spam_data, np.ravel(spam_labels))
test_scikits_model(lda, spam_test_data, spam_test_labels)

0.12239583333333333

### 4.4 Same as above, except arbitrary Gaussians (i.e., each class with its own covariance matrix).

In [228]:
# http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(spam_data, np.ravel(spam_labels))
test_scikits_model(qda, spam_test_data, spam_test_labels)

0.17447916666666666

### 4.5 Averaged Percepton w/ Feature Map

In [254]:
spam_expanded = np.array([expand_features(x) for x in spam_data])

In [256]:
spam_test_expanded = np.array([expand_features(x) for x in spam_test_data])

In [257]:
expanded_examples = zip(spam_expanded, spam_labels)

In [260]:
trained_weights = avg_perceptron_train(64, expanded_examples)
test_perceptron_model(predict, spam_test_expanded, spam_test_labels, trained_weights)

0.2766927083333333

### 4.6 Logistic Regression w/ Feature Map

In [127]:
clf = LogisticRegression()
clf.fit(spam_expanded, np.ravel(spam_labels))
test_scikits_model(clf, spam_test_expanded, spam_test_labels)

0.07747395833333333

### Driver Script

In [267]:
def k_fold_cross_validation(data, labels, num_folds):
    data_folds = np.array_split(data, num_folds)
    label_folds = np.array_split(labels, num_folds)
    data_combos = np.array(list(itertools.combinations(data_folds, num_folds - 1)))
    label_combos = np.array(list(itertools.combinations(label_folds, num_folds - 1)))
    
    for fold_num in range(num_folds):
        # create the training data
        train_data = np.concatenate(data_combos[fold_num])
        train_labels = np.concatenate(label_combos[fold_num])
        # create the holdout set
        test_ix = (num_folds - 1) - fold_num
        test_data = data_folds[test_ix]
        test_labels = label_folds[test_ix]
        training = {'data': train_data, 'labels': train_labels}
        test = {'data': test_data, 'labels': test_labels}
        
        yield (training, test)

In [268]:
models = [
    ('Averaged Perceptron', AveragedPerceptron),
    ('Logistic Regression', LogisticRegression), 
    ('QDA', QuadraticDiscriminantAnalysis),
    ('LDA', LinearDiscriminantAnalysis),
    ('Averaged Perceptron Expanded', AveragedPerceptron),
    ('Logistic Regression Expanded', LogisticRegression)
]

In [273]:
def score_models(models):
    for name, Model in models:
        errors = []
        classifiers = []
        num_folds = 10
        for training, testing in k_fold_cross_validation(spam_data, spam_labels, num_folds):
            if 'Expanded' in name:
                training['data'] = np.array([expand_features(x) for x in training['data']])
                testing['data'] = np.array([expand_features(x) for x in testing['data']])
            clf = Model()
            clf.fit(training['data'], training['labels'])
            error = test_scikits_model(clf, testing['data'], testing['labels'])
            errors.append(error)
            classifiers.append(clf)
        avg_error = np.sum(errors) / num_folds

        yield (name, avg_error, errors, classifiers)

In [303]:
def select_classifer(scored_models):
    errors = np.concatenate([s[2] for s in scored_models])
    classifiers = np.concatenate([s[3] for s in scored_models])
    min_error_ix = np.argmin(errors)
    name_ix = min_error_ix//10
    clf_name = [m[0] for m in scored_models][name_ix]
    return (clf_name, errors[min_error_ix], classifiers[min_error_ix])

In [280]:
def cross_validation_error_rates(scored_models):
    for scored in scored_models:
        print "{} Cross Validation Error Rate: {}".format(scored[0], scored[1])

In [352]:
def best_classfier_test_error_rate(scored_models, test_data, test_labels):
    clf_name, clf_error, clf = select_classifer(scored_models)
    if 'Expanded' in clf_name:
        test_data = np.array([expand_features(x) for x in test_data])
    test_error = test_scikits_model(clf, test_data, test_labels)
    return test_error

In [297]:
scored_models = list(score_models(models))

In [314]:
best_clf_name, best_clf_training_error, best_clf = select_classifer(scored_models)

In [354]:
best_clf_test_error = best_classfier_test_error_rate(
    scored, spam_test_data, spam_test_labels)

In [318]:
# 1. Report the cross-validation error rates for all methods
cross_validation_error_rates(scored)

# 2. the training error rate of the classifier learned by the selected method (and state which method was chosen)
print "%s Training Error Rate: %.4f" % (best_clf_name, best_clf_training_error)

# 3. the test error rate for the learned classifier
print "%s Testing Error Rate %.4f" % (best_clf_name, best_clf_test_error)

Averaged Perceptron Cross Validation Error Rate: 0.136062676971
Logistic Regression Cross Validation Error Rate: 0.0815769304464
QDA Cross Validation Error Rate: 0.163780843499
LDA Cross Validation Error Rate: 0.107669625939
Averaged Perceptron Expanded Cross Validation Error Rate: 0.423422963105
Logistic Regression Expanded Cross Validation Error Rate: 0.0756956419919
Logistic Regression Expanded Training Error Rate: 0.0456
