In [63]:
%matplotlib notebook

import matplotlib.pyplot as plt
import numpy as np
import numpy.random
import numpy.linalg
import scipy.io
import scipy.stats
import sklearn.metrics
import random

inNotebook = True # change this to True if you use a notebook
def nextplot():
    if inNotebook:
        plt.figure()  # this creates a new plot
    else:
        plt.clf()     # and this clears the current one

# Load the data

In [39]:
data = scipy.io.loadmat('data/spamData.mat')
X = data['Xtrain']

N = X.shape[0]
D = X.shape[1]

Xtest = data['Xtest']
Ntest = Xtest.shape[0]
y = data['ytrain'].squeeze().astype(int)
ytest = data['ytest'].squeeze().astype(int)

features = np.array([
    "word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d",
    "word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet",
    "word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will",
    "word_freq_people", "word_freq_report", "word_freq_addresses", "word_freq_free",
    "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit",
    "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money",
    "word_freq_hp", "word_freq_hpl", "word_freq_george", "word_freq_650",
    "word_freq_lab", "word_freq_labs", "word_freq_telnet", "word_freq_857",
    "word_freq_data", "word_freq_415", "word_freq_85", "word_freq_technology",
    "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct",
    "word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project",
    "word_freq_re", "word_freq_edu", "word_freq_table", "word_freq_conference",
    "char_freq_;", "char_freq_(", "char_freq_[", "char_freq_!",
    "char_freq_$", "char_freq_#", "capital_run_length_average", "capital_run_length_longest",
    "capital_run_length_total" ])

# 1. Dataset Statistics

In [40]:
# look some dataset statistics
scipy.stats.describe(X)
print(np.mean(X,axis=1))

[ 20.10449123   4.24650877   4.2464386  ...,   2.33024561   1.55603509
   0.97447368]


In [41]:
# plot the distribution of all features
nextplot()
densities = [ scipy.stats.gaussian_kde(X[:,j]) for j in range(D) ]

# np.linspace returns arthmetic progression
xs = np.linspace(0,np.max(X),200)

for j in range(D):
    plt.plot(xs, densities[j](xs), label=j)
plt.legend(ncol=5)

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x10dbb5a20>

In [73]:
# this plots is not really helpful; go now explore further
# YOUR CODE HERE

nextplot()

densities = list()
for j in range(D):
    kernel =scipy.stats.gaussian_kde(Xz[:,j])
    bw = kernel.covariance_factor()*3.0
    densities.append(scipy.stats.gaussian_kde(X[:,j],bw_method=bw))

# np.linspace returns arthmetic progression
xs = np.linspace(0,0.8,200)

for j in range(D):
    plt.plot(xs, densities[j](xs), label=j)
plt.legend(ncol=5)

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x104d53908>

In [146]:
# Let's compute z-scores; create two new variables Xz and Xtestz.
def create_new(data):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    def z_score(var, mean, std):
        return  (var - mean)/ std 
    re_array = list()
    for i in range(D):
        col = [z_score(X[j,i],mean[i],std[i]) for j in range(len(data))]
        #print(col)
        re_array.append(col)
    return np.transpose(np.asarray(re_array))

Xz = create_new(X)
Xtestz = create_new(Xtest)


In [147]:
# Let's check. Xz and Xtestz refer to the normalized datasets just created. We
# will use them throughout.
a1 = np.mean(Xz, axis=0)                       # should be all 0
a2 = np.var(Xz, axis=0)                        # should be all 1
np.mean(Xtestz, axis=0)                   # what do you get here?
np.var(Xtestz, axis=0)

a3 = np.sum(Xz**3)                             # should be: 1925261.15
print(a1,a2,a3)

[  1.85459768e-17   9.27298839e-18  -5.56379304e-17  -9.27298839e-18
   5.56379304e-17   3.70919536e-17   0.00000000e+00  -7.41839072e-17
   5.56379304e-17   0.00000000e+00  -1.85459768e-17  -2.43415945e-17
  -4.63649420e-17   1.85459768e-17   1.85459768e-17   3.70919536e-17
  -3.70919536e-17  -9.27298839e-17  -1.66913791e-16   9.27298839e-18
   1.85459768e-17   9.27298839e-18  -5.56379304e-17  -1.85459768e-17
  -6.49109188e-17  -3.70919536e-17  -1.85459768e-17   1.85459768e-17
  -2.78189652e-17   4.63649420e-17  -1.85459768e-17   5.56379304e-17
   0.00000000e+00  -1.85459768e-17   3.70919536e-17   1.85459768e-17
  -9.27298839e-18   4.63649420e-18   1.85459768e-17   9.27298839e-18
   2.31824710e-17  -2.78189652e-17  -9.27298839e-18   4.63649420e-18
  -9.27298839e-18  -9.27298839e-18   1.39094826e-17  -2.78189652e-17
  -3.70919536e-17  -6.49109188e-17   4.63649420e-18   3.70919536e-17
  -3.70919536e-17   9.27298839e-18  -9.27298839e-18   9.27298839e-18
  -7.41839072e-17] [ 1.  1.  1.  1

In [45]:
# Explore the normalized data
# YOUR CODE HERE

# 2. Maximum Likelihood Estimation

## Helper functions

In [46]:
def logsumexp(x):
    """Computes log(sum(exp(x)).

    Uses offset trick to reduce risk of numeric over- or underflow. When x is a
    1D ndarray, computes logsumexp of its entries. When x is a 2D ndarray,
    computes logsumexp of each row.

    Keyword arguments:
    x : a 1D or 2D ndarray
    """
    offset = np.max(x, axis=0)
    return offset + np.log(np.sum(np.exp(x-offset), axis=0))

In [47]:
# Define the logistic function. Make sure it operates on both scalars
# and vectors.
def sigma(x):
    # YOUR CODE HERE
    if isinstance(x, np.ndarray):
        re = list()
        for i in x:
            e = 1 / (1 + np.exp(i * -1))
            re.append(e)
        return np.asarray(re)
    else:
        re = 1 / (1 + np.exp(x * -1))
    return re

In [48]:
# this should give:
# [0.5, array([0.26894142, 0.5, 0.73105858])]
[ sigma(0), sigma(np.array([-1,0,1])) ]

[0.5, array([ 0.26894142,  0.5       ,  0.73105858])]

In [49]:
# Define the logarithm of the logistic function. Make sure it operates on both
# scalars and vectors. Perhaps helpful: isinstance(x, np.ndarray).
def logsigma (x):
    # YOUR CODE HERE
    if isinstance(x, np.ndarray):
        re = list()
        for i in x:
            e = np.log(sigma(i))
            re.append(e)
        return np.asarray(re)
    else:
        re = np.log(sigma(x))
    return re

In [50]:
# this should give:
# [-0.69314718055994529, array([-1.31326169, -0.69314718, -0.31326169])]
[ logsigma(0), logsigma(np.array([-1,0,1])) ]

[-0.69314718055994529, array([-1.31326169, -0.69314718, -0.31326169])]

## 2b Log-likelihood and gradient

In [51]:
def l(y, X, w):
    """Log-likelihood of the logistic regression model.

    Parameters
    ----------
    y : ndarray of shape (N,)
        Binary labels (either 0 or 1).
    X : ndarray of shape (N,D)
        Design matrix.
    w : ndarray of shape (D,)
        Weight vector.
    """
    # YOUR CODE HERE
    eta = np.asarray([np.dot(w,i) for i in X])
    ll = np.sum( y * eta - np.log(1 + np.exp(eta)) )
    return ll

In [52]:
# this should give:
# -47066.641667825766
l(y, Xz, np.linspace(-5,5,D))

-47066.641667825774

In [53]:
def dl(y,X,w):
    """Gradient of the log-likelihood of the logistic regression model.

    Parameters
    ----------
    y : ndarray of shape (N,)
        Binary labels (either 0 or 1).
    X : ndarray of shape (N,D)
        Design matrix.
    w : ndarray of shape (D,)
        Weight vector.

    Returns
    -------
    ndarray of shape (D,)
    """
    # YOUR CODE HERE
    eta = np.asarray([np.dot(w,i) for i in X])
    result = list()
    for i in X.T:
        
        # gradient_log_liklihood = ∑ (y - sigma(eta) * x
        gradient_ll = np.sum(np.dot((y - sigma(eta)), i))
        
        a = gradient_ll.tolist()              
        result.append(a)
    return np.asarray(result)
        

In [54]:
# this should give:
# array([  551.33985842,   143.84116318,   841.83373606,   156.87237578,
#          802.61217579,   795.96202907,   920.69045803,   621.96516752,
#          659.18724769,   470.81259805,   771.32406968,   352.40325626,
#          455.66972482,   234.36600888,   562.45454038,   864.83981264,
#          787.19723703,   649.48042176,   902.6478154 ,   544.00539886,
#         1174.78638035,   120.3598967 ,   839.61141672,   633.30453444,
#         -706.66815087,  -630.2039816 ,  -569.3451386 ,  -527.50996698,
#         -359.53701083,  -476.64334832,  -411.60620464,  -375.11950586,
#         -345.37195689,  -376.22044258,  -407.31761977,  -456.23251936,
#         -596.86960184,  -107.97072355,  -394.82170044,  -229.18125598,
#         -288.46356547,  -362.13402385,  -450.87896465,  -277.03932676,
#         -414.99293368,  -452.28771693,  -167.54649092,  -270.9043748 ,
#         -252.20140951,  -357.72497343,  -259.12468742,   418.35938483,
#          604.54173228,    43.10390907,   152.24258478,   378.16731033,
#          416.12032881])
dl(y, Xz, np.linspace(-5,5,D))

array([  551.33985842,   143.84116318,   841.83373606,   156.87237578,
         802.61217579,   795.96202907,   920.69045803,   621.96516752,
         659.18724769,   470.81259805,   771.32406968,   352.40325626,
         455.66972482,   234.36600888,   562.45454038,   864.83981264,
         787.19723703,   649.48042176,   902.6478154 ,   544.00539886,
        1174.78638035,   120.3598967 ,   839.61141672,   633.30453444,
        -706.66815087,  -630.2039816 ,  -569.3451386 ,  -527.50996698,
        -359.53701083,  -476.64334832,  -411.60620464,  -375.11950586,
        -345.37195689,  -376.22044258,  -407.31761977,  -456.23251936,
        -596.86960184,  -107.97072355,  -394.82170044,  -229.18125598,
        -288.46356547,  -362.13402385,  -450.87896465,  -277.03932676,
        -414.99293368,  -452.28771693,  -167.54649092,  -270.9043748 ,
        -252.20140951,  -357.72497343,  -259.12468742,   418.35938483,
         604.54173228,    43.10390907,   152.24258478,   378.16731033,
      

## 2c Gradient descent

In [55]:
# you don't need to modify this function
def optimize(obj_up, theta0, nepochs=50, eps0=0.01):
    """Iteratively minimize a function.

    We use it here to run either gradient descent or stochastic gradient
    descent, using arbitrarly optimization criteria.

    Parameters
    ----------
    obj_up  : a tuple of form (f, update) containing two functions f and update.
              f(theta) computes the value of the objective function.
              update(theta,eps) performs a parameter update with step size eps
              and returns the result.
    theta0  : ndarray of shape (D,)
              Initial parameter vector.
    nepochs : int
              How many epochs (calls to update) to run.
    eps0    : float
              Initial step size.

    Returns
    -------
    A triple consisting of the fitted parameter vector, the values of the
    objective function after every epoch, and the step sizes that were used.
    """

    f, update = obj_up

    # initialize results
    theta = theta0
    
    # contains liklihood value
    values = np.zeros(nepochs+1)
    
    # contains learning rate
    eps = np.zeros(nepochs+1)
    
    values[0] = f(theta0)
    eps[0] = eps0

    # now run the update function nepochs times
        for epoch in range(nepochs):
        print("Epoch {:3d}: f={:10.3f}, eps={:10.9f}".format(epoch, values[epoch], eps[epoch]))
        
        # update newly theta
        theta = update(theta, eps[epoch])

        # we use the bold driver heuristic
        # pre calculate next value of liklihood, if next is smaller than current
        # then means eps set too large, over the peak. So reduce the eps 
        values[epoch+1] = f(theta)
        if (values[epoch] < values[epoch+1]):
            eps[epoch+1] = eps[epoch]/2.
        else:
            eps[epoch+1] = eps[epoch]*1.05

    # all done
    print("Result after {} epochs: f={}".format(nepochs, values[-1]))
    return theta, values, eps

In [56]:
# define the objective and update function for one gradient-descent epoch for
# fitting an MLE estimate of logistic regression with gradient descent (should
# return a tuple of two functions; see optimize)
def gd(y,X):
    # YOUR CODE HERE
    def update(theta, eps):
        return theta + eps * dl(y,X,theta)
    def f(theta):
        # f means calculate the L(theta), the value of liklihood
        return l(y,X,theta) * -1
    return f, update

In [57]:
# this should give
# [47066.641667825766,
#  array([  4.13777838e+01,  -1.56745627e+01,   5.75882538e+01,
#           1.14225143e+01,   5.54249703e+01,   5.99229049e+01,
#           7.11220141e+01,   4.84761728e+01,   5.78067289e+01,
#           4.54794720e+01,   7.14638492e+01,   1.51369386e+01,
#           3.36375739e+01,   2.15061217e+01,   5.78014255e+01,
#           6.72743066e+01,   7.00829312e+01,   5.29328088e+01,
#           6.16042473e+01,   5.50018510e+01,   8.94624817e+01,
#           2.74784480e+01,   8.51763599e+01,   5.60363965e+01,
#          -2.55865589e+01,  -1.53788213e+01,  -4.67015412e+01,
#          -2.50356570e+00,  -3.85357592e+00,  -2.21819155e+00,
#           3.32098671e+00,   3.86933390e+00,  -2.00309898e+01,
#           3.84684492e+00,  -2.19847927e-01,  -1.29775457e+00,
#          -1.28374302e+01,  -2.78303173e+00,  -5.61671182e+00,
#           1.73657121e+01,  -6.81197570e+00,  -1.20249002e+01,
#           2.65789491e+00,  -1.39557852e+01,  -2.01135653e+01,
#          -2.72134051e+01,  -9.45952961e-01,  -1.02239111e+01,
#           1.52794293e-04,  -5.18938123e-01,  -3.19717561e+00,
#           4.62953437e+01,   7.87893022e+01,   1.88618651e+01,
#           2.85195027e+01,   5.04698358e+01,   6.41240689e+01])
f, update = gd(y, Xz)
[ f(np.linspace(-5,5,D)),
  update(np.linspace(-5,-5,D), .1) ]

[47066.641667825774,
 array([  4.13777838e+01,  -1.56745627e+01,   5.75882538e+01,
          1.14225143e+01,   5.54249703e+01,   5.99229049e+01,
          7.11220141e+01,   4.84761728e+01,   5.78067289e+01,
          4.54794720e+01,   7.14638492e+01,   1.51369386e+01,
          3.36375739e+01,   2.15061217e+01,   5.78014255e+01,
          6.72743066e+01,   7.00829312e+01,   5.29328088e+01,
          6.16042473e+01,   5.50018510e+01,   8.94624817e+01,
          2.74784480e+01,   8.51763599e+01,   5.60363965e+01,
         -2.55865589e+01,  -1.53788213e+01,  -4.67015412e+01,
         -2.50356570e+00,  -3.85357592e+00,  -2.21819155e+00,
          3.32098671e+00,   3.86933390e+00,  -2.00309898e+01,
          3.84684492e+00,  -2.19847927e-01,  -1.29775457e+00,
         -1.28374302e+01,  -2.78303173e+00,  -5.61671182e+00,
          1.73657121e+01,  -6.81197570e+00,  -1.20249002e+01,
          2.65789491e+00,  -1.39557852e+01,  -2.01135653e+01,
         -2.72134051e+01,  -9.45952961e-01,  -1.0

In [58]:
# you can run gradient descent!
numpy.random.seed(0)
w0 = np.random.normal(size=D)
wz_gd, vz_gd, ez_gd = optimize(gd(y,Xz), w0, nepochs=500)

Epoch   0: f=  6636.208, eps=0.010000000
Epoch   1: f=  4216.957, eps=0.010500000
Epoch   2: f=  2657.519, eps=0.011025000
Epoch   3: f=  1926.135, eps=0.011576250
Epoch   4: f=  1449.495, eps=0.012155063
Epoch   5: f=  1207.529, eps=0.012762816
Epoch   6: f=  1052.489, eps=0.013400956
Epoch   7: f=   957.275, eps=0.014071004
Epoch   8: f=   899.610, eps=0.014774554
Epoch   9: f=   882.904, eps=0.015513282
Epoch  10: f=  1017.083, eps=0.007756641
Epoch  11: f=   840.760, eps=0.008144473
Epoch  12: f=   805.649, eps=0.008551697
Epoch  13: f=   822.108, eps=0.004275848
Epoch  14: f=   746.377, eps=0.004489641
Epoch  15: f=   735.803, eps=0.004714123
Epoch  16: f=   729.780, eps=0.004949829
Epoch  17: f=   724.467, eps=0.005197320
Epoch  18: f=   719.408, eps=0.005457186
Epoch  19: f=   714.564, eps=0.005730046
Epoch  20: f=   709.932, eps=0.006016548
Epoch  21: f=   705.514, eps=0.006317375
Epoch  22: f=   701.321, eps=0.006633244
Epoch  23: f=   697.373, eps=0.006964906
Epoch  24: f=   

Epoch 200: f=   661.463, eps=0.005330767
Epoch 201: f=   661.555, eps=0.002665383
Epoch 202: f=   660.978, eps=0.002798653
Epoch 203: f=   660.966, eps=0.002938585
Epoch 204: f=   660.955, eps=0.003085514
Epoch 205: f=   660.942, eps=0.003239790
Epoch 206: f=   660.929, eps=0.003401780
Epoch 207: f=   660.916, eps=0.003571869
Epoch 208: f=   660.902, eps=0.003750462
Epoch 209: f=   660.887, eps=0.003937985
Epoch 210: f=   660.871, eps=0.004134885
Epoch 211: f=   660.855, eps=0.004341629
Epoch 212: f=   660.837, eps=0.004558710
Epoch 213: f=   660.819, eps=0.004786646
Epoch 214: f=   660.801, eps=0.005025978
Epoch 215: f=   660.781, eps=0.005277277
Epoch 216: f=   660.760, eps=0.005541141
Epoch 217: f=   660.738, eps=0.005818198
Epoch 218: f=   660.715, eps=0.006109108
Epoch 219: f=   660.691, eps=0.006414563
Epoch 220: f=   660.666, eps=0.006735291
Epoch 221: f=   660.640, eps=0.007072056
Epoch 222: f=   660.612, eps=0.007425659
Epoch 223: f=   660.583, eps=0.007796941
Epoch 224: f=   

Epoch 400: f=   657.156, eps=0.005967586
Epoch 401: f=   657.138, eps=0.006265965
Epoch 402: f=   657.118, eps=0.006579263
Epoch 403: f=   657.098, eps=0.006908226
Epoch 404: f=   657.077, eps=0.007253638
Epoch 405: f=   657.054, eps=0.007616320
Epoch 406: f=   657.031, eps=0.007997136
Epoch 407: f=   657.007, eps=0.008396992
Epoch 408: f=   656.981, eps=0.008816842
Epoch 409: f=   656.954, eps=0.009257684
Epoch 410: f=   656.926, eps=0.009720568
Epoch 411: f=   656.896, eps=0.010206597
Epoch 412: f=   656.866, eps=0.010716927
Epoch 413: f=   656.838, eps=0.011252773
Epoch 414: f=   656.871, eps=0.005626387
Epoch 415: f=   656.908, eps=0.002813193
Epoch 416: f=   656.776, eps=0.002953853
Epoch 417: f=   656.765, eps=0.003101546
Epoch 418: f=   656.755, eps=0.003256623
Epoch 419: f=   656.745, eps=0.003419454
Epoch 420: f=   656.735, eps=0.003590427
Epoch 421: f=   656.724, eps=0.003769948
Epoch 422: f=   656.713, eps=0.003958445
Epoch 423: f=   656.701, eps=0.004156368
Epoch 424: f=   

In [59]:
# look at how gradient descent made progess
# YOUR CODE HERE
print(wz_gd, vz_gd, ez_gd)


[ -8.86805803e-03  -1.87711722e-01   1.21527679e-01   8.08625721e+00
   2.59086182e-01   1.81011961e-01   1.15333583e+00   2.90298799e-01
   1.13527420e-01   6.60449223e-02  -4.68398587e-02  -1.65669222e-01
  -6.53149090e-02  -1.09479222e-02   4.94021994e-01   8.43700154e-01
   5.63710277e-01   5.46097770e-02   2.89015689e-01   2.84882797e-01
   2.17373273e-01   6.64971958e-01   1.46174692e+00   2.54294549e-01
  -2.07767632e+00  -2.64575174e-01  -1.41614626e+00  -1.47368318e-03
  -3.06163147e-01  -3.12258163e-02   4.18300416e-02   1.28949980e+00
  -3.28686051e-01  -1.22486766e+00  -8.94396839e-02   4.03419808e-01
   4.58896319e-02  -1.52073801e-01  -3.18849236e-01  -2.15088329e-01
  -2.17999766e-01  -7.68176094e-01  -2.41103469e-01  -6.14325300e-01
  -7.98221834e-01  -1.05010623e+00  -1.49903381e-01  -6.40578227e-01
  -5.06152873e-01  -2.01326929e-01  -1.08442215e-01   2.37141619e-01
   2.18926566e+00   1.15453951e+00  -5.17771557e-01   2.31366068e+00
   2.14950198e-01] [ 6636.20835095

In [106]:
print(w0)

[ 1.76405235  0.40015721  0.97873798  2.2408932   1.86755799 -0.97727788
  0.95008842 -0.15135721 -0.10321885  0.4105985   0.14404357  1.45427351
  0.76103773  0.12167502  0.44386323  0.33367433  1.49407907 -0.20515826
  0.3130677  -0.85409574 -2.55298982  0.6536186   0.8644362  -0.74216502
  2.26975462 -1.45436567  0.04575852 -0.18718385  1.53277921  1.46935877
  0.15494743  0.37816252 -0.88778575 -1.98079647 -0.34791215  0.15634897
  1.23029068  1.20237985 -0.38732682 -0.30230275 -1.04855297 -1.42001794
 -1.70627019  1.9507754  -0.50965218 -0.4380743  -1.25279536  0.77749036
 -1.61389785 -0.21274028 -0.89546656  0.3869025  -0.51080514 -1.18063218
 -0.02818223  0.42833187  0.06651722]


## 2d Stochastic gradient descent

In [111]:
def sgdepoch(y,X,w,eps):
    """Run one SGD epoch and return the updated weight vector. """
    # Run N stochastic gradient steps (without replacement). Do not rescale each
    # step by factor N (i.e., proceed differntly than in the lecture slides).
    # YOUR CODE HERE

    #
    i = random.randint(0,len(y)-1)
    eta = np.dot(X[i],w)
    
    # gradient_log_liklihood = ∑ (y - sigma(eta) * x
    gradient_ll = np.dot((y[i] - sigma(eta)), X[i])
    
    theta = w + eps * gradient_ll
    return theta

In [115]:
# when you run this multiple times, with 50% probability you should get the
# following result (there is one other result which is very close):
# array([ -3.43689655e+02,  -1.71161311e+02,  -5.71093536e+02,
#         -5.16478220e+01,   4.66294348e+02,  -3.71589878e+02,
#          5.21493183e+02,   1.25699230e+03,   8.33804130e+02,
#          5.63185399e+02,   1.32761302e+03,  -2.64104011e+02,
#          7.10693307e+02,  -1.75497331e+02,  -1.94174427e+02,
#          1.11641507e+02,  -3.30817509e+02,  -3.46754913e+02,
#          8.48722111e+02,  -1.89136304e+02,  -4.25693844e+02,
#         -1.23084189e+02,  -2.95894797e+02,  -2.35789333e+02,
#         -3.38695243e+02,  -3.05642830e+02,  -2.28975383e+02,
#         -2.38075137e+02,  -1.66702530e+02,  -2.27341599e+02,
#         -1.77575620e+02,  -1.49093855e+02,  -1.70028859e+02,
#         -1.50243833e+02,  -1.82986008e+02,  -2.41143708e+02,
#         -3.31047159e+02,  -5.79991185e+01,  -1.98477863e+02,
#         -1.91264948e+02,  -1.17371919e+02,  -1.66953779e+02,
#         -2.01472565e+02,  -1.23330949e+02,  -3.00857740e+02,
#         -1.95853348e+02,  -7.44868073e+01,  -1.11172370e+02,
#         -1.57618226e+02,  -1.25729512e+00,  -1.45536466e+02,
#         -1.43362438e+02,  -3.00429708e+02,  -9.84391082e+01,
#         -4.54152047e+01,  -5.26492232e+01,  -1.45175427e+02])
sgdepoch(y[1:3],Xz[1:3,:],np.linspace(-5,5,D),1000)

array([ -3.43689655e+02,  -1.71161311e+02,  -5.71093536e+02,
        -5.16478220e+01,   4.66294348e+02,  -3.71589878e+02,
         5.21493183e+02,   1.25699230e+03,   8.33804130e+02,
         5.63185399e+02,   1.32761302e+03,  -2.64104011e+02,
         7.10693307e+02,  -1.75497331e+02,  -1.94174427e+02,
         1.11641507e+02,  -3.30817509e+02,  -3.46754913e+02,
         8.48722111e+02,  -1.89136304e+02,  -4.25693844e+02,
        -1.23084189e+02,  -2.95894797e+02,  -2.35789333e+02,
        -3.38695243e+02,  -3.05642830e+02,  -2.28975383e+02,
        -2.38075137e+02,  -1.66702530e+02,  -2.27341599e+02,
        -1.77575620e+02,  -1.49093855e+02,  -1.70028859e+02,
        -1.50243833e+02,  -1.82986008e+02,  -2.41143708e+02,
        -3.31047159e+02,  -5.79991185e+01,  -1.98477863e+02,
        -1.91264948e+02,  -1.17371919e+02,  -1.66953779e+02,
        -2.01472565e+02,  -1.23330949e+02,  -3.00857740e+02,
        -1.95853348e+02,  -7.44868073e+01,  -1.11172370e+02,
        -1.57618226e+02,

In [102]:
# define the objective and update function for one gradient-descent epoch for
# fitting an MLE estimate of logistic regression with stochastic gradient descent
# (should return a tuple of two functions; see optimize)
def sgd(y,X):
    # YOUR CODE HERE
    def update(theta, eps):
        return sgdepoch(y,X,theta,eps)
    def f(theta):
        # f means calculate the L(theta), the value of liklihood
        return l(y,X,theta) * -1
    return f, update

In [103]:
# with 50% probability, you should get:
# [40.864973045695081,
#  array([ -3.43689655e+02,  -1.71161311e+02,  -5.71093536e+02,
#          -5.16478220e+01,   4.66294348e+02,  -3.71589878e+02,
#           5.21493183e+02,   1.25699230e+03,   8.33804130e+02,
#           5.63185399e+02,   1.32761302e+03,  -2.64104011e+02,
#           7.10693307e+02,  -1.75497331e+02,  -1.94174427e+02,
#           1.11641507e+02,  -3.30817509e+02,  -3.46754913e+02,
#           8.48722111e+02,  -1.89136304e+02,  -4.25693844e+02,
#          -1.23084189e+02,  -2.95894797e+02,  -2.35789333e+02,
#          -3.38695243e+02,  -3.05642830e+02,  -2.28975383e+02,
#          -2.38075137e+02,  -1.66702530e+02,  -2.27341599e+02,
#          -1.77575620e+02,  -1.49093855e+02,  -1.70028859e+02,
#          -1.50243833e+02,  -1.82986008e+02,  -2.41143708e+02,
#          -3.31047159e+02,  -5.79991185e+01,  -1.98477863e+02,
#          -1.91264948e+02,  -1.17371919e+02,  -1.66953779e+02,
#          -2.01472565e+02,  -1.23330949e+02,  -3.00857740e+02,
#          -1.95853348e+02,  -7.44868073e+01,  -1.11172370e+02,
#          -1.57618226e+02,  -1.25729512e+00,  -1.45536466e+02,
#          -1.43362438e+02,  -3.00429708e+02,  -9.84391082e+01,
#          -4.54152047e+01,  -5.26492232e+01,  -1.45175427e+02])]
f, update = sgd(y[1:3], Xz[1:3,:])
[ f(np.linspace(-5,5,D)),
  update(np.linspace(-5,5,D), 1000) ]

[40.864973045695081,
 array([ -3.43689655e+02,  -1.71161311e+02,  -5.71093536e+02,
         -5.16478220e+01,   4.66294348e+02,  -3.71589878e+02,
          5.21493183e+02,   1.25699230e+03,   8.33804130e+02,
          5.63185399e+02,   1.32761302e+03,  -2.64104011e+02,
          7.10693307e+02,  -1.75497331e+02,  -1.94174427e+02,
          1.11641507e+02,  -3.30817509e+02,  -3.46754913e+02,
          8.48722111e+02,  -1.89136304e+02,  -4.25693844e+02,
         -1.23084189e+02,  -2.95894797e+02,  -2.35789333e+02,
         -3.38695243e+02,  -3.05642830e+02,  -2.28975383e+02,
         -2.38075137e+02,  -1.66702530e+02,  -2.27341599e+02,
         -1.77575620e+02,  -1.49093855e+02,  -1.70028859e+02,
         -1.50243833e+02,  -1.82986008e+02,  -2.41143708e+02,
         -3.31047159e+02,  -5.79991185e+01,  -1.98477863e+02,
         -1.91264948e+02,  -1.17371919e+02,  -1.66953779e+02,
         -2.01472565e+02,  -1.23330949e+02,  -3.00857740e+02,
         -1.95853348e+02,  -7.44868073e+01,  -1.1

In [119]:
# you can run stochastic gradient descent!
wz_sgd, vz_sgd, ez_sgd = optimize(sgd(y,Xz), w0, nepochs=500)

Epoch   0: f=  6636.208, eps=0.010000000
Epoch   1: f=  6636.003, eps=0.010500000
Epoch   2: f=  6611.617, eps=0.011025000
Epoch   3: f=  6611.579, eps=0.011576250
Epoch   4: f=  6591.160, eps=0.012155063
Epoch   5: f=  6591.174, eps=0.006077531
Epoch   6: f=  6596.531, eps=0.003038766
Epoch   7: f=  6583.034, eps=0.003190704
Epoch   8: f=  6566.449, eps=0.003350239
Epoch   9: f=  6567.287, eps=0.001675120
Epoch  10: f=  6563.205, eps=0.001758876
Epoch  11: f=  6557.901, eps=0.001846819
Epoch  12: f=  6557.006, eps=0.001939160
Epoch  13: f=  6554.001, eps=0.002036118
Epoch  14: f=  6553.804, eps=0.002137924
Epoch  15: f=  6553.829, eps=0.001068962
Epoch  16: f=  6553.817, eps=0.001122410
Epoch  17: f=  6551.104, eps=0.001178531
Epoch  18: f=  6551.104, eps=0.001237457
Epoch  19: f=  6551.099, eps=0.001299330
Epoch  20: f=  6551.026, eps=0.001364297
Epoch  21: f=  6544.571, eps=0.001432511
Epoch  22: f=  6537.166, eps=0.001504137
Epoch  23: f=  6537.163, eps=0.001579344
Epoch  24: f=  6

Epoch 213: f=  6483.132, eps=0.000006026
Epoch 214: f=  6483.117, eps=0.000006328
Epoch 215: f=  6483.117, eps=0.000006644
Epoch 216: f=  6483.117, eps=0.000006976
Epoch 217: f=  6483.117, eps=0.000007325
Epoch 218: f=  6483.117, eps=0.000007691
Epoch 219: f=  6483.116, eps=0.000008076
Epoch 220: f=  6483.109, eps=0.000008480
Epoch 221: f=  6483.109, eps=0.000004240
Epoch 222: f=  6483.107, eps=0.000004452
Epoch 223: f=  6483.099, eps=0.000004674
Epoch 224: f=  6483.094, eps=0.000004908
Epoch 225: f=  6483.083, eps=0.000005154
Epoch 226: f=  6483.068, eps=0.000005411
Epoch 227: f=  6483.068, eps=0.000005682
Epoch 228: f=  6483.041, eps=0.000005966
Epoch 229: f=  6483.040, eps=0.000006264
Epoch 230: f=  6482.970, eps=0.000006577
Epoch 231: f=  6482.969, eps=0.000006906
Epoch 232: f=  6482.969, eps=0.000007252
Epoch 233: f=  6482.962, eps=0.000007614
Epoch 234: f=  6482.948, eps=0.000007995
Epoch 235: f=  6482.948, eps=0.000008395
Epoch 236: f=  6482.944, eps=0.000008814
Epoch 237: f=  6

Epoch 433: f=  6482.766, eps=0.000000000
Epoch 434: f=  6482.766, eps=0.000000000
Epoch 435: f=  6482.766, eps=0.000000000
Epoch 436: f=  6482.766, eps=0.000000000
Epoch 437: f=  6482.766, eps=0.000000000
Epoch 438: f=  6482.766, eps=0.000000000
Epoch 439: f=  6482.766, eps=0.000000000
Epoch 440: f=  6482.766, eps=0.000000000
Epoch 441: f=  6482.766, eps=0.000000000
Epoch 442: f=  6482.766, eps=0.000000000
Epoch 443: f=  6482.766, eps=0.000000000
Epoch 444: f=  6482.766, eps=0.000000000
Epoch 445: f=  6482.766, eps=0.000000000
Epoch 446: f=  6482.766, eps=0.000000000
Epoch 447: f=  6482.766, eps=0.000000000
Epoch 448: f=  6482.766, eps=0.000000000
Epoch 449: f=  6482.766, eps=0.000000000
Epoch 450: f=  6482.766, eps=0.000000000
Epoch 451: f=  6482.766, eps=0.000000000
Epoch 452: f=  6482.766, eps=0.000000000
Epoch 453: f=  6482.766, eps=0.000000000
Epoch 454: f=  6482.766, eps=0.000000000
Epoch 455: f=  6482.766, eps=0.000000000
Epoch 456: f=  6482.766, eps=0.000000000
Epoch 457: f=  6

## 2e Compare GD and SGD

In [None]:
# YOUR CODE HERE

# 3 Prediction

In [153]:
def predict(Xtest, w):
    """Returns vector of predicted confidence values for logistic regression with
weight vector w."""
    # YOUR CODE HERE
    return np.dot(Xtest, w)
    
def classify(Xtest, w):
    """Returns 0/1 vector of predicted class labels for logistic regression with
weight vector w."""
    # YOUR CODE HERE
    re = list()
    for i in Xtest:        
        if sigma(np.dot(i,w)) > 0.45:
            re.append(1)
        else:
            re.append(0)
    return np.asarray(re)

In [154]:
yhat = predict(Xtestz, wz_gd)
ypred = classify(Xtestz, wz_gd)
print(len(Xtestz))
a = 0
for i in range(len(ypred)):
    if ytest[i] == ypred[i]:
        a += 1
print(a/len(ypred))

1536
0.5859375


In [155]:
# Example: confusion matrix
yhat = predict(Xtestz, wz_gd)
ypred = classify(Xtestz, wz_gd)
print( sklearn.metrics.classification_report(ytest, ypred) )

             precision    recall  f1-score   support

          0       0.84      0.40      0.54       941
          1       0.48      0.88      0.62       595

avg / total       0.70      0.59      0.57      1536



In [156]:
# Example: precision-recall curve (with annotated thresholds)
nextplot()
precision, recall, thresholds = sklearn.metrics.precision_recall_curve(ytest, yhat)
plt.plot(recall, precision)
for x in np.linspace(0,1,10,endpoint=False):
    index = int(x * (precision.size-1))
    plt.text(recall[index], precision[index], "{:3.2f}".format(thresholds[index]))
plt.xlabel("Recall")
plt.ylabel("Precision")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x10ef3d898>

# 4 Maximum Aposteriori Estimation

## 4a Gradient Descent

In [None]:
def l_l2(y,X,w,lambda_):
    """Log-density of posterior of logistic regression with weights w and L2
regularization parameter lambda_"""
    # YOUR CODE HERE

In [None]:
# this should give:
# [-47066.641667825766, -47312.623810682911]
[ l_l2(y, Xz, np.linspace(-5,5,D),0), l_l2(y, Xz, np.linspace(-5,5,D),1) ]

In [None]:
def dl_l2(y,X,w,lambda_):
    """Gradient of log-density of posterior of logistic regression with weights w
and L2 regularization parameter lambda_."""
    # YOUR CODE HERE

In [None]:
# this should give:
# [array([  551.33985842,   143.84116318,   841.83373606,   156.87237578,
#           802.61217579,   795.96202907,   920.69045803,   621.96516752,
#           659.18724769,   470.81259805,   771.32406968,   352.40325626,
#           455.66972482,   234.36600888,   562.45454038,   864.83981264,
#           787.19723703,   649.48042176,   902.6478154 ,   544.00539886,
#          1174.78638035,   120.3598967 ,   839.61141672,   633.30453444,
#          -706.66815087,  -630.2039816 ,  -569.3451386 ,  -527.50996698,
#          -359.53701083,  -476.64334832,  -411.60620464,  -375.11950586,
#          -345.37195689,  -376.22044258,  -407.31761977,  -456.23251936,
#          -596.86960184,  -107.97072355,  -394.82170044,  -229.18125598,
#          -288.46356547,  -362.13402385,  -450.87896465,  -277.03932676,
#          -414.99293368,  -452.28771693,  -167.54649092,  -270.9043748 ,
#          -252.20140951,  -357.72497343,  -259.12468742,   418.35938483,
#           604.54173228,    43.10390907,   152.24258478,   378.16731033,
#           416.12032881]),
#  array([  556.33985842,   148.66259175,   846.4765932 ,   161.33666149,
#           806.89789007,   800.06917193,   924.61902946,   625.71516752,
#           662.75867626,   474.20545519,   774.5383554 ,   355.43897054,
#           458.52686767,   237.04458031,   564.95454038,   867.16124121,
#           789.34009417,   651.44470748,   904.43352968,   545.61254171,
#          1176.21495178,   121.6098967 ,   840.68284529,   634.19739158,
#          -705.95386516,  -629.66826731,  -568.98799574,  -527.33139555,
#          -359.53701083,  -476.82191975,  -411.9633475 ,  -375.65522015,
#          -346.08624261,  -377.11329972,  -408.38904835,  -457.48251936,
#          -598.29817327,  -109.57786641,  -396.60741472,  -231.14554169,
#          -290.60642261,  -364.45545242,  -453.37896465,  -279.71789819,
#          -417.85007654,  -455.32343122,  -170.76077664,  -274.29723194,
#          -255.77283808,  -361.47497343,  -263.05325885,   414.25224198,
#           600.25601799,    38.63962335,   147.59972763,   373.34588176,
#           411.12032881])]
[ dl_l2(y, Xz, np.linspace(-5,5,D),0), dl_l2(y, Xz, np.linspace(-5,5,D),1) ]

In [None]:
# now define the (f,update) tuple for optimize for logistic regression, L2
# regularization, and gradient descent
def gd_l2(y,X,lambda_):
    # YOUR CODE HERE

In [None]:
# let's run!
lambda_ = 100
wz_gd_l2, vz_gd_l2, ez_gd_l2 = optimize(gd_l2(y,Xz,lambda_), w0, nepochs=500)

## 4b Effect of Prior

In [None]:
# YOUR CODE HERE

## 4c Composition of Weight Vector

In [None]:
# YOUR CODE HERE

## 5 Exploration (optional)

In [None]:
# all yours