In [193]:
import matplotlib.pyplot as plt
import numpy as np
import numpy.random
import numpy.linalg
import scipy.io
import scipy.stats
import sklearn.metrics
%matplotlib inline

# setup plotting 
from IPython import get_ipython
import psutil
inTerminal = not "IPKernelApp" in get_ipython().config
inJupyterNb = any(filter(lambda x: x.endswith("jupyter-notebook"), psutil.Process().parent().cmdline()))
get_ipython().run_line_magic("matplotlib", "" if inTerminal else "notebook" if inJupyterNb else "widget")
def nextplot():
    if inTerminal:
        plt.clf()     # this clears the current plot
    else:
        plt.figure()  # this creates a new plot 

# Load the data

In [2]:
data = scipy.io.loadmat("data/spamData.mat")
X = data["Xtrain"]
N = X.shape[0]
D = X.shape[1]
Xtest = data["Xtest"]
Ntest = Xtest.shape[0]
y = data["ytrain"].squeeze().astype(int)
ytest = data["ytest"].squeeze().astype(int)

features = np.array(
    [
        "word_freq_make",
        "word_freq_address",
        "word_freq_all",
        "word_freq_3d",
        "word_freq_our",
        "word_freq_over",
        "word_freq_remove",
        "word_freq_internet",
        "word_freq_order",
        "word_freq_mail",
        "word_freq_receive",
        "word_freq_will",
        "word_freq_people",
        "word_freq_report",
        "word_freq_addresses",
        "word_freq_free",
        "word_freq_business",
        "word_freq_email",
        "word_freq_you",
        "word_freq_credit",
        "word_freq_your",
        "word_freq_font",
        "word_freq_000",
        "word_freq_money",
        "word_freq_hp",
        "word_freq_hpl",
        "word_freq_george",
        "word_freq_650",
        "word_freq_lab",
        "word_freq_labs",
        "word_freq_telnet",
        "word_freq_857",
        "word_freq_data",
        "word_freq_415",
        "word_freq_85",
        "word_freq_technology",
        "word_freq_1999",
        "word_freq_parts",
        "word_freq_pm",
        "word_freq_direct",
        "word_freq_cs",
        "word_freq_meeting",
        "word_freq_original",
        "word_freq_project",
        "word_freq_re",
        "word_freq_edu",
        "word_freq_table",
        "word_freq_conference",
        "char_freq_;",
        "char_freq_(",
        "char_freq_[",
        "char_freq_!",
        "char_freq_$",
        "char_freq_#",
        "capital_run_length_average",
        "capital_run_length_longest",
        "capital_run_length_total",
    ]
)

In [9]:
import pandas as pd

In [20]:
test_df = pd.concat([pd.DataFrame(Xtest, columns=features.tolist()), pd.DataFrame(ytest, columns=['Spam'])], axis=1)
display(test_df.head())
print(test_df.shape)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,Spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61.0,278.0,1
1,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485.0,2259.0,1
2,0.15,0.0,0.46,0.0,0.61,0.0,0.3,0.0,0.92,0.76,...,0.0,0.271,0.0,0.181,0.203,0.022,9.744,445.0,1257.0,1
3,0.06,0.12,0.77,0.0,0.19,0.32,0.38,0.0,0.06,0.0,...,0.04,0.03,0.0,0.244,0.081,0.0,1.729,43.0,749.0,1
4,0.0,0.69,0.34,0.0,0.34,0.0,0.0,0.0,0.0,0.0,...,0.0,0.056,0.0,0.786,0.0,0.0,3.728,61.0,261.0,1


(1536, 58)


In [21]:
train_df = pd.concat([pd.DataFrame(X, columns=features.tolist()), pd.DataFrame(y, columns=['Spam'])], axis=1)
display(train_df.head())
print(train_df.shape)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,Spam
0,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101.0,1028.0,1
1,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40.0,191.0,1
2,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40.0,191.0,1
3,0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,...,0.0,0.223,0.0,0.0,0.0,0.0,3.0,15.0,54.0,1
4,0.0,0.0,0.0,0.0,1.92,0.0,0.0,0.0,0.0,0.64,...,0.0,0.054,0.0,0.164,0.054,0.0,1.671,4.0,112.0,1


(3065, 58)


In [24]:
train_df.describe()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,Spam
count,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,...,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0
mean,0.110819,0.228486,0.274153,0.062969,0.317788,0.095755,0.113546,0.107217,0.088923,0.241719,...,0.037954,0.138396,0.018183,0.265471,0.079128,0.053422,4.900629,52.675041,282.203915,0.39739
std,0.327252,1.373834,0.484063,1.334772,0.66357,0.260613,0.373958,0.414731,0.264054,0.68542,...,0.235502,0.278921,0.121674,0.87131,0.259719,0.51923,27.245399,220.584047,607.414933,0.489438
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.6,6.0,35.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.066,0.0,0.0,0.0,0.0,2.28,15.0,97.0,0.0
75%,0.0,0.0,0.41,0.0,0.39,0.0,0.0,0.0,0.0,0.16,...,0.0,0.191,0.0,0.315,0.056,0.0,3.737,43.0,272.0,1.0
max,4.54,14.28,5.1,42.81,9.09,3.57,7.27,11.11,3.33,18.18,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


# 1. Dataset Statistics

In [3]:
# look some dataset statistics
scipy.stats.describe(X)

DescribeResult(nobs=3065, minmax=(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 1.]), array([4.5400e+00, 1.4280e+01, 5.1000e+00, 4.2810e+01, 9.0900e+00,
       3.5700e+00, 7.2700e+00, 1.1110e+01, 3.3300e+00, 1.8180e+01,
       2.0000e+00, 9.6700e+00, 5.5500e+00, 5.5500e+00, 2.8600e+00,
       1.0160e+01, 7.1400e+00, 9.0900e+00, 1.8750e+01, 6.3200e+00,
       1.1110e+01, 1.7100e+01, 5.4500e+00, 9.0900e+00, 2.0000e+01,
       1.4280e+01, 3.3330e+01, 4.7600e+00, 1.4280e+01, 4.7600e+00,
       4.7600e+00, 4.7600e+00, 1.8180e+01, 4.7600e+00, 2.0000e+01,
       7.6900e+00, 6.8900e+00, 7.4000e+00, 9.7500e+00, 4.7600e+00,
       7.1400e+00, 1.4280e+01, 3.5700e+00, 2.0000e+01, 2.1420e+01,
       1.6700e+01, 2.1200e+00, 1.0000e+01, 4.3850e+00, 9.7520e+00,
       4.0810e+00, 3.2478e+01, 6.0030e

In [25]:
scipy.stats.describe(y)

DescribeResult(nobs=3065, minmax=(0, 1), mean=0.39738988580750406, variance=0.23954932085067235, skewness=0.41936632478193103, kurtosis=-1.824131885638896)

In [26]:
# plot the distribution of all features
nextplot()
densities = [scipy.stats.gaussian_kde(X[:, j]) for j in range(D)]
xs = np.linspace(0, np.max(X), 200)
for j in range(D):
    plt.plot(xs, densities[j](xs), label=j)
plt.legend(ncol=5)

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x16bc7d210>

In [58]:
# this plots is not really helpful; go now explore further
# YOUR CODE HERE

selected_features = [0, 1, 2]

def plot_selected_features(X, feature_names, selected_features):
    """
    Plots kernel density estimates for the specified features.
    """
    nextplot()  # Clear or create a new plot
    
    for j in selected_features:
        density = scipy.stats.gaussian_kde(X[:, j])
        xs = np.linspace(0, np.max(X[:, j]), 200)
        plt.plot(xs, density(xs), label=feature_names[j])
    
    plt.legend(ncol=1)
    plt.title("KDE Plot of Selected Features")
    plt.show()

plot_selected_features(X, features, selected_features)

<IPython.core.display.Javascript object>

In [37]:
np.mean(X, axis=0)

array([1.10818923e-01, 2.28486134e-01, 2.74153344e-01, 6.29690049e-02,
       3.17787928e-01, 9.57553018e-02, 1.13546493e-01, 1.07216966e-01,
       8.89233279e-02, 2.41719413e-01, 5.81305057e-02, 5.37432300e-01,
       9.26231648e-02, 4.96639478e-02, 5.07210440e-02, 2.35334421e-01,
       1.47197390e-01, 1.86600326e-01, 1.66121044e+00, 7.63066884e-02,
       8.19592170e-01, 1.22727569e-01, 1.02006525e-01, 8.90799347e-02,
       5.29800979e-01, 2.62071778e-01, 7.71507341e-01, 1.14323002e-01,
       1.09487765e-01, 9.92952692e-02, 6.28156607e-02, 4.90342577e-02,
       9.27471452e-02, 4.96019576e-02, 1.02156607e-01, 9.93050571e-02,
       1.43285481e-01, 1.24274062e-02, 7.55921697e-02, 6.60456770e-02,
       4.63360522e-02, 1.32176183e-01, 4.88580750e-02, 7.11876020e-02,
       3.06590538e-01, 1.79794454e-01, 5.28874388e-03, 3.13768352e-02,
       3.79543230e-02, 1.38396411e-01, 1.81830343e-02, 2.65470799e-01,
       7.91275693e-02, 5.34218597e-02, 4.90062936e+00, 5.26750408e+01,
      

In [38]:
# Let's compute z-scores; create two new variables Xz and Xtestz.
mean_train = np.mean(X, axis=0)
std_train = np.std(X, axis=0)  

Xz = (X - mean_train) / std_train
Xtestz = (Xtest - mean_train) / std_train 

In [45]:
# Let's check. Xz and Xtestz refer to the normalized datasets just created. We
# will use them throughout.
print("mean train -- # should be all 0\n", np.mean(Xz, axis=0), '\n')  # should be all 0
print("var train -- # should be all 1\n", np.var(Xz, axis=0), '\n')  # should be all 1
print("mean test -- # what do you get here?\n",np.mean(Xtestz, axis=0), '\n')  # what do you get here?
print("var test -- \n", np.var(Xtestz, axis=0), '\n')

print("should be: 1925261.15\n", np.sum(Xz ** 3), '\n')  # should be: 1925261.15

mean train -- # should be all 0
 [ 1.85459768e-17  9.27298839e-18 -5.56379304e-17 -9.27298839e-18
  5.56379304e-17  3.70919536e-17  0.00000000e+00 -7.41839072e-17
  5.56379304e-17  0.00000000e+00 -1.85459768e-17 -2.43415945e-17
 -4.63649420e-17  1.85459768e-17  1.85459768e-17  3.70919536e-17
 -3.70919536e-17 -9.27298839e-17 -1.66913791e-16  9.27298839e-18
  1.85459768e-17  9.27298839e-18 -5.56379304e-17 -1.85459768e-17
 -6.49109188e-17 -3.70919536e-17 -1.85459768e-17  1.85459768e-17
 -2.78189652e-17  4.63649420e-17 -1.85459768e-17  5.56379304e-17
  0.00000000e+00 -1.85459768e-17  3.70919536e-17  1.85459768e-17
 -9.27298839e-18  4.63649420e-18  1.85459768e-17  9.27298839e-18
  2.31824710e-17 -2.78189652e-17 -9.27298839e-18  4.63649420e-18
 -9.27298839e-18 -9.27298839e-18  1.39094826e-17 -2.78189652e-17
 -3.70919536e-17 -6.49109188e-17  4.63649420e-18  3.70919536e-17
 -3.70919536e-17  9.27298839e-18 -9.27298839e-18  9.27298839e-18
 -7.41839072e-17] 

var train -- # should be all 1
 [1. 1

In [54]:
# Explore the normalized data
# YOUR CODE HERE

train_df_z = pd.concat([pd.DataFrame(Xz, columns=features.tolist()), pd.DataFrame(y, columns=['Spam'])], axis=1)
display(train_df_z.describe().round(3))

print('*****************************')

test_df_z = pd.concat([pd.DataFrame(Xtestz, columns=features.tolist()), pd.DataFrame(ytest, columns=['Spam'])], axis=1)
display(test_df_z.describe().round(3))

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,Spam
count,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,...,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0,3065.0
mean,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,...,-0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.397
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.489
min,-0.339,-0.166,-0.566,-0.047,-0.479,-0.367,-0.304,-0.259,-0.337,-0.353,...,-0.161,-0.496,-0.149,-0.305,-0.305,-0.103,-0.143,-0.234,-0.463,0.0
25%,-0.339,-0.166,-0.566,-0.047,-0.479,-0.367,-0.304,-0.259,-0.337,-0.353,...,-0.161,-0.496,-0.149,-0.305,-0.305,-0.103,-0.121,-0.212,-0.407,0.0
50%,-0.339,-0.166,-0.566,-0.047,-0.479,-0.367,-0.304,-0.259,-0.337,-0.353,...,-0.161,-0.26,-0.149,-0.305,-0.305,-0.103,-0.096,-0.171,-0.305,0.0
75%,-0.339,-0.166,0.281,-0.047,0.109,-0.367,-0.304,-0.259,-0.337,-0.119,...,-0.161,0.189,-0.149,0.057,-0.089,-0.103,-0.043,-0.044,-0.017,1.0
max,13.537,10.23,9.971,32.031,13.222,13.333,19.14,26.534,12.276,26.176,...,18.462,34.473,33.396,36.976,22.812,38.093,40.292,45.053,25.619,1.0


*****************************


Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,Spam
count,1536.0,1536.0,1536.0,1536.0,1536.0,1536.0,1536.0,1536.0,1536.0,1536.0,...,1536.0,1536.0,1536.0,1536.0,1536.0,1536.0,1536.0,1536.0,1536.0,1536.0
mean,-0.057,-0.034,0.04,0.006,-0.025,0.002,0.005,-0.014,0.013,-0.01,...,0.008,0.007,-0.03,0.012,-0.038,-0.053,0.032,-0.007,0.005,0.387
std,0.782,0.805,1.12,1.131,1.04,1.146,1.135,0.898,1.158,0.81,...,1.099,0.905,0.653,0.794,0.83,0.226,1.438,0.586,0.995,0.487
min,-0.339,-0.166,-0.566,-0.047,-0.479,-0.367,-0.304,-0.259,-0.337,-0.353,...,-0.161,-0.496,-0.149,-0.305,-0.305,-0.103,-0.143,-0.234,-0.463,0.0
25%,-0.339,-0.166,-0.566,-0.047,-0.479,-0.367,-0.304,-0.259,-0.337,-0.353,...,-0.161,-0.496,-0.149,-0.305,-0.305,-0.103,-0.122,-0.212,-0.407,0.0
50%,-0.339,-0.166,-0.566,-0.047,-0.479,-0.367,-0.304,-0.259,-0.337,-0.353,...,-0.161,-0.27,-0.149,-0.305,-0.305,-0.103,-0.097,-0.171,-0.315,0.0
75%,-0.339,-0.166,0.322,-0.047,0.067,-0.367,-0.304,-0.259,-0.337,-0.105,...,-0.161,0.156,-0.149,0.052,-0.147,-0.103,-0.045,-0.035,-0.046,1.0
max,6.599,10.23,8.814,31.971,14.593,22.198,19.14,13.922,19.587,6.666,...,17.621,14.819,13.282,10.686,20.105,2.948,37.319,9.02,14.503,1.0


In [55]:
## Redo the Kernel Density Plot
nextplot()
densities = [scipy.stats.gaussian_kde(Xz[:, j]) for j in range(D)]
xs = np.linspace(0, np.max(Xz), 200)
for j in range(D):
    plt.plot(xs, densities[j](xs), label=j)
plt.legend(ncol=5)

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x16c4b47d0>

In [59]:
selected_features = [0, 1, 2]
plot_selected_features(Xz, features, selected_features)

<IPython.core.display.Javascript object>

# 2. Maximum Likelihood Estimation

## Helper functions

In [None]:
def logsumexp(x):
    """Computes log(sum(exp(x)).

    Uses offset trick to reduce risk of numeric over- or underflow. When x is a
    1D ndarray, computes logsumexp of its entries. When x is a 2D ndarray,
    computes logsumexp of each column.

    Keyword arguments:
    x : a 1D or 2D ndarray
    """
    offset = np.max(x, axis=0)
    return offset + np.log(np.sum(np.exp(x - offset), axis=0))

In [64]:
# Define the logistic function. Make sure it operates on both scalars
# and vectors.
def sigma(x):
    # YOUR CODE HERE
    return 1 / (1 + np.exp(-x))

In [65]:
# this should give:
# [0.5, array([0.26894142, 0.5, 0.73105858])]
[sigma(0), sigma(np.array([-1, 0, 1]))]

[0.5, array([0.26894142, 0.5       , 0.73105858])]

In [74]:
# Define the logarithm of the logistic function. Make sure it operates on both
# scalars and vectors. Perhaps helpful: isinstance(x, np.ndarray).
def logsigma(x):
    # YOUR CODE HERE
    # Check if x is a numpy array
    if isinstance(x, np.ndarray):
        return -np.log1p(np.exp(-x))
    else:
        return -np.log1p(np.exp(-x))

In [75]:
# this should give:
# [-0.69314718055994529, array([-1.31326169, -0.69314718, -0.31326169])]
[logsigma(0), logsigma(np.array([-1, 0, 1]))]

[-0.6931471805599453, array([-1.31326169, -0.69314718, -0.31326169])]

## 2b Log-likelihood and gradient

In [84]:
def l(y, X, w):
    """Log-likelihood of the logistic regression model.

    Parameters
    ----------
    y : ndarray of shape (N,)
        Binary labels (either 0 or 1).
    X : ndarray of shape (N,D)
        Design matrix.
    w : ndarray of shape (D,)
        Weight vector.
    """
    # YOUR CODE HERE
    z = X @ w
    log_likelihood = np.sum(y * logsigma(z) + (1 - y) * logsigma(-z))
    return log_likelihood

In [85]:
# this should give:
# -47066.641667825766
l(y, Xz, np.linspace(-5, 5, D))

-47066.641667825774

In [102]:
def dl(y, X, w):
    """Gradient of the log-likelihood of the logistic regression model.

    Parameters
    ----------
    y : ndarray of shape (N,)
        Binary labels (either 0 or 1).
    X : ndarray of shape (N,D)
        Design matrix.
    w : ndarray of shape (D,)
        Weight vector.

    Returns
    -------
    ndarray of shape (D,)
    """
    # YOUR CODE HERE
    gradient = X.T @ (y - sigma(X @ w))
    return gradient

In [103]:
# this should give:
# array([  551.33985842,   143.84116318,   841.83373606,   156.87237578,
#          802.61217579,   795.96202907,   920.69045803,   621.96516752,
#          659.18724769,   470.81259805,   771.32406968,   352.40325626,
#          455.66972482,   234.36600888,   562.45454038,   864.83981264,
#          787.19723703,   649.48042176,   902.6478154 ,   544.00539886,
#         1174.78638035,   120.3598967 ,   839.61141672,   633.30453444,
#         -706.66815087,  -630.2039816 ,  -569.3451386 ,  -527.50996698,
#         -359.53701083,  -476.64334832,  -411.60620464,  -375.11950586,
#         -345.37195689,  -376.22044258,  -407.31761977,  -456.23251936,
#         -596.86960184,  -107.97072355,  -394.82170044,  -229.18125598,
#         -288.46356547,  -362.13402385,  -450.87896465,  -277.03932676,
#         -414.99293368,  -452.28771693,  -167.54649092,  -270.9043748 ,
#         -252.20140951,  -357.72497343,  -259.12468742,   418.35938483,
#          604.54173228,    43.10390907,   152.24258478,   378.16731033,
#          416.12032881])
dl(y, Xz, np.linspace(-5, 5, D))

array([ 551.33985842,  143.84116318,  841.83373606,  156.87237578,
        802.61217579,  795.96202907,  920.69045803,  621.96516752,
        659.18724769,  470.81259805,  771.32406968,  352.40325626,
        455.66972482,  234.36600888,  562.45454038,  864.83981264,
        787.19723703,  649.48042176,  902.6478154 ,  544.00539886,
       1174.78638035,  120.3598967 ,  839.61141672,  633.30453444,
       -706.66815087, -630.2039816 , -569.3451386 , -527.50996698,
       -359.53701083, -476.64334832, -411.60620464, -375.11950586,
       -345.37195689, -376.22044258, -407.31761977, -456.23251936,
       -596.86960184, -107.97072355, -394.82170044, -229.18125598,
       -288.46356547, -362.13402385, -450.87896465, -277.03932676,
       -414.99293368, -452.28771693, -167.54649092, -270.9043748 ,
       -252.20140951, -357.72497343, -259.12468742,  418.35938483,
        604.54173228,   43.10390907,  152.24258478,  378.16731033,
        416.12032881])

## 2c Gradient descent

In [108]:
# you don't need to modify this function
def optimize(obj_up, theta0, nepochs=50, eps0=0.01, verbose=True):
    """Iteratively minimize a function.

    We use it here to run either gradient descent or stochastic gradient
    descent, using arbitrarly optimization criteria.

    Parameters
    ----------
    obj_up  : a tuple of form (f, update) containing two functions f and update.
              f(theta) computes the value of the objective function.
              update(theta,eps) performs an epoch of parameter update with step size
              eps and returns the result.
    theta0  : ndarray of shape (D,)
              Initial parameter vector.
    nepochs : int
              How many epochs (calls to update) to run.
    eps0    : float
              Initial step size.
    verbose : boolean
              Whether to print progress information.

    Returns
    -------
    A triple consisting of the fitted parameter vector, the values of the
    objective function after every epoch, and the step sizes that were used.
    """

    f, update = obj_up

    # initialize results
    theta = theta0
    values = np.zeros(nepochs + 1)
    eps = np.zeros(nepochs + 1)
    values[0] = f(theta0)
    eps[0] = eps0

    # now run the update function nepochs times
    for epoch in range(nepochs):
        if verbose:
            print(
                "Epoch {:3d}: f={:10.3f}, eps={:10.9f}".format(
                    epoch, values[epoch], eps[epoch]
                )
            )
        theta = update(theta, eps[epoch])

        # we use the bold driver heuristic
        values[epoch + 1] = f(theta)
        if values[epoch] < values[epoch + 1]:
            eps[epoch + 1] = eps[epoch] / 2.0
        else:
            eps[epoch + 1] = eps[epoch] * 1.05

    # all done
    if verbose:
        print("Result after {} epochs: f={}".format(nepochs, values[-1]))
    return theta, values, eps

In [109]:
# define the objective and update function for one gradient-descent epoch for
# fitting an MLE estimate of logistic regression with gradient descent (should
# return a tuple of two functions; see optimize)
def gd(y, X):
    def objective(w):
        # YOUR CODE HERE
        return -l(y, X, w)

    def update(w, eps):
        # YOUR CODE HERE
        grad = dl(y, X, w)
        return w + eps * grad

    return (objective, update)

In [110]:
# this should give
# [47066.641667825766,
#  array([  4.13777838e+01,  -1.56745627e+01,   5.75882538e+01,
#           1.14225143e+01,   5.54249703e+01,   5.99229049e+01,
#           7.11220141e+01,   4.84761728e+01,   5.78067289e+01,
#           4.54794720e+01,   7.14638492e+01,   1.51369386e+01,
#           3.36375739e+01,   2.15061217e+01,   5.78014255e+01,
#           6.72743066e+01,   7.00829312e+01,   5.29328088e+01,
#           6.16042473e+01,   5.50018510e+01,   8.94624817e+01,
#           2.74784480e+01,   8.51763599e+01,   5.60363965e+01,
#          -2.55865589e+01,  -1.53788213e+01,  -4.67015412e+01,
#          -2.50356570e+00,  -3.85357592e+00,  -2.21819155e+00,
#           3.32098671e+00,   3.86933390e+00,  -2.00309898e+01,
#           3.84684492e+00,  -2.19847927e-01,  -1.29775457e+00,
#          -1.28374302e+01,  -2.78303173e+00,  -5.61671182e+00,
#           1.73657121e+01,  -6.81197570e+00,  -1.20249002e+01,
#           2.65789491e+00,  -1.39557852e+01,  -2.01135653e+01,
#          -2.72134051e+01,  -9.45952961e-01,  -1.02239111e+01,
#           1.52794293e-04,  -5.18938123e-01,  -3.19717561e+00,
#           4.62953437e+01,   7.87893022e+01,   1.88618651e+01,
#           2.85195027e+01,   5.04698358e+01,   6.41240689e+01])
f, update = gd(y, Xz)
[f(np.linspace(-5, 5, D)), update(np.linspace(-5, -5, D), 0.1)]

[47066.641667825774,
 array([ 4.13777838e+01, -1.56745627e+01,  5.75882538e+01,  1.14225143e+01,
         5.54249703e+01,  5.99229049e+01,  7.11220141e+01,  4.84761728e+01,
         5.78067289e+01,  4.54794720e+01,  7.14638492e+01,  1.51369386e+01,
         3.36375739e+01,  2.15061217e+01,  5.78014255e+01,  6.72743066e+01,
         7.00829312e+01,  5.29328088e+01,  6.16042473e+01,  5.50018510e+01,
         8.94624817e+01,  2.74784480e+01,  8.51763599e+01,  5.60363965e+01,
        -2.55865589e+01, -1.53788213e+01, -4.67015412e+01, -2.50356570e+00,
        -3.85357592e+00, -2.21819155e+00,  3.32098671e+00,  3.86933390e+00,
        -2.00309898e+01,  3.84684492e+00, -2.19847927e-01, -1.29775457e+00,
        -1.28374302e+01, -2.78303173e+00, -5.61671182e+00,  1.73657121e+01,
        -6.81197570e+00, -1.20249002e+01,  2.65789491e+00, -1.39557852e+01,
        -2.01135653e+01, -2.72134051e+01, -9.45952961e-01, -1.02239111e+01,
         1.52794293e-04, -5.18938123e-01, -3.19717561e+00,  4.62953

In [111]:
# you can run gradient descent!
numpy.random.seed(0)
w0 = np.random.normal(size=D)
wz_gd, vz_gd, ez_gd = optimize(gd(y, Xz), w0, nepochs=500)

Epoch   0: f=  6636.208, eps=0.010000000
Epoch   1: f=  4216.957, eps=0.010500000
Epoch   2: f=  2657.519, eps=0.011025000
Epoch   3: f=  1926.135, eps=0.011576250
Epoch   4: f=  1449.495, eps=0.012155063
Epoch   5: f=  1207.529, eps=0.012762816
Epoch   6: f=  1052.489, eps=0.013400956
Epoch   7: f=   957.275, eps=0.014071004
Epoch   8: f=   899.610, eps=0.014774554
Epoch   9: f=   882.904, eps=0.015513282
Epoch  10: f=  1017.083, eps=0.007756641
Epoch  11: f=   840.760, eps=0.008144473
Epoch  12: f=   805.649, eps=0.008551697
Epoch  13: f=   822.108, eps=0.004275848
Epoch  14: f=   746.377, eps=0.004489641
Epoch  15: f=   735.803, eps=0.004714123
Epoch  16: f=   729.780, eps=0.004949829
Epoch  17: f=   724.467, eps=0.005197320
Epoch  18: f=   719.408, eps=0.005457186
Epoch  19: f=   714.564, eps=0.005730046
Epoch  20: f=   709.932, eps=0.006016548
Epoch  21: f=   705.514, eps=0.006317375
Epoch  22: f=   701.321, eps=0.006633244
Epoch  23: f=   697.373, eps=0.006964906
Epoch  24: f=   

In [123]:
# look at how gradient descent made progess
# YOUR CODE HERE
# Plot the progression of the objective function value (negative log-likelihood)
plt.figure(figsize=(9, 4))

plt.subplot(1, 2, 1)
plt.plot(vz_gd, label="Objective (Negative Log-Likelihood)")
plt.xlabel("Epoch")
plt.ylabel("Objective Function Value")
plt.title("Progression of Objective Function")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(ez_gd, label="Step Size (eps)", color="orange")
plt.xlabel("Epoch")
plt.ylabel("Step Size")
plt.title("Progression of Step Size")
plt.legend()

plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

## 2d Stochastic gradient descent

In [136]:
def sgdepoch(y, X, w, eps):
    """Run one SGD epoch and return the updated weight vector. """
    # Run N stochastic gradient steps (without replacement). Do not rescale each
    # step by factor N (i.e., proceed differently than in the lecture slides).
    # YOUR CODE HERE
    indices = np.random.permutation(len(y))
    
    for i in indices:
        xi = X[i]
        yi = y[i]
        
        gradient = (yi - sigma(xi @ w)) * xi
        w += eps * gradient

    return w

In [137]:
# when you run this multiple times, with 50% probability you should get the
# following result (there is one other result which is very close):
# array([ -3.43689655e+02,  -1.71161311e+02,  -5.71093536e+02,
#         -5.16478220e+01,   4.66294348e+02,  -3.71589878e+02,
#          5.21493183e+02,   1.25699230e+03,   8.33804130e+02,
#          5.63185399e+02,   1.32761302e+03,  -2.64104011e+02,
#          7.10693307e+02,  -1.75497331e+02,  -1.94174427e+02,
#          1.11641507e+02,  -3.30817509e+02,  -3.46754913e+02,
#          8.48722111e+02,  -1.89136304e+02,  -4.25693844e+02,
#         -1.23084189e+02,  -2.95894797e+02,  -2.35789333e+02,
#         -3.38695243e+02,  -3.05642830e+02,  -2.28975383e+02,
#         -2.38075137e+02,  -1.66702530e+02,  -2.27341599e+02,
#         -1.77575620e+02,  -1.49093855e+02,  -1.70028859e+02,
#         -1.50243833e+02,  -1.82986008e+02,  -2.41143708e+02,
#         -3.31047159e+02,  -5.79991185e+01,  -1.98477863e+02,
#         -1.91264948e+02,  -1.17371919e+02,  -1.66953779e+02,
#         -2.01472565e+02,  -1.23330949e+02,  -3.00857740e+02,
#         -1.95853348e+02,  -7.44868073e+01,  -1.11172370e+02,
#         -1.57618226e+02,  -1.25729512e+00,  -1.45536466e+02,
#         -1.43362438e+02,  -3.00429708e+02,  -9.84391082e+01,
#         -4.54152047e+01,  -5.26492232e+01,  -1.45175427e+02])
sgdepoch(y[1:3], Xz[1:3, :], np.linspace(-5, 5, D), 1000)

array([-343.68965468, -171.16131088, -571.0935363 ,  -51.64782199,
        466.29434839, -371.58987765,  521.49318326, 1256.99229669,
        833.80413052,  563.18539912, 1327.6130216 , -264.10401085,
        710.69330696, -175.49733124, -194.17442685,  111.6415075 ,
       -330.81750896, -346.75491303,  848.72211134, -189.13630403,
       -425.69384414, -123.08418879, -295.89479699, -235.78933296,
       -338.69524324, -305.64283021, -228.97538337, -238.07513719,
       -166.70252969, -227.34159943, -177.57562021, -149.09385494,
       -170.02885884, -150.24383253, -182.98600816, -241.14370766,
       -331.0471594 ,  -57.99911848, -198.47786321, -191.26494851,
       -117.37191884, -166.95377868, -201.47256486, -123.33094893,
       -300.85774034, -195.8533476 ,  -74.48680731, -111.17237043,
       -157.61822569,   -8.42895846, -145.53646559, -145.65820563,
       -300.42970793,  -98.43910824,  -45.41520475,  -52.64922325,
       -145.17542713])

In [138]:
# define the objective and update function for one gradient-descent epoch for
# fitting an MLE estimate of logistic regression with stochastic gradient descent
# (should return a tuple of two functions; see optimize)
def sgd(y, X):
    def objective(w):
        # YOUR CODE HERE
        return -l(y, X, w)
        
    def update(w, eps):
        return sgdepoch(y, X, w, eps)

    return (objective, update)

In [139]:
# with 50% probability, you should get:
# [40.864973045695081,
#  array([ -3.43689655e+02,  -1.71161311e+02,  -5.71093536e+02,
#          -5.16478220e+01,   4.66294348e+02,  -3.71589878e+02,
#           5.21493183e+02,   1.25699230e+03,   8.33804130e+02,
#           5.63185399e+02,   1.32761302e+03,  -2.64104011e+02,
#           7.10693307e+02,  -1.75497331e+02,  -1.94174427e+02,
#           1.11641507e+02,  -3.30817509e+02,  -3.46754913e+02,
#           8.48722111e+02,  -1.89136304e+02,  -4.25693844e+02,
#          -1.23084189e+02,  -2.95894797e+02,  -2.35789333e+02,
#          -3.38695243e+02,  -3.05642830e+02,  -2.28975383e+02,
#          -2.38075137e+02,  -1.66702530e+02,  -2.27341599e+02,
#          -1.77575620e+02,  -1.49093855e+02,  -1.70028859e+02,
#          -1.50243833e+02,  -1.82986008e+02,  -2.41143708e+02,
#          -3.31047159e+02,  -5.79991185e+01,  -1.98477863e+02,
#          -1.91264948e+02,  -1.17371919e+02,  -1.66953779e+02,
#          -2.01472565e+02,  -1.23330949e+02,  -3.00857740e+02,
#          -1.95853348e+02,  -7.44868073e+01,  -1.11172370e+02,
#          -1.57618226e+02,  -1.25729512e+00,  -1.45536466e+02,
#          -1.43362438e+02,  -3.00429708e+02,  -9.84391082e+01,
#          -4.54152047e+01,  -5.26492232e+01,  -1.45175427e+02])]
f, update = sgd(y[1:3], Xz[1:3, :])
[f(np.linspace(-5, 5, D)), update(np.linspace(-5, 5, D), 1000)]

[40.86497304569509,
 array([-3.43689655e+02, -1.71161311e+02, -5.71093536e+02, -5.16478220e+01,
         4.66294348e+02, -3.71589878e+02,  5.21493183e+02,  1.25699230e+03,
         8.33804130e+02,  5.63185399e+02,  1.32761302e+03, -2.64104011e+02,
         7.10693307e+02, -1.75497331e+02, -1.94174427e+02,  1.11641507e+02,
        -3.30817509e+02, -3.46754913e+02,  8.48722111e+02, -1.89136304e+02,
        -4.25693844e+02, -1.23084189e+02, -2.95894797e+02, -2.35789333e+02,
        -3.38695243e+02, -3.05642830e+02, -2.28975383e+02, -2.38075137e+02,
        -1.66702530e+02, -2.27341599e+02, -1.77575620e+02, -1.49093855e+02,
        -1.70028859e+02, -1.50243833e+02, -1.82986008e+02, -2.41143708e+02,
        -3.31047159e+02, -5.79991185e+01, -1.98477863e+02, -1.91264948e+02,
        -1.17371919e+02, -1.66953779e+02, -2.01472565e+02, -1.23330949e+02,
        -3.00857740e+02, -1.95853348e+02, -7.44868073e+01, -1.11172370e+02,
        -1.57618226e+02, -1.25729512e+00, -1.45536466e+02, -1.433624

In [140]:
# you can run stochastic gradient descent!
wz_sgd, vz_sgd, ez_sgd = optimize(sgd(y, Xz), w0, nepochs=500)

Epoch   0: f=  6636.208, eps=0.010000000
Epoch   1: f=   971.379, eps=0.010500000
Epoch   2: f=   794.794, eps=0.011025000
Epoch   3: f=   744.176, eps=0.011576250
Epoch   4: f=   720.713, eps=0.012155063
Epoch   5: f=   710.576, eps=0.012762816
Epoch   6: f=   696.426, eps=0.013400956
Epoch   7: f=   701.076, eps=0.006700478
Epoch   8: f=   686.690, eps=0.007035502
Epoch   9: f=   683.915, eps=0.007387277
Epoch  10: f=   684.682, eps=0.003693639
Epoch  11: f=   680.112, eps=0.003878321
Epoch  12: f=   679.357, eps=0.004072237
Epoch  13: f=   678.290, eps=0.004275848
Epoch  14: f=   677.666, eps=0.004489641
Epoch  15: f=   676.842, eps=0.004714123
Epoch  16: f=   676.032, eps=0.004949829
Epoch  17: f=   676.677, eps=0.002474914
Epoch  18: f=   675.233, eps=0.002598660
Epoch  19: f=   674.573, eps=0.002728593
Epoch  20: f=   674.133, eps=0.002865023
Epoch  21: f=   673.726, eps=0.003008274
Epoch  22: f=   673.388, eps=0.003158688
Epoch  23: f=   673.099, eps=0.003316622
Epoch  24: f=   

Epoch 211: f=   664.863, eps=0.000984496
Epoch 212: f=   664.856, eps=0.001033721
Epoch 213: f=   664.847, eps=0.001085407
Epoch 214: f=   664.842, eps=0.001139678
Epoch 215: f=   664.840, eps=0.001196661
Epoch 216: f=   664.823, eps=0.001256494
Epoch 217: f=   664.812, eps=0.001319319
Epoch 218: f=   664.828, eps=0.000659660
Epoch 219: f=   664.812, eps=0.000692643
Epoch 220: f=   664.796, eps=0.000727275
Epoch 221: f=   664.785, eps=0.000763638
Epoch 222: f=   664.779, eps=0.000801820
Epoch 223: f=   664.768, eps=0.000841911
Epoch 224: f=   664.763, eps=0.000884007
Epoch 225: f=   664.756, eps=0.000928207
Epoch 226: f=   664.746, eps=0.000974618
Epoch 227: f=   664.740, eps=0.001023349
Epoch 228: f=   664.729, eps=0.001074516
Epoch 229: f=   664.721, eps=0.001128242
Epoch 230: f=   664.713, eps=0.001184654
Epoch 231: f=   664.707, eps=0.001243887
Epoch 232: f=   664.705, eps=0.001306081
Epoch 233: f=   664.684, eps=0.001371385
Epoch 234: f=   664.690, eps=0.000685692
Epoch 235: f=   

Epoch 413: f=   663.682, eps=0.000578605
Epoch 414: f=   663.679, eps=0.000607536
Epoch 415: f=   663.674, eps=0.000637912
Epoch 416: f=   663.671, eps=0.000669808
Epoch 417: f=   663.665, eps=0.000703298
Epoch 418: f=   663.662, eps=0.000738463
Epoch 419: f=   663.656, eps=0.000775386
Epoch 420: f=   663.652, eps=0.000814156
Epoch 421: f=   663.647, eps=0.000854863
Epoch 422: f=   663.641, eps=0.000897607
Epoch 423: f=   663.636, eps=0.000942487
Epoch 424: f=   663.630, eps=0.000989611
Epoch 425: f=   663.628, eps=0.001039092
Epoch 426: f=   663.628, eps=0.000519546
Epoch 427: f=   663.622, eps=0.000545523
Epoch 428: f=   663.617, eps=0.000572799
Epoch 429: f=   663.611, eps=0.000601439
Epoch 430: f=   663.607, eps=0.000631511
Epoch 431: f=   663.603, eps=0.000663087
Epoch 432: f=   663.600, eps=0.000696241
Epoch 433: f=   663.596, eps=0.000731053
Epoch 434: f=   663.594, eps=0.000767606
Epoch 435: f=   663.588, eps=0.000805986
Epoch 436: f=   663.583, eps=0.000846286
Epoch 437: f=   

## 2e Compare GD and SGD

In [207]:
# YOUR CODE HERE
np.random.seed(0)

plt.figure(figsize=(9, 4))

plt.subplot(1, 2, 1)
plt.plot(vz_sgd, label='SGD', color='blue')
plt.title('Stochastic Gradient Descent (SGD)')
plt.xlabel('Epoch')
plt.ylabel('Negative Log-Likelihood')
plt.ylim(600, 800)
plt.axhline(y=671, color='red', linestyle='--', label='Threshold at y=671')
plt.axhline(y=660, color='black', linestyle='--', label='Threshold at y=660')
plt.axvline(x=29, color='red', linestyle='--', label='Threshold at x=29')
plt.legend()

# Plot GD results
plt.subplot(1, 2, 2)
plt.plot(vz_gd, label='GD', color='orange')
plt.title('Gradient Descent (GD)')
plt.xlabel('Epoch')
plt.ylabel('Negative Log-Likelihood')
plt.ylim(600, 800)
plt.axhline(y=678, color='red', linestyle='--', label='Threshold at y=678')
plt.axhline(y=660, color='black', linestyle='--', label='Threshold at y=660')
plt.axvline(x=29, color='red', linestyle='--', label='Threshold at x=29')
plt.legend()

plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [226]:
print('1) SVD min loglikelihood: ', np.min(vz_sgd), '\n2) GD min loglikelihood : ' , np.min(vz_gd))
print('****************************************')
print('1) SVD mean loglikelihood: ', np.mean(vz_sgd), '\n2) GD mean loglikelihood : ' , np.mean(vz_gd))
print('****************************************')
print('1) SVD median loglikelihood: ', np.median(vz_sgd), '\n2) GD median loglikelihood : ' , np.median(vz_gd))
print('****************************************')
print('1) SVD 1st quartile loglikelihood: ', np.percentile(vz_sgd, 25), '\n2) GD 1st quartile loglikelihood : ' , np.percentile(vz_gd, 25))
print('****************************************')
print('1) SVD 3rd quartile loglikelihood: ', np.percentile(vz_sgd, 75), '\n2) GD 3rd quartile loglikelihood : ' , np.percentile(vz_gd, 75))

1) SVD min loglikelihood:  663.2973296789817 
2) GD min loglikelihood :  655.413496469943
****************************************
1) SVD mean loglikelihood:  678.6860501900633 
2) GD mean loglikelihood :  693.8859677697319
****************************************
1) SVD median loglikelihood:  664.5785738153414 
2) GD median loglikelihood :  659.9852259808722
****************************************
1) SVD 1st quartile loglikelihood:  663.8652951280337 
2) GD 1st quartile loglikelihood :  657.5670859999374
****************************************
1) SVD 3rd quartile loglikelihood:  665.7772999208883 
2) GD 3rd quartile loglikelihood :  662.8279655964287


Performance Comparison: GD shows an ability to achieve better minimum and median log-likelihoods, suggesting it can consistently find better solutions than SVD. However, the higher mean log-likelihood indicates that it may be less stable and potentially overshooting optimal solutions in some cases.

Consistency and Variability: The results indicate that while SGD can yield superior results in many instances (especially seen in the lower quartiles), it may also lead to worse overall average results. The higher mean log-likelihood of GD suggests it might struggle with consistency across epochs.

Convergence Behavior: The ability of GD to reach lower minimum log-likelihoods indicates effective convergence properties in some scenarios, but the overall higher mean log-likelihood signifies potential issues with oscillations or failures to converge adequately during other epochs.

Conclusion: In summary, while GD can find better solutions in specific instances, its average performance suggests that it may require tuning of hyperparameters (like learning rate and epoch count) for more consistent results. Conversely, SVD might provide more stable outcomes but may not reach the same depths of performance as GD. Therefore, it could be beneficial to consider combining the strengths of both methods or fine-tuning GD for better average performance.

for mean: 
Discussion: Interestingly, while the minimum log-likelihood favors GD, the mean log-likelihood is worse for GD. This indicates that while GD can find a better solution in some cases (as seen with the minimum), its overall performance across all epochs is less effective than SVD. This could suggest that GD might be more prone to instability or has larger fluctuations in performance across epochs.

median:
The median log-likelihood for GD is lower than for SVD, which points to better central performance for GD. This means that in the majority of the epochs, GD is finding better-fitting models compared to SVD.

# 3 Prediction

In [230]:
wz_sgd.shape
Xz.shape

(3065, 57)

In [None]:
def predict(Xtest, w):
    """Returns vector of predicted confidence values for logistic regression with
weight vector w."""
    # YOUR CODE HERE
    


def classify(Xtest, w):
    """Returns 0/1 vector of predicted class labels for logistic regression with
weight vector w."""
    # YOUR CODE HERE

In [None]:
# Example: confusion matrix
yhat = predict(Xtestz, wz_gd)
ypred = classify(Xtestz, wz_gd)
print(sklearn.metrics.confusion_matrix(ytest, ypred))  # true x predicted

In [None]:
# Example: classification report
print(sklearn.metrics.classification_report(ytest, ypred))

In [None]:
# Example: precision-recall curve (with annotated thresholds)
nextplot()
precision, recall, thresholds = sklearn.metrics.precision_recall_curve(ytest, yhat)
plt.plot(recall, precision)
for x in np.linspace(0, 1, 10, endpoint=False):
    index = int(x * (precision.size - 1))
    plt.text(recall[index], precision[index], "{:3.2f}".format(thresholds[index]))
plt.xlabel("Recall")
plt.ylabel("Precision")

In [None]:
# Explore which features are considered important
# YOUR CODE HERE

# 4 Maximum Aposteriori Estimation

## 4a Gradient Descent

In [None]:
def l_l2(y, X, w, lambda_):
    """Log-density of posterior of logistic regression with weights w and L2
regularization parameter lambda_"""
    # YOUR CODE HERE

In [None]:
# this should give:
# [-47066.641667825766, -47312.623810682911]
[l_l2(y, Xz, np.linspace(-5, 5, D), 0), l_l2(y, Xz, np.linspace(-5, 5, D), 1)]

In [None]:
def dl_l2(y, X, w, lambda_):
    """Gradient of log-density of posterior of logistic regression with weights w
and L2 regularization parameter lambda_."""
    # YOUR CODE HERE

In [None]:
# this should give:
# [array([  551.33985842,   143.84116318,   841.83373606,   156.87237578,
#           802.61217579,   795.96202907,   920.69045803,   621.96516752,
#           659.18724769,   470.81259805,   771.32406968,   352.40325626,
#           455.66972482,   234.36600888,   562.45454038,   864.83981264,
#           787.19723703,   649.48042176,   902.6478154 ,   544.00539886,
#          1174.78638035,   120.3598967 ,   839.61141672,   633.30453444,
#          -706.66815087,  -630.2039816 ,  -569.3451386 ,  -527.50996698,
#          -359.53701083,  -476.64334832,  -411.60620464,  -375.11950586,
#          -345.37195689,  -376.22044258,  -407.31761977,  -456.23251936,
#          -596.86960184,  -107.97072355,  -394.82170044,  -229.18125598,
#          -288.46356547,  -362.13402385,  -450.87896465,  -277.03932676,
#          -414.99293368,  -452.28771693,  -167.54649092,  -270.9043748 ,
#          -252.20140951,  -357.72497343,  -259.12468742,   418.35938483,
#           604.54173228,    43.10390907,   152.24258478,   378.16731033,
#           416.12032881]),
#  array([  556.33985842,   148.66259175,   846.4765932 ,   161.33666149,
#           806.89789007,   800.06917193,   924.61902946,   625.71516752,
#           662.75867626,   474.20545519,   774.5383554 ,   355.43897054,
#           458.52686767,   237.04458031,   564.95454038,   867.16124121,
#           789.34009417,   651.44470748,   904.43352968,   545.61254171,
#          1176.21495178,   121.6098967 ,   840.68284529,   634.19739158,
#          -705.95386516,  -629.66826731,  -568.98799574,  -527.33139555,
#          -359.53701083,  -476.82191975,  -411.9633475 ,  -375.65522015,
#          -346.08624261,  -377.11329972,  -408.38904835,  -457.48251936,
#          -598.29817327,  -109.57786641,  -396.60741472,  -231.14554169,
#          -290.60642261,  -364.45545242,  -453.37896465,  -279.71789819,
#          -417.85007654,  -455.32343122,  -170.76077664,  -274.29723194,
#          -255.77283808,  -361.47497343,  -263.05325885,   414.25224198,
#           600.25601799,    38.63962335,   147.59972763,   373.34588176,
#           411.12032881])]
[dl_l2(y, Xz, np.linspace(-5, 5, D), 0), dl_l2(y, Xz, np.linspace(-5, 5, D), 1)]

In [None]:
# now define the (f,update) tuple for optimize for logistic regression, L2
# regularization, and gradient descent
def gd_l2(y, X, lambda_):
    # YOUR CODE HERE

In [None]:
# let's run!
lambda_ = 100
wz_gd_l2, vz_gd_l2, ez_gd_l2 = optimize(gd_l2(y, Xz, lambda_), w0, nepochs=500)

## 4b Effect of Prior

In [None]:
# YOUR CODE HERE

## 4c Composition of Weight Vector

In [None]:
# YOUR CODE HERE

## 5 Exploration (optional)

### 5 Exploration: PyTorch

In [None]:
# if you want to experiment, here is an implementation of logistic
# regression in PyTorch
import math
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F

# prepare the data
Xztorch = torch.FloatTensor(Xz)
ytorch = torch.LongTensor(y)
train = torch.utils.data.TensorDataset(Xztorch, ytorch)


# manual implementation of logistic regression (without bias)
class LogisticRegression(nn.Module):
    def __init__(self, D, C):
        super(LogisticRegression, self).__init__()
        self.weights = torch.nn.Parameter(
            torch.randn(D, C) / math.sqrt(D)
        )  # xavier initialization
        self.register_parameter("W", self.weights)

    def forward(self, x):
        out = torch.matmul(x, self.weights)
        out = F.log_softmax(out)
        return out


# define the objective and update function. here we ignore the learning rates
# and parameters given to us by optimize (they are stored in the PyTorch model
# and optimizer, resp., instead)
def opt_pytorch():
    model = LogisticRegression(D, 2)
    criterion = nn.NLLLoss(reduction="sum")
    # change the next line to try different optimizers
    # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    def objective(_):
        outputs = model(Xztorch)
        return criterion(outputs, ytorch)

    def update(_1, _2):
        for i, (examples, labels) in enumerate(train_loader):
            outputs = model(examples)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        W = model.state_dict()["W"]
        w = W[:, 1] - W[:, 0]
        return w

    return (objective, update)

In [None]:
# run the optimizer
learning_rate = 0.01
batch_size = 100  # number of data points to sample for gradient estimate
shuffle = True  # sample with replacement (false) or without replacement (true)

train_loader = torch.utils.data.DataLoader(train, batch_size, shuffle=True)
wz_t, vz_t, _ = optimize(opt_pytorch(), None, nepochs=100, eps0=None, verbose=True)