# Assignment 1b

In [1]:
import numpy as np
import pandas as pd

class log_reg:
    """
    link_function: specifies the link function. Possible values are
                   "sigmoid" and "softmax"
                   
    note: working under the assumption that the rows and columns of
          the data corresponds to observations and variables respectively
    """
    def __init__(self, step_size = 0.01, epochs = 10000, random_init = False, 
                link_function = "sigmoid"):
        self.step_size = step_size
        self.epochs = epochs
        self.random_init = random_init
        self.link_function = link_function
        
    def sigmoid(self, z):
        return 1/(1+np.exp(-z))
    
    def softmax(self, z):
        """
        return: n by K matrix where n is the sample
                and K are the possible outcomes of y
        """
        return np.exp(z)/np.sum(np.exp(z), axis = 0)

    def loss(self, h, y):
        # h: sigmoid applies to z
        return (-y * np.log(h) - (1-y) * np.log(1-h)).mean()
    
    def add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis = 1)
    
    def fit(self, X, y):
        # add an intercept for the b term
        X = self.add_intercept(X)
        # initialize weights dependeing on link function
        if not self.random_init and self.link_function.lower() == 'sigmoid':
            self.w = np.zeros(X.shape[1]).reshape(X.shape[1],1)
        elif self.random_init and self.link_function.lower() == 'sigmoid':
            self.w = np.random.rand(X.shape[1]).reshape(X.shape[1],1)
        elif self.link_function.lower() == "softmax":
            k = np.unique(y).size
            # generated from N(0, .01)
            self.w = np.random.normal(0, .01, k * X.shape[1]).reshape(X.shape[1], k)
            
        
        # time for model fitting
        for i in range(self.epochs):
            z = np.dot(X, self.w)
            h = self.sigmoid(z)
            # as calculated in the exercises but not including
            # the gradient wrt b as it is accounted for in 
            # the data matrix by the column of 1s
            gradient = np.dot(X.T, (h-y)) / y.size
            # update
            self.w -= self.step_size * gradient     
            
            z = np.dot(X, self.w)
            h = self.sigmoid(z)
            loss = self.loss(h, y)
    
    def predict(self, X, p_cutoff = .5):
        X = self.add_intercept(X)
        predicted_prob = self.sigmoid(np.dot(X, self.w))
        return predicted_prob >= p_cutoff

The log_reg class is what I used in assignment 1a.

## Exercise 2.1
We are now looking at the multinomial distribution with $K$ possible outcomes. That is, our data is of the form $\{\mathbf{x}_i, y_i\}_{i=1}^n$ where $y_i \in \{1, \ldots, K\}$. 

For this end, I will use the softmax function which is defined as
$$
p_i^{(k)} = \frac{\exp\left\{ \mathbf{w}_{(k)}^T \mathbf{x}_i + b\right\}}{\sum_{j=1}^K\exp\left\{ \mathbf{w}_{(j)}^T \mathbf{x}_i + b \right\}},
$$
where $p_i^{(k)} = P(\mathbf{y}_{(i)} = k| \mathbf{x}_i, \mathbf{w})$, and $\mathbf{w}$ is now $p \times K$. This means that
$$
p_i = \frac{1}{\sum_{j=1}^K\exp\left\{ \mathbf{w}_{(j)}^T \mathbf{x}_i + b \right\}} 
\begin{bmatrix} \exp\{\mathbf{w}^T_{(1)}\mathbf{x}_i\} \\ \vdots \\
\exp\{\mathbf{w}^T_{(K)}\mathbf{x}_i\}
\end{bmatrix}
$$

The cross entropy loss, with the softmax activation (correct word?) function is then
$$
J = - \frac{1}{n} \sum_{i=1}^n L_i = - \frac{1}{n} \sum_{i=1}^n \sum_{k = 1}^K y_{ik} \log p_i^{(k)},
$$
which simplifies to the cost function of assignment 1a with $K = 2$. It should also be noted that I will treat $\mathbf{y}$ as a $n\times K$ matrix where each row has one $1$ and the rest are zeroes, with the position of the $1$ corresponding to class adherence of that observation.

Consider the derivative of $p_i^{(k)}$, whilst dropping the intercept as it will be accounted for by inserting a column of ones into $\mathbf{X}$,
$$
\begin{aligned}
\frac{\partial p_i^{(k)}}{\partial \mathbf{w}_k} &= \frac{\partial}{\partial \mathbf{w}_k}\frac{\exp\left\{ \mathbf{w}_k^T \mathbf{x}_i\right\}}{\sum_{j=1}^K\exp\left\{ \mathbf{w}_j^T \mathbf{x}_i\right\}} = \mathbf{x}_i \frac{\exp\left\{ \mathbf{w}_k^T \mathbf{x}_i\right\}}{\sum_{j=1}^K \exp\left\{ \mathbf{w}_j^T \mathbf{x}_i\right\}} - \mathbf{x}_i \left(\frac{\exp\left\{ \mathbf{w}_k^T \mathbf{x}_i\right\}}{\sum_{j=1}^K \exp\left\{ \mathbf{w}_j^T \mathbf{x}_i\right\}}\right)^2 = \mathbf{x}_i\ p_i^{(k)}\big(1-p_i^{(k)}\big),
\end{aligned}
$$
and
$$
\begin{aligned}
\frac{\partial p_i^{(k)}}{\partial \mathbf{w}_l} = \exp\left\{ \mathbf{w}_k^T \mathbf{x}_i\right\} \frac{\partial}{\partial \mathbf{w}_l} \frac{1}{\sum_{j=1}^K \exp\left\{ \mathbf{w}_j^T \mathbf{x}_i\right\}} = \exp\left\{ \mathbf{w}_k^T \mathbf{x}_i\right\} \left( - \mathbf{x}_i\frac{\exp\left\{ \mathbf{w}_l^T \mathbf{x}_i\right\}}{\left(\sum_{j=1}^K \exp\left\{ \mathbf{w}_j^T \mathbf{x}_i\right\}\right)^2} \right) = -\mathbf{x}_i p_i^{(k)}p_i^{(l)}
\end{aligned}
$$

Taking the derivative of $L_i$ wrt to $\mathbf{w}_l$ gives
$$
\begin{aligned}
\frac{\partial L_i}{\partial\mathbf{w}_l} &=- \frac{\partial}{\partial \mathbf{w}_l} \sum_{k=1}^K y_{ik}\log p^{(k)}_i = - \sum_{k=1}^K y_{ik} \frac{1}{ p^{(k)}_i} \frac{\partial}{\partial \mathbf{w}_l} p^{(k)}_i = - x_i \frac{y_{ik}}{p_i^{(k)}}p_i^{(k)}(1-p_i^{(l)}) + \sum_{k \neq l} x_i \frac{y_{ik}}{p_i^{(k)}}p_i^{(k)}p_i^{(l)} \\
&= - x_i y_{ik}\big(1-p_i^{(l)} \big) + x_i\sum_{k\neq l}y_{ik}p_i^{(l)} = x_i \left( \sum_{k\neq l}y_{ik}p_i^{(l)} -  y_{ik}\big(1-p_i^{(l)} \big) \right) = x_i \big(p_i^{(l)} - y_{ik} \big),
\end{aligned}
$$
where the last step follows from the one hot encoding of $\mathbf{y}_i$.

The derivative of the entropy loss wrt $w_{k}$ is then
$$
\begin{aligned}
\frac{\partial J}{\partial w_{k}} &= -\frac{1}{n} \sum_{i=1}^n \frac{\partial}{\partial w_k} \sum_{k=1}^K \mathbb{1}\{y_i = k\}\log\left(\frac{\exp\left\{ \mathbf{w}_{(k)}^T \mathbf{x}_i + b\right\}}{\sum_{j=1}^K\exp\left\{ \mathbf{w}_{(j)}^T \mathbf{x}_i + b \right\}}\right) \\
&= -\frac{1}{n} \sum_{i=1}^n \mathbb{1}\{y_i = k\}\left[ \frac{\partial}{\partial w_k}  \mathbf{w}_{(k)}^T \mathbf{x}_i - \frac{\partial}{\partial w_k}\log \sum_{j=1}^K\exp\left\{ \mathbf{w}_{(j)}^T \mathbf{x}_i + b \right\} \right] \\
&= -\frac{}{}
\end{aligned}
$$

In [4]:
# softmax test
X = np.array([1.,2.,3.,4.,5.,6.]).reshape(2,3)
w = np.array([-1., 2., -2., 1., .5, 3.]).reshape(3,2)

def softmax(z):
    return np.exp(z)/ np.sum(np.exp(z), axis = 0)

In [5]:
softmax(X.dot(w))

array([[9.99447221e-01, 1.52299795e-08],
       [5.52778637e-04, 9.99999985e-01]])