In [2]:
import data
import numpy as np
from log_reg import LogisticRegression
from min_max_scaler import MinMaxScaler


In [18]:
scaler = MinMaxScaler()
binary_data = data.BinaryPrices()
X_train = binary_data.X_train
X_train = scaler.fit_transform(X_train)
y_train = binary_data.y_train

X_train[0]

array([0.375     , 0.25      , 0.28205128, 0.00765098, 0.        ,
       0.        , 0.        , 0.5       , 0.375     , 0.1380531 ,
       0.55299539, 0.53043478])

# Score

In [4]:
x = binary_data.X_train[0]
x = np.insert(x, 0, 1)

w = np.ones(x.shape) / x.shape

print("Feature vector: ", x)
print("Weight vector: ", w)

z = np.sum(w * x, axis=-1) 
print("Score: ", z)

Feature vector:  [1.         0.375      0.25       0.28205128 0.00765098 0.
 0.         0.         0.5        0.375      0.1380531  0.55299539
 0.53043478]
Weight vector:  [0.07692308 0.07692308 0.07692308 0.07692308 0.07692308 0.07692308
 0.07692308 0.07692308 0.07692308 0.07692308 0.07692308 0.07692308
 0.07692308]
Score:  0.30855273372347336


In [5]:
log_reg = LogisticRegression()
z = log_reg.score(x)
print(z)

0.30855273372347336


# Sigmoid

In [6]:
p = log_reg.sigmoid(x)
print(p)

0.5765319605995369


# Negative Log Likelihood



When training logistic regression models, are goal is to find the **weights $w$** that make our model's predictions most consistent with the observed data

$$p(y_i = 1 | x_i ; w) = \sigma(w^T x)$$

- **Choose weights** that make all **observed labels $y_i$** as likely as the possible given input $x_i$

**Likelihood:** Product of all the probabilities that the model assigns to each observed outcome

$$L(w) = \prod_{i=1}^N p(y_i | x_i ; w)$$

For logistic regression, this becomes

$$L(w) = \prod_{i=1}^{N}[\sigma(w^T x)]^{y_i} \cdot [1-\sigma(w^T x)]^{1-y_i}$$

Since products are messy, we take the logarithm to turn the products into sums

$$\log L(w) = \sum_{i=1}^N [y_i \log(\sigma(w^T x_i)) + (1-y_i)\log(1- \sigma(w^T x_i))]$$
- We do this because its easier to take the derivates of sums rather than products

Our goal is to **minimize** the loss function, so we flip the sign and get

\begin{align}
\textrm{NLL}(\mathbf{w}) = -\frac{1}{N}\sum_{i=1}^N \left[y_i \log \sigma(\mathbf{w}^T{\mathbf{x_i}}) + (1-y_i)\log(1 - \sigma(\mathbf{w}^T\mathbf{x_i}))\right]
\end{align}




In [7]:
X_train = binary_data.X_train[0:10]
X_train = np.concatenate([np.ones((X_train.shape[0], 1)), X_train], axis=1)
y_train = binary_data.y_train[0:10]
probs = []
print("Sample data:")
for i in range(len(X_train)):
    print(X_train[i])
print("\n")
print("Observed data:")

for i in range(len(y_train)):
    print(y_train[i])
print("\n")
print("Probabilities:")
for i in range(len(X_train)):
    x = X_train[i]
    y = y_train[i]
    z = log_reg.score(x)
    p = log_reg.sigmoid(x)
    probs.append(p)
    print(p)
print("\n")
likelihood = np.prod([probs[i] ** y_train[i] * (1 - probs[i]) ** (1 - y_train[i]) for i in range(len(probs))])
print("Likelihood:")
print(likelihood)


Sample data:
[1.         0.375      0.25       0.28205128 0.00765098 0.
 0.         0.         0.5        0.375      0.1380531  0.55299539
 0.53043478]
[1.         0.5        0.41666667 0.36467236 0.01287321 0.25
 1.         1.         1.         0.5        0.24424779 0.5437788
 0.24347826]
[1.         0.75       0.70833333 0.93447293 0.01525816 1.
 0.         1.         0.75       1.         0.86902655 0.76036866
 0.86086957]
[1.         0.375      0.33333333 0.1965812  0.01091513 0.
 0.         0.         0.5        0.5        0.24424779 0.
 0.77391304]
[1.         0.25       0.16666667 0.17378917 0.01364646 0.
 0.         0.         0.75       0.375      0.12743363 0.23041475
 0.46956522]
[1.         0.125      0.16666667 0.03133903 0.00575097 0.
 0.         0.         0.5        0.125      0.03893805 0.
 0.33913043]
[1.         0.375      0.33333333 0.16951567 0.01456845 0.
 0.         0.         0.75       0.375      0.21061947 0.
 0.74782609]
[1.         0.375      0.58333333 0.4

# Gradient

\begin{align}
\frac{\partial \textrm{NLL}}{\partial \mathbf{w}} = \frac{1}{N} \sum_i \left[\sigma(\mathbf{w}^T\mathbf{x}_i)-y_i)\right]\mathbf{x}_i
\end{align}

In [11]:
gradient = np.sum([probs[i] - y_train[i] * X_train[i] for i in range(len(probs))]) / len(X_train)
gradient

np.float64(5.996135824629257)

In [13]:
grad = sum((probs[i] - y_train[i]) * X_train[i] for i in range(len(probs))) / len(X_train)
grad

array([0.09394246, 0.06945643, 0.10653699, 0.12545962, 0.00215478,
       0.14514555, 0.06328089, 0.13102811, 0.09914447, 0.16419397,
       0.11474697, 0.10709961, 0.09187016])

TypeError: unsupported operand type(s) for /: 'list' and 'int'