In [2]:
# Add lib input sys.path
import os
import sys
import time

import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import math
from sklearn.preprocessing import normalize
from functools import partial
import h5py
from scipy.spatial import distance

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from matplotlib.colors import ListedColormap
import libs.linear_models as lm
import libs.data_util as data
import libs.nn as nn
import libs.plot as myplot

%matplotlib inline

#### Exercise 9.1

$x_g = (47,35)$, $x_b = (22, 40)$, $x_u = (21, 36)$

The distance from $x_u$ to Mr. Bad is closer than the distance to Mr. Good. So the BoL should NOT give hime credit.

If the income is measured in dollars, Mr. Unknown is closer to Mr. Good, so the BoL should give him credit.

In [5]:
# The distance between $x_u$ and the two points
print('--- Income measured in K')
xg = np.array([47, 35])
xb = np.array([22, 40])
xu = np.array([21, 36])

d_ug = np.linalg.norm(xu-xg)
d_ub = np.linalg.norm(xu-xb)
print(f"--- Distance from unknow to Mr. Good: {d_ug}")
print(f"--- Distance from unknow to Mr. Bad: {d_ub}")

print('--- Income measured in dollars')
# Income measured in dollars
xg = np.array([47, 35000])
xb = np.array([22, 40000])
xu = np.array([21, 36000])

d_ug = np.linalg.norm(xu-xg)
d_ub = np.linalg.norm(xu-xb)
print(f"--- Distance from unknow to Mr. Good: {d_ug}")
print(f"--- Distance from unknow to Mr. Bad: {d_ub}")

--- Income measured in K
--- Distance from unknow to Mr. Good: 26.019223662515376
--- Distance from unknow to Mr. Bad: 4.123105625617661
--- Income measured in dollars
--- Distance from unknow to Mr. Good: 1000.3379428972991
--- Distance from unknow to Mr. Bad: 4000.000124999998


#### Exercise 9.2

\begin{align*}
Z &= \gamma X \\
&= (I - \frac{1}{N}1 1^T)X \\
&= X - \frac{1}{N} 1 1^T X\\
&= X - 1\frac{1}{N} \begin{bmatrix}\sum^N_{i=1} x_{i1} & \sum^N_{i=1} x_{i2} & \dots & \sum^N_{i=1} x_{id}\end{bmatrix}  \\
&= X - 1\begin{bmatrix}\bar{x}_1 & \bar{x}_2 & \dots & \bar{x}_d\end{bmatrix}\\
&=  X - 1\bar{x}^T \\
\end{align*}

#### Exercise 9.3

\begin{align*}
Z &= \begin{bmatrix}z^T_1 \\ \dots \\ z^T_N \end{bmatrix}\\
&= \begin{bmatrix}(Dx_1)^T \\ \dots \\ (Dx_N)^T\end{bmatrix}\\
&= \begin{bmatrix}x_1^TD^T \\ \dots \\ x_N^TD^T \end{bmatrix}\\
&= \begin{bmatrix}x_1^TD \\ \dots \\ x_N^TD \end{bmatrix}\\
&= XD \\
\end{align*}

\begin{align*}
Z^TZ &= (XD)^TXD \\
&= D^TX^TXD\\
&= DX^TXD\\
\end{align*}

#### Exercise 9.4

* (a) $\text{variance}(x_1) = \text{variance}(\hat{x}_1) = 1$, $\text{variance}(x_2) = \text{variance}(\sqrt{1-\epsilon^2}\hat{x}_1+\epsilon\hat{x}_2) = (1-\epsilon^2)\text{variance}(\hat{x}_1)+ \epsilon^2 \text{variance}(\hat{x}_2) = 1$

$\text{covariance}(x_1,x_2) = E[(x_1 - \bar{x}_1)(x_2 - \bar{x}_2)] = E[x_1x_2] = E[\sqrt{1-\epsilon^2}\hat{x}^2_1 + \epsilon\hat{x}_1\hat{x}_2] = \sqrt{1-\epsilon^2}$

* (b) 

\begin{align*}
f(x) &= w_1x_1 + w_2x_2 \\
&= w_1\hat{x}_1 + w_2 (\sqrt{1-\epsilon^2}\hat{x}_1 + \epsilon \hat{x}_2) \\
&= (w_1 + w_2 \sqrt{1-\epsilon^2})\hat{x}_1 + w_2\epsilon \hat{x}_2 \\
&= \hat{w}_1\hat{x}_1 + \hat{w}_2 \hat{x}_2 \\
\end{align*}

So if we set $\hat{w}_1 = w_1 + w_2 \sqrt{1-\epsilon^2}, \hat{w}_2 = w_2\epsilon$, we see $f$ is linear in $x_1,x_2$.

* (c) From problem (b), we have $\hat{w}_1 = \hat{w}_2 = 1$, so we have $w_1 = \frac{\epsilon - \sqrt{1-\epsilon^2}}{\epsilon}, w_2 = \frac{1}{\epsilon}$, so that $C \ge w^2_1 + w^2_2 = 2\frac{1-\epsilon\sqrt{1-\epsilon^2}}{\epsilon^2}$

* (d) As $\epsilon \to 0$, we have the minimum $C \to \infty $. It means that we have to use a huge $C$ to be able to implement the target function, which is impossible here.

* (e) If there is significant noise in the data, with correlated inputs, it'll be hard to regularize the learning, and overfitting is likely. So var term can be high while bias can be low.

#### Exercise 9.5

