# Homework 2

In [2]:
import numpy as np
from scipy.optimize import minimize_scalar
from scipy.special import erf
import plotly.offline as py
from plotly.graph_objs import Scatter, Figure, Layout, Bar

py.init_notebook_mode(connected=True)

## Sampling π


### Basic properties

The probability is just the ratio between the area of the circle and that of the square, i.e:

$$\mathbb{P}[s_i = 4] = \frac{\mathcal{A}_{circle}}{\mathcal{A}_{square}}$$

  The mean and variance are easily determined:

$$
\begin{align*}
m &= \mathbb{E}[s_i] = \frac{\pi}{4} \cdot 4 + 0 = \pi \\
\Delta &= \mathbb{E}[s_i^2] - \mathbb{E}[s_i]^2 = 4 \cdot \pi - \pi^2 = \pi(4 - \pi)
\end{align*}
$$


### Estimators

The estimators $\hat{m}$ and $\hat{\Delta}$ are both unbiased. In fact:

$$
\begin{align*}

\mathbb{E}[\hat{m}] &= \frac{1}{N} \cdot N \cdot \mathbb{E}[s_i] = m \qquad \mathrm{\blacksquare} \\

\mathbb{E}[\hat{\Delta}] &= \frac{1}{N-1} \sum_{i = 1}^{N} (\mathbb{E}[s_i^2] - \mathbb{E}[\hat{m}^2])
= \frac{N}{N-1} \{ \mathbb{E}[s_i^2] - \frac{1}{N}\mathbb{E}[s_i^2] - (N - 1)\mathbb{E}[s_i]^2 \}
= \frac{N-1}{N-1}(\mathbb{E}[s_i^2] - \mathbb{E}[s_i]^2)
= \Delta \qquad \mathrm{\blacksquare}

\end{align*}
$$

.

$$
\begin{align*}
\hat{m} &= \frac{1}{N}\sum_{i=1}^{N}s_i \\
\Delta &= \frac{1}{N-1}\sum_{i=1}^{N}(s_i^2 - \hat{m}^2)
\end{align*}
$$

are unbiased


In [3]:
def sample_points(N=1):
    """Returns an array of random points [x, y] inside the square."""
    return np.random.uniform(-1, 1, (N, 2))


# Visualize the sampling
x, y = sample_points(1000).transpose()
data = [Scatter(x=x, y=y, mode='markers')]
lyt = Layout(
    title="Sampling π",
    yaxis=dict(scaleanchor="x"),
    shapes=[{'type': 'circle', 'x0': -1, 'y0': -1, 'x1': 1, 'y1': 1}]
)

py.iplot(Figure(data=data, layout=lyt))

In [8]:
def estimate_pi(sample):
    """Estimates π given a sample."""
    return 4*np.average((np.sum(sample**2, axis=1) < 1.))

# Define constants
MAX_ERROR = 0.01
VARIANCE = np.pi*(4 - np.pi)

# Sample sizes
Ns = [10, 100, 1000, 10000, 100000]

# Number of samples (of length N each) we average on
repeat = 1000

means = np.empty(len(Ns))      # empirical mean of N estimates of π
variances = np.empty(len(Ns))  # empirical variance over N estimates
p_error = np.empty(len(Ns))    # empirical probability of getting error >= MAX_ERROR

for i, N in enumerate(Ns):
    pi_estimates = np.array([estimate_pi(sample_points(N)) for _ in range(repeat)])
    variances[i] = np.var(pi_estimates, ddof=1)  # use the unbiased estimator over N-1
    means[i] = np.average(pi_estimates)
    p_error[i] = np.average(np.abs(pi_estimates - np.pi) >= MAX_ERROR)
    
        
# Plot the empirical vs theoretical variance
x = ["N = {} <br> (π = {:2.5f})".format(Ns[i], means[i]) for i in range(len(Ns))]
data = [
    Bar(x=x, y=variances, name="Empirical Variance"),
    Bar(x=x, y=[VARIANCE/n for n in Ns], name="Theoretical Variance")
]

lyt = Layout(
    title="Empirical vs theoretical variance (log scale)",
    yaxis=dict(type="log")
)

py.iplot(Figure(data=data, layout=lyt))

In [10]:
# Define formulas for bounds
dkl = lambda p, q: p*np.log(p/q) + (1-p)*np.log((1-p)/(1-q))
markov_bound = lambda err: np.pi/(np.pi + err)
cheb_bound   = lambda err, n: VARIANCE/(n * err**2)  
chern_bound  = lambda err, n: np.exp(-n*dkl((np.pi + err)/4, np.pi/4))
hoeff_bound  = lambda err, n: np.exp(-n*err**2/8) 

def p_error_normal(err, N):
    """Returns the probability of having deviations >= err for a normal distribution."""
    z = err/(np.sqrt(VARIANCE/N))  # normalize z-score
  
    return 1 - erf(z/np.sqrt(2))


# Plot a comparison between different bounds, the normal approximation and the empirical 
data = [
    Bar(x=x, y=[markov_bound(MAX_ERROR)], name="Markov bound"),
    Bar(x=x, y=[cheb_bound(MAX_ERROR, N) for N in Ns], name="Chebyshev bound"),
    Bar(x=x, y=[hoeff_bound(MAX_ERROR, N) for N in Ns], name="Hoeffding bound"),
    Bar(x=x, y=[chern_bound(MAX_ERROR, N) for N in Ns], name="Chernoff bound"),
    Bar(x=x, y=[p_error_normal(MAX_ERROR, N) for N in Ns], name="Normal approximation"),
    Bar(x=x, y=p_error, name="Empirical measure")
]

lyt = Layout(
    title="Probability of getting Error ≥ {}".format(MAX_ERROR),
    yaxis=dict(range=(0, 1))
)

py.iplot(Figure(data=data, layout=lyt))


## 2. Find the lighthouse

Since the angular distribution is uniform, considering N total flashes spreaded in the $\pi$, the number of flashes in an angle $d\theta$ is: $n(\theta)=\frac{Nd\theta}{\pi}$, yielding an angular probability density of:

$$
\begin{equation}
p(\theta)=\dfrac{1}{\pi}
\end{equation}
$$

The $x_k$ position on the shore is a function of the angle $\theta$ through: $x_k=\beta tan(\theta_k)$. Each element of lenght $dx$ then depends on $d\theta$ through:

$$
\begin{equation}
 dx=\dfrac{\beta d\theta}{cos(\theta)^{2}}
\end{equation}
$$

but 

$$
\begin{equation}
cos(\theta)=\dfrac{\beta}{\sqrt{\beta^{2}+(x_k-\alpha)^{2}}}
\end{equation}
$$

consequently

$$
\begin{equation}
p(\theta)d\theta=\dfrac{1}{\pi} \dfrac{dx}{\dfrac{\beta}{cos(\theta)^{2}}}=\dfrac{\beta dx}{\pi(\beta^{2}+(x_k-\alpha)^{2})}
\end{equation}
$$

where

$$
\begin{equation}
p(x)=\dfrac{\beta}{\pi(\beta^{2}+(x_k-\alpha)^{2})}
\end{equation}
$$




## 3. Statistical inference & maximum likelihood

$$

The log-Likelihood function is:
\begin{equation}
logL=-\dfrac{\sum_{i=1}^{N} x_{i}}{\lambda}-Nlog(Z(\lambda))
\end{equation}
Setting its first $\lambda$-derivative equal to 0 we obtain the following relation for the ML estimator $\lambda^{*}$

\begin{equation}
\dfrac{\sum_{i=1}^{N} x_{i}}{\lambda^{*2}}-N\dfrac{Z'(\lambda^{*})}{Z(\lambda^{*})}=0
\end{equation}
which can be recast in the followig form:
\begin{equation}
\lambda^{*} +\dfrac{  e^{1/\lambda^{*}}  -20 e^{20/\lambda^{*}} }     {e^{1/\lambda^{*}} - e^{20/\lambda^{*}} }     =\dfrac{\sum_{i=1}^{N} x_{i}}{N}
\end{equation}

now lets call call for simplicity 
\begin{equation}
\mu_{*}(\lambda^{*})=\lambda^{*} +\dfrac{  e^{1/\lambda^{*}}  -20 e^{20/\lambda^{*}} }     {e^{1/\lambda^{*}} - e^{20/\lambda^{*}} }
\end{equation}

We know that the sample average $\dfrac{\sum_{i=1}^{N} x_{i}}{N}$ is an unbiased estimator of the expected value of a random variable, which in our case will be a function of   $\lambda_{true}$: $\mu(\lambda_{true})$. 

This means $\mu_{*}(\lambda^{*})$ is an unbiased estimator of $\mu(\lambda_{true})$.

But it turns out that the expected value is:
\begin{equation}
\mu(\lambda_{true})=\lambda^{true} +\dfrac{  e^{1/\lambda^{true}}  -20 e^{20/\lambda^{true}} }     {e^{1/\lambda^{true}} - e^{20/\lambda^{true}} }
\end{equation}

The equality $\lambda_{*}=\lambda_{true}$ follows from the fact that $\mu\equiv\mu_{*}$ is a injective function of its argument, being its first derivative always positive.
$$

In [9]:
lambda_true = 10.
Ns = [10, 100, 1000]

samples = [sample_exp(lambda_true, N) for N in Ns]
lambdas = np.arange(5, 50, 0.1)

data = []

for s in samples:
    lh = [log_likelihood(l, s) for l in lambdas]
    i_max = np.argmax(lh)

    data.append(Scatter(
        x = lambdas,
        y = lh / np.abs(lh[i_max]),
        name = len(s)
    ))

lyt = Layout(
    title="Log-likelihood for different sample sizes",
    xaxis=dict(title="λ")
) 
py.iplot(Figure(data=data, layout=lyt) )

In [11]:
def estimate_lambda(sample, bounds=(1, 100)):
    """Estimates the Maximum Likelihood lambda parameter."""
    r = minimize_scalar(log_likelihood, args=(sample, -1), method='bounded', bounds=bounds)
    if not r.success:
        raise Exception("estimate_lambda: {}".format(r.message))
    
    return r.x


def MSE(lambda_true, N=100, repeat=10000):
    """Calculate the Mean Square Error.
    
    Args:
        lambda_true: The true value of the lambda parameter.
        N: The sample size.
        repeat: The number of samples on which the average is calculated.
    """
    errors = np.zeros(repeat)
    
    for i in range(repeat):
        s = sample_exp(lambda_true, N)
        lambda_ml = estimate_lambda(s)
        errors[i] = (lambda_ml-lambda_true)**2

    return np.average(errors)


def I(lam, N):
    """Fisher Information."""
    # @todo: this is WRONG!
    return N / lam**3 * (np.exp(-1/lam) - 400*np.exp(-20/lam) + 2*((np.exp(-1/lam) - 20*np.exp(-20/lam))/(np.exp(-1/lam) - np.exp(-20/lam)) + lam))

In [16]:
lam = 10.
s = sample_exp(lam, N=1000)

lambdas_true = np.arange(1, 20, 1.)

lyt = Layout(
    title="MSE as function of λ",
    xaxis=dict(title="λ"),
    yaxis=dict(title="MSE")
)

data = [Scatter(
        x=lambdas_true,
        y=[MSE(l, N=1000, repeat=100) for l in lambdas_true],  # this is slow…
        name="MSE"
    ),
    Scatter(
        x=lambdas_true,
        y=[1/I(l, N=1000) for l in lambdas_true],
        name="Fisher Information"
)]

py.iplot(Figure(data=data, layout=lyt))

In [18]:
I(10., N=10)

-0.3791248119914184