# Homework 2

In [2]:
import numpy as np
from scipy.optimize import minimize_scalar
from scipy.special import erf
import plotly.offline as py
from plotly.graph_objs import Scatter, Figure, Layout, Bar

py.init_notebook_mode(connected=True)

## 1. Sampling π


### 1.1 Basic properties
The probability that $s_i$ = 4 is just the ratio between the area of the circle and that of the square (since the sampling is uniform), i.e:

$$\mathbb{P}[s_i = 4] = \frac{\mathcal{A}_{circle}}{\mathcal{A}_{square}} = \frac{\pi}{4}$$

The mean and variance of the distribution are easily determined:

$$
\begin{align*}
m &= \mathbb{E}[s_i] = \frac{\pi}{4} \cdot 4 + 0 = \pi \\
\Delta &= \mathbb{E}[s_i^2] - \mathbb{E}[s_i]^2 = 16 \cdot \frac{\pi}{4} - \pi^2 = \pi(4 - \pi)
\end{align*}
$$

### 1.2 Estimators
The estimators $\hat{m}$ and $\hat{\Delta}$, defined as follow: 

$$
\begin{align*}
\hat{m} &= \frac{1}{N}\sum_{i=1}^{N}s_i \\
\Delta &= \frac{1}{N-1}\sum_{i=1}^{N}(s_i^2 - \hat{m}^2)
\end{align*}
$$

are both unbiased. In fact:

$$
\begin{align*}
\mathbb{E}[\hat{m}] &= \frac{1}{N} \cdot N \cdot \mathbb{E}[s_i] = m \qquad \mathrm{\blacksquare} \\
\mathbb{E}[\hat{\Delta}] &= \frac{1}{N-1} \sum_{i = 1}^{N} (\mathbb{E}[s_i^2] - \mathbb{E}[\hat{m}^2])
= \frac{1}{N-1} \{ N \cdot \mathbb{E}[s_i^2] - \frac{N}{N^2} (N \cdot \mathbb{E}[s_i^2] - N (N - 1) \cdot\mathbb{E}[s_i]^2 )\}
= \frac{N-1}{N-1}(\mathbb{E}[s_i^2] - \mathbb{E}[s_i]^2)
= \Delta \qquad \mathrm{\blacksquare}
\end{align*}
$$



### 1.3 Typical error of estimation 
The variance of the estimator $\hat{m}$ is determined exploiting the fact that the $s_i$ are i.i.d

$$
\begin{align*}
\mathbb{V}[\hat{m}] &= \frac{1}{N^2} \sum_{i = 1}^{N} \mathbb{V}[s_i] = \frac {\Delta}{N}
\end{align*}
$$

The typical error done using this estimator is given by the square root of its variance:  

$\sigma = \sqrt{\frac{\Delta}{N}}$

### 1.4 Error bounds and considerations:  

In this section we will estimate the probability of having an error $\epsilon \geq 0.01$ on the $\pi$ estimation with different bounds:  

#### Markov bound

$$
\begin{align*}\
\mathbb{P}[\hat{m} \geq m + \epsilon] \leq \frac{m}{m + \epsilon}   
\end{align*}
$$

As one can see running the code, this bound is poor of information. The problem is that it doesn't depend on the variance of the estimator, thus it doesn't get better as N (the number of samples) increases. This bound is very general (it holds for all the distributions with that mean), but not well suited for our case in which the displacement from the mean is small.


####  Chebyshev inequality

$$
\begin{align*}
\mathbb{P}[|\hat{m} - m| \geq k{\sigma}] \leq \frac{1}{k^2}  
\end{align*}
$$  

with ${\sigma}$ being the standard deviation of the estimator $\hat{m}$. In our case we want $k{\sigma} = \epsilon$, thus:

$$
\begin{align*}\
k = \epsilon \cdot \sqrt{ \frac{N}{\Delta}}    
\end{align*}
$$


#### Chernoff bound  

$$
\begin{align*}
\mathbb{P}[\hat{m} \geq m + \epsilon] \leq e^{-N \cdot D_{KL}[p + \frac{\epsilon}{4} || p]}    
\end{align*}
$$  

Where the Kullback-Leibler divergence is computed between the probability distribution of $s_i$ (that has probability $p = \pi$ to assume value 4) and a modified one (of the same kind, i.e. Bernoullian) with parameter $q = p + \frac{\epsilon}{4}$, where the $\frac{1}{4}$ factor comes from the fact that in our case $s$ assumes the values 0 and 4 instead of the standard 0 and 1. In particular:  

$$
\begin{align*}
D_{KL}\left[p + \frac{\epsilon}{4} || p\right] &= q \log{\frac{q}{p}} + (1-q) \log{\frac{1-q}{1-p}} 
\end{align*}
$$  

Since we are considering a small displacement from the mean, the $D_{KL}$ is small. For this reason the exponential decay is slow and the bound give us useful information only for N sufficiently large (e.g. $N = 10000$).   


#### Hoeffding bound

The distribution under investigation is sub-Gaussian (since the support of the random variable $s_i$ is finite). We can thus use this bound that give us a result similar to the Chernoff bound, since it comes from it.

$$
\begin{align*}
\mathbb{P}[\hat{m} \geq m + \epsilon] \leq e^{- \frac{N \epsilon^2}{2\sigma_s^2}}  
\end{align*}
$$  

where $\sigma_s = 2$, i.e. half of the support extension of $s_i$.


#### Probability estimation using the central limit theorem

According to the central limit theorem, for N sufficeintly large, we have:  $ \hat{m} \sim \mathcal{N} \left(m, \frac{\Delta}{N} \right)$   
We can estimate the probability simply integrating the normal distribution for x such that $ |x - m| \geq \epsilon$. Note that the result obtained is not a bound but an estimation (that gets better as N increase). For N small this result could result in a bad estimate but, as one can see in the plot below, it shows a very good agreement with the empirical estimate as N increases.


In [2]:
def sample_points(N=1):
    """Returns an array of random points [x, y] inside the square."""
    return np.random.uniform(-1, 1, (N, 2))


# Visualize the sampling
x, y = sample_points(1000).transpose()
data = [Scatter(x=x, y=y, mode='markers')]
lyt = Layout(
    title="Sampling π",
    yaxis=dict(scaleanchor="x"),
    shapes=[{'type': 'circle', 'x0': -1, 'y0': -1, 'x1': 1, 'y1': 1}]
)

py.iplot(Figure(data=data, layout=lyt))

In [3]:
def estimate_pi(sample):
    """Estimates π given a sample."""
    return 4*np.average((np.sum(sample**2, axis=1) < 1.))

# Define constants
MAX_ERROR = 0.01
VARIANCE = np.pi*(4 - np.pi)

# Sample sizes
Ns = [10, 100, 1000, 10000, 100000]

# Number of samples (of length N each) we average on
repeat = 1000

means = np.empty(len(Ns))      # empirical mean of N estimates of π
variances = np.empty(len(Ns))  # empirical variance over N estimates
p_error = np.empty(len(Ns))    # empirical probability of getting error >= MAX_ERROR

for i, N in enumerate(Ns):
    pi_estimates = np.array([estimate_pi(sample_points(N)) for _ in range(repeat)])
    variances[i] = np.var(pi_estimates, ddof=1)  # use the unbiased estimator over N-1
    means[i] = np.average(pi_estimates)
    p_error[i] = np.average(np.abs(pi_estimates - np.pi) >= MAX_ERROR)
    
        
# Plot the empirical vs theoretical variance
x = ["N = {} <br> (π = {:2.5f})".format(Ns[i], means[i]) for i in range(len(Ns))]
data = [
    Bar(x=x, y=variances, name="Empirical Variance"),
    Bar(x=x, y=[VARIANCE/n for n in Ns], name="Theoretical Variance")
]

lyt = Layout(
    title="Empirical vs theoretical variance (log scale)",
    yaxis=dict(type="log")
)

py.iplot(Figure(data=data, layout=lyt))

In [4]:
# Define formulas for bounds
dkl = lambda p, q: p*np.log(p/q) + (1-p)*np.log((1-p)/(1-q))
markov_bound = lambda err: np.pi/(np.pi + err)
cheb_bound   = lambda err, n: VARIANCE/(n * err**2)  
chern_bound  = lambda err, n: np.exp(-n*dkl((np.pi + err)/4, np.pi/4))
hoeff_bound  = lambda err, n: np.exp(-n*err**2/8) 

def p_error_normal(err, N):
    """Returns the probability of having deviations >= err for a normal distribution."""
    z = err/(np.sqrt(VARIANCE/N))  # normalize z-score
  
    return 1 - erf(z/np.sqrt(2))


# Plot a comparison between different bounds, the normal approximation and the empirical 
data = [
    Bar(x=x, y=[markov_bound(MAX_ERROR)], name="Markov bound"),
    Bar(x=x, y=[cheb_bound(MAX_ERROR, N) for N in Ns], name="Chebyshev bound"),
    Bar(x=x, y=[hoeff_bound(MAX_ERROR, N) for N in Ns], name="Hoeffding bound"),
    Bar(x=x, y=[chern_bound(MAX_ERROR, N) for N in Ns], name="Chernoff bound"),
    Bar(x=x, y=[p_error_normal(MAX_ERROR, N) for N in Ns], name="Normal approximation"),
    Bar(x=x, y=p_error, name="Empirical measure")
]

lyt = Layout(
    title="Probability of getting Error ≥ {}".format(MAX_ERROR),
    yaxis=dict(range=(0, 1))
)

py.iplot(Figure(data=data, layout=lyt))


## 2. Find the lighthouse

Since the angular distribution is uniform, considering N total flashes spreaded in the $\pi$, the number of flashes in an angle $d\theta$ is: $n(\theta)=\frac{Nd\theta}{\pi}$, yielding an angular probability density of:

$$
\begin{equation}
p(\theta)=\dfrac{1}{\pi}
\end{equation}
$$

The $x_k$ position on the shore is a function of the angle $\theta$ through: $x_k=\beta tan(\theta_k)$. Each element of lenght $dx$ then depends on $d\theta$ through:

$$
\begin{equation}
 dx=\dfrac{\beta d\theta}{cos(\theta)^{2}}
\end{equation}
$$

but 

$$
\begin{equation}
cos(\theta)=\dfrac{\beta}{\sqrt{\beta^{2}+(x_k-\alpha)^{2}}}
\end{equation}
$$

consequently

$$
\begin{equation}
p(\theta)d\theta=\dfrac{1}{\pi} \dfrac{dx}{\dfrac{\beta}{cos(\theta)^{2}}}=\dfrac{\beta dx}{\pi(\beta^{2}+(x_k-\alpha)^{2})}
\end{equation}
$$

where

$$
\begin{equation}
p(x)=\dfrac{\beta}{\pi(\beta^{2}+(x_k-\alpha)^{2})}
\end{equation}
$$




In [62]:
def sample_cauchy(alpha, beta, N=1):
    """Samples from a Cauchy distribution with parameters alpha and beta."""
    return np.random.standard_cauchy(N) * beta + alpha

def pdf_cauchy(x, alpha, beta):
    """Probability Density Function of the Cauchy distribution."""
    return beta / ( np.pi * ( beta**2 + (x - alpha)**2 ) )

def log_likelihood(sample, alpha, beta):
    """Calculate the log-likelihood given a sample."""
    return np.sum(np.log(pdf_cauchy(sample, alpha, beta)))


beta = 10.0
Ns = [10, 100, 1000]

alphas = np.arange(10, 50, 0.1)
samples = [sample_cauchy(alpha, beta, N) for N in Ns]

data = []

for s in samples:
    lh = np.array([log_likelihood(s, alpha, beta) for alpha in alphas])
    i_max = np.argmax(lh)

    data.append(Scatter(
        x = alphas,
        y = lh / np.abs(lh[i_max]),  # normalize max to 1
        name = len(s)
    ))
    
    data.append(Scatter(
        x = [alphas[i_max]],
        y = [np.sign(lh[i_max])],  # we normalized!
        mode = "markers",
        marker = dict(size=15, color="red"),
        name = "Maximum ({})".format(len(s)),
        showlegend = False
    ))

py.iplot(data)


The sample mean does not coincide with the mode of the posterior even in the limit $N\rightarrow\infty$.

The law of large numbers states that if the espectation $\mu=E(x)$ exist then as $N\rightarrow\infty, \forall \varepsilon$:

\begin{equation}
P \left(\left|\dfrac   {\sum_{1}^{N} X_{i}}{N}-\mu\right|>\varepsilon\right)\longrightarrow 0
\end{equation}

The Cauchy distribution is symmetric around its mode, therefore its expectation, if finite,  will coincide with it, and relying on the law of large numbers we could state that as $N\rightarrow\infty$ the sample average will tend to $\mu$.

But the Cauchy distribution has an infinite expectation so we can't say anything about the behaviour the sample average, in particular we can't say that the sample average will concide with the mode.

In [65]:
# Print the average of the samples used above to show the deviation.
for s in samples:
    print(np.average(s))

30.5496401779
111.407455481
18.3255142665


## 3. Statistical inference & maximum likelihood


The log likelihood function is
\begin{equation}
\mathcal{L} =-\dfrac{\sum_{i=1}^{N} x_{i}}{\lambda}-N\log(Z(\lambda))
\end{equation}
Setting its first derivative in $\lambda$ to 0 we obtain the following relation for the ML estimator $\lambda^{*}$

\begin{equation}
\dfrac{\sum_{i=1}^{N} x_{i}}{\lambda^{*2}}-N\dfrac{Z'(\lambda^{*})}{Z(\lambda^{*})}=0
\end{equation}
which can be recast in the form:
\begin{equation}
\lambda^{*} +\dfrac{  e^{-1/\lambda^{*}}  -20 e^{-20/\lambda^{*}} }     {e^{-1/\lambda^{*}} - e^{-20/\lambda^{*}} }     =\dfrac{\sum_{i=1}^{N} x_{i}}{N}
\end{equation}

lets call call for simplicity $a^{*}(\lambda^{*})$ this quantity.

$a^{*}$ is itself an estimator being, a function from the sample data to the space of parameters, more precisely, to an injective mapping of the space parameter through $a^{*}(\lambda^{*})$ (being $da^{*}/d\lambda^{*}$ always positive).

Let's consider the expectation value of $a^{*}$ taken over the probabilty distribution $p_{\lambda}(x)$:
\begin{equation}
E_{\lambda}(a^{*})=E_{\lambda}\left(\dfrac{\sum_{i=1}^{N} x_{i}}{N}\right)=\dfrac{\sum_{i=1}^{N} E_{\lambda}(x_{i})}{N}= E_{\lambda}(x)\doteq\mu(\lambda)
\end{equation}
and this means that $a^{*}$ is and unbiased estimator of the mean of the distribution $\mu(\lambda)$. But the mean itself can be computed to be:

\begin{equation}
\mu(\lambda)=\lambda +\dfrac{  e^{1/\lambda}  -20 e^{20/\lambda} }     {e^{1/\lambda} - e^{20/\lambda} }
\end{equation}

We see that $\mu(\lambda)$ and $a^{*}(\lambda^{*})$ have the same functional dependence on their argument, let's call this common function $m$. 
Due to the fact that  $m(\cdot)$ is injective, saying that $m(\lambda^{*})$ is an unbiased estimator of $m(\lambda)$ is equivalent to say that $\lambda^{*}$ is an unbiased estimator of $\lambda$.


In [21]:
def sample_exp(lambd, N=1):
    """Sample from the truncated exponential distribution."""
    return -lambd*np.log(np.random.uniform(np.exp(-20/lambd), np.exp(-1/lambd), N))


def pdf_exp(lambd, x):
    """Probability Density Function of the truncated exponential."""
    return np.exp(-x/lambd) / (lambd*(np.exp(-1/lambd) - np.exp(-20/lambd)))


def log_likelihood(lambd, sample, sign=1.0):
    """Calculate the log-likelihood given a sample and lambda."""
    return sign * np.sum(np.log(pdf_exp(lambd, sample)))


lambda_true = 10.
Ns = [10, 100, 1000]

samples = [sample_exp(lambda_true, N) for N in Ns]
lambdas = np.arange(5, 50, 0.1)

data = []

for s in samples:
    lh = [log_likelihood(l, s) for l in lambdas]
    i_max = np.argmax(lh)

    data.append(Scatter(
        x = lambdas,
        y = lh / np.abs(lh[i_max]),
        name = len(s)
    ))

lyt = Layout(
    title="Log-likelihood for different sample sizes",
    xaxis=dict(title="λ")
) 
py.iplot(Figure(data=data, layout=lyt) )

In [11]:
def estimate_lambda(sample, bounds=(1, 100)):
    """Estimates the Maximum Likelihood lambda parameter."""
    r = minimize_scalar(log_likelihood, args=(sample, -1), method='bounded', bounds=bounds)
    if not r.success:
        raise Exception("estimate_lambda: {}".format(r.message))
    
    return r.x


def MSE(lambda_true, N=100, repeat=10000):
    """Calculate the Mean Square Error.
    
    Args:
        lambda_true: The true value of the lambda parameter.
        N: The sample size.
        repeat: The number of samples on which the average is calculated.
    """
    errors = np.zeros(repeat)
    
    for i in range(repeat):
        s = sample_exp(lambda_true, N)
        lambda_ml = estimate_lambda(s)
        errors[i] = (lambda_ml-lambda_true)**2

    return np.average(errors)


def I(lambd, N):
    """Fisher Information."""
    alpha = -1/lambd
    e_1 = np.exp(alpha) - np.exp(20*alpha)
    e_2 = np.exp(alpha) - 20*np.exp(20*alpha)
    e_3 = np.exp(alpha) - 400*np.exp(20*alpha)
    var = e_3/e_1 + lambd**2 - (e_2/e_1)**2
    
    return N*var/(lambd**4)


lambdas_true = np.arange(1, 20, 1.)

lyt = Layout(
    title="MSE as function of λ",
    xaxis=dict(title="λ"),
    yaxis=dict(title="MSE")
)

data = [
    Scatter(
        x=lambdas_true,
        y=[MSE(l, N=10000, repeat=1000) for l in lambdas_true],  # this is slow…
        name="MSE"
    ),
    Scatter(
        x=lambdas_true,
        y=[1/I(l, N=10000) for l in lambdas_true],
        name="Fisher Information"
    )
]

py.iplot(Figure(data=data, layout=lyt))