# Astronomy 8824 - Problem Set 5 
The goal of this problem set is to gain familiarity with Fisher Matrix Forecasts.

This problem set was developed by David Weinberg, with some modifications by Paul Martini.

In [1]:
import numpy as np
from numpy import matrix
from numpy import linalg
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from astropy.io import ascii

# matplotlib settings 
SMALL_SIZE = 14
MEDIUM_SIZE = 16
BIGGER_SIZE = 18

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('lines', linewidth=2)
plt.rc('axes', linewidth=2)
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)   # fontsize of the figure title

In [2]:
def gaussian(x, mu, sig):
    '''
    Calculate a gaussian with mean mu and dispersion sig at input points x
    '''
    return np.exp( -0.5 * np.power( (x - mu)/sig, 2.)) / (np.sqrt(2*np.pi)*sig)

def PlotTwoDist(xy1, label1, xy2=None, label2=None, dims=None, addgauss=False, gxsig=False, gysig=False, xy2hist=True, axes=["X", "Y"], connect=20): 
    '''
    xy1, xy2: (x,y) points for the two distributions
    label1, label2: labels for the two distributions
    dims: [xc, yc, dx, dy] where xc,yc are the plot center and dx, dy are the half sizes of the plot
    addgauss: (bool) True to overplot Gaussian on each histogram
    xy2hist: (bool) Plot histograms for second distribution
    axes: ["X", "Y"] Labels for axes
    connect: (default 20) Initial set of points to connect with a solid line
    gxsig, gysig: (float) sigma values for the two histograms
    '''
    
    if dims is not None:
        xc = dims[0]
        yc = dims[1]
        dx = dims[2]
        dy = dims[3]
        xbins = np.arange(xc - dx, xc + dx, 0.05*dx)
        ybins = np.arange(yc - dy, yc + dy, 0.05*dy)
    else: 
        xbins = []
        ybins = []

    fig = plt.figure(figsize=(8,8))
    gs = GridSpec(4,4)
    ax_scatter = fig.add_subplot(gs[1:4, 0:3])
    ax_xhist = fig.add_subplot(gs[0,0:3])
    ax_yhist = fig.add_subplot(gs[1:4,3])
    ax_scatter.scatter(xy1[0], xy1[1], color='k', s=1, label=label1)
    ax_xhist.hist(xy1[0], bins=xbins, histtype='step', color='k', density=True)
    ax_yhist.hist(xy1[1], bins=ybins, histtype='step', color='k', density=True, orientation='horizontal')

    if xy2 is not None: 
        ax_scatter.scatter(xy2[0], xy2[1], color='r', s=1, label=label2)
        ax_scatter.plot(xy2[0][:connect], xy2[1][:connect], color='r', ls='-')
        if xy2hist: 
            ax_xhist.hist(xy2[0], bins=xbins, histtype='step', color='r', density=True)
            ax_xhist.hist(xy2[0], bins=xbins, histtype='step', color='r', density=True)
            ax_yhist.hist(xy2[1], bins=xbins, histtype='step', color='r', density=True, orientation='horizontal')

    if addgauss and gxsig and gysig: 
        if dims is not None:
            ggxx = np.linspace(xc - dx, xc + dx, 100)
            ggx = gaussian(ggxx, xc, gxsig)
            ax_xhist.plot(ggxx, ggx)
            ggyy = np.linspace(yc - dy, yc + dy, 100)
            ggy = gaussian(ggyy, yc, gysig)
            ax_yhist.plot(ggy, ggyy)            
        else: 
            gg = np.linspace(-1*size, size, 100)
            ggx = gaussian(gg, 0., gxsig)
            ggy = gaussian(gg, 0., gysig)
            ax_xhist.plot(gg, ggx)
            ax_yhist.plot(ggy, gg)
    
    plt.setp(ax_xhist.get_xticklabels(), visible=False)
    plt.setp(ax_xhist.get_yticklabels(), visible=False)
    plt.setp(ax_yhist.get_xticklabels(), visible=False)
    plt.setp(ax_yhist.get_yticklabels(), visible=False)
    ax_scatter.set_xlabel(axes[0], fontsize=16)
    ax_scatter.set_ylabel(axes[1], fontsize=16)
    if dims is not None: 
        ax_scatter.set_xlim(xc - dx, xc + dx)
        ax_scatter.set_ylim(yc - dy, yc + dy)
        ax_xhist.set_xlim(xc - dx, xc + dx)
        ax_yhist.set_ylim(yc - dy, yc + dy)


    ax_xhist.set_ylabel("N", fontsize=16)
    ax_yhist.set_xlabel("N", fontsize=16)
    ax_scatter.legend(frameon=False, fontsize=16)

In [3]:
def prob(x,cinv,prefac):
    """
    Return multivariate Gaussian probability
    x = vector of data values  (matrix)
    cinv = inverse covariance matrix
    prefac = prefactor for properly normalized Gaussian (float),
             taken as input so that it isn't computed every call
             should be [(2\pi)^{M/2} \sqrt{det(C)}]^{-1}
    """
    arg=float(x*cinv*x.T)
    return (prefac*np.exp(-arg/2.))

def rundmc(data, initvals, stepvals, nchain, nthin=1, cov=None, intype=1):
    '''
    Read in data from a file (intype=1) or array (intype=2) and use MCMC to sample the 
    distribution of parameter values
    initvals, stepvals : arrays of initial values and step sizes. Each array is 3 elements. To only sample two 
      parameters, set last element of stepvals to be zero.
    nchain : number of steps
    nthin : thin the chain by this amount
    intype = 1 (datafile) or 2 (data)
    cov = input covariance matrix, will be diagonal if cov=None
    '''
    
    if intype == 1: 
        x, y, errors = np.loadtxt(data, unpack=True)
    elif intype ==  2:
        x, y, errors = data[0], data[1], data[2]
    else:
        print("Error with {} format".format(data))
    t1init = initvals[0]
    t2init = initvals[1]
    t3init = initvals[2]
    step1 = stepvals[0]
    step2 = stepvals[1]
    step3 = stepvals[2]
    
    if cov is None:
        cov = np.diag(errors)
        
    cinv = np.linalg.inv(cov)
    prefac=1./(2*np.pi*np.sqrt(np.linalg.det(cov)))

    t1 = t1init
    t2 = t2init
    t3 = t3init
    deltay = y - (t1 + t2*x + t3*x*x)
    dym = matrix(deltay)
    p1 = prob(dym,cinv,prefac)        # probability at starting point

    chain=np.zeros((nchain,4))    # store (theta1,theta2,theta3,p) as elements of chain
    chain[0][0] = t1
    chain[0][1] = t2
    chain[0][2] = t3
    chain[0][3] = p1

    ichain = 1
    naccept = 0
    while (ichain < nchain):
        t1trial = t1+step1*np.random.normal()
        t2trial = t2+step2*np.random.normal()
        t3trial = t3 + step3*np.random.normal()
        deltay = y - (t1trial + t2trial*x + t3trial*x*x)
        dym = matrix(deltay)
        p2 = prob(dym,cinv,prefac)            # compute probability
        if ((p2>p1 or p1==0) or np.random.random()<p2/p1):    # accept step?
            chain[ichain][0]=t1trial
            chain[ichain][1]=t2trial
            chain[ichain][2]=t3trial
            chain[ichain][3]=p2
            t1=t1trial
            t2=t2trial
            t3=t3trial
            p1=p2
            naccept+=1
        else:
            chain[ichain][0]=t1
            chain[ichain][1]=t2
            chain[ichain][2]=t3
            chain[ichain][3]=p2
        ichain+=1

    xaccept=float(naccept)/float(ichain)
    print('%.4f of trials accepted' % (xaccept))
    return chain[::nthin]

LaTex macros hidden here --

$\newcommand{\xhat}{\hat{x}}$
$\newcommand{\xmin}{x_{min}}$
$\newcommand{\xmax}{x_{max}}$
$\newcommand{\cinvkl}{C_{kl}^{-1}}$

### 1. Fisher Matrix Forecast, linear fit

Suppose you have 20 (x,y) data points generated from a linear relation y = $\theta_1 + \theta_2 x$ with x uniformly distributed in the range $5 < x < 20$ and independent Gaussian errors on y with standard deviation $\sigma = 1$. 

#### a. What is the Fisher matrix?

$$ 
F_{ij} = \left< \frac{\partial^2 ln L}{\partial\theta_i \partial\theta_j} \right> ?
$$
Express your answer in terms of $\sigma$ and $x$. 

#### Answer


#### b) What is the inverse of the Fisher matrix? 

Express your answer in terms of the $\sigma$, $x$, and the minimum and maximum value $x_{min}$ and $x_{max}$. _Hint:_ The expectation value is $<x> = (x_{max} + x_{min})/2.$

#### Answer 


#### c) If both the intercept $\theta_1$ and the slope $\theta_2$ are to be estimated by fitting the data, what are the expected errors on $\theta_1$ and $\theta_2$?

Use your results from b) to compute numerical values for $\Delta \theta_1$ and $\Delta \theta_2$.

#### Answer



#### d) If the slope $\theta_2$ is known and you only need to solve for $\theta_1$, what is the expected error? 

#### Answer 



#### e) How does the expected slope error change if N = 6 instead of N = 20? How does the expect slope error change if xmax = 15 instead of xmax = 20?

#### Answer



### 2. MCMC parameters of a linear fit

The repository includes the data files:
- line.n20.s12.dat
- line.n20.s17.dat
- line.n20.s0.dat
- line.n6.s0.dat
  
These files have (x, y, σ) data points, with σ = 1 in all cases and x evenly spaced in the range 5 – 20. They were generated by the David's program linedata.py (also included), which you should look at to check that you understand what it is doing. For the two files labeled ‘s0.dat’ the points have been forced to lie exactly on the prescribed line. 

This notebook includes the function rundmc(), which reads a data file in this format and generates an MCMC for the intercept and slope (θ1, θ2) of a linear fit. You can either use this code or refer to it and write your own. Note that the probability is proportional to exp(-$\chi^2$/2), and you do not have to compute the constant of proportionality because you only need ratios of probabilities for your MCMC.

In [4]:
filenames = ['line.n20.s12.dat', 'line.n20.s17.dat', 'line.n20.s0.dat', 'line.n6.s0.dat']

data1 = ascii.read(filenames[0])
data2 = ascii.read(filenames[1])
data3 = ascii.read(filenames[2])
data4 = ascii.read(filenames[3])

#### a)	For the first two files, compute the best-fit slope and intercept using the formulas we have discussed in class (and are given in Numerical Recipes).

In [5]:
### Answer 


#### b) Generate an MCMC chain for the first 3 files (with N=20). Plot (theta1, theta2) for each and compare the marginal distributions of (theta1, theta2) to the Gaussian distributions with the erorrs you predicted from Part 1. 

#### Answer


#### c) Plot instead the distribution of ($\theta_1 + 12.5 \theta_2, \theta_2$).  Comment on the result. Relate your interpretation of this result to the Fisher matrix (think particularly about the moments that enter there). 

#### Answer 


#### d) Repeat b) for line.n6.s0.dat

#### Answer


### 3. A Third Parameter

Suppose that with the same (N = 20) data you allow a third parameter with a quadratic term, $y = \theta_1 + \theta_2 x + \theta_3 x^2$. For the fiducial model being assumed for the forecast, you adopt $\theta_3 = 0$, but you allow it to be free in the fit. 

#### a) What is the Fisher Matrix in this case? (You can do the matrix inversion numerically.)  What are the forecast errors on $\theta_1, \theta_2, \theta_3$? 

#### Answer


#### b) Use rundmc() or your own code to create a chain for this 3-parameter model. Apply it to the files line.n20.s0.dat and line.n6.s0.dat and plot the results, with particular attention to $\theta_3$ vs. $\theta_2$. Compare the errors from MCMC to your Fisher matrix forecast. 

#### Answer


### 4. Correlated Errors

The code linepluscov() generates a distribution of points with correlated errors. Run the code for 20 points in the range x = 5 – 20 with a slope $\theta_2 = 0.5$ and intercept $\theta_1 = 2$ for the random number seeds 12 and 17 used previously for the diagonal case. For this problem we are changing the slope from $\theta_2 = 2$ to $\theta_2 = 0.5$ while keeping $\sigma = 1$. This shrinks the vertical scale relative to the error bar, making the effect of correlations easier to see. 

In [6]:
def linepluscov(xmin, xmax, npoints, a, b, seed):

    '''
    Generate points on a line with errors drawn from a multivariate Gaussian with various 
    covariance matrices
    
    Parameters:
       xmin,xmax = range of x values
       npoints = number of points, evenly distributed in xmin,xmax
       a, b = slope and intercept of line
       seed = random number seed
       
    Written by DHW, modified for notebook by PM
    '''
    
    sigma=1.0
    np.random.seed(seed)
    output = {}

    x=np.linspace(xmin,xmax,npoints)
    y=a*x+b

    errors=sigma*np.ones(npoints)
    mu=np.zeros(npoints)
    cov=np.diag(errors)
    delta=np.random.multivariate_normal(mu,cov)
    y1=y+delta    
    output['A'] = np.transpose([x,y1,errors])

    offd=0.2
    b=offd*np.ones(npoints-1)
    cov1=cov+np.diag(b,1)+np.diag(b,-1)
    delta=np.random.multivariate_normal(mu,cov1)
    y1=y+delta
    output['B'] = np.transpose([x,y1,errors])

    offd=-0.2
    b=offd*np.ones(npoints-1)
    cov1=cov+np.diag(b,1)+np.diag(b,-1)
    delta=np.random.multivariate_normal(mu,cov1)
    y1=y+delta
    output['C'] = np.transpose([x,y1,errors])

    offd=0.4
    b=offd*np.ones(npoints-1)
    cov1=cov+np.diag(b,1)+np.diag(b,-1)
    delta=np.random.multivariate_normal(mu,cov1)
    y1=y+delta
    output['D'] = np.transpose([x,y1,errors])

    offd=0.4
    cov1=offd*np.ones((npoints,npoints))+np.diag(errors-offd)
    delta=np.random.multivariate_normal(mu,cov1)
    y1=y+delta
    output['E'] = np.transpose([x,y1,errors])

    return output

# Generate two realizations
output12 = linepluscov(5, 20, 20, 0.5, 2, 12)
output17 = linepluscov(5, 20, 20, 0.5, 2, 17)

#### a)	What covariance matrices are being used for the five sets of data points (A, B, C, D, E) that the code produces? (Look at the code to figure out what it is doing.)

#### Answer


#### b) Plot the two realizations of N = 20 points for each of the five cases, attaching error bars, and including the y = 0.5 x + 2 line

In [7]:
#### Answer


### 5. (OPTIONAL) Parameter errors with correlated data errors

#### a) Compute the predicted errors on $\theta_1$ and $\theta_2$ for each case from Part 4. You’ll now need to compute the Fisher matrix using the expression with the full covariance matrix (Stats Notes 4, page 5) and invert it numerically.

In [8]:
#### Answer

sigma = 1.0
N = 20
xmin = 5
xmax = 20
a = 0.5 
b = 2.
x = np.linspace(xmin, xmax, N)
y = a*x + b
errors = sigma*np.ones(N)
mu = np.zeros(N)

# Case A: 
covA = np.diag(errors)

# Case B: 
offd = 0.2
b = offd*np.ones(N-1)
covB = covA + np.diag(b, 1) + np.diag(b, -1)

# Case C: 
offd = -0.2
b = offd*np.ones(N-1)
covC = covA + np.diag(b, 1) + np.diag(b, -1)

# Case D: 
offd = 0.4
b = offd*np.ones(N-1)
covD = covA + np.diag(b, 1) + np.diag(b, -1)

# Case E: 
offd = 0.4
covE = offd*np.ones((N, N)) + np.diag(errors-offd)

def get_fisher(cov, x, label):
    '''
    your code here
    '''
    fish = np.diag( (2, 2) ) # replace this
    fishinv = linalg.inv(fish)
    return fish, fishinv

print("sigma1, sigma2")
fishA, fishAinv = get_fisher(covA, x, "Case A")
fishB, fishBinv = get_fisher(covB, x, "Case B")
fishC, fishCinv = get_fisher(covC, x, "Case C")
fishD, fishDinv = get_fisher(covD, x, "Case D")
fishE, fishEinv = get_fisher(covE, x, "Case E")

sigma1, sigma2


#### Answer



#### b) How do the correlated errors affect the expected parameter errors? Does the behavior make sense?

#### Answer


#### c) For D and E, check the Fisher matrix against the MCMC

#### Answer
