In [None]:
%pylab inline


In [None]:
import scipy.stats as stats
import numpy.random as random
import scipy.interpolate as interpol
import scipy.integrate as integrate

# Test case for today: log-normal data

Let's create $10^6$ values distributed as a log-normal: $x \sim e^{N(0,1)}$

In [None]:
data=exp(random.randn(100_000))

__In the below code box, plot a histogram of the array `data` as one figure (using `plt.figure()`) and a histogram of the log of the data as a second figure.  Use 100 bins for each plot.__

Calculating the mean:

In [None]:
print(np.mean(data))
print( data.mean() )

Calculating the median:

In [None]:
print(np.median(data))

Calculating the mode:

In [None]:
print(f'Unrounded: {stats.mode(data)}')

data_r = np.round(data,decimals=2)
print(f'Rounded: {stats.mode(data_r)}' )

In [None]:
bins = np.linspace(-0.005,10.005,1002)
counts,edges=np.histogram(data,bins=bins)
whmax=np.argmax(counts)
mode=(edges[whmax]+edges[whmax+1])/2
print(mode)

## Creating a mode function

Let's define a function which can calculate the mode for any binning we might choose at the time we run it:

In [None]:
def mode2(data,**kwargs):
# note: provide bins and (optionally) range keywords 
# as used in np.histogram to not use the
# defaults of np.histogram (10 bins, full range)
    counts,edges=np.histogram(data,**kwargs)
    whmax=np.argmax(counts)
    mode=(edges[whmax]+edges[whmax+1])/2
    return(mode)

__Use the `mode2` function to calculate the mode for at least 3 different binnings of the data (differing in bin sizes).__  Since we used `**kwargs` we can include the `bins=` keyword within the `mode2` function call, and it gets passed to `np.histogram`.

# Calculating measures of scale

First, the standard deviation:


In [None]:
print( np.std(data),np.std(data,ddof=1) )
print( np.std(log(data)),np.std(log(data),ddof=1) )

### Now, the mean absolute deviation.  

__Discuss with your group: why do these values agree or not with the standard deviation?__


In [None]:
normmeanabsdev = np.mean(np.abs(data-data.mean()))/0.7979
mnlog = np.mean(np.log(data) )
normmeanabsdev_log = np.mean(np.abs( np.log(data)-mnlog) )/0.7979

print(f'Normalized Mean Absolute Deviation: {meanabsdev:.5f}')
print(f'Normalized Mean Absolute Deviation of log(data): {meanabsdev_log:.5f}')

### Calculating the MAD:

__Discuss with your group: why do these values agree or not with the standard deviation?__


In [None]:
meddata=np.median(data)
normmad = np.median(np.abs(data-meddata))/0.6745
normmad_log = np.median(abs(np.log(data)-np.log(meddata)))/0.6745


print(f'Normalized Median Absolute Deviation: {normmad:.5f}')
print(f'Normalized Mean Absolute Deviation of log(data): {normmad_log:.5f}')

### Calculating the IQR:

__Discuss with your group: why do these values agree or not with the standard deviation?__

In [None]:
d25,d75 = np.percentile(data,[25,75])
normiqr = (d75-d25)/1.349
normiqr_log = (np.log(d75)-np.log(d25))/1.349


print(f'Normalized IQR: {normiqr:.5f}')
print(f'Normalized IQR of log(data): {normiqr_log:.5f}')

# Results when averaging data together

In [None]:
nsims=int(1E5)
navg=100
data=random.randn(nsims,navg)

# average each set of navg values
means=np.mean(data,axis=1)

- __Using the below code block, plot a histogram of the distribution of the means from each simulation, with binsize 0.01, over the range from -2 to +2 (what should you set the `bins` and `range` keywords to be for that?)__

- __Determine the standard deviation of the array of means__

- __Now, in the above code box, create a second set of simulations with `navg` = 9.  Plot the histogram of results from these simulations on top of the histogram for averaging 100 measurements.__

__Discuss with your group: How do you explain the differences between the results (both qualitatively and quantitatively)? How does the scatter of the means scale with `navg`?__