```
This notebook sets up and runs a set of benchmarks to compare
different numerical discretizations of the SWEs

Copyright (C) 2016  SINTEF ICT

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
```

# Random Numbers on the GPU

In order to add model error to our Shallow Water Simulators, we need to be able to generate random numbers on the GPU. Since we don't want to write such a library ourselves, we look for existing libraries.

Below is a list of existing libraries.

#### MWC64X 
http://cas.ee.ic.ac.uk/people/dt10/research/rngs-gpu-mwc64x.html#source_code

Copyright only, but within a source code file it says BSD

Looks quite old. Copyright from 2011 - no further info.

#### clRNG
https://github.com/clMathLibraries/clRNG

Last commit: Mar 3. 2016
Last commit (not readme): Sep 23 2015
Release 1.0.0 beta: May 1. 2015 ("15 commits to master since this release")

Copyright only (I think)

To cite: Technical report: http://clmathlibraries.github.io/clRNG/docs/clrng-api.pdf

(Possibly) interesting paper: http://www.iro.umontreal.ca/~lecuyer/myftp/papers/parallel-rng-imacs.pdf 
Which is published here: https://www.sciencedirect.com/science/article/pii/S0378475416300829?via%3Dihub
(recieved march 2014, accepted may 2016, online june 2016, published may 2017...)



#### random123
http://www.thesalmons.org/john/random123/releases/1.06/docs/index.html

Copyright only

With publication from 2011: https://dl.acm.org/citation.cfm?doid=2063405


Latest release: 1.09 - Mar 6, 2016



## Installing and testing clRNG

https://github.com/clMathLibraries/clRNG#example-instructions-for-linux
with the following changes:

     cmake -DCMAKE_INSTALL_PREFIX:PATH=/home/havahol/3rdparty/clRNG ../src/
    make install
    export CLRNG_ROOT=/home/havahol/3rdparty/clRNG/
    
Tests failed...

### And then:
When the test failed, it was time for a GPU-Ocean telco, and André recommended the approach below.


# Linear Congruential Generator (LCG)

LCG is a linear algorithm for generating a serie of pseudo-random numbers. It is serial in design, based on the piecewise linear form 
$$ X_{n+1} = (a X_n + c) \mod m.$$




REF: Section 1.1 of http://citeseer.ist.psu.edu/viewdoc/download?doi=10.1.1.53.3686&rep=rep1&type=pdf 


In [None]:
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt

In [None]:
a = 0x7fffffff
print a
b = 2147483648.0
print b
c = 1103515245
d = 12345

def lcgAndre(seed):
    seed = (seed * 1103515245) + 12345
    abs_ = seed % 0x7fffffff
    return abs_ / 2147483648.0, seed

def lcg(seed):
    seed = ((seed*1103515245) + 12345) % 0x7fffffff
    return seed / 2147483648.0, seed

def boxMuller(seed):
    u1, seed = lcg(seed)
    u2, seed = lcg(seed)
    r = np.sqrt(-2.0*np.log(u1))
    theta = 2*np.pi*u2
    n1 = r*np.cos(theta)
    n2 = r*np.sin(theta)
    return n1, n2, seed

def boxMullerInner(u1, u2):
    r = np.sqrt(-2.0*np.log(u1))
    theta = 2*np.pi*u2
    n1 = r*np.cos(theta)
    n2 = r*np.sin(theta)
    return n1, n2
    
def boxMullerFasit(seed):
    u1 = np.random.rand()
    u2 = np.random.rand()
    r = np.sqrt(-2.0*np.log(u1))
    theta = 2.0*np.pi*u2
    n1 = r*np.cos(theta)
    n2 = r*np.sin(theta)
    return n1, n2, 0

seed1 = 42124.4*np.random.rand()
N = 10000
u1 = np.zeros(N)
n1 = np.zeros(N)
#n4 = np.random.normal(0, 1, 10000)

for i in range(N):
    u1[i], seed1 = lcg(seed1)
for i in range(N/2):
    n1[2*i], n1[2*i+1], seed1 = boxMuller(seed1)

fig = plt.figure(figsize=(4,2))
plt.hist(n1, bins=50)
print np.mean(n1), np.var(n1)

print "---- uniform ----"
fig = plt.figure(figsize=(4,2))
plt.hist(u1, bins=50)
print np.min(u1), np.max(u1)

#def autocorr(x):
autocorr = np.correlate(u1, u1, mode='full')
autocorr = autocorr[autocorr.size/2:]
normalized_autocorr =  autocorr / ( u1.var() * np.arange(u1.size, 0, -1) )
fig = plt.figure(figsize=(4,2))
plt.plot(normalized_autocorr)
plt.title("autocorr")

### CPU version of a *kernel* updating the global buffers


In [None]:
nx = 200
ny = 200
halo_nx = nx + 2*2
halo_ny = ny + 2*2
initRandom = np.ones((halo_ny, halo_nx))*(-6.0)
initRandomCopy = initRandom.copy()
seeds = np.random.rand(halo_ny, halo_nx/2)*0x7fffffff
#initRandomFloat = np.zeros((ny, nx), dtype=np.float32)
#initRandomFloat += initRandom

#print np.finfo(seeds[0,0])
#print np.finfo(initRandomFloat[0,0])

def updateRandom(random, seed):
    (ny, nx) = seed.shape
    (domain_ny, domain_nx) = random.shape
    b_dim = 16
    blocks_y = int(np.ceil(ny /np.float(b_dim)) * b_dim)
    blocks_x = int(np.ceil(nx /np.float(b_dim)) * b_dim)
    for by in range(blocks_y):
        for bx in range(blocks_x):
            for j in range(b_dim):
                for i in range(b_dim):
                    
                    ## Content of kernel:
                    y = b_dim*by + j # thread_id
                    x = b_dim*bx + i # thread_id
                    if (x < nx and y < ny):
                        n1, n2, seeds[y,x]   = boxMuller(seeds[y,x])
                        if x*2 + 1 < domain_nx:
                            random[y, x*2  ] = n1
                            random[y, x*2+1] = n2
                        elif x*2 == domain_nx:
                            random[y, x*2] = n1
                            
def soar(eta_pert, random, seed):
    updateRandom(random, seed)
    
    
                    
    
print "initRandom.shape: ", initRandom.shape
print "seeds.shape: ", seeds.shape
updateRandom(initRandom, seeds)

fig = plt.figure(figsize=(4,2))
plt.hist(initRandom.flatten(), bins=50, normed=True)
plt.title("initRandom")
fig = plt.figure(figsize=(4,2))
plt.hist(initRandomCopy.flatten(), bins=50, normed=True)
plt.title("initRandomCopy")


fasitNormal = np.random.normal(size=(halo_ny, halo_nx))
fig = plt.figure(figsize=(4,2))
plt.hist(fasitNormal.flatten(), bins=50, normed=True)
plt.title("fasitNormal")

# Applying a covariance structure to a variable perturbation

We want to perturb $\eta$ with a model error $\Delta \eta$, so that the covariance of $\Delta \eta$ becomes $Q$. 

Let $\xi$ be a vector of size $n_x n_y$ (same as the vector $\eta$), with $\xi_i \sim N(0,1)$. The perturbation can be generated as $\Delta \eta = Q^{1/2} \xi$, where 
$$ Q^{1/2}(a,b) = q_0 \left[ 1 + 2 \frac{dist(a,b)}{L} \right] \exp \left\{ - \frac{dist(a,b)}{L} \right\},$$
in which $a$ and $b$ are cells, $L$ is a length scale, and $q_0$ is a scale on the amplitude.

**Recall**, if $Z$ is a random variable from $N(0,1)$ (mean 0 and variance 1), then $ X = \sigma Z + \mu$ is a random variable with mean $mu$ and variance $\sigma^2$.

In [None]:

def SOAR_Q(a_x, a_y, b_x, b_y, dx, dy, q0, L):
    dist = np.sqrt( dx*dx*(a_x - b_x)**2  +  dy*dy*(a_y - b_y)**2)
    return q0*(1.0 + 2.0*dist/L)*np.exp(-dist/L)

def createQMatrix(nx, ny, dx=1, dy=1, q0=1, L=1):
    Q = np.zeros((ny*nx, ny*nx))
    for a_y in range(ny):
        for a_x in range(nx):
            j = a_y*nx + a_x
            for b_y in range(ny):
                for b_x in range(nx):
                    i = b_y*nx + b_x
                    Q[j, i] = SOAR_Q(a_x, a_y, b_x, b_y, dx, dy, q0, L)
    return Q

def applyQ(xi, dx=1, dy=1, q0=0.1, L=1):
    Qxi = np.zeros_like(xi)
    (ny, nx) = xi.shape
    for a_y in range(ny):
        for a_x in range(nx):
            # This is a OpenCL thread
            Qx = 0
            for b_y in range(ny):
                for b_x in range(nx):
                    Q = SOAR_Q(a_x, a_y, b_x, b_y, dx, dy, q0, L)
                    Qx += Q*xi[b_y, b_x]
            Qxi[a_y, a_x] = Qx
    return Qxi

def applyQfast(xi, dx=1, dy=1, q0=0.1, L=1, cutoff=10):
    Qxi = np.zeros_like(xi)
    (ny, nx) = xi.shape
    for a_y in range(ny):
        for a_x in range(nx):
            # This is a OpenCL thread
            Qx = 0
            start_b_y = max(0, a_y - cutoff)
            end_b_y =  min(ny, a_y + cutoff+1)
            start_b_x = max(0, a_x - cutoff)
            end_b_x =  min(nx, a_x + cutoff+1)
            
            for b_y in range(start_b_y, end_b_y):
                for b_x in range(start_b_x, end_b_x):
                    Q = SOAR_Q(a_x, a_y, b_x, b_y, dx, dy, q0, L)
                    Qx += Q*xi[b_y, b_x]
            Qxi[a_y, a_x] = Qx
    return Qxi

    
nx = 40
ny = 40
L = 1
q0 = 0.05
Q = createQMatrix(nx, ny, L=L, q0=q0)
fig = plt.figure(figsize=(4,4))
plt.imshow(Q, interpolation="None")
plt.colorbar()
print "norm of Q - Q.T: ", np.linalg.norm(Q - Q.T)
print "Q[5,5]: ", Q[5,5]
print "max(Q): ", np.max(Q)

#xi = np.random.rand(ny, nx)
xi = np.random.normal(size=(ny, nx))
fig = plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.imshow(xi, interpolation="None")
plt.colorbar()
Qxi = applyQ(xi, L=L, q0=q0)
plt.subplot(1,2,2)
plt.imshow(Qxi, interpolation="None")
plt.colorbar()

# Investigate the effect of cutoff
print "norm of Qxi: ", np.linalg.norm(Qxi)
for cutoff in range(min(nx, 8)):
    Qxifast = applyQfast(xi,  L=L, q0=q0, cutoff=cutoff)
    print "Diff with cutoff = " + str(cutoff) + ": ", np.linalg.norm(Qxi - Qxifast)
    print "\tMax diff:     ", np.max(np.abs(Qxi - Qxifast))
    print "\tMax rel diff: ", np.max(np.abs((Qxi - Qxifast)))*100/np.linalg.norm(Qxi)
    
# Visualize a given cutoff value:
cutoff = 3
fig = plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.imshow(xi, interpolation="None")
plt.colorbar()
Qxifast = applyQfast(xi,  L=L, q0=q0, cutoff=cutoff)
plt.subplot(1,3,2)
plt.imshow(Qxifast, interpolation="None")
plt.colorbar()
plt.subplot(1,3,3)
plt.imshow(Qxifast-Qxi, interpolation="None", vmin=np.min(Qxifast), vmax=np.max(Qxifast))
plt.colorbar()


#### Import modules and set up environment

In [None]:
#Lets have matplotlib "inline"
%config InlineBackend.figure_format = 'retina'

#Import packages we need
import numpy as np
from matplotlib import animation, rc
from matplotlib import pyplot as plt
from matplotlib import gridspec


import os
import pyopencl
import datetime
import sys

#Set large figure sizes
rc('figure', figsize=(16.0, 12.0))
rc('animation', html='html5')

#Import our simulator
from SWESimulators import CTCS, CDKLM16, PlotHelper, Common
#Import initial condition and bathymetry generating functions:
from SWESimulators.BathymetryAndICs import *
from SWESimulators import Drifter, CPUDrifter, GPUDrifter
from SWESimulators import DataAssimilationUtils as dautils

from SWESimulators import WindForcingEnsemble

In [None]:
#Make sure we get compiler output from OpenCL
os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1"

#Set which CL device to use, and disable kernel caching
if (str.lower(sys.platform).startswith("linux")):
    os.environ["PYOPENCL_CTX"] = "0"
else:
    os.environ["PYOPENCL_CTX"] = "1"
os.environ["CUDA_CACHE_DISABLE"] = "1"
os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1"
os.environ["PYOPENCL_NO_CACHE"] = "1"

#Create OpenCL context
cl_ctx = pyopencl.create_some_context()
print "Using ", cl_ctx.devices[0].name