```
This notebook sets up and runs a set of benchmarks to compare
different numerical discretizations of the SWEs

Copyright (C) 2016  SINTEF ICT

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
```

# Random Numbers on the GPU

In order to add model error to our Shallow Water Simulators, we need to be able to generate random numbers on the GPU. Since we don't want to write such a library ourselves, we look for existing libraries.

Below is a list of existing libraries.

#### MWC64X 
http://cas.ee.ic.ac.uk/people/dt10/research/rngs-gpu-mwc64x.html#source_code

Copyright only, but within a source code file it says BSD

Looks quite old. Copyright from 2011 - no further info.

#### clRNG
https://github.com/clMathLibraries/clRNG

Last commit: Mar 3. 2016
Last commit (not readme): Sep 23 2015
Release 1.0.0 beta: May 1. 2015 ("15 commits to master since this release")

Copyright only (I think)

To cite: Technical report: http://clmathlibraries.github.io/clRNG/docs/clrng-api.pdf

(Possibly) interesting paper: http://www.iro.umontreal.ca/~lecuyer/myftp/papers/parallel-rng-imacs.pdf 
Which is published here: https://www.sciencedirect.com/science/article/pii/S0378475416300829?via%3Dihub
(recieved march 2014, accepted may 2016, online june 2016, published may 2017...)



#### random123
http://www.thesalmons.org/john/random123/releases/1.06/docs/index.html

Copyright only

With publication from 2011: https://dl.acm.org/citation.cfm?doid=2063405


Latest release: 1.09 - Mar 6, 2016



## Installing and testing clRNG

https://github.com/clMathLibraries/clRNG#example-instructions-for-linux
with the following changes:

     cmake -DCMAKE_INSTALL_PREFIX:PATH=/home/havahol/3rdparty/clRNG ../src/
    make install
    export CLRNG_ROOT=/home/havahol/3rdparty/clRNG/
    
Tests failed...

### And then:
When the test failed, it was time for a GPU-Ocean telco, and AndrÃ© recommended the approach below.


# Linear Congruential Generator (LCG)

LCG is a linear algorithm for generating a serie of pseudo-random numbers. It is serial in design, based on the piecewise linear form 
$$ X_{n+1} = (a X_n + c) \mod m.$$




REF: Section 1.1 of http://citeseer.ist.psu.edu/viewdoc/download?doi=10.1.1.53.3686&rep=rep1&type=pdf 


### Normal distributed numbers:

Since $X_{n}$ and $X_{n+1}$ from the LCG are independent random numbers from $U[0,1]$, we can use them to generate two random numbers $N_{n}$ and $N_{n+1}$ from $N(0,1)$ through the Box-Muller transform:
$$ N_{n}   = \sqrt{-2 \ln (X_{n})} \cos (2 \pi X_{n+1}) $$
$$ N_{n+1} = \sqrt{-2 \ln (X_{n})} \sin (2 \pi X_{n+1}) $$

## Set environment

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
from matplotlib import pyplot as plt
from matplotlib import animation, rc

import pyopencl
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))

#Set large figure sizes
rc('figure', figsize=(16.0, 12.0))
rc('animation', html='html5')

#Import our simulator
from SWESimulators import FBL, CTCS, KP07, CDKLM16, PlotHelper, Common

from SWESimulators import OceanStateNoise, GPUDrifterCollection


In [None]:
#Make sure we get compiler output from OpenCL
os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1"

#Set which CL device to use, and disable kernel caching
if (str.lower(sys.platform).startswith("linux")):
    os.environ["PYOPENCL_CTX"] = "0"
else:
    os.environ["PYOPENCL_CTX"] = "1"
os.environ["CUDA_CACHE_DISABLE"] = "1"
os.environ["PYOPENCL_COMPILER_OUTPUT"] = "1"
os.environ["PYOPENCL_NO_CACHE"] = "1"

#Create OpenCL context
cl_ctx = pyopencl.create_some_context()
cl_queue = pyopencl.CommandQueue(cl_ctx)
print "Using ", cl_ctx.devices[0].name

In [None]:
nx, ny = 40, 40
floatMax = 2147483648.0

host_seed = np.random.rand(ny, nx)*floatMax
host_seed.astype(np.int64, order='C')

mf = pyopencl.mem_flags
data = pyopencl.Buffer(cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=host_seed)
print "hei"

np.uint64

## Start testing with random numbers and LCG

All mentioned functionality is implemented in the OceanStateNoise class.

In [None]:
a = 0x7fffffff
print a
b = 2147483648.0
print b
c = 1103515245
d = 12345

def lcgAndre(seed):
    seed = (seed * 1103515245) + 12345
    abs_ = seed % 0x7fffffff
    return abs_ / 2147483648.0, seed

def lcg(seed):
    seed = (np.double((np.float32(seed)*1103515245)) + 12345) % 0x7fffffff
    return seed / 2147483648.0, seed

def boxMuller(seed):
    u1, seed = lcg(seed)
    u2, seed = lcg(seed)
    r = np.sqrt(-2.0*np.log(u1))
    theta = 2*np.pi*u2
    n1 = r*np.cos(theta)
    n2 = r*np.sin(theta)
    return n1, n2, seed

def boxMullerInner(u1, u2):
    r = np.sqrt(-2.0*np.log(u1))
    theta = 2*np.pi*u2
    n1 = r*np.cos(theta)
    n2 = r*np.sin(theta)
    return n1, n2
    
def boxMullerFasit(seed):
    u1 = np.random.rand()
    u2 = np.random.rand()
    r = np.sqrt(-2.0*np.log(u1))
    theta = 2.0*np.pi*u2
    n1 = r*np.cos(theta)
    n2 = r*np.sin(theta)
    return n1, n2, 0

seed1 = 42124.4*np.random.rand()
N = 100000
u1 = np.zeros(N)
n1 = np.zeros(N)
#n4 = np.random.normal(0, 1, 10000)

for i in range(N):
    u1[i], seed1 = lcg(seed1)
for i in range(N/2):
    n1[2*i], n1[2*i+1], seed1 = boxMuller(seed1)

fig = plt.figure(figsize=(4,2))
plt.hist(n1, bins=50)
print np.mean(n1), np.var(n1)

print "---- uniform ----"
fig = plt.figure(figsize=(4,2))
plt.hist(u1, bins=50)
print np.min(u1), np.max(u1)
print "mean u1: ", np.mean(u1)

def autocorr(x, k, mean=None, var=None):
    R = np.zeros(k)
    n = len(x)
    if mean is None:
        mean = np.mean(x)
    if var is None:
        var = np.var(x)
    for i in range(k):
        r = 0.0
        for t in range(n-i):
            r += (x[t] - mean)*(x[t+i] - mean)
        R[i] = r/((n-i)*var)
    return R

U_mean = 1.0/2.0
U_var = 1.0/12.0
k = 20
lag = autocorr(u1, k, U_mean, U_var)
biased_lag = autocorr(u1, k)
normal_lag = autocorr(n1, k)
fig = plt.figure(figsize=(4,2))
plt.semilogy(np.abs(lag))
plt.semilogy(np.abs(normal_lag))
plt.title("autocorr")
plt.grid()

Small calcs on errros related to parallel LCGs. Refs from L'Ecuyer 2017 review paper

In [None]:
#s = 1000*1000.
s = 200*200.
#s = 50*50.
rho = 2**31
print "State space:     ", rho
print "Number of cells: ", s
x =  s / 2**31 
print x
print np.log(1-x)
lnp = (s-1)*np.log(1-x)
print lnp
print "probability of none overlapping seeds: ", np.exp(lnp)
print s*s/rho

### CPU version of a *kernel* updating the global buffers


In [None]:
nx = 200
ny = 200
halo_nx = nx + 2*2
halo_ny = ny + 2*2
initRandom = np.ones((halo_ny, halo_nx))*(-6.0)
initRandomCopy = initRandom.copy()
seeds = np.random.rand(halo_ny, halo_nx/2)*0x7fffffff
#initRandomFloat = np.zeros((ny, nx), dtype=np.float32)
#initRandomFloat += initRandom

#print np.finfo(seeds[0,0])
#print np.finfo(initRandomFloat[0,0])

def updateRandom(random, seed, normal=True):
    (ny, nx) = seed.shape
    (domain_ny, domain_nx) = random.shape
    b_dim = 16
    blocks_y = int(np.ceil(ny /np.float(b_dim)) * b_dim)
    blocks_x = int(np.ceil(nx /np.float(b_dim)) * b_dim)
    for by in range(blocks_y):
        for bx in range(blocks_x):
            for j in range(b_dim):
                for i in range(b_dim):
                    
                    ## Content of kernel:
                    y = b_dim*by + j # thread_id
                    x = b_dim*bx + i # thread_id
                    if (x < nx and y < ny):
                        n1, n2 = 0.0, 0.0
                        if normal:
                            n1, n2, seeds[y,x]   = boxMuller(seeds[y,x])
                        else:
                            n1, seed[y,x] = lcg(seed[y,x])
                            n2, seed[y,x] = lcg(seed[y,x])
                        if x*2 + 1 < domain_nx:
                            random[y, x*2  ] = n1
                            random[y, x*2+1] = n2
                        elif x*2 == domain_nx:
                            random[y, x*2] = n1


### Testing the OceanStateNoise class w.r.t. pure random numbers

In [None]:
reload(OceanStateNoise)

nx = 200
ny = 200
dx, dy = 1.0, 1.0
staggered = True
boundaryConditions = Common.BoundaryConditions(2,2,2,2)

np.random.seed(0)
noise = OceanStateNoise.OceanStateNoise(cl_ctx, cl_queue, nx, ny, dx, dy,
                                        boundaryConditions, staggered)

for i in range(100):
    noise.generateUniformDistribution()
noise.generateUniformDistributionCPU()

def compareCPUGPU(noise):
    U = noise.getRandomNumbers()
    U_CPU = noise.getRandomNumbersCPU()
    var_U = np.var(U)
    mean_U = np.mean(U)
    var_U_CPU = np.var(U_CPU)
    mean_U_CPU = np.mean(U_CPU)
    print "norm(U - U_CPU): ", np.linalg.norm(U-U_CPU)
    print "GPU(mean, var): ", (mean_U, var_U)
    print "CPU(mean, var): ", (mean_U_CPU, var_U_CPU)

fig = plt.figure(figsize=(8,2))
plt.subplot(1,2,1)
plt.hist(noise.getRandomNumbers().flatten(), bins=50, normed=True)
plt.title("uniform GPU")
plt.subplot(1,2,2)
plt.hist(noise.getRandomNumbersCPU().flatten(), bins=50, normed=True)
plt.title("uniform CPU")

compareCPUGPU(noise)


if (nx*ny < 1000):
    fig = plt.figure(figsize=(4,4))
    plt.imshow(noise.getRandomNumbersCPU(), interpolation="None")
    plt.colorbar()

for i in range(100):
    noise.generateNormalDistribution()
noise.generateNormalDistributionCPU()

fig = plt.figure(figsize=(8,2))
plt.subplot(1,2,1)
plt.hist(noise.getRandomNumbers().flatten(), bins=50, normed=True)
plt.title("normal GPU")
plt.subplot(1,2,2)
plt.hist(noise.getRandomNumbersCPU().flatten(), bins=50, normed=True)
plt.title("normal CPU")

compareCPUGPU(noise)


Since the seed is stored in single precision on the GPU, it can not be expected that the first round of random numbers from the CPU and the GPU are the same, even though their original seed is the same.

# Applying a covariance structure to a variable perturbation

We want to perturb $\eta$ with a model error $\Delta \eta$, so that the covariance of $\Delta \eta$ becomes $Q$. 

Let $\xi$ be a vector of size $n_x n_y$ (same as the vector $\eta$), with $\xi_i \sim N(0,1)$. The perturbation can be generated as $\Delta \eta = Q^{1/2} \xi$, where 
$$ Q^{1/2}(a,b) = q_0 \left[ 1 + \frac{dist(a,b)}{L} \right] \exp \left\{ - \frac{dist(a,b)}{L} \right\},$$
in which $a$ and $b$ are cells, $L$ is a length scale, and $q_0$ is a scale on the amplitude.

**Recall**, if $Z$ is a random variable from $N(0,1)$ (mean 0 and variance 1), then $ X = \sigma Z + \mu$ is a random variable with mean $mu$ and variance $\sigma^2$.

In [None]:

def SOAR_Q(a_x, a_y, b_x, b_y, dx, dy, q0, L):
    dist = np.sqrt( dx*dx*(a_x - b_x)**2  +  dy*dy*(a_y - b_y)**2)
    return q0*(1.0 + dist/L)*np.exp(-dist/L)

def createQMatrix(nx, ny, dx=1, dy=1, q0=1, L=1):
    Q = np.zeros((ny*nx, ny*nx))
    for a_y in range(ny):
        for a_x in range(nx):
            j = a_y*nx + a_x
            for b_y in range(ny):
                for b_x in range(nx):
                    i = b_y*nx + b_x
                    Q[j, i] = SOAR_Q(a_x, a_y, b_x, b_y, dx, dy, q0, L)
    return Q

def applyQ(xi, dx=1, dy=1, q0=0.1, L=1):
    Qxi = np.zeros_like(xi)
    (ny, nx) = xi.shape
    for a_y in range(ny):
        for a_x in range(nx):
            # This is a OpenCL thread
            Qx = 0
            for b_y in range(ny):
                for b_x in range(nx):
                    Q = SOAR_Q(a_x, a_y, b_x, b_y, dx, dy, q0, L)
                    Qx += Q*xi[b_y, b_x]
            Qxi[a_y, a_x] = Qx
    return Qxi


def applyQfast(xi, dx=1, dy=1, q0=0.1, L=1, cutoff=10):
    Qxi = np.zeros_like(xi)
    (ny, nx) = xi.shape
    for a_y in range(ny):
        for a_x in range(nx):
            # This is a OpenCL thread (a_x, a_y)
            Qx = 0
            start_b_y = max(0, a_y - cutoff)
            end_b_y =  min(ny, a_y + cutoff+1)
            start_b_x = max(0, a_x - cutoff)
            end_b_x =  min(nx, a_x + cutoff+1)
            
            for b_y in range(start_b_y, end_b_y):
                for b_x in range(start_b_x, end_b_x):
                    Q = SOAR_Q(a_x, a_y, b_x, b_y, dx, dy, q0, L)
                    Qx += Q*xi[b_y, b_x]
            Qxi[a_y, a_x] = Qx
    return Qxi

def applyQfastPeriodic(xi, dx=1, dy=1, q0=0.1, L=1, cutoff=5):
    # Assume in a GPU setting - we read xi into shared memory with ghostcells
    (ny, nx) = xi.shape
    ny_halo = int(ny + cutoff*2)
    nx_halo = int(nx + cutoff*2)
    local_xi = np.zeros((ny_halo, nx_halo))
    for j in range(ny_halo):
        global_j = (j - cutoff) % ny
        for i in range(nx_halo):
            global_i = (i - cutoff) % nx
            local_xi[j,i] = xi[global_j, global_i]
    # Sync threads
    
    Qxi = np.zeros_like(xi)
    for a_y in range(ny):
        for a_x in range(nx):
            # This is a OpenCL thread (a_x, a_y)
            local_a_x = a_x + cutoff
            local_a_y = a_y + cutoff
            
            start_b_y = local_a_y - cutoff
            end_b_y =  local_a_y + cutoff+1
            start_b_x = local_a_x - cutoff
            end_b_x =  local_a_x + cutoff+1
            
            Qx = 0
            for b_y in range(start_b_y, end_b_y):
                for b_x in range(start_b_x, end_b_x):
                    Q = SOAR_Q(local_a_x, local_a_y, b_x, b_y, dx, dy, q0, L)
                    Qx += Q*local_xi[b_y, b_x]
            Qxi[a_y, a_x] = Qx
    return Qxi

    
nx = 40
ny = 40
L = 0.75
q0 = 0.05
fullQ = False
if fullQ:
    Q = createQMatrix(nx, ny, L=L, q0=q0)
    fig = plt.figure(figsize=(4,4))
    plt.imshow(Q, interpolation="None")
    plt.colorbar()
    fig = plt.figure(figsize=(4,4))
    plt.imshow(np.dot(Q,Q), interpolation="None")
    plt.colorbar()
    print "norm of Q - Q.T: ", np.linalg.norm(Q - Q.T)
    print "Q[5,5]: ", Q[5,5]
    print "max(Q): ", np.max(Q)

#xi = np.random.rand(ny, nx)
xi = np.random.normal(size=(ny, nx))
fig = plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.imshow(xi, interpolation="None")
plt.title('Xi')
plt.colorbar()
Qxi = applyQ(xi, L=L, q0=q0)
plt.subplot(1,3,2)
plt.imshow(Qxi, interpolation="None")
plt.title('Full Q*xi')
plt.colorbar()
Qxifast = applyQfast(xi,  L=L, q0=q0, cutoff=3)
plt.subplot(1,3,3)
plt.imshow(Qxifast, interpolation="None")
plt.title('Q*xi with cutoff = 3')
plt.colorbar()

# Investigate the effect of cutoff
print "norm of Qxi: ", np.linalg.norm(Qxi)
for cutoff in range(min(nx, 8)):
    Qxifast = applyQfast(xi,  L=L, q0=q0, cutoff=cutoff)
    print "Diff with cutoff = " + str(cutoff) + ": ", np.linalg.norm(Qxi - Qxifast)
    print "\tMax diff:     ", np.max(np.abs(Qxi - Qxifast))
    print "\tMax rel diff: ", np.max(np.abs((Qxi - Qxifast)))*100/np.linalg.norm(Qxi)
    
cutoff = 5
QxiPeriodic = applyQfastPeriodic(xi,  L=L, q0=q0, cutoff=cutoff)
fig = plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.imshow(QxiPeriodic, interpolation="None")
plt.title('Periodic Q*xi, cutoff = ' + str(cutoff))
plt.colorbar()
plt.subplot(1,2,2)
plt.imshow(QxiPeriodic-Qxi, interpolation="None", vmin=np.min(Qxifast), vmax=np.max(Qxifast))
plt.title('Periodic Q*xi - full Q*xi')
plt.colorbar()

fig = plt.figure(figsize=(8,2))
plt.subplot(1,2,1)
plt.hist(xi.flatten(), bins=50, normed=True)
plt.title("Distribution of xi values")
plt.subplot(1,2,2)
plt.hist(Qxi.flatten(), bins=50, normed=True)
plt.title("Distribution of full Qxi values")


In [None]:
fig = plt.figure(figsize=(12,6))

for cutoff in range(2,5):
    # Visualize a given cutoff value:
    

    Qxifast = applyQfast(xi,  L=L, q0=q0, cutoff=cutoff)
    plotNo = cutoff - 2
    
    plt.subplot(2,3,plotNo + 1)
    plt.imshow(Qxifast, interpolation="None")
    plt.title('Cutoff ' + str(cutoff))
    plt.colorbar()
    
    plt.subplot(2,3,plotNo + 4)
    plt.imshow(Qxifast-Qxi, interpolation="None", vmin=np.min(Qxifast), vmax=np.max(Qxifast))
    plt.title('Cutoff diff from full')
    plt.colorbar()
    

plt.suptitle('Different cutoff sizes')


In [None]:
#def applyQfastPeriodic(xi, dx=1, dy=1, q0=0.1, L=1, cutoff=5):

def obtainOceanPerturbations(nx, ny, seed, f, H=10, beta=0, g=9.81,
                             ghosts_x=2, ghosts_y=2, dx=1, dy=1, 
                             q0=0.1, L=1.0, cutoff=5):
    xi = np.zeros((ny, nx))
    updateRandom(xi, seed)
    
    ####
    # Global sync
    ####
    
    d_eta = applyQfastPeriodic(xi, dx=dx, dy=dy, q0=q0, L=L, cutoff=cutoff)
    
    ####
    # Global sync (currently)
    #     Can be made into a local sync, as long as d_eta is given 
    #     periodic overlap (1 more global computated ghost cell)
    ####
    
    
    d_hu = np.zeros((ny, nx))
    d_hv = np.zeros((ny, nx))
    
    ### Find H_mid:
    # Read global H (def on intersections) to local, find H_mid
    # The local memory can then be reused to something else (perhaps use local_d_eta before computing local_d_eta?)
    #
    # Here, we just set it to 10
    H_mid = H
    ####
    # Local sync
    ####
    
    local_d_eta = np.zeros((ny+2, nx+2))
    for j in range(ny + 2):
        global_j = (j - 1) % ny
        for i in range(nx + 2):
            global_i = (i - 1) % nx
            local_d_eta[j,i] = d_eta[global_j, global_i]
    
    ####
    # LOCAL sync
    ####
    
    

    for j in range(0, ny):
        local_j = j + 1
        coriolis = f + beta*local_j*dy
        for i in range(0, nx):
            local_i = i + 1
            h_mid = local_d_eta[local_j,local_i] + H_mid
            
            eta_diff_y = (local_d_eta[local_j+1, local_i] - local_d_eta[local_j-1, local_i])/(2.0*dy)
            d_hu[j,i] = -(g/coriolis)*h_mid*eta_diff_y
            
            eta_diff_x = (local_d_eta[local_j, local_i+1] - local_d_eta[local_j, local_i-1])/(2.0*dx)
            d_hv[j,i] = (g/coriolis)*h_mid*eta_diff_x   
    
    return d_eta, d_hu, d_hv
    


nx = 50
ny = 50
ghosts_x = 0
ghosts_y = 0
halo_nx = nx + ghosts_x*2
halo_ny = ny + ghosts_y*2
dx = 100.0
dy = 100.0
H = 60

f = 0.02
seeds = np.random.rand(ny, nx/2)*0x7fffffff

q0 = dx/1000000
L = 0.75*dx
cutoff = 2

d_eta, d_hu, d_hv = obtainOceanPerturbations(nx, ny, seeds, f, H=H, ghosts_x=ghosts_x, ghosts_y=ghosts_y, dx=dx, dy=dy,
                                             q0=q0, L=L, cutoff=cutoff)

fig = plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.imshow(d_eta, interpolation="None", origin="lower")
plt.title('$\Delta \eta$')
plt.colorbar()

plt.subplot(1,3,2)
plt.imshow(d_hu, interpolation="None", origin="lower")
plt.title('$\Delta hu$')
plt.colorbar()

plt.subplot(1,3,3)
plt.imshow(d_hv, interpolation="None", origin="lower")
plt.title('$\Delta hv$')
plt.colorbar()

fig = plt.figure(figsize=(5,5))
X,Y = np.meshgrid(np.arange(0, nx, 1.0), np.arange(0, ny, 1.0))
plt.quiver(X, Y, d_hu, d_hv)



# GPU implementation

Some considerations have to be done with respect to boundary conditions.

In order to generate perturbed eta with ***double periodic boundary conditions and assuming non-staggered grids***, we would 
- generate $\xi$ as $(nx, ny)$ random normal distributed numbers
- apply periodic boundary conditions to $\xi$ (in local memory), so that it gets the size 
$$(nx+2(1+cutoff), ny + 2(1+cutoff))$$
- apply SOAR to $\xi$, giving d_eta on $(nx + 2, ny + 2)$, where the outer cells fulfills the periodicity.
- find d_hu and d_hv through the geostrophic balance, sizes $(nx, ny)$, and use only the inner $(nx, ny)$ of d_eta.

Now, what happens if we have ***staggered grids***? In order to calculate d_hu and d_hv, we require the same number of cells as in the non-staggered case, so the only difference is the computation of geostrophic balance. The total required data memory layout is the same, and d_eta does not need to be modified.

How about ***other boundary conditions***? 
First, how to give a consistent covariance structure of d_eta? By using extra random numbers. Internally, the computational domain of wall and open boundary conditions are the same (here $(nx, ny)$). The tactic would be to produce $\xi$ $$(nx+2(1+cutoff), ny + 2(1+cutoff))$$ random numbers at once, where no boundary conditions are applied to $\xi$. Applying SOAR would give d_eta of the same size as above, and final d_eta, d_hu and d_hv of sizes $(nx, ny)$.

***Mixed boundary conditions***, where they are periodic in only one direction. The only difference will 

***Common for all boundary conditions*** would be the final step, namely 
$$\eta += \Delta \eta, \quad hu += \Delta hu, \quad hv += \Delta hv,$$
where each perturbation is only added on the computational domain. 
Boundary conditions would have to be applied to all three fields after the perturbation is added in any case.

In [None]:
if 'noise' in globals():
    noise.cleanUp()
reload(OceanStateNoise)
def compareDeltaEta(noise):
    etaCPU = np.zeros((noise.ny, noise.nx))
    HCPU = np.ones((noise.ny+1, noise.nx+1))*10.0
    etaGPU = Common.OpenCLArray2D(cl_ctx, noise.nx, noise.ny, 0, 0, etaCPU)
    huGPU = Common.OpenCLArray2D(cl_ctx, noise.nx, noise.ny, 0, 0, etaCPU)
    hvGPU = Common.OpenCLArray2D(cl_ctx, noise.nx, noise.ny, 0, 0, etaCPU)
    HGPU = Common.OpenCLArray2D(cl_ctx, noise.nx+1, noise.ny+1, 0, 0, HCPU)
    f, beta, g = 0.02, 0.0, 9.81
    
    #noise.generateNormalDistribtion()
    noise.perturbOceanState(etaGPU, huGPU, hvGPU, HGPU, f, beta, g)
    noise.perturbEtaCPU(etaCPU, use_existing_GPU_random_numbers=True)
    print "perturbed etaCPU - min/max(etaCPU): ", (np.min(etaCPU), np.max(etaCPU))
    etaFromGPU = etaGPU.download(cl_queue)
    etaGPU.release()
    huGPU.release()
    hvGPU.release()
    
    eta_max = max(np.max(np.abs(etaCPU)), np.max(np.abs(etaFromGPU)))
    
    fig = plt.figure(figsize=(12,4))
    plt.subplot(1,3,1)
    im = plt.imshow(etaCPU, interpolation="None", origin="lower")
    plt.title('$\Delta \eta$ from CPU')
    im.set_clim(-eta_max, eta_max)
    plt.colorbar()

    plt.subplot(1,3,2)
    im = plt.imshow(etaFromGPU, interpolation="None", origin="lower")
    plt.title('$\Delta \eta$ from GPU')
    im.set_clim(-eta_max, eta_max)
    plt.colorbar()

    plt.subplot(1,3,3)
    plt.imshow(etaCPU-etaFromGPU, interpolation="None", origin="lower")
    plt.title('Diff between CPU and GPU')
    plt.colorbar()
    
    if noise.periodicNorthSouth and noise.periodicEastWest:
        plt.suptitle("Full periodic")
    elif noise.periodicNorthSouth:
        plt.suptitle("Periodic north-south")
    elif noise.periodicEastWest:
        plt.suptitle("Periodic east-west")
    else:
        plt.suptitle("Non-periodic")
    return etaFromGPU
    
nx, ny = 50, 50
dx, dy = 1.0, 1.0
staggered = False
bcs = [Common.BoundaryConditions(2,2,2,2), Common.BoundaryConditions(1,1,1,1),
      Common.BoundaryConditions(1,2,1,2), Common.BoundaryConditions(2,1,2,1)]
#bcs = [Common.BoundaryConditions(2,2,2,2)]
eta = None
for bc in bcs:
    noise = OceanStateNoise.OceanStateNoise(cl_ctx, cl_queue, nx, ny, dx, dy, bc, staggered)  
    eta = compareDeltaEta(noise)
fig = plt.figure(figsize=(4,2))
plt.hist(eta.flatten(), bins=50, normed=True)
print "ok?"
print eta[0,0:20]
mod_test = np.zeros(20)
for i in range(20):
    tx = i % 16
    mod_test[i] = (tx-3)%10
print mod_test
print eta[-1,-1]
print eta.shape


In [None]:
if 'noise' in globals():
    noise.cleanUp()
reload(OceanStateNoise)
def compareOceanNoise(noise):
    etaCPU = np.zeros((noise.ny, noise.nx))
    huCPU = np.zeros((noise.ny, noise.nx))
    hvCPU = np.zeros((noise.ny, noise.nx))
    HCPU = np.ones((noise.ny+1, noise.nx+1))*10.0

    etaGPU = Common.OpenCLArray2D(cl_ctx, noise.nx, noise.ny, 0, 0, etaCPU)
    huGPU = Common.OpenCLArray2D(cl_ctx, noise.nx, noise.ny, 0, 0, etaCPU)
    hvGPU = Common.OpenCLArray2D(cl_ctx, noise.nx, noise.ny, 0, 0, etaCPU)
    HGPU = Common.OpenCLArray2D(cl_ctx, noise.nx+1, noise.ny+1, 0, 0, HCPU)
    f, beta, g = 0.02, 0.0, 9.81
    
    #noise.generateNormalDistribtion()
    noise.perturbOceanState(etaGPU, huGPU, hvGPU, HGPU, f, beta, g)
    noise.perturbOceanStateCPU(etaCPU, huCPU, hvCPU, HCPU, f, beta, g, use_existing_GPU_random_numbers=True)
    #print "perturbed etaCPU - min/max(etaCPU): ", (np.min(etaCPU), np.max(etaCPU))
    eta = etaGPU.download(cl_queue)
    hu = huGPU.download(cl_queue)
    hv = hvGPU.download(cl_queue)
    etaGPU.release()
    huGPU.release()
    hvGPU.release()
    
    eta_max = max(np.max(np.abs(etaCPU)), np.max(np.abs(eta)))
    hu_max = max(np.max(np.abs(huCPU)), np.max(np.abs(hu)))
    hv_max = max(np.max(np.abs(hvCPU)), np.max(np.abs(hv)))
    
    
    # ----- CPU ----------
    fig = plt.figure(figsize=(12,12))
    plt.subplot(3,3,1)
    im = plt.imshow(etaCPU, interpolation="None", origin="lower")
    plt.title('$\Delta \eta$ from CPU')
    im.set_clim(-eta_max, eta_max)
    plt.colorbar()
    
    plt.subplot(3,3,2)
    im = plt.imshow(huCPU, interpolation="None", origin="lower")
    plt.title('$\Delta hu$ from CPU')
    im.set_clim(-hu_max, hu_max)
    plt.colorbar()
    
    plt.subplot(3,3,3)
    im = plt.imshow(hvCPU, interpolation="None", origin="lower")
    plt.title('$\Delta hv$ from CPU')
    im.set_clim(-hv_max, hv_max)
    plt.colorbar()

    ## ---------- GPU ------    
    plt.subplot(3,3,4)
    im = plt.imshow(eta, interpolation="None", origin="lower")
    plt.title('$\Delta \eta$ from GPU')
    im.set_clim(-eta_max, eta_max)
    plt.colorbar()
    
    plt.subplot(3,3,5)
    im = plt.imshow(hu, interpolation="None", origin="lower")
    plt.title('$\Delta hu$ from GPU')
    im.set_clim(-hu_max, hu_max)
    plt.colorbar()
    
    plt.subplot(3,3,6)
    im = plt.imshow(hv, interpolation="None", origin="lower")
    plt.title('$\Delta hv$ from GPU')
    im.set_clim(-hv_max, hv_max)
    plt.colorbar()

    # ------ Diff ----------
    
    plt.subplot(3,3,7)
    plt.imshow(etaCPU-eta, interpolation="None", origin="lower")
    plt.title('Diff $\eta$ between CPU and GPU')
    plt.colorbar()
    
    plt.subplot(3,3,8)
    plt.imshow(huCPU-hu, interpolation="None", origin="lower")
    plt.title('Diff $hu$ between CPU and GPU')
    plt.colorbar()
    
    plt.subplot(3,3,9)
    plt.imshow(hvCPU-hv, interpolation="None", origin="lower")
    plt.title('Diff $hv$ between CPU and GPU')
    plt.colorbar()
    
    supertitle="Non-periodic"
    if noise.periodicNorthSouth and noise.periodicEastWest:
        supertitle = "Full periodic"
    elif noise.periodicNorthSouth:
        supertitle = "Periodic north-south"
    elif noise.periodicEastWest:
        supertitle = "Periodic east-west"
    plt.suptitle(supertitle)

    fig = plt.figure(figsize=(4,4))
    X,Y = np.meshgrid(np.arange(0, noise.nx, 1.0), np.arange(0, noise.ny, 1.0))
    plt.quiver(X, Y, hu, hv)
    plt.suptitle(supertitle)
    return eta
    
    
nx, ny = 50, 50
dx, dy = 1.0, 1.0
staggered = False
bcs = [Common.BoundaryConditions(2,2,2,2), Common.BoundaryConditions(1,1,1,1),
      Common.BoundaryConditions(1,2,1,2), Common.BoundaryConditions(2,1,2,1)]
#bcs = [Common.BoundaryConditions(2,2,2,2)]
eta = None
for bc in bcs:
    noise = OceanStateNoise.OceanStateNoise(cl_ctx, cl_queue, nx, ny, dx, dy, bc, staggered)  
    eta = compareOceanNoise(noise)
fig = plt.figure(figsize=(4,2))
plt.hist(eta.flatten(), bins=50, normed=True)
print "ok?"


## Simulator with model error

First, let's try to start a model with a field that is perturbed with relatively large $q_0$ (noise with large amplitude).

Drifters are added as a way to visualize the velocity field, but in order to see anything exciting, their sensitivity should be increase (`drifters.setSensitivity(50)` or similar).
Keep in mind that if $hu = 1$ and $H = 10$m, we get $u = 0.1 m/s$ which means it will stay within its $100m \times 100m$ grid cell for (potentially) 1000 seconds :)

In [None]:
def plotOceanState(sim, ghosts, supertitle):
    eta, hu, hv = sim.download()
    eta = eta[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]]
    hu = hu[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]]
    hv = hv[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]]
    
    fig = plt.figure(figsize=(12,4))
    plt.subplot(1,3,1)
    im = plt.imshow(eta, interpolation="None", origin="lower")
    plt.title('$\eta$')
    #im.set_clim(-eta_max, eta_max)
    plt.colorbar()
    
    plt.subplot(1,3,2)
    im = plt.imshow(hu, interpolation="None", origin="lower")
    plt.title('$hu$')
    #im.set_clim(-hu_max, hu_max)
    plt.colorbar()
    
    plt.subplot(1,3,3)
    im = plt.imshow(hv, interpolation="None", origin="lower")
    plt.title('$hv$')
    #im.set_clim(-hv_max, hv_max)
    plt.colorbar()
    plt.suptitle(supertitle)
    
    fig = plt.figure(figsize=(3,3))
    X,Y = np.meshgrid(np.arange(0, sim.nx, 1.0), np.arange(0, sim.ny, 1.0))
    plt.quiver(X, Y, hu, hv)
    plt.suptitle(supertitle)
    
    max_u = max(np.max(hu/(eta+10) + np.sqrt(sim.g*(eta+10))),
                np.max(hu/(eta+10) - np.sqrt(sim.g*(eta+10)))  )
    max_v = max(np.max(hv/(eta+10) + np.sqrt(sim.g*(eta+10))),
                np.max(hv/(eta+10) - np.sqrt(sim.g*(eta+10)))  )
    dt_max = 0.25*min(sim.dx/max_u, sim.dy/max_v)
    print "Finding maximum possible dt: ", dt_max


nx, ny = 40, 40 # 100, 100
dx, dy = 100.0 ,100.0
dt = 0.5

f, beta = 0.02, 0.0
g = 9.81
waterdepth = 10.0
r = 0.0

ghosts = np.array([2,2,2,2]) # north, east, south, west
boundaryConditions = Common.BoundaryConditions(2,2,2,2)

# Define which cell index which has lower left corner as position (0,0)
x_zero_ref, y_zero_ref = 2, 2

dataShape = (ny + ghosts[0]+ghosts[2], 
             nx + ghosts[1]+ghosts[3])

# Input data
eta0 = np.zeros(dataShape, dtype=np.float32, order='C');
u0 = np.zeros(dataShape, dtype=np.float32, order='C');
v0 = np.zeros(dataShape, dtype=np.float32, order='C');
Hi = np.ones((dataShape[0]+1, dataShape[1]+1), dtype=np.float32, order='C')*waterdepth


# Required for anim/plotting:
x_center = dx*nx/2.0
y_center = dy*ny/2.0
y_coords, x_coords = np.mgrid[0:ny*dy:dy, 0:nx*dx:dx]
radius = np.sqrt(np.multiply(x_coords, x_coords) + np.multiply(y_coords, y_coords))

staggered = False

In [None]:
if 'sim' in globals():
    sim.cleanUp()
if 'noise' in globals():
    noise.cleanUp()
    
reload(CDKLM16)
reload(OceanStateNoise)
# Creating sim
sim = CDKLM16.CDKLM16(cl_ctx, eta0, u0, v0, Hi, \
                      nx, ny, dx, dy, dt, g, f, r, \
                      boundary_conditions=boundaryConditions,
                      write_netcdf=False)

# Adding drifters:
numParticles = 100
gpuParticles = GPUDrifterCollection.GPUDrifterCollection(cl_ctx, numParticles,
                                                         observation_variance=0.0,
                                                         boundaryConditions=sim.boundary_conditions,
                                                         domain_size_x=sim.nx*sim.dx,
                                                         domain_size_y=sim.ny*sim.dy)
gpuParticles.initializeUniform()
gpuParticles.setSensitivity(10)
sim.attachDrifters(gpuParticles)

q0 = 100*dx/100000
noise = OceanStateNoise.OceanStateNoise.fromsim(sim, soar_q0=q0)
noise.perturbSim(sim)
plotOceanState(sim, ghosts, "Initial conditions with noise")


### Animating simulation from the above state

fig = plt.figure(figsize=(8, 10))
plotter = PlotHelper.PlotHelper(fig, x_coords, y_coords, radius, 
                                eta0[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]], 
                                u0[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]], 
                                v0[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]])
plotter.showDrifters(sim.drifters)
T = 100
sub_t = 200*dt
def animate(i):
    if (i>0):
        t = sim.step(sub_t)
    else:
        t = 0.0

    # Downscale the fields since they are quite intense :P    
    scale = 0.1
    eta1, hu1, hv1 = sim.download()
    plotter.plot(scale*eta1[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]], 
                 scale*hu1[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]], 
                 scale*hv1[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]]);
    plotter.showDrifters(sim.drifters)
         
    fig.suptitle("CDKLM16 from a chaotic state = " + "{:04.0f}".format(t) + " s", fontsize=18)
    
    if (i%20 == 0):
        print "{:03.0f}".format(100*i / T) + " % => t=" + str(t) + "\tMax eta: " + str(np.max(eta1)) + \
        "\tMax hu: " + str(np.max(hu1)) + \
        "\tMax hv: " + str(np.max(hv1))
                     
anim = animation.FuncAnimation(fig, animate, range(T), interval=100)
plt.close(anim._fig)
anim


## Test simulation with model errors
Starting from lake-at-rest and adding some model errors for every time step. Fun fun fun!

In [None]:
if 'sim' in globals():
    sim.cleanUp()
if 'noise' in globals():
    noise.cleanUp()

reload(Common)
reload(CDKLM16)
reload(OceanStateNoise)

q0 = 0.1*dx/100000
#noise = OceanStateNoise.OceanStateNoise.fromsim(sim, soar_q0=q0)

sim = CDKLM16.CDKLM16(cl_ctx, eta0, u0, v0, Hi, \
                      nx, ny, dx, dy, dt, g, f, r, \
                      boundary_conditions=boundaryConditions, \
                      write_netcdf=False, \
                      small_scale_perturbation=True, \
                      small_scale_perturbation_amplitude=q0)

# Adding drifters:
numParticles = 100
gpuParticles = GPUDrifterCollection.GPUDrifterCollection(cl_ctx, numParticles,
                                                         observation_variance=0.0,
                                                         boundaryConditions=sim.boundary_conditions,
                                                         domain_size_x=sim.nx*sim.dx,
                                                         domain_size_y=sim.ny*sim.dy)
gpuParticles.initializeUniform()
#gpuParticles.setSensitivity(10)
sim.attachDrifters(gpuParticles)


### Animating simulation from the above state

fig = plt.figure(figsize=(8, 10))
plotter = PlotHelper.PlotHelper(fig, x_coords, y_coords, radius, 
                                eta0[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]], 
                                u0[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]], 
                                v0[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]])
plotter.showDrifters(sim.drifters)
T = 200
sub_t = 50*dt
def animate(i):
    if (i>0):
        t = sim.step(sub_t)
        
        # Add this line to generate new seed from the CPU every superstep
        #sim.small_scale_model_error.resetSeed()
        
        # Add this block to generate new seed from the CPU every timestep
        #for j in range(50):
        #    t = sim.step(dt)
        #    sim.small_scale_model_error.resetSeed()

    else:
        t = 0.0

    scale = 1.0
    eta1, hu1, hv1 = sim.download()
    plotter.plot(scale*eta1[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]], 
                 scale*hu1[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]], 
                 scale*hv1[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]]);
    plotter.showDrifters(sim.drifters)
         
    fig.suptitle("CDKLM16 from rest with model error = " + "{:04.0f}".format(t) + " s", fontsize=18)
    
    if (i%10 == 0):
        meanEta = np.mean(eta1[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]])
        minEta = np.min(eta1[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]])
        maxEta = np.max(eta1[ghosts[2]:-ghosts[0], ghosts[3]:-ghosts[1]])
        print "{:03.0f}".format(100*i / T) + " % => t=" + str(t) + "\tMean eta: " + str([meanEta, minEta, maxEta]) + \
        "\tMax hu: " + str(np.max(hu1)) + \
        "\tMax hv: " + str(np.max(hv1))
                     
anim = animation.FuncAnimation(fig, animate, range(T), interval=100)
plt.close(anim._fig)
anim


## Investigating $\Delta \eta$ mean value

The trend in the perturbation is that the $\eta$-field is perturbed with negative mean. This will lead to a significant loss of mass across several timesteps.

In [None]:
def evalMean(sim):
    eta, hu, hv = sim.download()
    eta = eta[2:-2, 2:-2]
    meanEta = np.mean(eta)
    return meanEta

if 'sim1' in globals():
    sim1.cleanUp()
if 'noise1' in globals():
    noise1.cleanUp()
if 'sim2' in globals():
    sim2.cleanUp()
if 'noise2' in globals():
    noise2.cleanUp()
reload(CDKLM16)
reload(OceanStateNoise)
# Creating sim
sim1 = CDKLM16.CDKLM16(cl_ctx, eta0, u0, v0, Hi, \
                      nx, ny, dx, dy, dt, g, f, r, \
                      boundary_conditions=boundaryConditions,
                      write_netcdf=False)
sim2 = CDKLM16.CDKLM16(cl_ctx, eta0, u0, v0, Hi, \
                      nx, ny, dx, dy, dt, g, f, r, \
                      boundary_conditions=boundaryConditions,
                      write_netcdf=False)



q0 = 100*dx/100000
noise1 = OceanStateNoise.OceanStateNoise.fromsim(sim1, soar_q0=q0)
noise2 = OceanStateNoise.OceanStateNoise.fromsim(sim2, soar_q0=q0)
meanT = 5000
means1 = np.zeros(meanT)
means2 = np.zeros(meanT)
for i in range(meanT):
    for j in range(50):
        noise1.perturbSim(sim1)
        noise2.perturbSim(sim2)
        noise2.resetSeed()
    means1[i] = evalMean(sim1)
    means2[i] = evalMean(sim2)
fig = plt.figure(figsize=(6,3))
plt.plot(means1, label="no reset")
plt.plot(means2, label="reset")
plt.legend(loc=0)
plt.grid()
print (sim1.nx, sim1.ny)

In [None]:
# Looking at autocorrelation within a cell:

def autocorr(x, k, mean=None, var=None):
    R = np.zeros(k)
    n = len(x)
    if mean is None:
        mean = np.mean(x)
    if var is None:
        var = np.var(x)
    for i in range(k):
        r = 0.0
        for t in range(n-i):
            r += (x[t] - mean)*(x[t+i] - mean)
        R[i] = r/((n-i)*var)
    return R

if 'sim2' in globals():
    sim2.cleanUp()
if 'noise2' in globals():
    noise2.cleanUp()
reload(CDKLM16)
reload(OceanStateNoise)
reload(Common)
# Creating sim
sim2 = CDKLM16.CDKLM16(cl_ctx, eta0, u0, v0, Hi, \
                      nx, ny, dx, dy, dt, g, f, r, \
                      boundary_conditions=boundaryConditions,
                      write_netcdf=False)



q0 = 100*dx/100000
noise2 = OceanStateNoise.OceanStateNoise.fromsim(sim2, soar_q0=q0)
noise2.resetSeed()
autocorrN = 10000
for rounds in range(5):
    for i in range(100000):
        noise2.generateNormalDistribution()
    print "round " + str(rounds)
    u = np.zeros((autocorrN, 9))
    for i in range(autocorrN):
        noise2.generateNormalDistribution()
        generatedU = noise2.random_numbers.download(noise2.cl_queue)
        for j in range(9):
            u[i,j] = generatedU[j,j]


    fig = plt.figure(figsize=(4,2))
    for j in range(9):
        lag = autocorr(u[:,j], 100)
        plt.plot(np.abs(lag))
        print "mean row " + str(j), np.mean(u[:,j])
    print "total mean: ", np.mean(u)
    print "min/max: ", np.min(u), np.max(u)
    plt.title("autocorr")
    plt.grid()
    
    fig = plt.figure(figsize=(4,2))
    cov = np.cov(u, rowvar=False)
    print cov.shape
    plt.imshow(cov)
#np.u


In [None]:
print 1103515245
print 2**31*0.51386433
print 2**31
print np.float64(2147483648.0)

print np.int16(1103515245)
print (0x7fffffff)

a = np.double(3.14)
print a
print isinstance(a, np.float64)
type(a)