# Experiment 1: Quantifying overhead in different levels of optimization

In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import timeit

In [2]:
%%bash
make --directory=../Stencil_code/ clean

make: Entering directory '/users/class169/Project/HPC4WC_project_ijblocking/Stencil_code'
rm -f -rf *~ *.o *.mod *.MOD *.i core.* *.out *.lst *.x *.x+orig *.x+[0-9]* *.dat *.report result_*.py
make: Leaving directory '/users/class169/Project/HPC4WC_project_ijblocking/Stencil_code'


In [3]:
def read_field_from_file(filename, num_halo=None):
    (rank, nbits, num_halo, nx, ny, nz) = np.fromfile(filename, dtype=np.int32, count=6)
    offset=(3 + rank) * 32 // nbits
    data = np.fromfile(filename, dtype=np.float32 if nbits == 32 else np.float64, \
                       count=nz * ny * nx + offset)
    if rank == 3:
        return np.reshape(data[offset:], (nz, ny, nx))
    else:
        return np.reshape(data[offset:], (ny, nx))

def validate_results():
    fig, axs = plt.subplots(1, 2, figsize=(12, 4))

    in_field = read_field_from_file('in_field.dat')
    k_lev = in_field.shape[0] // 2
    im1 = axs[0].imshow(in_field[k_lev, :, :], origin='lower', vmin=-0.1, vmax=1.1);
    fig.colorbar(im1, ax=axs[0]);
    axs[0].set_title('Initial condition (k = {})'.format(k_lev));

    out_field = read_field_from_file('out_field.dat')
    k_lev = out_field.shape[0] // 2
    im2 = axs[1].imshow(out_field[k_lev, :, :], origin='lower', vmin=-0.1, vmax=1.1);
    fig.colorbar(im2, ax=axs[1]);
    axs[1].set_title('Final result (k = {})'.format(k_lev));
    
    plt.show()

In [4]:
%%bash
module load daint-gpu
module switch PrgEnv-gnu PrgEnv-cray

make --directory=../Stencil_code/ VERSION=simplecopy-block
make --directory=../Stencil_code/ VERSION=simplecopy-ij
make --directory=../Stencil_code/ VERSION=simplecopy-k

make: Entering directory '/users/class169/Project/HPC4WC_project_ijblocking/Stencil_code'
ftn -O3 -hfp3 -eZ -ffree -N255 -ec -eC -eI -eF -rm -c m_utils.F90
ftn -O3 -hfp3 -eZ -ffree -N255 -ec -eC -eI -eF -rm -c stencil2d-simplecopy-block.F90
ftn -O3 -hfp3 -eZ -ffree -N255 -ec -eC -eI -eF -rm m_utils.o stencil2d-simplecopy-block.o -o stencil2d-simplecopy-block.x
cp stencil2d-simplecopy-block.x stencil2d.x
make: Leaving directory '/users/class169/Project/HPC4WC_project_ijblocking/Stencil_code'
make: Entering directory '/users/class169/Project/HPC4WC_project_ijblocking/Stencil_code'
ftn -O3 -hfp3 -eZ -ffree -N255 -ec -eC -eI -eF -rm -c stencil2d-simplecopy-ij.F90
ftn -O3 -hfp3 -eZ -ffree -N255 -ec -eC -eI -eF -rm m_utils.o stencil2d-simplecopy-ij.o -o stencil2d-simplecopy-ij.x
cp stencil2d-simplecopy-ij.x stencil2d.x
make: Leaving directory '/users/class169/Project/HPC4WC_project_ijblocking/Stencil_code'
make: Entering directory '/users/class169/Project/HPC4WC_project_ijblocking/Stencil_co

Getting the baseline: speed for  no blocking.

In [5]:
output_ij_block = !srun -n 1 ../Stencil_code/stencil2d-simplecopy-block.x --nx 128 --ny 128 --nz 64 --num_iter 1
exec(output_ij_block.nlstr)
time_ij_block = data[0,-1]

In [6]:
output_ij = !srun -n 1 ../Stencil_code/stencil2d-simplecopy-ij.x --nx 128 --ny 128 --nz 64 --num_iter 1
exec(output_ij.nlstr)
time_ij = data[0,-1]

In [7]:
output_k = !srun -n 1 ../Stencil_code/stencil2d-simplecopy-k.x --nx 128 --ny 128 --nz 64 --num_iter 1
exec(output_k.nlstr)
time_k = data[0,-1]

Compare the times, calculate relative improvement of the thing.

In [8]:
improvement_ij_block = (time_k - time_ij_block) / time_k * 100
improvement_ij = (time_k - time_ij) / time_k * 100

In [9]:
print(f'Improvement with ij-blocking: {improvement_ij}%, improvement with ij-blocking and smaller blocks in tmp-field: {improvement_ij_block}%.')

Improvement with ij-blocking: -18.4847364592685%, improvement with ij-blocking and smaller blocks in tmp-field: -4.522028998431412%.


In [10]:
print(time_ij_block)

0.000628233


In [11]:
print(time_k)

0.0006010532


In [12]:
print(time_ij)

0.0007121563


In [13]:
N = 10
improvement_ij_averaged = 0
improvement_ij_block_averaged = 0


for i in range(N):
    output_ij_block = !srun -n 1 ../Stencil_code/stencil2d-simplecopy-block.x --nx 128 --ny 128 --nz 64 --num_iter 1
    exec(output_ij_block.nlstr)
    time_ij_block = data[0,-1]
    
    output_ij = !srun -n 1 ../Stencil_code/stencil2d-simplecopy-ij.x --nx 128 --ny 128 --nz 64 --num_iter 1
    exec(output_ij.nlstr)
    time_ij = data[0,-1]
    
    output_k = !srun -n 1 ../Stencil_code/stencil2d-simplecopy-k.x --nx 128 --ny 128 --nz 64 --num_iter 1
    exec(output_k.nlstr)
    time_k = data[0,-1]
    
    improvement_ij_block_averaged += (time_k - time_ij_block) / time_k * 100
    improvement_ij_averaged += (time_k - time_ij) / time_k * 100
    
improvement_ij_averaged /= N
improvement_ij_block_averaged /= N

In [14]:
improvement_ij_averaged

-20.272872308842203

In [15]:
improvement_ij_block_averaged

-4.888905821973357