In [1]:
import numpy as np
from collections import OrderedDict as odict
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

This notebook computes the latencies and bandwidths of the three primitive function types

In [2]:
#(hardware name, number of nodes)
filesD = {
         'knl_mpi1':('knl',1), 'knl_mpi2':('knl',2), 'knl_mpi4':('knl',4),
         'skl_mpi1':('skl',1), 'skl_mpi2':('skl',2), 'skl_mpi4':('skl',4),
         'i5':('i5',1),
         'p100_mpi1':('p100',1), 'p100_mpi2':('p100',2), 'p100_mpi4':('p100',4),
         'v100_mpi1':('v100',1), 'v100_mpi2':('v100',2), 'v100_mpi4':('v100',4),
         'gtx1060':('gtx1060',1)
        }

files = odict(sorted(filesD.items(), key= lambda t : t[1][1]))
pd.set_option('precision',1)

#### Axpby and Dot Latencies
The latencies are determined by taking the minimum of the average runtimes 

#### Axpby and Dot Bandwidths
The bandwidths are determined by taking the average bandwidth of the 30 bandwidths corresponding to the 3 largest sizes.

#### Dx-Dy Latencies
As in Axpby 

#### Dx-Dy Bandwidths
Since the efficiency of the matrix-vector multiplications depends on the polynomial coefficient we should compute these bandwidths separately


In [3]:
names={'axpby':3,'dot':2,'dx':3, 'dy':3}
#ns=[3,4]
values = []
for f, v in files.items() :#{'knl_mpi2':('knl',2)}.items():
    runtimes=pd.read_csv('benchmark_'+f+'.csv', delimiter=' ')
    #add size and bandwidth columns
    runtimes.insert(0,'size', 8*runtimes['n']*runtimes['n']*runtimes['Nx']*runtimes['Ny']/1e6/v[1]) #inplace transformation
    for name,memops in names.items() :
        runtimes.insert(0,name+'_bw',runtimes['size']/1000*memops/runtimes[name])
    runtimes = runtimes.assign( dxdy=(runtimes['dx']+runtimes['dy'])/2)
    runtimes = runtimes.assign( dxdy_bw=2.0*runtimes['dx_bw']*runtimes['dy_bw']/(runtimes['dx_bw']+runtimes['dy_bw']))
    #compute one version with aggregated grouped sizes and one without
    avgruntimes=runtimes.groupby(['n', 'Nx','Ny','size']).agg(['mean', 'std'])
    avgruntimes=avgruntimes.reset_index(level=['n','Nx','Ny','size'])
    avgruntimes.sort_values(by='size',inplace=True) #sort by size
    runtimes.sort_values(by='size',inplace=True)
    ##first compute axpby and dot latencies and bandwidths 
    nmax = 3
    s =30

    line = []
    l=len(runtimes)
    line.append(v[0]) #0
    line.append(v[1]) #1
    line.append(runtimes[l-s:l]['axpby_bw'].mean()) #2
    line.append(runtimes[l-s:l]['axpby_bw'].std())  #3
    line.append(avgruntimes[0:nmax][('axpby','mean')].mean()/1e-6) #4
    line.append(avgruntimes[0:nmax][('axpby','mean')].min()/1e-6)  #5

    line.append( line[5] - avgruntimes['size'][0]*names['axpby']/line[2]/1e-3) #6
    if line[6] <0 : line[6] = 0 

    line.append(runtimes[l-s:l]['dot_bw'].mean()) #7
    line.append(runtimes[l-s:l]['dot_bw'].std())  #8 
    line.append(avgruntimes[0:nmax][('dot','mean')].mean()/1e-6) #9
    line.append(avgruntimes[0:nmax][('dot','mean')].min()/1e-6)  #10
    line.append(line[10] - avgruntimes['size'][0]*names['dot']/line[2]/1e-3) #11
    if line[11] <0 : line[11] = 0 
    ##now compute latency and bandwidths of dx and y
 
    for n in [2,3,4,5]:
        #take n
        dxdy=runtimes[runtimes['n']==n]
        
        avgdxdy = avgruntimes[avgruntimes['n']==n]
        dxdy=dxdy.sort_values(by='size')
        avgdxdy=avgdxdy.sort_values(by='size') #sort by size

        Nx = 767 # compute among the four greatest sizes
        bw = dxdy[dxdy['Nx']>=Nx]['dxdy_bw'].mean()
        #if v[0] == 'gtx1060' and n > 3: bw = dxdy[(dxdy['Nx']>=Nx) & (dxdy['Nx']<2048)]['dxdy_bw'].mean()
        line.append(dxdy[(dxdy['size']>10)&(dxdy['size']<400)]['dxdy_bw'].mean())
        line.append(dxdy[(dxdy['size']>10)&(dxdy['size']<400)]['dxdy_bw'].std())   
        line.append(avgdxdy[0:nmax][('dxdy','mean')].mean()/1e-6)
        line.append(avgdxdy[('dxdy','mean')].min()/1e-6)
        line.append(avgdxdy[('dxdy','mean')].min()/1e-6 - avgdxdy['size'].loc[avgdxdy[('dxdy','mean')].idxmin()]*names['dx']/line[2]/1e-3)
        if line[len(line)-1] <0 : line[len(line)-1] = 0        
    #print(line)    
    values.append(line)

In [4]:
#now construct new table with values from previous cell      
tuples=[('arch','',''),('nodes','','')]        
for q in ['axpby','dot','dxdy2','dxdy3','dxdy4','dxdy5']:
    tuples.append((q,'bw','avg'))
    tuples.append((q,'bw','std'))
    tuples.append((q,'lat','avg'))
    tuples.append((q,'lat','min'))
    tuples.append((q,'lat','bw'))

cols=pd.MultiIndex.from_tuples(tuples)
arr = pd.DataFrame(values,index=files.keys(), columns=cols)
arr.sort_values(by='arch',inplace=True)
arr.set_index(['arch','nodes'],inplace=True)
#arr.loc[:,[('dot','bw','avg'),('dot','lat','avg')]]
arr

Unnamed: 0_level_0,Unnamed: 1_level_0,axpby,axpby,axpby,axpby,axpby,dot,dot,dot,dot,dot,...,dxdy4,dxdy4,dxdy4,dxdy4,dxdy4,dxdy5,dxdy5,dxdy5,dxdy5,dxdy5
Unnamed: 0_level_1,Unnamed: 1_level_1,bw,bw,lat,lat,lat,bw,bw,lat,lat,lat,...,bw,bw,lat,lat,lat,bw,bw,lat,lat,lat
Unnamed: 0_level_2,Unnamed: 1_level_2,avg,std,avg,min,bw,avg,std,avg,min,bw,...,avg,std,avg,min,bw,avg,std,avg,min,bw
arch,nodes,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
gtx1060,1,157.0,0.056,23.2,3.5,0.0,26.5,0.098,199.9,131.6,124.9,...,89.9,2.5,322.6,72.5,32.4,80.0,0.5,571.4,125.0,62.5
i5,1,30.0,0.19,30.6,12.4,0.0,9.3,0.037,316.7,117.4,82.5,...,25.8,1.5,1212.1,208.3,0.0,22.1,1.8,1956.6,340.4,12.6
knl,1,435.1,14.0,11.4,10.0,5.9,145.0,12.0,74.3,61.8,59.0,...,134.9,17.0,217.3,49.9,33.6,112.9,16.1,427.6,78.5,52.9
knl,2,420.0,37.0,11.0,10.1,8.0,128.7,5.8,99.9,90.9,89.5,...,97.2,13.1,231.6,110.6,102.2,80.7,9.3,418.1,157.2,144.0
knl,4,402.9,50.0,10.8,10.5,9.4,112.6,6.0,126.0,123.5,122.8,...,90.4,9.2,223.4,157.9,153.4,71.9,10.3,326.7,185.8,178.9
p100,1,552.8,1.0,4.1,3.1,0.2,345.8,1.9,61.1,56.4,54.5,...,199.7,1.8,148.6,30.1,18.8,170.5,2.3,273.0,62.7,45.0
p100,2,554.2,0.11,3.3,3.0,1.6,339.8,2.9,53.5,49.6,48.7,...,184.7,13.0,128.1,66.6,60.9,157.0,11.5,191.4,80.0,71.1
p100,4,554.7,0.89,3.2,3.1,2.4,324.3,8.4,54.0,48.7,48.2,...,166.4,19.0,131.5,91.5,88.7,144.5,20.2,166.8,107.2,102.8
skl,1,207.5,3.2,4.6,4.0,0.0,194.1,5.8,32.3,23.1,16.7,...,117.2,22.9,390.1,85.9,47.6,116.7,4.7,436.0,134.7,74.8
skl,2,216.4,4.8,10.5,4.0,0.0,183.6,7.8,54.5,25.3,22.3,...,118.9,20.6,163.0,71.3,52.9,113.3,6.1,257.8,96.3,67.5


In [5]:
#arr=arr.reset_index()

In [8]:
#define conversion function 
def toString(x): 
    if pd.isnull(x) : return 'n/a'
    #string = '%.1f'% x
    string = '%d' %np.ceil(x)
    #if np.ceil(x)<100 : string = '0'+string
    if np.ceil(x)<10 : string = '0'+string
    return string

In [9]:
addto = []
for n in ['axpby','dot','dxdy2','dxdy3','dxdy4','dxdy5']:
    arr.loc[:,(n,'bw','string')]= arr[n]['bw']['avg'].apply(toString) +" ± "+arr[n]['bw']['std'].apply(toString)
    addto.append((n,'lat','bw'))
    addto.append((n,'bw','string'))

#make a table for display
nicetable=arr[addto]
drop = nicetable.columns.droplevel(2)
nicetable.columns=drop
#nicetable.reset_index(inplace=True)
#nicetable.set_index('arch')
newindex=[('i5',1)]
for n in ['skl','knl']:
    for m in [1,2,4]:
        newindex.append((n,m))
newindex.append(('gtx1060',1))
for n in ['p100','v100']:
    for m in [1,2,4]:
        newindex.append((n,m))
    
nicetable=nicetable.reindex(newindex)

nicetable

Unnamed: 0_level_0,Unnamed: 1_level_0,axpby,axpby,dot,dot,dxdy2,dxdy2,dxdy3,dxdy3,dxdy4,dxdy4,dxdy5,dxdy5
Unnamed: 0_level_1,Unnamed: 1_level_1,lat,bw,lat,bw,lat,bw,lat,bw,lat,bw,lat,bw
arch,nodes,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
i5,1,0.0,30 ± 01,82.5,10 ± 01,0.0,28 ± 03,0.0,30 ± 03,0.0,26 ± 02,12.6,23 ± 02
skl,1,0.0,208 ± 04,16.7,195 ± 06,28.4,182 ± 35,26.5,162 ± 13,47.6,118 ± 23,74.8,117 ± 05
skl,2,0.0,217 ± 05,22.3,184 ± 08,36.9,171 ± 45,40.3,156 ± 15,52.9,119 ± 21,67.5,114 ± 07
skl,4,1.8,234 ± 10,35.7,172 ± 14,38.5,163 ± 44,39.5,152 ± 23,46.2,115 ± 20,57.1,110 ± 07
knl,1,5.9,436 ± 15,59.0,145 ± 12,12.3,256 ± 15,20.9,182 ± 29,33.6,135 ± 17,52.9,113 ± 17
knl,2,8.0,420 ± 38,89.5,129 ± 06,70.6,155 ± 24,87.1,119 ± 16,102.2,98 ± 14,144.0,81 ± 10
knl,4,9.4,403 ± 50,122.8,113 ± 07,120.9,128 ± 18,136.9,105 ± 25,153.4,91 ± 10,178.9,72 ± 11
gtx1060,1,0.0,158 ± 01,124.9,27 ± 01,1.7,131 ± 01,11.9,112 ± 02,32.4,90 ± 03,62.5,80 ± 01
p100,1,0.2,553 ± 02,54.5,346 ± 02,4.5,288 ± 03,9.3,238 ± 04,18.8,200 ± 02,45.0,171 ± 03
p100,2,1.6,555 ± 01,48.7,340 ± 03,47.9,247 ± 21,50.5,216 ± 15,60.9,185 ± 13,71.1,158 ± 12


#### Assumptions
- there are three basic functions: trivially parallel(axpby), nearest neighbor (dxdy), global reduction (dot)
- each can be represented by the single node bandwidth, the single node latency and the multinode latency

#### But
- does not capture cache effect e.g. in SKl

In [10]:
index = ['i5','skl','knl','gtx1060','p100','v100']  
lines = []
for arch in  index: 
    line = []
    line.append(arch)
    #first the bandwidths
    line.append( arr.loc[(arch,1),('axpby','bw','avg')] )
    for n in ['dot','dxdy2','dxdy3','dxdy4','dxdy5']:
        line.append( arr.loc[(arch,1),(n,'bw','avg')] /line[1])
    for n in ['axpby','dot','dxdy2'] :
        line.append( arr.loc[(arch,1),(n,'lat','bw')] )
        if arch == 'i5' or arch == 'gtx1060':
            line.append(None)
        else:
            line.append( arr.loc[(arch,4),(n,'lat','bw')] )
    lines.append(line)
    
tuples=['arch']     

for n in ['axpby','dot','dxdy2','dxdy3','dxdy4','dxdy5']:
    tuples.append(n+'_bw')
for n in ['axpby','dot','dxdy']:
    tuples.append(n+'_lat_shared')
    tuples.append(n+'_lat_dist')
cols=tuples
toDisk = pd.DataFrame(lines, columns=cols)
toDisk.to_csv('performance.csv',sep=' ',index=False)

In [11]:
pd.set_option('precision',2)
test = pd.read_csv('performance.csv',delimiter=' ')
test

Unnamed: 0,arch,axpby_bw,dot_bw,dxdy2_bw,dxdy3_bw,dxdy4_bw,dxdy5_bw,axpby_lat_shared,axpby_lat_dist,dot_lat_shared,dot_lat_dist,dxdy_lat_shared,dxdy_lat_dist
0,i5,29.99,0.31,0.93,0.97,0.86,0.74,0.0,,82.46,,0.0,
1,skl,207.52,0.94,0.87,0.78,0.56,0.56,0.0,1.84,16.69,35.68,28.38,38.49
2,knl,435.07,0.33,0.59,0.42,0.31,0.26,5.94,9.37,59.05,122.8,12.26,120.88
3,gtx1060,157.05,0.17,0.83,0.71,0.57,0.51,0.0,,124.95,,1.72,
4,p100,552.83,0.63,0.52,0.43,0.36,0.31,0.25,2.42,54.49,48.2,4.54,83.78
5,v100,845.61,0.7,0.95,0.85,0.78,0.67,1.21,2.8,34.48,37.09,3.12,84.97


#### Observations
- note the high latency in the knl MPI implementation of dxdy. It seems to suffer from the same problem as the GPUs. (Is this the speed of PCIe we see?)

In [22]:
index = ['i5','skl','knl','gtx1060','p100','v100']  
#theo = [38,None,None,192,732,898]
lines = []
for arch in  index: 
    line = []
    #line.append(arch)
    #first the bandwidths
    base_bw = arr.loc[(arch,1), ('axpby','bw','avg')]
    err_bw = arr.loc[(arch,1), ('axpby','bw','std')]
    
    line.append( toString(base_bw)+" $\pm$ "+toString(err_bw) )
    line.append( toString(arr.loc[(arch,1),('axpby','lat','bw')]) )
    if arch == 'i5' or arch == 'gtx1060':
        line.append(toString(None))
    else:
        line.append( toString(arr.loc[(arch,4),('axpby','lat','bw')]) )
        
    for n in ['dot','dxdy2','dxdy3','dxdy4','dxdy5']:
        new_bw = arr.loc[(arch,1),(n,'bw','avg')]
        new_err = arr.loc[(arch,1),(n,'bw','std')]
        #efficiency
        #line.append( toString( new_bw/base_bw*100)+" $\pm$ "+toString(100*(err_bw/new_bw+new_bw/base_bw/base_bw*new_err))) 
        #bandwidth
        line.append( toString( new_bw)+" $\pm$ "+toString(err_bw))
        line.append( toString(arr.loc[(arch,1),(n,'lat','bw')]) )
        if arch == 'i5' or arch == 'gtx1060':
            line.append(toString(None))
        else:
            if n == 'dot':
                line.append( toString(arr.loc[(arch,4),(n,'lat','bw')]) )
            else:
                line.append(toString(arr.loc[(arch,4),('dxdy2','lat','bw')]))
                
    lines.append(line)
    
tuples=[]  


for p in ['axpby','dot','dxdy (n=2)','dxdy (n=3)','dxdy (n=4)','dxdy (n=5)']:
    #for q in ['efficiency [\% bw]','lat s [us]','lat d [us]']:
    for q in ['bandwidth [GB/s]','lat s [us]','lat d [us]']:
        tuples.append((p,q))
tuples[0] = ('axpby','bandwidth [GB/s]')
    

cols=pd.MultiIndex.from_tuples(tuples)

toDisk = pd.DataFrame(lines, index=index, columns=cols)
#toDisk.insert(0,('theo','[GB/s]'),theo)
filename='axpby-dot.tex'
with open(filename, 'wb') as f:
    f.write(bytes(toDisk.iloc[:,0:6].to_latex(escape=False),'UTF-8'))
toDisk.iloc[:,0:6]

Unnamed: 0_level_0,axpby,axpby,axpby,dot,dot,dot
Unnamed: 0_level_1,bandwidth [GB/s],lat s [us],lat d [us],bandwidth [GB/s],lat s [us],lat d [us]
i5,30 $\pm$ 01,0,,10 $\pm$ 01,83,
skl,208 $\pm$ 04,0,2.0,195 $\pm$ 04,17,36.0
knl,436 $\pm$ 15,6,10.0,145 $\pm$ 15,60,123.0
gtx1060,158 $\pm$ 01,0,,27 $\pm$ 01,125,
p100,553 $\pm$ 02,1,3.0,346 $\pm$ 02,55,49.0
v100,846 $\pm$ 01,2,3.0,594 $\pm$ 01,35,38.0


In [21]:
dxdy = toDisk.loc[:,[('dxdy (n=2)','bandwidth [GB/s]'),
                     ('dxdy (n=3)','bandwidth [GB/s]'),
                     ('dxdy (n=4)','bandwidth [GB/s]'),
                     ('dxdy (n=5)','bandwidth [GB/s]'),
                     ('dxdy (n=2)','lat s [us]'),
                     ('dxdy (n=2)','lat d [us]'),
                    ]]
filename='dxdy.tex'
with open(filename, 'wb') as f:
    f.write(bytes(dxdy.to_latex(escape=False),'UTF-8'))
dxdy

Unnamed: 0_level_0,dxdy (n=2),dxdy (n=3),dxdy (n=4),dxdy (n=5),dxdy (n=2),dxdy (n=2)
Unnamed: 0_level_1,bandwidth [GB/s],bandwidth [GB/s],bandwidth [GB/s],bandwidth [GB/s],lat s [us],lat d [us]
i5,28 $\pm$ 01,30 $\pm$ 01,26 $\pm$ 01,23 $\pm$ 01,0,
skl,182 $\pm$ 04,162 $\pm$ 04,118 $\pm$ 04,117 $\pm$ 04,29,39.0
knl,256 $\pm$ 15,182 $\pm$ 15,135 $\pm$ 15,113 $\pm$ 15,13,121.0
gtx1060,131 $\pm$ 01,112 $\pm$ 01,90 $\pm$ 01,80 $\pm$ 01,2,
p100,288 $\pm$ 02,238 $\pm$ 02,200 $\pm$ 02,171 $\pm$ 02,5,84.0
v100,803 $\pm$ 01,719 $\pm$ 01,659 $\pm$ 01,566 $\pm$ 01,4,85.0
