In [35]:
import pandas as pd

In [36]:
#name, cache-size (in kB)
hardware = {}
hardware['i5'] = ('Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz',6144, 
                  '1 MPI task x 4 OpenMP threads')
hardware['skl'] = ('2 x 24-cores Intel Xeon 8160 CPU (Skylake) at 2.10 GHz',2*33000, 
                   '2 MPI tasks (one on each socket) x 24 OpenMP threads (one for each core)')
hardware['knl'] = ('1 x 68-cores Intel Xeon Phi 7250 CPU (Knights Landing) at 1.40 GHz',34000, 
                   '1 MPI task x 136 OpenMP (2 hyperthreads)')
hardware['gtx1060'] = ('GeForce GTX 1060 6GB',1572.864, '1 MPI task per GPU')
hardware['p100'] = ('Tesla P100-PCIE-16GB',4194.304, '1 MPI task per GPU')
hardware['v100'] = ('Tesla V100-PCIE-16GB',6291.456, '1 MPI task per GPU')

memory ={} #find with 'dmidecode --type 17'
#name, clockrate (MHz) , buswidth (bit), size (MB),
memory['i5'] = ('2x Kingston DDR4 8GB ', 2400, 64, 2*8192)
memory['skl'] = ('DDR4',None,None,92000)
memory['knl'] = ('MCDRAM',None,None,16000)
memory['gtx1060'] = ('on-card global memory',4004,192,6069)
memory['p100'] = ('on-card global memory',715,4096,16276)
memory['v100'] = ('on-card global memory',877,4096,16152)

theoretical memory bandwidth is $$bw = 2*clockrate*buswidth$$ ; the '2' is for double data rate (DDR)

In [37]:
df = pd.DataFrame(hardware)
df = df.transpose()
df.columns= ['device-name', 'cache-size-kB','single-node configuration']

In [38]:
mem = pd.DataFrame(memory)
mem = mem.transpose()
mem.columns = ['mem-description', 'clockrate-MHz', 'buswidth-bit', 'size-MB']
df=df.join(mem)
df

Unnamed: 0,device-name,cache-size-kB,single-node configuration,mem-description,clockrate-MHz,buswidth-bit,size-MB
gtx1060,GeForce GTX 1060 6GB,1572.86,1 MPI task per GPU,on-card global memory,4004.0,192.0,6069
i5,Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz,6144.0,1 MPI task x 4 OpenMP threads,2x Kingston DDR4 8GB,2400.0,64.0,16384
knl,1 x 68-cores Intel Xeon Phi 7250 CPU (Knights ...,34000.0,1 MPI task x 136 OpenMP (2 hyperthreads),MCDRAM,,,16000
p100,Tesla P100-PCIE-16GB,4194.3,1 MPI task per GPU,on-card global memory,715.0,4096.0,16276
skl,2 x 24-cores Intel Xeon 8160 CPU (Skylake) at ...,66000.0,2 MPI tasks (one on each socket) x 24 OpenMP t...,DDR4,,,92000
v100,Tesla V100-PCIE-16GB,6291.46,1 MPI task per GPU,on-card global memory,877.0,4096.0,16152


In [39]:
df['bandwidth'] = 2*df['clockrate-MHz']*1e6*df['buswidth-bit']/8/1e9
df

Unnamed: 0,device-name,cache-size-kB,single-node configuration,mem-description,clockrate-MHz,buswidth-bit,size-MB,bandwidth
gtx1060,GeForce GTX 1060 6GB,1572.86,1 MPI task per GPU,on-card global memory,4004.0,192.0,6069,192.192
i5,Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz,6144.0,1 MPI task x 4 OpenMP threads,2x Kingston DDR4 8GB,2400.0,64.0,16384,38.4
knl,1 x 68-cores Intel Xeon Phi 7250 CPU (Knights ...,34000.0,1 MPI task x 136 OpenMP (2 hyperthreads),MCDRAM,,,16000,
p100,Tesla P100-PCIE-16GB,4194.3,1 MPI task per GPU,on-card global memory,715.0,4096.0,16276,732.16
skl,2 x 24-cores Intel Xeon 8160 CPU (Skylake) at ...,66000.0,2 MPI tasks (one on each socket) x 24 OpenMP t...,DDR4,,,92000,
v100,Tesla V100-PCIE-16GB,6291.46,1 MPI task per GPU,on-card global memory,877.0,4096.0,16152,898.048


In [40]:
exp = pd.read_csv('performance.csv',delimiter=' ')
exp.set_index('arch',inplace=True)
exp.index.name = None
df = df.join(exp['axpby_bw'])

In [41]:
#pd.set_option('precision',2)
df['mem_efficiency']=df['axpby_bw']/df['bandwidth']
df

Unnamed: 0,device-name,cache-size-kB,single-node configuration,mem-description,clockrate-MHz,buswidth-bit,size-MB,bandwidth,axpby_bw,mem_efficiency
gtx1060,GeForce GTX 1060 6GB,1572.86,1 MPI task per GPU,on-card global memory,4004.0,192.0,6069,192.192,157.047153,0.817137
i5,Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz,6144.0,1 MPI task x 4 OpenMP threads,2x Kingston DDR4 8GB,2400.0,64.0,16384,38.4,29.985115,0.780862
knl,1 x 68-cores Intel Xeon Phi 7250 CPU (Knights ...,34000.0,1 MPI task x 136 OpenMP (2 hyperthreads),MCDRAM,,,16000,,435.072979,
p100,Tesla P100-PCIE-16GB,4194.3,1 MPI task per GPU,on-card global memory,715.0,4096.0,16276,732.16,552.831323,0.755069
skl,2 x 24-cores Intel Xeon 8160 CPU (Skylake) at ...,66000.0,2 MPI tasks (one on each socket) x 24 OpenMP t...,DDR4,,,92000,,207.5187,
v100,Tesla V100-PCIE-16GB,6291.46,1 MPI task per GPU,on-card global memory,877.0,4096.0,16152,898.048,845.608887,0.941608


#### ToDo
- ask Marconi for clockrate and buswidth of SKL DDR4  and MCDRAM
- maybe try 4 * 17 config on knights landing -> done

In [44]:
file = df.loc[:,['device-name','single-node configuration','bandwidth']]
file.loc['knl','bandwidth'] = '>400'
file.columns = ['device name', 'single-node configuration', 'bandwidth [GB/s]']
filename='hardware.tex'
df.loc['knl','bandwidth'] = '$>$400'
pd.set_option('display.max_colwidth', 200)
with open(filename, 'wb') as f:
    f.write(bytes(file.to_latex(),'UTF-8'))
file

Unnamed: 0,device name,single-node configuration,bandwidth [GB/s]
gtx1060,GeForce GTX 1060 6GB,1 MPI task per GPU,192.192
i5,Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz,1 MPI task x 4 OpenMP threads,38.4
knl,1 x 68-cores Intel Xeon Phi 7250 CPU (Knights Landing) at 1.40 GHz,1 MPI task x 136 OpenMP (2 hyperthreads),>400
p100,Tesla P100-PCIE-16GB,1 MPI task per GPU,732.16
skl,2 x 24-cores Intel Xeon 8160 CPU (Skylake) at 2.10 GHz,2 MPI tasks (one on each socket) x 24 OpenMP threads (one for each core),
v100,Tesla V100-PCIE-16GB,1 MPI task per GPU,898.048
