# Preamble

In [2]:
from corals.threads import set_threads_for_external_libraries as set_threads
set_threads(1)

In [3]:
%load_ext watermark

In [4]:
import numpy as np

In [5]:
np.show_config()

blas_info:
    libraries = ['cblas', 'blas', 'cblas', 'blas']
    library_dirs = ['/home/mgbckr/miniconda3/envs/nalab-fastcor/lib']
    include_dirs = ['/home/mgbckr/miniconda3/envs/nalab-fastcor/include']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
blas_opt_info:
    define_macros = [('NO_ATLAS_INFO', 1), ('HAVE_CBLAS', None)]
    libraries = ['cblas', 'blas', 'cblas', 'blas']
    library_dirs = ['/home/mgbckr/miniconda3/envs/nalab-fastcor/lib']
    include_dirs = ['/home/mgbckr/miniconda3/envs/nalab-fastcor/include']
    language = c
lapack_info:
    libraries = ['lapack', 'blas', 'lapack', 'blas']
    library_dirs = ['/home/mgbckr/miniconda3/envs/nalab-fastcor/lib']
    language = f77
lapack_opt_info:
    libraries = ['lapack', 'blas', 'lapack', 'blas', 'cblas', 'blas', 'cblas', 'blas']
    library_dirs = ['/home/mgbckr/miniconda3/envs/nalab-fastcor/lib']
    language = c
    define_macros = [('NO_ATLAS_INFO', 1), ('HAVE_CBLAS', None)]
    include_dirs = ['/home/mgbc

# Define data

In [31]:
n = 20000
m = 1000
D1 = np.random.random((m, n))
D1t = np.array(D1.transpose())
D2 = np.random.random((m, n))
D2t = np.array(D2.transpose())
D3 = np.array(np.random.random((m, n)), order="F")
D3t = np.array(D3.transpose(), order="F")

In [32]:
D1.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [33]:
D3.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

# Compare transpose vs copy

In [34]:
%%timeit
# transposed copy
np.matmul(D1t, D1)

15.9 s ± 72.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
%%timeit
# same matrix
# Note: I expected this to be faster than for copied data due to sylk (https://github.com/numpy/numpy/blob/main/numpy/core/src/umath/matmul.c.src#L114)
# and 
# * it actually as `m` increases (`m=1000`) 
# * but for lower `m` (`m=100`) there seems to be no / marginal difference; 
# * interestingly for very low `m` (`m=10`) this relationship reverses.
np.matmul(D1.transpose(), D1)

8.88 s ± 128 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
%%timeit
# transposed copy; different matrix
np.matmul(D1t, D2)

15.7 s ± 282 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [37]:
%%timeit
# transposed view; different matrix
np.matmul(D1.transpose(), D2)

15.8 s ± 104 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Row  major/minor order

In [38]:
%%timeit
# transposed copy
# Note: there seems to be no difference for difference element orders
np.matmul(D3t, D3)

15.5 s ± 294 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
%%timeit
# transposed view
# Note: there seems to be no difference for difference element orders
np.matmul(D3.transpose(), D3)

9.02 s ± 64.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
%%timeit
# transposed copy mixed with different order
# Note: Not much of a difference to regular order
np.matmul(D3t, D1)

15.8 s ± 124 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [41]:
%%timeit
# transposed copy mixed with different order
# Note: this seems to do the best when copying matrices ... but not a lot ... could be an artifact, too.
np.matmul(D1t, D3)

15.1 s ± 107 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Summary

* Multiplying by the same matrix but transposed seems to speed things up by a factor of `2` if the number of samples `m` is large enough, but is slower if the number of samples is too small. THe number of columns `m` seems to change dependent on the number of columns `n` in order to observe this effect (try `n=2000` and `n=10`). 
* Transpose is not detected when data is copied.
* Row major/minor order does not seem to make a significant runtime difference.

# Watermark

In [42]:
%watermark

Last updated: 2022-08-16T13:35:48.375552-07:00

Python implementation: CPython
Python version       : 3.9.1
IPython version      : 7.24.1

Compiler    : GCC 9.3.0
OS          : Linux
Release     : 5.15.0-46-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 12
Architecture: 64bit



In [43]:
%watermark --iversions

numpy: 1.20.3
sys  : 3.9.1 | packaged by conda-forge | (default, Jan 26 2021, 01:34:10) 
[GCC 9.3.0]

