# Start with hardware scan

### Find CPU number, Main Memory Size, Cache sizes 

In [2]:
import subprocess
import platform
import psutil
import cpuinfo

# Check if running on macOS
if platform.system() == "Darwin":
    def sysctl(name):
        return subprocess.check_output(["sysctl", "-n", name]).decode().strip()

    print("Physical cores :", sysctl("hw.physicalcpu"))
    print("Logical cores  :", sysctl("hw.logicalcpu"))
    print("Total RAM (GB) :", round(int(sysctl("hw.memsize")) / 1e9, 2))

    print("L1d Cache (bytes):", sysctl("hw.l1dcachesize"))
    print("L1i Cache (bytes):", sysctl("hw.l1icachesize"))
    print("L2 Cache (bytes) :", sysctl("hw.l2cachesize"))
    print("L3 Cache (bytes) :", sysctl("hw.l3cachesize"))
else:
    # Windows / Linux fallback
    info = cpuinfo.get_cpu_info()
    print("CPU Model:", info.get("brand_raw", "N/A"))
    print("Physical cores :", psutil.cpu_count(logical=False))
    print("Logical cores  :", psutil.cpu_count(logical=True))
    print("Total RAM (GB) :", round(psutil.virtual_memory().total / 1e9, 2))
    print("L2 Cache:", info.get("l2_cache_size", "N/A"))
    print("L3 Cache:", info.get("l3_cache_size", "N/A"))

CPU Model: Intel(R) Core(TM) i5-10300H CPU @ 2.50GHz
Physical cores : 4
Logical cores  : 8
Total RAM (GB) : 17.01
L2 Cache: 65536
L3 Cache: N/A


### Find CPU number, Main Memory Size, Cache sizes (Generic)

In [3]:
import cpuinfo
import psutil

# ---- CPU Information ----
info = cpuinfo.get_cpu_info()

print("CPU Model:", info.get("brand_raw", "N/A"))

# Core counts
print("Physical cores:", psutil.cpu_count(logical=False))
print("Logical cores :", psutil.cpu_count(logical=True))

# ---- Memory ----
mem = psutil.virtual_memory()
print("Total RAM (GB):", round(mem.total / 1e9, 2))

# ---- Cache (if reported) ----
print("L2 Cache:", info.get("l2_cache_size", "N/A"))
print("L3 Cache:", info.get("l3_cache_size", "N/A"))

CPU Model: Intel(R) Core(TM) i5-10300H CPU @ 2.50GHz
Physical cores: 4
Logical cores : 8
Total RAM (GB): 17.01
L2 Cache: 65536
L3 Cache: N/A


# Timing primitive operators

In [4]:
import numpy as np
import time

x = np.random.rand(10**6)
y = np.random.rand(10**6)

k = 0.2

In [5]:
x.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

In [6]:
%timeit x * y

4.34 ms ± 1.44 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
%timeit x / y

4.24 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
%timeit x + y

5.35 ms ± 1.53 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
%timeit x - y

4.1 ms ± 672 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%timeit x / k

3.57 ms ± 314 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
%timeit x * (1/k)

3.31 ms ± 235 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
%timeit x * k

3.15 ms ± 850 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
# ---- Performance Benchmark ----
def benchmark():
    size = 10**7
    a = np.random.rand(size)
    b = np.random.rand(size)

    start = time.time()
    c = a + b
    end = time.time()

    print("Time for vector addition (10M elements):", round(end - start, 4), "seconds") 

if __name__ == "__main__":
    benchmark()

Time for vector addition (10M elements): 0.0379 seconds


# Matrix Slicing

In [14]:
X = np.random.rand(20,20)
print(X.shape)
X.flags


(20, 20)


  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

In [15]:
X_sliced = X[3:5, 2:10]
print(X_sliced.shape)
X_sliced.flags

(2, 8)


  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

In [16]:
print(np.asfortranarray(X_sliced).shape)
np.asfortranarray(X_sliced).flags

(2, 8)


  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

In [17]:
print(np.ascontiguousarray(X_sliced).shape)
np.ascontiguousarray(X_sliced).flags

(2, 8)


  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

# Creating complex scalars

In [18]:
%timeit x = complex(8.1, 0.3)

186 ns ± 32 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [19]:
%timeit x = 8.1 + 1j*0.3

25 ns ± 9.46 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


# Using efficient numerical expression functions

In [21]:
import numexpr as ne
import numpy as np
ne.nthreads          # current number of threads
ne.detect_number_of_cores()
ne.set_num_threads(8)   # choose a value


a = np.arange(7E6)
b = np.arange(7E6)

%timeit a**2 + b**2 + 2*a*b
%timeit ne.evaluate("a**2 + b**2 + 2*a*b")

%timeit a**10 + a**7 + a**2*b**3
%timeit ne.evaluate("a**10 + a**7 + a**2*b**3")

175 ms ± 30.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
26.2 ms ± 2.83 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
966 ms ± 318 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
33.8 ms ± 3.62 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
import numpy as np
from numba import njit, prange, set_num_threads, get_num_threads

set_num_threads(8)      # set number of threads
print(get_num_threads())

a = np.arange(7_000_000, dtype=np.float64)
b = np.arange(7_000_000, dtype=np.float64)

@njit(parallel=True)
def expr1(a, b):
    out = np.empty_like(a)
    for i in prange(a.size):
        out[i] = a[i]**2 + b[i]**2 + 2*a[i]*b[i]
    return out

@njit(parallel=True)
def expr2(a, b):
    out = np.empty_like(a)
    for i in prange(a.size):
        out[i] = a[i]**10 + a[i]**7 + a[i]**2 * b[i]**3
    return out

# # ---- warm-up (compilation) ----
# _ = expr1(a, b)
# _ = expr2(a, b)

# ---- timing (like your numexpr example) ----
%timeit expr1(a, b)
%timeit expr2(a, b)


8
18.1 ms ± 1.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
21.7 ms ± 2.52 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Vectorization

In [46]:
import time
import numpy as np


def scalar(u, v):
    t0 = time.time()
    R = 0.0
    for n in range(len(u)):
        R += u[n] * v[n]
    dt = time.time() - t0
    return R, dt


def vectorized(u, v):
    t0 = time.time()
    R = np.dot(u.T, v)
    dt = time.time() - t0
    return R, dt


# ----------------------------------
# Testbench
# ----------------------------------
if __name__ == "__main__":

    N = int(10e6)

    u = np.random.normal(size=N)
    v = np.random.normal(size=N)

    # Scalar version
    x_sum_scalar, dt_scalar = scalar(u, v)
    print('%s\nScalar\n%s' % ('='*79, '='*79))
    print('Time: %20.6f [s]' % dt_scalar)
    print('Sum:  %20.6f' % x_sum_scalar)

    # Vectorized version
    x_sum_vectorized, dt_vectorized = vectorized(u, v)
    print('%s\nVectorized\n%s' % ('='*79, '='*79))
    print('Time: %20.6f [s]' % dt_vectorized)
    print('Sum:  %20.6f' % x_sum_vectorized)

    # ----------------------------------
    # Speedup
    # ----------------------------------
    speedup = dt_scalar / dt_vectorized
    print('%s\nPerformance\n%s' % ('='*79, '='*79))
    print('Speedup (Scalar / Vectorized): %10.2f x' % speedup)


Scalar
Time:             1.194749 [s]
Sum:           2022.094122
Vectorized
Time:             0.001429 [s]
Sum:           2022.094122
Performance
Speedup (Scalar / Vectorized):     836.03 x
