In [1]:
import numpy as np
import psutil

In [2]:
# Function to print memory usage
def print_memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    print(f"Memory Usage: {mem_info.rss / (1024 ** 2):.2f} MB")


In [None]:
# Multiply two large matrices
# FAILED if matrices too large and memory is not enough

# A = np.random.rand(100000, 50)
# B = np.random.rand(50, 100000)

# # Standard multiplication
# C = np.dot(A, B)  # Or A @ B

# # Print the shape of the result
# print("Shape of C:", C.shape)

# # Optionally, print a small part of the matrix
# print("First 5x5 block of C:\n", C[:5, :5])

In [None]:
# Use numpy.memmap for Disk-Based Computation
# PERFECT METHOD FOR LOW MEMORY SYSTEMS
import numpy as np

print("Memory usage before multiplication:")
print_memory_usage()

A = np.memmap('A.dat', dtype='float32', mode='w+', shape=(100000, 1))
B = np.memmap('B.dat', dtype='float32', mode='w+', shape=(1, 100000))
C = np.memmap('C.dat', dtype='float32', mode='w+', shape=(100000, 100000))

# Fill A and B with random values (example)
A[:] = np.random.rand(100000, 1)
B[:] = np.random.rand(1, 100000)

# Multiply in chunks
for i in range(A.shape[0]):
    C[i, :] = A[i, :] @ B  # Process row-by-row

# Check memory usage after multiplication
print("Memory usage after multiplication:")
print_memory_usage()

# # Print the shape of the result
print("Shape of C:", C.shape)

# # Optionally, print a small part of the matrix
print("First 5x5 block of C:\n", C[:5, :5])

Memory usage (init):
Memory Usage: 99.39 MB
Memory usage before multiplication:
Memory Usage: 99.41 MB
Memory usage after multiplication:
Memory Usage: 3471.30 MB
Shape of C: (100000, 100000)
First 5x5 block of C:
 [[0.20294341 0.31546688 0.68615556 0.152865   0.65581053]
 [0.04588334 0.07132371 0.15513247 0.03456115 0.14827177]
 [0.08352435 0.12983505 0.28239745 0.06291384 0.2699085 ]
 [0.01826654 0.02839456 0.06175954 0.01375908 0.05902824]
 [0.07345313 0.11417976 0.24834643 0.0553278  0.23736338]]


In [4]:
print("Memory usage after multiplication:")
print_memory_usage()

Memory usage after multiplication:
Memory Usage: 1887.77 MB


In [3]:
# Matrix Multiplication with MPS

import torch

# Move tensors to GPU (MPS)
device = torch.device("mps")  # Use "cpu" if no GPU is available

A = torch.rand(100000, 50, device=device)
B = torch.rand(50, 100000, device=device)

C = torch.matmul(A, B)  # GPU-accelerated matrix multiplication


RuntimeError: Invalid buffer size: 37.25 GB

In [2]:
!pip install cupy

Collecting cupy
  Downloading cupy-13.3.0.tar.gz (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[3 lines of output][0m
  [31m   [0m Generating cache key from header files...
  [31m   [0m Cache key (1610 files matching /private/var/folders/yr/qjfqfcxs79b91s4n4v1zp0b80000gn/T/pip-install-0wnzsiin/cupy_0de7c0e6b02d427da42326562bb7297e/cupy/_core/include/**): 5a9d2f850421ee67b5b715210ae3691771282212
  [31m   [0m Error: macOS is no longer supported
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[?25h[1;31merror[0m: [1mmetadata-generation-failed[0m

[

In [None]:
# Use GPU Acceleration (cupy or torch.cuda)
# If you have a GPU, cupy or torch.cuda can handle large matrices efficiently.
# CANNOT BE USED IN MACOSX

import cupy as cp

A = cp.random.rand(100000, 50)
B = cp.random.rand(50, 100000)

C = A @ B  # Performed on GPU


ModuleNotFoundError: No module named 'cupy'

In [4]:
# Use scipy.sparse for Sparse Matrices
# If your matrices contain many zeros, use scipy.sparse:

import numpy as np
from scipy.sparse import csr_matrix

A = csr_matrix(np.random.rand(100000, 50))
B = csr_matrix(np.random.rand(50, 100000))

C = A @ B  # Sparse multiplication, saving memory


: 

In [2]:
!pip install dask

Collecting dask
  Downloading dask-2024.8.0-py3-none-any.whl.metadata (3.8 kB)
Collecting cloudpickle>=1.5.0 (from dask)
  Using cached cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting partd>=1.4.0 (from dask)
  Downloading partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting toolz>=0.10.0 (from dask)
  Downloading toolz-1.0.0-py3-none-any.whl.metadata (5.1 kB)
Collecting locket (from partd>=1.4.0->dask)
  Downloading locket-1.0.0-py2.py3-none-any.whl.metadata (2.8 kB)
Downloading dask-2024.8.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached cloudpickle-3.1.1-py3-none-any.whl (20 kB)
Downloading partd-1.4.2-py3-none-any.whl (18 kB)
Downloading toolz-1.0.0-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.4/56.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading locket-1.0.0-py2.py3-no

In [None]:
# Use dask for Out-of-Core Computation
# dask.array allows parallel and chunk-based computations.

import dask.array as da

A = da.random.random((100000, 50), chunks=(1000, 50))
B = da.random.random((50, 100000), chunks=(50, 1000))

C = A @ B  # Computation is lazy, not in memory yet
C.compute()  # Compute when needed


  out = blockwise(


: 

In [1]:
!pip install jax

Collecting jax
  Downloading jax-0.4.30-py3-none-any.whl.metadata (22 kB)
Collecting jaxlib<=0.4.30,>=0.4.27 (from jax)
  Downloading jaxlib-0.4.30-cp39-cp39-macosx_11_0_arm64.whl.metadata (1.0 kB)
Downloading jax-0.4.30-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading jaxlib-0.4.30-cp39-cp39-macosx_11_0_arm64.whl (66.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: jaxlib, jax
Successfully installed jax-0.4.30 jaxlib-0.4.30


In [7]:
import jax.numpy as jnp

A = jnp.ones((100000, 10))
B = jnp.ones((10, 100000))

C = jnp.dot(A, B)  # Automatically optimized

# # Print the shape of the result
print("Shape of C:", C.shape)

# # Optionally, print a small part of the matrix
print("First 5x5 block of C:\n", C[:5, :5])

: 

In [2]:
import tensorflow as tf

A = tf.random.normal([100000, 10])
B = tf.random.normal([10, 100000])

C = tf.linalg.matmul(A, B)  # GPU-accelerated

# # Print the shape of the result
print("Shape of C:", C.shape)

# # Optionally, print a small part of the matrix
print("First 5x5 block of C:\n", C[:5, :5])

: 

In [None]:
import pyopencl as cl
import numpy as np

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

A = np.random.rand(10000, 5000).astype(np.float32)
B = np.random.rand(5000, 8000).astype(np.float32)

# Transfer data to the GPU
A_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=A)
B_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=B)
C_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, A.nbytes * B.shape[1])

# OpenCL kernel for matrix multiplication (requires setup)
# You'd need to write a kernel function to compute the matrix product.

