In [5]:
import numpy as np
import psutil

# Function to print memory usage
def print_memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    print(f"Memory Usage: {mem_info.rss / (1024 ** 2):.2f} MB")


In [6]:
# Multiply two large matrices
# FAILED if matrices too large and memory is not enough

A = np.random.rand(100000, 1)
B = np.random.rand(1, 100000)

# Standard multiplication
C = np.dot(A, B)  # Or A @ B

# Print the shape of the result
print("Shape of C:", C.shape)

# Optionally, print a small part of the matrix
print("First 5x5 block of C:\n", C[:5, :5])

: 

In [9]:
# Use numpy.memmap for Disk-Based Computation
# PERFECT METHOD FOR LOW MEMORY SYSTEMS
import numpy as np

print("Memory usage before multiplication:")
print_memory_usage()

A = np.memmap('A.dat', dtype='float32', mode='w+', shape=(100000, 1))
B = np.memmap('B.dat', dtype='float32', mode='w+', shape=(1, 100000))
C = np.memmap('C.dat', dtype='float32', mode='w+', shape=(100000, 100000))

# Fill A and B with random values (example)
A[:] = np.random.rand(100000, 1)
B[:] = np.random.rand(1, 100000)

# Multiply in chunks
for i in range(A.shape[0]):
    C[i, :] = A[i, :] @ B  # Process row-by-row

# Check memory usage after multiplication
print("Memory usage after multiplication:")
print_memory_usage()

# # Print the shape of the result
print("Shape of C:", C.shape)

# # Optionally, print a small part of the matrix
print("First 5x5 block of C:\n", C[:5, :5])

Memory usage before multiplication:
Memory Usage: 2582.28 MB
Memory usage after multiplication:
Memory Usage: 3534.44 MB
Shape of C: (100000, 100000)
First 5x5 block of C:
 [[0.02896208 0.00425345 0.01149264 0.0286885  0.00942824]
 [0.44973823 0.06604978 0.17846373 0.44548994 0.14640653]
 [0.10924079 0.0160434  0.0433486  0.10820889 0.03556195]
 [0.590646   0.08674388 0.23437834 0.58506674 0.19227727]
 [0.19842781 0.02914165 0.07873951 0.19655344 0.06459564]]


In [3]:
# Matrix Multiplication with MPS

import torch

print("Memory usage before multiplication:")
print_memory_usage()

# Move tensors to GPU (MPS)
device = torch.device("mps")  # Use "cpu" if no GPU is available

A = torch.rand(100000, 1, device=device)
B = torch.rand(1, 100000, device=device)

C = torch.matmul(A, B)  # GPU-accelerated matrix multiplication

# Check memory usage after multiplication
print("Memory usage after multiplication:")
print_memory_usage()

# # Print the shape of the result
print("Shape of C:", C.shape)

# # Optionally, print a small part of the matrix
print("First 5x5 block of C:\n", C[:5, :5])

Memory usage before multiplication:
Memory Usage: 224.27 MB


RuntimeError: Invalid buffer size: 37.25 GB

In [3]:
!pip install cupy

Collecting cupy
  Using cached cupy-13.3.0.tar.gz (3.4 MB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[3 lines of output][0m
  [31m   [0m Generating cache key from header files...
  [31m   [0m Cache key (1610 files matching /private/var/folders/yr/qjfqfcxs79b91s4n4v1zp0b80000gn/T/pip-install-e_73c8eh/cupy_cc40364dd651444d9dd3518dbe0e7d23/cupy/_core/include/**): 784b76636589cd680d6706d325ea744f1b389695
  [31m   [0m Error: macOS is no longer supported
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[?25h[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the 

In [None]:
# Use GPU Acceleration (cupy or torch.cuda)
# If you have a GPU, cupy or torch.cuda can handle large matrices efficiently.
# CANNOT BE USED IN MACOSX

import cupy as cp

A = cp.random.rand(100000, 50)
B = cp.random.rand(50, 100000)

C = A @ B  # Performed on GPU


ModuleNotFoundError: No module named 'cupy'

In [4]:
# Use scipy.sparse for Sparse Matrices
# If your matrices contain many zeros, use scipy.sparse:

import numpy as np
from scipy.sparse import csr_matrix

A = csr_matrix(np.random.rand(100000, 50))
B = csr_matrix(np.random.rand(50, 100000))

C = A @ B  # Sparse multiplication, saving memory


: 

In [2]:
# !pip install dask

In [10]:
# Use dask for Out-of-Core Computation
# dask.array allows parallel and chunk-based computations.

import dask.array as da

# Check memory usage after multiplication
print("Memory usage before multiplication:")
print_memory_usage()

A = da.random.random((100000, 1), chunks=(1000, 1))
B = da.random.random((1, 100000), chunks=(1, 1000))

C = A @ B  # Computation is lazy, not in memory yet
C.compute()  # Compute when needed

# Check memory usage after multiplication
print("Memory usage after multiplication:")
print_memory_usage()

# # Print the shape of the result
print("Shape of C:", C.shape)

# # Optionally, print a small part of the matrix
print("First 5x5 block of C:\n", C[:5, :5])

Memory usage before multiplication:
Memory Usage: 76.23 MB


  out = blockwise(


: 

In [1]:
# !pip install jax

In [4]:
import jax.numpy as jnp

A = jnp.ones((100000, 1))
B = jnp.ones((1, 100000))

C = jnp.dot(A, B)  # Automatically optimized

# # Print the shape of the result
print("Shape of C:", C.shape)

# # Optionally, print a small part of the matrix
print("First 5x5 block of C:\n", C[:5, :5])

: 

In [1]:
import tensorflow as tf

A = tf.random.normal([100000, 1])
B = tf.random.normal([1, 1000000])

C = tf.linalg.matmul(A, B)  # GPU-accelerated

# # Print the shape of the result
print("Shape of C:", C.shape)

# # Optionally, print a small part of the matrix
print("First 5x5 block of C:\n", C[:5, :5])

: 

In [None]:
import pyopencl as cl
import numpy as np

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

A = np.random.rand(10000, 5000).astype(np.float32)
B = np.random.rand(5000, 8000).astype(np.float32)

# Transfer data to the GPU
A_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=A)
B_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=B)
C_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, A.nbytes * B.shape[1])

# OpenCL kernel for matrix multiplication (requires setup)
# You'd need to write a kernel function to compute the matrix product.

