In [1]:
import subprocess
from memory_profiler import memory_usage
from torch.profiler import profile, ProfilerActivity

from cubes import get_cube

In [3]:
with profile(
    activities=[ProfilerActivity.CPU],
    profile_memory=True,
    record_shapes=True,
    # with_stack=True,
) as prof:
    cube = get_cube(20)
    solution = cube.solve()

# Show memory allocation over time
# prof.export_memory_timeline("memory.html")

# Export trace to Chrome Trace format (this can be opened in chrome://tracing or https://ui.perfetto.dev/)
prof.export_chrome_trace("trace.json")
# Print profiler results
print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))

--------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
--------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             Solve        32.86%     443.637ms        32.86%     443.746ms     443.746ms     375.00 Kb    -187.50 Kb             1  
                                         aten::bmm        19.79%     267.149ms        40.58%     547.914ms       7.610ms     408.65 Mb     408.65 Mb            72  
                                      aten::select        16.53%     223.150ms        20.58%     277.859ms       0.842us           0 b           0 b        329931  
          

In [5]:
import cProfile
import pstats


def test():
    cube = get_cube(50)
    cube.solve()


cProfile.run("test()", "stats")
p = pstats.Stats("stats")
print(p.sort_stats("time").print_stats(10))

Wed Jan 14 09:46:59 2026    stats

         28869 function calls (28651 primitive calls) in 17.378 seconds

   Ordered by: internal time
   List reduced from 435 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       62    4.217    0.068    4.217    0.068 {built-in method torch.einsum}
        5    2.906    0.581    2.906    0.581 {method 'coalesce' of 'torch._C.TensorBase' objects}
       96    2.126    0.022    2.126    0.022 {built-in method pyamg.amg_core.relaxation.gauss_seidel}
       77    1.529    0.020    1.529    0.020 {built-in method scipy.sparse._sparsetools.csr_matvec}
        1    1.383    1.383    4.629    4.629 /Users/meyernil/Code/torch-fem/src/torchfem/base.py:187(assemble_matrix)
        8    1.366    0.171    1.366    0.171 {built-in method scipy.sparse._sparsetools.bsr_matmat}
        2    0.719    0.359    5.548    2.774 /Users/meyernil/Code/torch-fem/src/torchfem/base.py:452(integrate_material)
      141    

In [2]:
print("|  N  |     DOFs |  FWD Time |  BWD Time |   Peak RAM |")
print("| --- | -------- | --------- | --------- | ---------- |")

device = "cpu"
order = "1"
for N in [10, 20, 30, 40, 50, 60, 70, 80]:
    mem_usage, result = memory_usage(
        lambda: subprocess.run(
            ["python", "cubes.py", "-N", str(N), "-device", device, "-order", order],
            capture_output=True,
            text=True,
        ),
        retval=True,
        include_children=True,
        max_usage=True,
        interval=0.1,
    )

    print(result.stdout.strip() + f" {mem_usage:8.1f}MB |")

|  N  |     DOFs |  FWD Time |  BWD Time |   Peak RAM |
| --- | -------- | --------- | --------- | ---------- |
|  10 |     3000 |     0.16s |     0.10s |    870.0MB |
|  20 |    24000 |     0.99s |     0.18s |   1268.9MB |
|  30 |    81000 |     3.44s |     0.60s |   1953.6MB |
|  40 |   192000 |     8.78s |     1.19s |   2749.4MB |
|  50 |   375000 |    17.90s |     2.29s |   3920.5MB |
|  60 |   648000 |    32.46s |     3.69s |   5103.4MB |
|  70 |  1029000 |    53.75s |     6.23s |   7243.5MB |
|  80 |  1536000 |    85.68s |     9.61s |  10532.1MB |
