### Numpy code

In [5]:
import numpy as np

# Example: Large matrices (adjust size as needed)
n = 7000  # For very large matrices, ensure you have enough RAM
A = np.random.rand(n, n).astype(np.float32)
B = np.random.rand(n, n).astype(np.float32)

C = np.dot(A, B)  # warm-up and Matrix multiplication

%timeit -r 2 -o np.dot(A, B)

print(f"Result shape: {C.shape}")
print(f"Result type: {C.dtype}")


1.44 s ± 24.3 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)
Result shape: (7000, 7000)
Result type: float32


In [8]:
# 3.4 PyTorch no compatible asi que cae en CPU

import torch
import time

n = 7000

print("CUDA available:", torch.cuda.is_available())

# Selección segura de dispositivo
if torch.cuda.is_available():
    try:
        device = torch.device("cuda")
        A_t = torch.rand((n, n), device=device, dtype=torch.float32)
        B_t = torch.rand((n, n), device=device, dtype=torch.float32)
        use_cuda = True
    except Exception as e:
        print("CUDA error detected, falling back to CPU")
        print(e)
        device = torch.device("cpu")
        use_cuda = False
else:
    device = torch.device("cpu")
    use_cuda = False

print("Device used:", device)

# Crear tensores en el dispositivo final
A_t = torch.rand((n, n), device=device, dtype=torch.float32)
B_t = torch.rand((n, n), device=device, dtype=torch.float32)

# Warm-up
C_t = A_t @ B_t
if use_cuda:
    torch.cuda.synchronize()

# Medir tiempo
start = time.time()
C_t = A_t @ B_t
if use_cuda:
    torch.cuda.synchronize()
end = time.time()

print(f"Tiempo multiplicación matrices con PyTorch: {end - start:.4f} s")
print(f"Result shape: {C_t.shape}")
print(f"Result type: {C_t.dtype}")


CUDA available: True
CUDA error detected, falling back to CPU
CUDA error: no kernel image is available for execution on the device
Search for `cudaErrorNoKernelImageForDevice' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Device used: cpu
Tiempo multiplicación matrices con PyTorch: 1.4828 s
Result shape: torch.Size([7000, 7000])
Result type: torch.float32


In [10]:
# 3.4 Para bohr

import torch
import time

# Usar el mismo tamaño que en NumPy
n = 7000

# Selección de dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Crear tensores
A_t = torch.rand((n, n), device=device, dtype=torch.float32)
B_t = torch.rand((n, n), device=device, dtype=torch.float32)

# Warm-up
C_t = A_t @ B_t
if device.type == "cuda":
    torch.cuda.synchronize()

# Medir tiempo (estilo profesor)
start = time.time()
C_t = A_t @ B_t
if device.type == "cuda":
    torch.cuda.synchronize()
end = time.time()

print(f"Tiempo multiplicación matrices con PyTorch: {end - start:.4f} s")
print(f"Result shape: {C_t.shape}")
print(f"Result type: {C_t.dtype}")


Device: cuda


AcceleratorError: CUDA error: no kernel image is available for execution on the device
Search for `cudaErrorNoKernelImageForDevice' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


639 ms ± 3.12 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)
Result shape: (7000, 7000)
Result type: float32
CUDA available: True
Device used: cuda
Tiempo multiplicación matrices con PyTorch: 0.0627 s
Result shape: torch.Size([7000, 7000])
Result type: torch.float32
Device: cuda
Tiempo multiplicación matrices con PyTorch: 0.0499 s
Result shape: torch.Size([7000, 7000])
Result type: torch.float32
