From eee317bfc09f232f35547e2aa55c62b79c6c3ea9 Mon Sep 17 00:00:00 2001
From: houpc <houpc96@gmail.com>
Date: Fri, 8 Nov 2024 12:17:46 +0800
Subject: [PATCH 1/5] update example

---
 examples/benchmark_vegas.py | 73 ++++++++++++++++++++++++-------------
 src/mc_test.py              |  6 +--
 src/vegas_test.py           | 18 ++++-----
 3 files changed, 59 insertions(+), 38 deletions(-)

diff --git a/examples/benchmark_vegas.py b/examples/benchmark_vegas.py
index d830c5d..292afb0 100644
--- a/examples/benchmark_vegas.py
+++ b/examples/benchmark_vegas.py
@@ -1,7 +1,6 @@
 import vegas
 import numpy as np
 import gvar
-import torch
 
 dim = 4
 nitn = 10
@@ -14,14 +13,11 @@ def f_batch(x):
     for d in range(dim):
         dx2 += (x[:, d] - 0.5) ** 2
     return np.exp(-200 * dx2)
-    # ans = np.empty((x.shape[0], 3), float)
-    # dx2 = 0.0
-    # for d in range(dim):
-    #     dx2 += (x[:, d] - 0.5) ** 2
-    # ans[:, 0] = np.exp(-200 * dx2)
-    # ans[:, 1] = x[:, 0] * ans[:, 0]
-    # ans[:, 2] = x[:, 0] ** 2 * ans[:, 0]
-    # return ans
+
+
+@vegas.lbatchintegrand
+def f1_batch(x):
+    return np.log(x[:, 0]) / np.sqrt(x[:, 0])
 
 
 def smc(f, map, neval, dim):
@@ -41,26 +37,53 @@ def mc(f, neval, dim):
     return (np.average(fy), np.std(fy) / neval**0.5)
 
 
-m = vegas.AdaptiveMap(dim * [[0, 1]], ninc=ninc)
-ny = 20000
-# torch.manual_seed(0)
-# y = torch.rand((ny, dim), dtype=torch.float64).numpy()
-y = np.random.uniform(0.0, 1.0, (ny, dim))  # 1000 random y's
+def vegas_map(f, dim, ny, alpha=0.5, niter=10):
+    m = vegas.AdaptiveMap(dim * [[0, 1]], ninc=ninc)
+    y = np.random.uniform(0.0, 1.0, (ny, dim))  # 1000 random y's
+
+    x = np.empty(y.shape, float)  # work space
+    jac = np.empty(y.shape[0], float)
+    f2 = np.empty(y.shape[0], float)
+
+    for itn in range(niter):  # 5 iterations to adapt
+        m.map(y, x, jac)  # compute x's and jac
+
+        f2 = (jac * f(x)) ** 2
+        m.add_training_data(y, f2)  # adapt
+        m.adapt(alpha=alpha)
+    return m
+
 
-x = np.empty(y.shape, float)  # work space
-jac = np.empty(y.shape[0], float)
-f2 = np.empty(y.shape[0], float)
+alpha = 2.0
+ny = 100000
+m = vegas_map(f1_batch, 1, ny, alpha=alpha)
+
+neval = 1000000
+# with map
+r = smc(f1_batch, m, neval, dim)
+print("   SMC + map:", f"{r[0]} +- {r[1]}")
+
+# without map
+r = mc(f1_batch, neval, dim)
+print("SMC (no map):", f"{r[0]} +- {r[1]}")
+
+# vegas with adaptive stratified sampling
+print("VEGAS using adaptive stratified sampling")
+integ = vegas.Integrator(dim * [[0, 1]])
+training = integ(f1_batch, nitn=10, neval=ny, alpha=alpha)  # adapt grid
+# print(integ.map)
+r = smc(f1_batch, integ.map, neval, dim)
+print("   SMC + map:", f"{r[0]} +- {r[1]}")
 
-for itn in range(10):  # 5 iterations to adapt
-    m.map(y, x, jac)  # compute x's and jac
+result = integ(f1_batch, nitn=10, neval=ny, adapt=False)
+# print(result)
+print("       vegas:", f"{result.mean} +- {result.sdev}")
 
-    f2 = (jac * f_batch(x)) ** 2
-    m.add_training_data(y, f2)  # adapt
-    # if itn == 0:
-    #     print(np.array(memoryview(m.sum_f)))
-    #     print(np.array(memoryview(m.n_f)))
-    m.adapt(alpha=0.5)
 
+### benchmark exp(-200 * (x1^2 + x2^2 + x3^2 + x4^2)) integral
+print("\nCalculate the integral h[X] in the bounds [0, 1]^4")
+print("h(X) = exp(-200 * (x1^2 + x2^2 + x3^2 + x4^2))")
+m = vegas_map(f_batch, dim, ny)
 
 # with map
 r = smc(f_batch, m, 50_000, dim)
diff --git a/src/mc_test.py b/src/mc_test.py
index e038226..48b7d24 100644
--- a/src/mc_test.py
+++ b/src/mc_test.py
@@ -29,10 +29,8 @@ def two_integrands(x, f):
     return f.mean(dim=-1)
 
 
-def sharp_integrands(x, f, dim=4):
-    f.zero_()
-    for d in range(dim):
-        f[:, 0] += (x[:, d] - 0.5) ** 2
+def sharp_integrands(x, f):
+    f[:, 0] = torch.sum((x - 0.5) ** 2, dim=-1)
     f[:, 0] *= -200
     f[:, 0].exp_()
     f[:, 1] = f[:, 0] * x[:, 0]
diff --git a/src/vegas_test.py b/src/vegas_test.py
index a1e847e..822ffaf 100644
--- a/src/vegas_test.py
+++ b/src/vegas_test.py
@@ -34,21 +34,22 @@ def func(x, f):
     return f[:, 0]
 
 
+alpha = 2.0
 ninc = 1000
-n_eval = 50000
-n_batch = 10000
+n_eval = 1000000
+n_batch = 20000
 n_therm = 10
 
 print("\nCalculate the integral log(x)/x^0.5 in the bounds [0, 1]")
 
 print("train VEGAS map")
 vegas_map = Vegas([(0, 1)], device=device, ninc=ninc)
-vegas_map.train(20000, func, epoch=10, alpha=0.5)
+vegas_map.train(100000, func, epoch=10, alpha=alpha)
 
 vegas_integrator = MonteCarlo(
     maps=vegas_map,
-    neval=1000000,
-    nbatch=n_batch,
+    neval=n_eval,
+    # nbatch=n_batch,
     device=device,
 )
 res = vegas_integrator(func)
@@ -56,7 +57,7 @@ def func(x, f):
 
 vegasmcmc_integrator = MCMC(
     maps=vegas_map,
-    neval=1000000,
+    neval=n_eval,
     nbatch=n_batch,
     nburnin=n_therm,
     device=device,
@@ -72,14 +73,13 @@ def func(x, f):
 bounds = [(0, 1)] * 4
 vegas_map = Vegas(bounds, device=device, ninc=ninc)
 print("train VEGAS map for h(X)...")
-vegas_map.train(20000, sharp_peak, epoch=10, alpha=0.5)
+vegas_map.train(20000, sharp_peak, epoch=10, alpha=alpha)
 # print(vegas_map.extract_grid())
 
 print("VEGAS Integral results:")
 vegas_integrator = MonteCarlo(
     maps=vegas_map,
-    neval=n_eval,
-    nbatch=n_batch,
+    neval=50000,
     device=device,
 )
 res = vegas_integrator(sharp_integrands, f_dim=3)

From 9212d5bc12d527c9114de603e5e7082ca501b1f4 Mon Sep 17 00:00:00 2001
From: houpc <houpc96@gmail.com>
Date: Fri, 15 Nov 2024 22:10:44 +0800
Subject: [PATCH 2/5] update statistics in integrators

---
 src/integrators.py | 230 ++++++++++++++++++++++++++-------------------
 1 file changed, 134 insertions(+), 96 deletions(-)

diff --git a/src/integrators.py b/src/integrators.py
index ef00893..18748a9 100644
--- a/src/integrators.py
+++ b/src/integrators.py
@@ -109,6 +109,13 @@ def __init__(
         super().__init__(maps, bounds, q0, neval, nbatch, device, dtype)
 
     def __call__(self, f: Callable, f_dim: int = 1, multigpu=False, **kwargs):
+        if multigpu:
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+        else:
+            rank = 0
+            world_size = 1
+
         x, _ = self.sample(self.nbatch)
         fx = torch.empty((self.nbatch, f_dim), dtype=self.dtype, device=self.device)
 
@@ -116,6 +123,7 @@ def __call__(self, f: Callable, f_dim: int = 1, multigpu=False, **kwargs):
         integ_values = torch.zeros(
             (self.nbatch, f_dim), dtype=self.dtype, device=self.device
         )
+        results = np.array([RAvg() for _ in range(f_dim)])
 
         for _ in range(epoch):
             x, log_detJ = self.sample(self.nbatch)
@@ -123,38 +131,52 @@ def __call__(self, f: Callable, f_dim: int = 1, multigpu=False, **kwargs):
             fx.mul_(log_detJ.exp_().unsqueeze_(1))
             integ_values += fx / epoch
 
-        results = np.array([RAvg() for _ in range(f_dim)])
-        if multigpu:
-            self.multi_gpu_statistic(integ_values, f_dim, results)
+        results = self.statistics(integ_values, results, rank, world_size)
+        if rank == 0:
+            return results
+
+    def statistics(
+        self,
+        values,
+        results,
+        rank=0,
+        world_size=1,
+    ):
+        f_dim = values.shape[1]
+        _mean = values.mean(dim=0)
+        _var = values.var(dim=0) / self.nbatch
+
+        if world_size > 1:
+            # Gather mean and variance statistics to rank 0
+            if rank == 0:
+                gathered_means = [
+                    torch.zeros(f_dim, dtype=self.dtype, device=self.device)
+                    for _ in range(world_size)
+                ]
+                gathered_vars = [
+                    torch.zeros(f_dim, dtype=self.dtype, device=self.device)
+                    for _ in range(world_size)
+                ]
+            dist.gather(_mean, gathered_means if rank == 0 else None, dst=0)
+            dist.gather(_var, gathered_vars if rank == 0 else None, dst=0)
+
+            if rank == 0:
+                for ngpu in range(world_size):
+                    for i in range(f_dim):
+                        results[i].update(
+                            gathered_means[ngpu][i].item(),
+                            gathered_vars[ngpu][i].item(),
+                            self.neval,
+                        )
         else:
             for i in range(f_dim):
-                _mean = integ_values[:, i].mean().item()
-                _var = integ_values[:, i].var().item() / self.nbatch
-                results[i].update(_mean, _var, self.neval)
-        if f_dim == 1:
-            return results[0]
-        else:
-            return results
+                results[i].update(_mean[i].item(), _var[i].item(), self.neval)
 
-    def multi_gpu_statistic(self, values, f_dim, results):
-        _mean = torch.zeros(f_dim, device=self.device, dtype=self.dtype)
-        _total_mean = torch.zeros(f_dim, device=self.device, dtype=self.dtype)
-        _var = torch.zeros(f_dim, device=self.device, dtype=self.dtype)
-        for i in range(f_dim):
-            _total_mean[i] = values[:, i].mean()
-            _mean[i] = _total_mean[i]
-            _var = values[:, i].var() / self.nbatch
-
-        dist.all_reduce(_total_mean, op=dist.ReduceOp.SUM)
-        _total_mean /= dist.get_world_size()
-        _var_between_batch = torch.square(_mean - _total_mean)
-        dist.all_reduce(_var_between_batch, op=dist.ReduceOp.SUM)
-        _var_between_batch /= dist.get_world_size()
-        dist.all_reduce(_var, op=dist.ReduceOp.SUM)
-        _var /= dist.get_world_size()
-        _var = _var + _var_between_batch
-        for i in range(f_dim):
-            results[i].update(_total_mean[i].item(), _var[i].item(), self.neval)
+        if rank == 0:
+            if f_dim == 1:
+                return results[0]
+            else:
+                return results
 
 
 def random_walk(dim, bounds, device, dtype, u, **kwargs):
@@ -205,6 +227,13 @@ def __call__(
         multigpu=False,
         **kwargs,
     ):
+        if multigpu:
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+        else:
+            rank = 0
+            world_size = 1
+
         epsilon = 1e-16  # Small value to ensure numerical stability
         epoch = self.neval // self.nbatch
         current_y, current_jac = self.q0.sample(self.nbatch)
@@ -212,11 +241,10 @@ def __call__(
         current_jac += detJ
         current_jac.exp_()
         fx = torch.empty((self.nbatch, f_dim), dtype=self.dtype, device=self.device)
-        fx_weight = torch.empty(self.nbatch, dtype=self.dtype, device=self.device)
-        fx_weight[:] = f(current_x, fx)
-        fx_weight.abs_()
 
-        current_weight = mix_rate / current_jac + (1 - mix_rate) * fx_weight
+        current_weight = (
+            mix_rate / current_jac + (1 - mix_rate) * f(current_x, fx).abs_()
+        )
         current_weight.masked_fill_(current_weight < epsilon, epsilon)
 
         n_meas = epoch // meas_freq
@@ -228,9 +256,7 @@ def one_step(current_y, current_x, current_weight, current_jac):
             proposed_x, new_jac = self.maps.forward(proposed_y)
             new_jac.exp_()
 
-            fx_weight[:] = f(proposed_x, fx)
-            fx_weight.abs_()
-            new_weight = mix_rate / new_jac + (1 - mix_rate) * fx_weight
+            new_weight = mix_rate / new_jac + (1 - mix_rate) * f(proposed_x, fx).abs_()
             new_weight.masked_fill_(new_weight < epsilon, epsilon)
 
             acceptance_probs = new_weight / current_weight * new_jac / current_jac
@@ -244,79 +270,91 @@ def one_step(current_y, current_x, current_weight, current_jac):
             current_x = torch.where(accept.unsqueeze(1), proposed_x, current_x)
             current_weight = torch.where(accept, new_weight, current_weight)
             current_jac = torch.where(accept, new_jac, current_jac)
-            return current_y, current_x, current_weight, current_jac
 
         for _ in range(self.nburnin):
-            current_y, current_x, current_weight, current_jac = one_step(
-                current_y, current_x, current_weight, current_jac
-            )
+            one_step(current_y, current_x, current_weight, current_jac)
 
         values = torch.zeros((self.nbatch, f_dim), dtype=self.dtype, device=self.device)
         refvalues = torch.zeros(self.nbatch, dtype=self.dtype, device=self.device)
+        results_unnorm = np.array([RAvg() for _ in range(f_dim)])
+        results_ref = RAvg()
 
         for _ in range(n_meas):
             for _ in range(meas_freq):
-                current_y, current_x, current_weight, current_jac = one_step(
-                    current_y, current_x, current_weight, current_jac
-                )
+                one_step(current_y, current_x, current_weight, current_jac)
             f(current_x, fx)
 
             fx.div_(current_weight.unsqueeze(1))
             values += fx / n_meas
             refvalues += 1 / (current_jac * current_weight) / n_meas
 
-        results = np.array([RAvg() for _ in range(f_dim)])
-        results_ref = RAvg()
-        if multigpu:
-            self.multi_gpu_statistic(values, refvalues, results, results_ref, f_dim)
-        else:
-            mean_ref = refvalues.mean().item()
-            var_ref = refvalues.var().item() / self.nbatch
-            results_ref.update(mean_ref, var_ref, self.neval)
-            for i in range(f_dim):
-                _mean = values[:, i].mean().item()
-                _var = values[:, i].var().item() / self.nbatch
-                results[i].update(_mean, _var, self.neval)
-
-        if f_dim == 1:
-            res = results[0] / results_ref * self._rangebounds.prod()
-            result = RAvg(itn_results=[res], sum_neval=self.neval)
-            return result
-        else:
-            return results / results_ref * self._rangebounds.prod().item()
-
-    def multi_gpu_statistic(self, values, refvalues, results, results_ref, f_dim):
-        # collect multigpu statistics for values
-        _mean = torch.zeros(f_dim, device=self.device, dtype=self.dtype)
-        _total_mean = torch.zeros(f_dim, device=self.device, dtype=self.dtype)
-        _var = torch.zeros(f_dim, device=self.device, dtype=self.dtype)
-        for i in range(f_dim):
-            _total_mean[i] = values[:, i].mean()
-            _mean[i] = _total_mean[i]
-            _var = values[:, i].var() / self.nbatch
-
-        dist.all_reduce(_total_mean, op=dist.ReduceOp.SUM)
-        _total_mean /= dist.get_world_size()
-        _var_between_batch = torch.square(_mean - _total_mean)
-        dist.all_reduce(_var_between_batch, op=dist.ReduceOp.SUM)
-        _var_between_batch /= dist.get_world_size()
-        dist.all_reduce(_var, op=dist.ReduceOp.SUM)
-        _var /= dist.get_world_size()
-        _var = _var + _var_between_batch
-        for i in range(f_dim):
-            results[i].update(_total_mean[i].item(), _var[i].item(), self.neval)
-
-        # collect multigpu statistics for refvalues
+        results = self.statistics(
+            values, refvalues, results_unnorm, results_ref, rank, world_size
+        )
+        if rank == 0:
+            return results
+
+    def statistics(
+        self,
+        values,
+        refvalues,
+        results_unnorm,
+        results_ref,
+        rank=0,
+        world_size=1,
+    ):
+        f_dim = values.shape[1]
         _mean_ref = refvalues.mean()
-        _total_mean_ref = _mean_ref.clone().detach()
         _var_ref = refvalues.var() / self.nbatch
-        dist.all_reduce(_total_mean_ref, op=dist.ReduceOp.SUM)
-        _total_mean_ref /= dist.get_world_size()
-        _var_ref_between_batch = torch.square(_mean_ref - _total_mean_ref)
-        dist.all_reduce(_var_ref_between_batch, op=dist.ReduceOp.SUM)
-        _var_ref_between_batch /= dist.get_world_size()
-        dist.all_reduce(_var_ref, op=dist.ReduceOp.SUM)
-        _var_ref /= dist.get_world_size()
-        _var_ref = _var_ref + _var_ref_between_batch
-        results_ref.update(_total_mean_ref.item(), _var_ref.item(), self.neval)
-        # return results, results_ref
+        _mean = values.mean(dim=0)
+        _var = values.var(dim=0) / self.nbatch
+
+        if world_size > 1:
+            # Gather mean and variance statistics to rank 0
+            if rank == 0:
+                gathered_means = [
+                    torch.zeros(f_dim, dtype=self.dtype, device=self.device)
+                    for _ in range(world_size)
+                ]
+                gathered_vars = [
+                    torch.zeros(f_dim, dtype=self.dtype, device=self.device)
+                    for _ in range(world_size)
+                ]
+                gathered_means_ref = [
+                    torch.zeros(1, dtype=self.dtype, device=self.device)
+                    for _ in range(world_size)
+                ]
+                gathered_vars_ref = [
+                    torch.zeros(1, dtype=self.dtype, device=self.device)
+                    for _ in range(world_size)
+                ]
+            dist.gather(_mean, gathered_means if rank == 0 else None, dst=0)
+            dist.gather(_var, gathered_vars if rank == 0 else None, dst=0)
+            dist.gather(_mean_ref, gathered_means_ref if rank == 0 else None, dst=0)
+            dist.gather(_var_ref, gathered_vars_ref if rank == 0 else None, dst=0)
+
+            if rank == 0:
+                for ngpu in range(world_size):
+                    for i in range(f_dim):
+                        results_unnorm[i].update(
+                            gathered_means[ngpu][i].item(),
+                            gathered_vars[ngpu][i].item(),
+                            self.neval,
+                        )
+                    results_ref.update(
+                        gathered_means_ref[ngpu].item(),
+                        gathered_vars_ref[ngpu].item(),
+                        self.neval,
+                    )
+        else:
+            for i in range(f_dim):
+                results_unnorm[i].update(_mean[i].item(), _var[i].item(), self.neval)
+            results_ref.update(_mean_ref.item(), _var_ref.item(), self.neval)
+
+        if rank == 0:
+            if f_dim == 1:
+                res = results_unnorm[0] / results_ref * self._rangebounds.prod()
+                result = RAvg(itn_results=[res], sum_neval=self.neval)
+                return result
+            else:
+                return results_unnorm / results_ref * self._rangebounds.prod().item()

From 34bc8181e951992c43c680e3e9001b2592a6b3a3 Mon Sep 17 00:00:00 2001
From: houpc <houpc96@gmail.com>
Date: Fri, 15 Nov 2024 22:14:37 +0800
Subject: [PATCH 3/5] set consants in base.py

---
 src/base.py        | 5 +++++
 src/integrators.py | 8 +++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/base.py b/src/base.py
index 687e8df..1d70aa1 100644
--- a/src/base.py
+++ b/src/base.py
@@ -1,6 +1,11 @@
 import torch
 from torch import nn
 import numpy as np
+import sys
+
+MINVAL = 10 ** (sys.float_info.min_10_exp + 50)
+MAXVAL = 10 ** (sys.float_info.max_10_exp - 50)
+EPSILON = 1e-16  # Small value to ensure numerical stability
 
 
 class BaseDistribution(nn.Module):
diff --git a/src/integrators.py b/src/integrators.py
index 18748a9..d195133 100644
--- a/src/integrators.py
+++ b/src/integrators.py
@@ -2,8 +2,7 @@
 import torch
 from utils import RAvg
 from maps import Linear
-from base import Uniform
-import gvar
+from base import Uniform, EPSILON
 import numpy as np
 
 import os
@@ -234,7 +233,6 @@ def __call__(
             rank = 0
             world_size = 1
 
-        epsilon = 1e-16  # Small value to ensure numerical stability
         epoch = self.neval // self.nbatch
         current_y, current_jac = self.q0.sample(self.nbatch)
         current_x, detJ = self.maps.forward(current_y)
@@ -245,7 +243,7 @@ def __call__(
         current_weight = (
             mix_rate / current_jac + (1 - mix_rate) * f(current_x, fx).abs_()
         )
-        current_weight.masked_fill_(current_weight < epsilon, epsilon)
+        current_weight.masked_fill_(current_weight < EPSILON, EPSILON)
 
         n_meas = epoch // meas_freq
 
@@ -257,7 +255,7 @@ def one_step(current_y, current_x, current_weight, current_jac):
             new_jac.exp_()
 
             new_weight = mix_rate / new_jac + (1 - mix_rate) * f(proposed_x, fx).abs_()
-            new_weight.masked_fill_(new_weight < epsilon, epsilon)
+            new_weight.masked_fill_(new_weight < EPSILON, EPSILON)
 
             acceptance_probs = new_weight / current_weight * new_jac / current_jac
 

From 1ffacd4c0c14e55441a787d92ddc15bcdb012516 Mon Sep 17 00:00:00 2001
From: houpc <houpc96@gmail.com>
Date: Sat, 16 Nov 2024 12:16:40 +0800
Subject: [PATCH 4/5] bugfix in mcmc

---
 src/integrators.py      |  24 ++++----
 src/mc_multigpu_test.py | 120 +++++++++++++++++++++-------------------
 2 files changed, 74 insertions(+), 70 deletions(-)

diff --git a/src/integrators.py b/src/integrators.py
index d195133..03c19ea 100644
--- a/src/integrators.py
+++ b/src/integrators.py
@@ -212,6 +212,8 @@ def __init__(
     ):
         super().__init__(maps, bounds, q0, neval, nbatch, device, dtype)
         self.nburnin = nburnin
+
+        # If no transformation maps are provided, use a linear map as default
         if maps is None:
             self.maps = Linear([(0, 1)] * self.dim, device=device)
         self._rangebounds = self.bounds[:, 1] - self.bounds[:, 0]
@@ -310,21 +312,13 @@ def statistics(
         if world_size > 1:
             # Gather mean and variance statistics to rank 0
             if rank == 0:
-                gathered_means = [
-                    torch.zeros(f_dim, dtype=self.dtype, device=self.device)
-                    for _ in range(world_size)
-                ]
-                gathered_vars = [
-                    torch.zeros(f_dim, dtype=self.dtype, device=self.device)
-                    for _ in range(world_size)
-                ]
+                gathered_means = [torch.zeros_like(_mean) for _ in range(world_size)]
+                gathered_vars = [torch.zeros_like(_var) for _ in range(world_size)]
                 gathered_means_ref = [
-                    torch.zeros(1, dtype=self.dtype, device=self.device)
-                    for _ in range(world_size)
+                    torch.zeros_like(_mean_ref) for _ in range(world_size)
                 ]
                 gathered_vars_ref = [
-                    torch.zeros(1, dtype=self.dtype, device=self.device)
-                    for _ in range(world_size)
+                    torch.zeros_like(_var_ref) for _ in range(world_size)
                 ]
             dist.gather(_mean, gathered_means if rank == 0 else None, dst=0)
             dist.gather(_var, gathered_vars if rank == 0 else None, dst=0)
@@ -351,8 +345,10 @@ def statistics(
 
         if rank == 0:
             if f_dim == 1:
-                res = results_unnorm[0] / results_ref * self._rangebounds.prod()
+                # res = results_unnorm[0] / results_ref * self._rangebounds.prod()
+                res = results_unnorm[0] / results_ref
                 result = RAvg(itn_results=[res], sum_neval=self.neval)
                 return result
             else:
-                return results_unnorm / results_ref * self._rangebounds.prod().item()
+                # return results_unnorm / results_ref * self._rangebounds.prod().item()
+                return results_unnorm / results_ref
diff --git a/src/mc_multigpu_test.py b/src/mc_multigpu_test.py
index 94b9aae..ec8f183 100644
--- a/src/mc_multigpu_test.py
+++ b/src/mc_multigpu_test.py
@@ -119,27 +119,31 @@ def sharp_integrands(x, f, dim=4):
 print("\nCalculate the integral [f(x1, x2), g(x1, x2)/2] in the bounds [-1, 1]^2")
 # Two integrands
 res = mc_integrator(two_integrands, f_dim=2, multigpu=True)
-print("Plain MC Integral results:")
-print("  Integral 1: ", res[0])
-print("  Integral 2: ", res[1])
+if res is not None:
+    print("Plain MC Integral results:")
+    print("  Integral 1: ", res[0])
+    print("  Integral 2: ", res[1])
 
 res = mcmc_integrator(two_integrands, f_dim=2, mix_rate=0.5, multigpu=True)
-print("MCMC Integral results:")
-print(f"  Integral 1: ", res[0])
-print(f"  Integral 2: ", res[1])
+if res is not None:
+    print("MCMC Integral results:")
+    print(f"  Integral 1: ", res[0])
+    print(f"  Integral 2: ", res[1])
 
 # print("VEAGS map is trained for g(x1, x2)")
 vegas_map.make_uniform()
 vegas_map.train(20000, two_integrands, f_dim=2, epoch=10, alpha=0.5, multigpu=True)
 res = vegas_integrator(two_integrands, f_dim=2, multigpu=True)
-print("VEGAS Integral results:")
-print("  Integral 1: ", res[0])
-print("  Integral 2: ", res[1])
+if res is not None:
+    print("VEGAS Integral results:")
+    print("  Integral 1: ", res[0])
+    print("  Integral 2: ", res[1])
 
 res = vegasmcmc_integrator(two_integrands, f_dim=2, mix_rate=0.5, multigpu=True)
-print("VEGAS-MCMC Integral results:")
-print("  Integral 1: ", res[0])
-print("  Integral 2: ", res[1])
+if res is not None:
+    print("VEGAS-MCMC Integral results:")
+    print("  Integral 1: ", res[0])
+    print("  Integral 2: ", res[1])
 
 print("\nCalculate the integral [h(X), x1 * h(X),  x1^2 * h(X)] in the bounds [0, 1]^4")
 print("h(X) = exp(-200 * (x1^2 + x2^2 + x3^2 + x4^2))")
@@ -155,37 +159,38 @@ def sharp_integrands(x, f, dim=4):
 mcmc_integrator = MCMC(
     bounds=bounds, neval=n_eval, nbatch=n_batch, nburnin=n_therm, device=device
 )
-print("Plain MC Integral results:")
 res = mc_integrator(sharp_integrands, f_dim=3, multigpu=True)
-print(
-    "  I[0] =",
-    res[0],
-    "  I[1] =",
-    res[1],
-    "  I[2] =",
-    res[2],
-    "  I[1]/I[0] =",
-    res[1] / res[0],
-)
-print("MCMC Integral results:")
+if res is not None:
+    print("Plain MC Integral results:")
+    print(
+        "  I[0] =",
+        res[0],
+        "  I[1] =",
+        res[1],
+        "  I[2] =",
+        res[2],
+        "  I[1]/I[0] =",
+        res[1] / res[0],
+    )
 res = mcmc_integrator(sharp_integrands, f_dim=3, mix_rate=0.5, multigpu=True)
-print(
-    "  I[0] =",
-    res[0],
-    "  I[1] =",
-    res[1],
-    "  I[2] =",
-    res[2],
-    "  I[1]/I[0] =",
-    res[1] / res[0],
-)
+if res is not None:
+    print("MCMC Integral results:")
+    print(
+        "  I[0] =",
+        res[0],
+        "  I[1] =",
+        res[1],
+        "  I[2] =",
+        res[2],
+        "  I[1]/I[0] =",
+        res[1] / res[0],
+    )
 
 vegas_map = Vegas(bounds, device=device)
 print("train VEGAS map for h(X)...")
 # vegas_map.train(20000, sharp_peak, epoch=10, alpha=0.5)
 vegas_map.train(20000, sharp_integrands, f_dim=3, epoch=10, alpha=0.5, multigpu=True)
 
-print("VEGAS Integral results:")
 vegas_integrator = MonteCarlo(
     maps=vegas_map,
     neval=n_eval,
@@ -194,18 +199,19 @@ def sharp_integrands(x, f, dim=4):
     device=device,
 )
 res = vegas_integrator(sharp_integrands, f_dim=3, multigpu=True)
-print(
-    "  I[0] =",
-    res[0],
-    "  I[1] =",
-    res[1],
-    "  I[2] =",
-    res[2],
-    "  I[1]/I[0] =",
-    res[1] / res[0],
-)
+if res is not None:
+    print("VEGAS Integral results:")
+    print(
+        "  I[0] =",
+        res[0],
+        "  I[1] =",
+        res[1],
+        "  I[2] =",
+        res[2],
+        "  I[1]/I[0] =",
+        res[1] / res[0],
+    )
 
-print("VEGAS-MCMC Integral results:")
 vegasmcmc_integrator = MCMC(
     maps=vegas_map,
     neval=n_eval,
@@ -214,13 +220,15 @@ def sharp_integrands(x, f, dim=4):
     device=device,
 )
 res = vegasmcmc_integrator(sharp_integrands, f_dim=3, mix_rate=0.5, multigpu=True)
-print(
-    "  I[0] =",
-    res[0],
-    "  I[1] =",
-    res[1],
-    "  I[2] =",
-    res[2],
-    "  I[1]/I[0] =",
-    res[1] / res[0],
-)
+if res is not None:
+    print("VEGAS-MCMC Integral results:")
+    print(
+        "  I[0] =",
+        res[0],
+        "  I[1] =",
+        res[1],
+        "  I[2] =",
+        res[2],
+        "  I[1]/I[0] =",
+        res[1] / res[0],
+    )

From ad5364bea0b612d071d4d70dd069cef7ee3775f4 Mon Sep 17 00:00:00 2001
From: houpc <houpc96@gmail.com>
Date: Sat, 16 Nov 2024 13:54:47 +0800
Subject: [PATCH 5/5] bugfix in one_step

---
 src/integrators.py      | 15 +++++++--------
 src/mc_multigpu_test.py | 28 ++++++++++++++++++----------
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/src/integrators.py b/src/integrators.py
index 03c19ea..813306d 100644
--- a/src/integrators.py
+++ b/src/integrators.py
@@ -266,10 +266,11 @@ def one_step(current_y, current_x, current_weight, current_jac):
                 <= acceptance_probs
             )
 
-            current_y = torch.where(accept.unsqueeze(1), proposed_y, current_y)
-            current_x = torch.where(accept.unsqueeze(1), proposed_x, current_x)
-            current_weight = torch.where(accept, new_weight, current_weight)
-            current_jac = torch.where(accept, new_jac, current_jac)
+            accept_expanded = accept.unsqueeze(1)
+            current_y.mul_(~accept_expanded).add_(proposed_y * accept_expanded)
+            current_x.mul_(~accept_expanded).add_(proposed_x * accept_expanded)
+            current_weight.mul_(~accept).add_(new_weight * accept)
+            current_jac.mul_(~accept).add_(new_jac * accept)
 
         for _ in range(self.nburnin):
             one_step(current_y, current_x, current_weight, current_jac)
@@ -345,10 +346,8 @@ def statistics(
 
         if rank == 0:
             if f_dim == 1:
-                # res = results_unnorm[0] / results_ref * self._rangebounds.prod()
-                res = results_unnorm[0] / results_ref
+                res = results_unnorm[0] / results_ref * self._rangebounds.prod()
                 result = RAvg(itn_results=[res], sum_neval=self.neval)
                 return result
             else:
-                # return results_unnorm / results_ref * self._rangebounds.prod().item()
-                return results_unnorm / results_ref
+                return results_unnorm / results_ref * self._rangebounds.prod().item()
diff --git a/src/mc_multigpu_test.py b/src/mc_multigpu_test.py
index ec8f183..89a0bc7 100644
--- a/src/mc_multigpu_test.py
+++ b/src/mc_multigpu_test.py
@@ -68,10 +68,12 @@ def sharp_integrands(x, f, dim=4):
 
 print("Calculate the area of the unit circle f(x1, x2) in the bounds [-1, 1]^2...")
 res = mc_integrator(unit_circle_integrand, multigpu=True)
-print("Plain MC Integral results: ", res)
+if res is not None:
+    print("Plain MC Integral results: ", res)
 
 res = mcmc_integrator(unit_circle_integrand, mix_rate=0.5, multigpu=True)
-print("MCMC Integral results: ", res)
+if res is not None:
+    print("MCMC Integral results: ", res)
 
 vegas_map.train(20000, unit_circle_integrand, alpha=0.5, multigpu=True)
 vegas_integrator = MonteCarlo(
@@ -82,7 +84,8 @@ def sharp_integrands(x, f, dim=4):
     device=device,
 )
 res = vegas_integrator(unit_circle_integrand, multigpu=True)
-print("VEGAS Integral results: ", res)
+if res is not None:
+    print("VEGAS Integral results: ", res)
 
 vegasmcmc_integrator = MCMC(
     maps=vegas_map,
@@ -92,7 +95,8 @@ def sharp_integrands(x, f, dim=4):
     device=device,
 )
 res = vegasmcmc_integrator(unit_circle_integrand, mix_rate=0.5, multigpu=True)
-print("VEGAS-MCMC Integral results: ", res, "\n")
+if res is not None:
+    print("VEGAS-MCMC Integral results: ", res, "\n")
 
 
 print(
@@ -100,20 +104,24 @@ def sharp_integrands(x, f, dim=4):
 )
 
 res = mc_integrator(half_sphere_integrand, multigpu=True)
-print("Plain MC Integral results: ", res)
+if res is not None:
+    print("Plain MC Integral results: ", res)
 
 res = mcmc_integrator(half_sphere_integrand, mix_rate=0.5, multigpu=True)
-print("MCMC Integral results:", res)
+if res is not None:
+    print("MCMC Integral results:", res)
 
 vegas_map.make_uniform()
 # train the vegas map
 vegas_map.train(20000, half_sphere_integrand, epoch=10, alpha=0.5, multigpu=True)
 
 res = vegas_integrator(half_sphere_integrand, multigpu=True)
-print("VEGAS Integral results: ", res)
+if res is not None:
+    print("VEGAS Integral results: ", res)
 
 res = vegasmcmc_integrator(half_sphere_integrand, mix_rate=0.5, multigpu=True)
-print("VEGAS-MCMC Integral results: ", res)
+if res is not None:
+    print("VEGAS-MCMC Integral results: ", res)
 
 
 print("\nCalculate the integral [f(x1, x2), g(x1, x2)/2] in the bounds [-1, 1]^2")
@@ -127,8 +135,8 @@ def sharp_integrands(x, f, dim=4):
 res = mcmc_integrator(two_integrands, f_dim=2, mix_rate=0.5, multigpu=True)
 if res is not None:
     print("MCMC Integral results:")
-    print(f"  Integral 1: ", res[0])
-    print(f"  Integral 2: ", res[1])
+    print("  Integral 1: ", res[0])
+    print("  Integral 2: ", res[1])
 
 # print("VEAGS map is trained for g(x1, x2)")
 vegas_map.make_uniform()