diff --git a/python/misc_pymod.cc b/python/misc_pymod.cc
index 243287f7..52a7ef85 100644
--- a/python/misc_pymod.cc
+++ b/python/misc_pymod.cc
@@ -797,7 +797,6 @@ py::array Py_get_deflected_angles2(const py::array &theta_,
         if (calc_rotation)
           MR_assert(e_r.x > 0, "for the poles fix alpha=0 case in calc_rotation first ");
         double dphi = 2*pi/nphi(iring);
-        double sin_aoa, twohav_aod, cos_a;
         for (size_t iphi=0; iphi<nphi(iring); ++iphi)
           {
           double phi = phi0(iring) + iphi*dphi;
@@ -805,6 +804,7 @@ py::array Py_get_deflected_angles2(const py::array &theta_,
           double a_theta = deflect(i,0),
                  a_phi = deflect(i,1);
           double d = a_theta*a_theta + a_phi*a_phi;
+          double sin_aoa, twohav_aod, cos_a;
           if (d < 0.0025){ // largely covers all CMB-lensing relevant cases to double precision
             sin_aoa = 1. - d/6. * (1. - d/20. * (1. - d/42.));   // sin(a) / a
             twohav_aod = -0.5 + d/24. * (1. - d/30. * (1. - d/56.));   // (cos a - 1) / (a* a), also needed for rotation      
diff --git a/python/sht_pymod.cc b/python/sht_pymod.cc
index 0037af04..710628c3 100644
--- a/python/sht_pymod.cc
+++ b/python/sht_pymod.cc
@@ -47,6 +47,20 @@ namespace py = pybind11;
 
 auto None = py::none();
 
+SHT_mode get_mode(const string &smode)
+  {
+  if (smode=="STANDARD") return STANDARD;
+  if (smode=="GRAD_ONLY") return GRAD_ONLY;
+  if (smode=="DERIV1") return DERIV1;
+  MR_fail("unknown SHT mode");
+  }
+
+size_t get_nmaps(size_t spin, SHT_mode /*mode*/)
+  { return 1+(spin>0); }
+
+size_t get_nalm(size_t spin, SHT_mode mode)
+  { return (spin==0) ? 1 : ((mode==STANDARD) ? 2 : 1); }
+
 py::array Py_GL_weights(size_t nlat, size_t nlon)
   {
   auto res = make_Pyarr<double>({nlat});
@@ -185,8 +199,9 @@ size_t min_mapdim(const cmav<size_t,1> &nphi, const cmav<size_t,1> &ringstart,
 template<typename T> py::array Py2_alm2leg(const py::array &alm_, size_t spin,
   size_t lmax, const py::object &mval_, const py::object &mstart_,
   ptrdiff_t lstride, const py::array &theta_, size_t nthreads,
-  py::object &leg__)
+  py::object &leg__, const string &mode_)
   {
+  auto mode = get_mode(mode_);
   auto alm = to_cmav<complex<T>,2>(alm_);
   auto theta = to_cmav<double,1>(theta_);
   vmav<size_t,1> mval, mstart;
@@ -194,91 +209,64 @@ template<typename T> py::array Py2_alm2leg(const py::array &alm_, size_t spin,
   MR_assert(alm.shape(1)>=min_almdim(lmax, mval, mstart, lstride),
     "bad a_lm array size");
   auto leg_ = get_optional_Pyarr<complex<T>>(leg__,
-    {alm.shape(0),theta.shape(0),mval.shape(0)});
+    {get_nmaps(spin,mode),theta.shape(0),mval.shape(0)});
   auto leg = to_vmav<complex<T>,3>(leg_);
   {
   py::gil_scoped_release release;
-  alm2leg(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads, ALM2MAP);
+  alm2leg(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads, mode);
   }
   return leg_;
   }
 py::array Py_alm2leg(const py::array &alm, size_t lmax, const py::array &theta,
   size_t spin, const py::object &mval, const py::object &mstart,
-  ptrdiff_t lstride, size_t nthreads, py::object &leg)
+  ptrdiff_t lstride, size_t nthreads, py::object &leg, const string &mode)
   {
   if (isPyarr<complex<float>>(alm))
     return Py2_alm2leg<float>(alm, spin, lmax, mval, mstart, lstride, theta,
-      nthreads, leg);
+      nthreads, leg, mode);
   if (isPyarr<complex<double>>(alm))
     return Py2_alm2leg<double>(alm, spin, lmax, mval, mstart, lstride, theta,
-      nthreads, leg);
+      nthreads, leg, mode);
   MR_fail("type matching failed: 'alm' has neither type 'c8' nor 'c16'");
   }
-template<typename T> py::array Py2_alm2leg_deriv1(const py::array &alm_,
-   size_t lmax, const py::object &mval_, const py::object &mstart_,
-   ptrdiff_t lstride, const py::array &theta_, size_t nthreads,
-   py::object &leg__)
-  {
-  auto alm = to_cmav<complex<T>,2>(alm_);
-  auto theta = to_cmav<double,1>(theta_);
-  vmav<size_t,1> mval, mstart;
-  getmstuff(lmax, mval_, mstart_, mval, mstart);
-  MR_assert(alm.shape(0)==1, "need exactly 1 a_lm component");
-  MR_assert(alm.shape(1)>=min_almdim(lmax, mval, mstart, lstride),
-    "bad a_lm array size");
-  auto leg_ = get_optional_Pyarr<complex<T>>(leg__,
-    {2,theta.shape(0),mval.shape(0)});
-  auto leg = to_vmav<complex<T>,3>(leg_);
-  {
-  py::gil_scoped_release release;
-  alm2leg(alm, leg, 0, lmax, mval, mstart, lstride, theta, nthreads,
-    ALM2MAP_DERIV1);
-  }
-  return leg_;
-  }
 py::array Py_alm2leg_deriv1(const py::array &alm, size_t lmax,
   const py::array &theta, const py::object &mval, const py::object &mstart,
   ptrdiff_t lstride, size_t nthreads, py::object &leg)
   {
-  if (isPyarr<complex<float>>(alm))
-    return Py2_alm2leg_deriv1<float>(alm, lmax, mval, mstart, lstride, theta,
-      nthreads, leg);
-  if (isPyarr<complex<double>>(alm))
-    return Py2_alm2leg_deriv1<double>(alm, lmax, mval, mstart, lstride, theta,
-      nthreads, leg);
-  MR_fail("type matching failed: 'alm' has neither type 'c8' nor 'c16'");
+  return Py_alm2leg(alm, lmax, theta, 1, mval, mstart, lstride, nthreads, leg, "DERIV1");
   }
 template<typename T> py::array Py2_leg2alm(const py::array &leg_,
   const py::array &theta_, size_t spin, size_t lmax, const py::object &mval_,
   const py::object &mstart_, ptrdiff_t lstride, size_t nthreads,
-  py::object &alm__)
+  py::object &alm__,const string &mode_)
   {
+  auto mode = get_mode(mode_);
   auto leg = to_cmav<complex<T>,3>(leg_);
   auto theta = to_cmav<double,1>(theta_);
   MR_assert(leg.shape(1)==theta.shape(0), "bad leg array size");
   vmav<size_t,1> mval, mstart;
   getmstuff(lmax, mval_, mstart_, mval, mstart);
   auto alm_ = get_optional_Pyarr_minshape<complex<T>>(alm__,
-    {leg.shape(0),min_almdim(lmax, mval, mstart, lstride)});
+    {get_nalm(spin,mode),min_almdim(lmax, mval, mstart, lstride)});
   auto alm = to_vmav<complex<T>,2>(alm_);
   MR_assert(alm.shape(0)==leg.shape(0),
     "bad number of components in a_lm array");
   {
   py::gil_scoped_release release;
-  leg2alm(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads);
+  leg2alm(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads, mode);
   }
   return alm_;
   }
 py::array Py_leg2alm(const py::array &leg, size_t lmax, const py::array &theta,
   size_t spin, const py::object &mval, const py::object &mstart,
-  ptrdiff_t lstride, size_t nthreads, py::object &alm)
+  ptrdiff_t lstride, size_t nthreads, py::object &alm, const string &mode)
   {
   if (isPyarr<complex<float>>(leg))
     return Py2_leg2alm<float>(leg, theta, spin, lmax, mval, mstart, lstride,
-      nthreads, alm);
+      nthreads, alm, mode);
   if (isPyarr<complex<double>>(leg))
     return Py2_leg2alm<double>(leg, theta, spin, lmax, mval, mstart, lstride,
-      nthreads, alm);
+      nthreads, alm, mode);
   MR_fail("type matching failed: 'leg' has neither type 'c8' nor 'c16'");
   }
 template<typename T> py::array Py2_map2leg(const py::array &map_,
@@ -352,8 +340,10 @@ template<typename T> py::array Py2_synthesis(const py::array &alm_,
   const py::array &theta_,
   const py::array &nphi_,
   const py::array &phi0_, const py::array &ringstart_,
-  ptrdiff_t pixstride, size_t nthreads, const py::object &mmax_)
+  ptrdiff_t pixstride, size_t nthreads, const py::object &mmax_,
+  const string &mode_)
   {
+  auto mode = get_mode(mode_);
   auto mstart = get_mstart(lmax, mmax_, mstart_);
   auto theta = to_cmav<double,1>(theta_);
   auto phi0 = to_cmav<double,1>(phi0_);
@@ -364,10 +354,10 @@ template<typename T> py::array Py2_synthesis(const py::array &alm_,
   vector<size_t> mapshp(alm_.ndim());
   for(size_t i=0; i<mapshp.size(); ++i) mapshp[i] = alm_.shape()[i];
   mapshp[mapshp.size()-1] = min_mapdim(nphi, ringstart, pixstride);
+  mapshp[mapshp.size()-2] = get_nmaps(spin, mode);
   auto map_ = get_optional_Pyarr_minshape<T>(map__, mapshp);
   auto map = to_vmav_with_optional_leading_dimensions<T,3>(map_);
   MR_assert(map.shape(0)==alm.shape(0), "bad number of components in map array");
-  MR_assert(map.shape(1)==alm.shape(1), "bad number of components in map array");
   size_t nthreads_outer=1;
   if (alm.shape(0)>nthreads)  // parallelize over entire transforms
     { nthreads_outer=nthreads; nthreads=1; }
@@ -381,7 +371,7 @@ template<typename T> py::array Py2_synthesis(const py::array &alm_,
         auto subalm = subarray<2>(alm, {{itrans},{},{}});
         auto submap = subarray<2>(map, {{itrans},{},{}});
         synthesis(subalm, submap, spin, lmax, mstart, lstride, theta, nphi,
-          phi0, ringstart, pixstride, nthreads, ALM2MAP);
+          phi0, ringstart, pixstride, nthreads, mode);
         }
     });
   }
@@ -392,71 +382,24 @@ py::array Py_synthesis(const py::array &alm, const py::array &theta,
   const py::array &nphi,
   const py::array &phi0, const py::array &ringstart, size_t spin,
   ptrdiff_t lstride, ptrdiff_t pixstride, size_t nthreads, py::object &map,
-  const py::object &mmax_)
+  const py::object &mmax_, const string &mode)
   {
   if (isPyarr<complex<float>>(alm))
     return Py2_synthesis<float>(alm, map, spin, lmax, mstart, lstride, theta,
-      nphi, phi0, ringstart, pixstride, nthreads, mmax_);
+      nphi, phi0, ringstart, pixstride, nthreads, mmax_, mode);
   else if (isPyarr<complex<double>>(alm))
     return Py2_synthesis<double>(alm, map, spin, lmax, mstart, lstride, theta,
-      nphi, phi0, ringstart, pixstride, nthreads, mmax_);
+      nphi, phi0, ringstart, pixstride, nthreads, mmax_, mode);
   MR_fail("type matching failed: 'alm' has neither type 'c8' nor 'c16'");
   }
-template<typename T> py::array Py2_synthesis_deriv1(const py::array &alm_,
-  py::object &map__, size_t lmax,
-  const py::object &mstart_, ptrdiff_t lstride,
-  const py::array &theta_,
-  const py::array &nphi_,
-  const py::array &phi0_, const py::array &ringstart_,
-  ptrdiff_t pixstride, size_t nthreads, const py::object &mmax_)
-  {
-  auto mstart = get_mstart(lmax, mmax_, mstart_);
-  auto theta = to_cmav<double,1>(theta_);
-  auto phi0 = to_cmav<double,1>(phi0_);
-  auto nphi = to_cmav<size_t,1>(nphi_);
-  auto ringstart = to_cmav<size_t,1>(ringstart_);
-  MR_assert((alm_.ndim()>=2)&&(alm_.ndim()<=3), "alm must be a 2D or 3D array");
-  auto alm = to_cmav_with_optional_leading_dimensions<complex<T>,3>(alm_);
-  vector<size_t> mapshp(alm_.ndim());
-  for(size_t i=0; i<mapshp.size(); ++i) mapshp[i] = alm_.shape()[i];
-  MR_assert(alm.shape(0)==1, "bad number of components in alm array");
-  mapshp[mapshp.size()-1] = min_mapdim(nphi, ringstart, pixstride);
-  mapshp[mapshp.size()-2] = 2;
-  auto map_ = get_optional_Pyarr_minshape<T>(map__, mapshp);
-  auto map = to_vmav_with_optional_leading_dimensions<T,3>(map_);
-  MR_assert(map.shape(0)==alm.shape(0), "bad number of components in map array");
-  size_t nthreads_outer=1;
-  if (alm.shape(0)>nthreads)  // parallelize over entire transforms
-    { nthreads_outer=nthreads; nthreads=1; }
-  {
-  py::gil_scoped_release release;
-  execDynamic(alm.shape(0), nthreads_outer, 1, [&](Scheduler &sched)
-    {
-    while (auto rng=sched.getNext())
-      for(auto itrans=rng.lo; itrans<rng.hi; ++itrans)
-        {
-        auto subalm = subarray<2>(alm, {{itrans},{},{}});
-        auto submap = subarray<2>(map, {{itrans},{},{}});
-        synthesis(subalm, submap, 1, lmax, mstart, lstride, theta, nphi, phi0,
-          ringstart, pixstride, nthreads, ALM2MAP_DERIV1);
-        }
-    });
-  }
-  return map_;
-  }
 py::array Py_synthesis_deriv1(const py::array &alm, const py::array &theta,
   size_t lmax, const py::object &mstart,
   const py::array &nphi,
   const py::array &phi0, const py::array &ringstart, ptrdiff_t lstride, ptrdiff_t pixstride,
   size_t nthreads, py::object &map, const py::object &mmax_)
   {
-  if (isPyarr<complex<float>>(alm))
-    return Py2_synthesis_deriv1<float>(alm, map, lmax, mstart, lstride, theta,
-      nphi, phi0, ringstart, pixstride, nthreads, mmax_);
-  else if (isPyarr<complex<double>>(alm))
-    return Py2_synthesis_deriv1<double>(alm, map, lmax, mstart, lstride, theta,
-      nphi, phi0, ringstart, pixstride, nthreads, mmax_);
-  MR_fail("type matching failed: 'alm' has neither type 'c8' nor 'c16'");
+  return Py_synthesis(alm, theta, lmax, mstart, nphi, phi0, ringstart, 1, lstride,
+    pixstride, nthreads, map, mmax_, "DERIV1");
   }
 
 
@@ -501,92 +444,71 @@ template<typename T> py::array_t<complex<T>> check_build_alm
 
 template<typename T> py::array Py2_synthesis_2d(const py::array &alm_,
   size_t spin, size_t lmax, const string &geometry, const py::object & ntheta,
-  const py::object &nphi, size_t mmax, size_t nthreads, py::object &map__)
+  const py::object &nphi, size_t mmax, size_t nthreads, py::object &map__, const string &mode_)
   {
+  auto mode = get_mode(mode_);
   auto alm = to_cmav<complex<T>,2>(alm_);
-  auto map_ = check_build_map<T>(map__, alm.shape(0), ntheta, nphi);
+  auto map_ = check_build_map<T>(map__, get_nmaps(spin,mode), ntheta, nphi);
   auto map = to_vmav<T,3>(map_);
-  MR_assert(map.shape(0)==alm.shape(0), "bad number of components in map array");
   {
   py::gil_scoped_release release;
-  synthesis_2d(alm, map, spin, lmax, mmax, geometry, nthreads);
+  synthesis_2d(alm, map, spin, lmax, mmax, geometry, nthreads, mode);
   }
   return map_;
   }
 py::array Py_synthesis_2d(const py::array &alm, size_t spin, size_t lmax,
   const string &geometry, const py::object &ntheta, const py::object &nphi,
-  const py::object &mmax_, size_t nthreads, py::object &map)
+  const py::object &mmax_, size_t nthreads, py::object &map, const string &mode)
   {
   size_t mmax = mmax_.is_none() ? lmax : mmax_.cast<size_t>();
   if (isPyarr<complex<float>>(alm))
     return Py2_synthesis_2d<float>(alm, spin, lmax, geometry, ntheta, nphi,
-      mmax, nthreads, map);
+      mmax, nthreads, map, mode);
   else if (isPyarr<complex<double>>(alm))
     return Py2_synthesis_2d<double>(alm, spin, lmax, geometry, ntheta, nphi,
-      mmax, nthreads, map);
+      mmax, nthreads, map, mode);
   MR_fail("type matching failed: 'alm' has neither type 'c8' nor 'c16'");
   }
 template<typename T> py::array Py2_adjoint_synthesis_2d(
   const py::array &map_, size_t spin, size_t lmax, const string &geometry,
-  size_t mmax, size_t nthreads, py::object &alm__)
+  size_t mmax, size_t nthreads, py::object &alm__, const string &mode_)
   {
+  auto mode = get_mode(mode_);
   auto map = to_cmav<T,3>(map_);
-  auto alm_ = check_build_alm<T>(alm__, map.shape(0), lmax, mmax);
+  auto alm_ = check_build_alm<T>(alm__, get_nalm(spin,mode), lmax, mmax);
   auto alm = to_vmav<complex<T>,2>(alm_);
-  MR_assert(map.shape(0)==alm.shape(0), "bad number of components in map array");
   {
   py::gil_scoped_release release;
-  adjoint_synthesis_2d(alm, map, spin, lmax, mmax, geometry, nthreads);
+  adjoint_synthesis_2d(alm, map, spin, lmax, mmax, geometry, nthreads, mode);
   }
   return alm_;
   }
 py::array Py_adjoint_synthesis_2d(
   const py::array &map, size_t spin, size_t lmax, const string &geometry,
-  const py::object &mmax_, size_t nthreads, py::object &alm)
+  const py::object &mmax_, size_t nthreads, py::object &alm, const string &mode)
   {
   size_t mmax = mmax_.is_none() ? lmax : mmax_.cast<size_t>();
   if (isPyarr<float>(map))
     return Py2_adjoint_synthesis_2d<float>(map, spin, lmax, geometry, mmax,
-      nthreads, alm);
+      nthreads, alm,mode);
   else if (isPyarr<double>(map))
     return Py2_adjoint_synthesis_2d<double>(map, spin, lmax, geometry, mmax,
-      nthreads, alm);
+      nthreads, alm, mode);
   MR_fail("type matching failed: 'alm' has neither type 'c8' nor 'c16'");
   }
-template<typename T> py::array Py2_synthesis_2d_deriv1(const py::array &alm_,
-  size_t lmax, const string &geometry, const py::object & ntheta,
-  const py::object &nphi, size_t mmax, size_t nthreads, py::object &map__)
-  {
-  auto alm = to_cmav<complex<T>,2>(alm_);
-  auto map_ = check_build_map<T>(map__, 2, ntheta, nphi);
-  auto map = to_vmav<T,3>(map_);
-  MR_assert((map.shape(0)==2) && (alm.shape(0)==1),
-    "incorrect number of components");
-  {
-  py::gil_scoped_release release;
-  synthesis_2d(alm, map, 1, lmax, mmax, geometry, nthreads, ALM2MAP_DERIV1);
-  }
-  return map_;
-  }
 py::array Py_synthesis_2d_deriv1(const py::array &alm, size_t lmax,
   const string &geometry, const py::object &ntheta, const py::object &nphi, const py::object &mmax_, size_t nthreads, py::object &map)
   {
-  size_t mmax = mmax_.is_none() ? lmax : mmax_.cast<size_t>();
-  if (isPyarr<complex<float>>(alm))
-    return Py2_synthesis_2d_deriv1<float>(alm, lmax, geometry, ntheta, nphi,
-      mmax, nthreads, map);
-  else if (isPyarr<complex<double>>(alm))
-    return Py2_synthesis_2d_deriv1<double>(alm, lmax, geometry, ntheta, nphi,
-      mmax, nthreads, map);
-  MR_fail("type matching failed: 'alm' has neither type 'c8' nor 'c16'");
+  return Py_synthesis_2d(alm, 1, lmax, geometry, ntheta, nphi, mmax_, nthreads, map, "DERIV1");
   }
 
 template<typename T> py::array Py2_adjoint_synthesis(py::object &alm__,
   size_t lmax, const py::object &mstart_, ptrdiff_t lstride,
   const py::array &map_, const py::array &theta_, const py::array &phi0_,
   const py::array &nphi_, const py::array &ringstart_, size_t spin,
-  ptrdiff_t pixstride, size_t nthreads, const py::object &mmax_)
+  ptrdiff_t pixstride, size_t nthreads, const py::object &mmax_, const string &mode_)
   {
+  auto mode = get_mode(mode_);
   auto mstart = get_mstart(lmax, mmax_, mstart_);
   auto theta = to_cmav<double,1>(theta_);
   auto phi0 = to_cmav<double,1>(phi0_);
@@ -600,10 +522,10 @@ template<typename T> py::array Py2_adjoint_synthesis(py::object &alm__,
   vector<size_t> almshp(map_.ndim());
   for(size_t i=0; i<almshp.size(); ++i) almshp[i] = map_.shape()[i];
   almshp[almshp.size()-1] = min_almdim(lmax, mval, mstart, lstride);
+  almshp[almshp.size()-2] = get_nalm(spin, mode);
   auto alm_ = get_optional_Pyarr_minshape<complex<T>>(alm__, almshp);
   auto alm = to_vmav_with_optional_leading_dimensions<complex<T>,3>(alm_);
   MR_assert(map.shape(0)==alm.shape(0), "bad number of components in alm array");
-  MR_assert(map.shape(1)==alm.shape(1), "bad number of components in alm array");
   size_t nthreads_outer=1;
   if (map.shape(0)>nthreads)  // parallelize over entire transforms
     { nthreads_outer=nthreads; nthreads=1; }
@@ -617,7 +539,7 @@ template<typename T> py::array Py2_adjoint_synthesis(py::object &alm__,
         auto submap = subarray<2>(map, {{itrans},{},{}});
         auto subalm = subarray<2>(alm, {{itrans},{},{}});
         adjoint_synthesis(subalm, submap, spin, lmax, mstart, lstride, theta,
-          nphi, phi0, ringstart, pixstride, nthreads);
+          nphi, phi0, ringstart, pixstride, nthreads, mode);
         }
     });
   }
@@ -630,14 +552,15 @@ py::array Py_adjoint_synthesis(const py::array &map, const py::array &theta,
   const py::array &phi0, const py::array &ringstart, size_t spin,
   ptrdiff_t lstride, ptrdiff_t pixstride,
   size_t nthreads,
-  py::object &alm, const py::object &mmax_)
+  py::object &alm, const py::object &mmax_,
+  const string &mode)
   {
   if (isPyarr<float>(map))
     return Py2_adjoint_synthesis<float>(alm, lmax, mstart, lstride, map, theta,
-      phi0, nphi, ringstart, spin, pixstride, nthreads, mmax_);
+      phi0, nphi, ringstart, spin, pixstride, nthreads, mmax_, mode);
   else if (isPyarr<double>(map))
     return Py2_adjoint_synthesis<double>(alm, lmax, mstart, lstride, map, theta,
-      phi0, nphi, ringstart, spin, pixstride, nthreads, mmax_);
+      phi0, nphi, ringstart, spin, pixstride, nthreads, mmax_, mode);
   MR_fail("type matching failed: 'alm' has neither type 'c8' nor 'c16'");
   }
 template<typename T> py::object Py2_pseudo_analysis(py::object &alm__,
@@ -781,32 +704,32 @@ py::array Py_adjoint_analysis_2d(const py::array &alm, size_t spin, size_t lmax,
 
 template<typename T, typename Tloc> py::array Py2_synthesis_general(const py::array &alm_,
   size_t spin, size_t lmax, const py::array &loc_, double epsilon, size_t mmax,
-  size_t nthreads, py::object &map__, double sigma_min, double sigma_max)
+  size_t nthreads, py::object &map__, double sigma_min, double sigma_max, const string &mode_)
   {
+  auto mode = get_mode(mode_);
   auto alm = to_cmav<complex<T>,2>(alm_);
   auto loc = to_cmav<Tloc,2>(loc_);
   MR_assert(loc.shape(1)==2, "last dimension of loc must have size 2");
-  size_t ncomp = (spin==0) ? 1 : 2;
-  MR_assert(alm.shape(0)==ncomp, "number of components mismatch in alm");
-  auto map_ = get_optional_Pyarr<T>(map__, {alm.shape(0), loc.shape(0)});
+  MR_assert(alm.shape(0)==get_nalm(spin,mode), "number of components mismatch in alm");
+  auto map_ = get_optional_Pyarr<T>(map__, {get_nmaps(spin,mode), loc.shape(0)});
   auto map = to_vmav<T,2>(map_);
   {
   py::gil_scoped_release release;
-  synthesis_general(alm, map, spin, lmax, mmax, loc, epsilon, sigma_min, sigma_max, nthreads);
+  synthesis_general(alm, map, spin, lmax, mmax, loc, epsilon, sigma_min, sigma_max, nthreads, mode);
   }
   return map_;
   }
 py::array Py_synthesis_general(const py::array &alm, size_t spin, size_t lmax,
   const py::array &loc, double epsilon, const py::object &mmax_,
-  size_t nthreads, py::object &map, double sigma_min, double sigma_max)
+  size_t nthreads, py::object &map, double sigma_min, double sigma_max, const string &mode)
   {
   size_t mmax = mmax_.is_none() ? lmax : mmax_.cast<size_t>();
   if (isPyarr<double>(loc))
     {
     if (isPyarr<complex<float>>(alm))
-      return Py2_synthesis_general<float,double>(alm, spin, lmax, loc, epsilon, mmax, nthreads, map, sigma_min, sigma_max);
+      return Py2_synthesis_general<float,double>(alm, spin, lmax, loc, epsilon, mmax, nthreads, map, sigma_min, sigma_max, mode);
     else if (isPyarr<complex<double>>(alm))
-      return Py2_synthesis_general<double,double>(alm, spin, lmax, loc, epsilon, mmax, nthreads, map, sigma_min, sigma_max);
+      return Py2_synthesis_general<double,double>(alm, spin, lmax, loc, epsilon, mmax, nthreads, map, sigma_min, sigma_max, mode);
    }
 #if 0
   else if (isPyarr<float>(loc))
@@ -823,33 +746,33 @@ py::array Py_synthesis_general(const py::array &alm, size_t spin, size_t lmax,
 
 template<typename T, typename Tloc> py::array Py2_adjoint_synthesis_general(const py::array &map_,
   size_t spin, size_t lmax, const py::array &loc_, double epsilon, size_t mmax,
-  size_t nthreads, py::object &alm__, double sigma_min, double sigma_max)
+  size_t nthreads, py::object &alm__, double sigma_min, double sigma_max, const string &mode_)
   {
+  auto mode = get_mode(mode_);
   auto map = to_cmav<T,2>(map_);
   auto loc = to_cmav<Tloc,2>(loc_);
   MR_assert(loc.shape(1)==2, "last dimension of loc must have size 2");
-  size_t ncomp = (spin==0) ? 1 : 2;
-  MR_assert(map.shape(0)==ncomp, "number of components mismatch in map");
-  auto alm_ = check_build_alm<T>(alm__, map.shape(0), lmax, mmax);
+  MR_assert(map.shape(0)==get_nmaps(spin,mode), "number of components mismatch in map");
+  auto alm_ = check_build_alm<T>(alm__, get_nalm(spin,mode), lmax, mmax);
   auto alm = to_vmav<complex<T>,2>(alm_);
 
   {
   py::gil_scoped_release release;
-  adjoint_synthesis_general(alm, map, spin, lmax, mmax, loc, epsilon, sigma_min, sigma_max, nthreads);
+  adjoint_synthesis_general(alm, map, spin, lmax, mmax, loc, epsilon, sigma_min, sigma_max, nthreads, mode);
   }
   return alm_;
   }
 py::array Py_adjoint_synthesis_general(const py::array &map, size_t spin, size_t lmax,
   const py::array &loc, double epsilon, const py::object &mmax_,
-  size_t nthreads, py::object &alm, double sigma_min, double sigma_max)
+  size_t nthreads, py::object &alm, double sigma_min, double sigma_max, const string &mode)
   {
   size_t mmax = mmax_.is_none() ? lmax : mmax_.cast<size_t>();
   if (isPyarr<double>(loc))
     {
     if (isPyarr<float>(map))
-      return Py2_adjoint_synthesis_general<float,double>(map, spin, lmax, loc, epsilon, mmax, nthreads, alm, sigma_min, sigma_max);
+      return Py2_adjoint_synthesis_general<float,double>(map, spin, lmax, loc, epsilon, mmax, nthreads, alm, sigma_min, sigma_max, mode);
     else if (isPyarr<double>(map))
-      return Py2_adjoint_synthesis_general<double,double>(map, spin, lmax, loc, epsilon, mmax, nthreads, alm, sigma_min, sigma_max);
+      return Py2_adjoint_synthesis_general<double,double>(map, spin, lmax, loc, epsilon, mmax, nthreads, alm, sigma_min, sigma_max, mode);
     }
 #if 0
   else if (isPyarr<float>(loc))
@@ -1019,13 +942,13 @@ template<typename T> class Py_sharpjob
           ringstart(rs) = size_t(base.Npix() - startpix - ringpix);
           }
         vmav<double,2> mr(map.data(), {1, size_t(npix_)}, {0, map.stride(0)});
-        synthesis(ar, mr, 0, lmax_, mstart, 1, theta, nphi, phi0, ringstart, 1, nthreads);
+        synthesis(ar, mr, 0, lmax_, mstart, 1, theta, nphi, phi0, ringstart, 1, nthreads, STANDARD);
         }
       else
         {
         vmav<double,3> mr(map.data(), {1, size_t(ntheta_), size_t(nphi_)},
           {0, ptrdiff_t(map.stride(0)*nphi_), map.stride(0)});
-        synthesis_2d(ar, mr, 0, lmax_, mmax_, geom, nthreads);
+        synthesis_2d(ar, mr, 0, lmax_, mmax_, geom, nthreads, STANDARD);
         }
       return map_;
       }
@@ -1059,13 +982,13 @@ template<typename T> class Py_sharpjob
           ringstart(rs) = size_t(base.Npix() - startpix - ringpix);
           }
         cmav<double,2> mr(map.data(), {1, npix_}, {0, map.stride(0)});
-        adjoint_synthesis(ar, mr, 0, lmax_, mstart, 1, theta, nphi, phi0, ringstart, 1, nthreads);
+        adjoint_synthesis(ar, mr, 0, lmax_, mstart, 1, theta, nphi, phi0, ringstart, 1, nthreads,STANDARD);
         }
       else
         {
         cmav<double,3> mr(map.data(), {1, size_t(ntheta_), size_t(nphi_)},
           {0, ptrdiff_t(map.stride(0)*nphi_), map.stride(0)});
-        adjoint_synthesis_2d(ar, mr, 0, lmax_, mmax_, geom, nthreads);
+        adjoint_synthesis_2d(ar, mr, 0, lmax_, mmax_, geom, nthreads, STANDARD);
         }
       return alm_;
       }
@@ -1111,13 +1034,13 @@ template<typename T> class Py_sharpjob
           ringstart(r) = size_t(startpix);
           ringstart(rs) = size_t(base.Npix() - startpix - ringpix);
           }
-        synthesis(alm, map, spin, lmax_, mstart, 1, theta, nphi, phi0, ringstart, 1, nthreads);
+        synthesis(alm, map, spin, lmax_, mstart, 1, theta, nphi, phi0, ringstart, 1, nthreads, STANDARD);
         }
       else
         {
         vmav<double,3> mr(map.data(), {2, ntheta_, nphi_},
           {map.stride(0), ptrdiff_t(map.stride(1)*nphi_), map.stride(1)});
-        synthesis_2d(alm, mr, spin, lmax_, mmax_, geom, nthreads);
+        synthesis_2d(alm, mr, spin, lmax_, mmax_, geom, nthreads, STANDARD);
         }
       return map_;
       }
@@ -1185,17 +1108,15 @@ dependent on theta and m.
 
 Parameters
 ----------
-alm: numpy.ndarray((ncomp, x), dtype=numpy.complex64 or numpy.complex128)
+alm: numpy.ndarray((nalm, x), dtype=numpy.complex64 or numpy.complex128)
     the set of spherical harmonic coefficients.
-    ncomp must be 1 if spin is 0, else 2.
     The second dimension must be large enough to accommodate all entries, which
     are stored according to the parameters `lmax`, 'mval`, `mstart`, and `lstride`.
-leg: None or numpy.ndarray((ncomp, ntheta, nm), same dtype as `alm`)
+leg: None or numpy.ndarray((nleg, ntheta, nm), same dtype as `alm`)
     output array containing the Legendre coefficients
     if `None`, a new suitable array is allocated
 spin: int >= 0
     the spin to use for the transform
-    if spin==0, ncomp must be 1, otherwise 2
 lmax: int >= 0
     the maximum l moment of the transform (inclusive)
 mval: numpy.ndarray((nm,), dtype = numpy.uint64)
@@ -1212,11 +1133,22 @@ theta: numpy.ndarray((ntheta,), dtype=numpy.float64)
 nthreads: int >= 0
     the number of threads to use for the computation
     if 0, use as many threads as there are hardware threads available on the system
+mode: str
+    the transform mode
+      | "STANDARD": standard transform
+      | "GRAD_ONLY": only valid for spin>0, curl a_lm are assumed to be zero
+      | "DERIV1": same as "GRAD_ONLY", but spin is assumed to be 1 and a_lm are
+        muliplied by sqrt(l*(l+1))
 
 Returns
 -------
-numpy.ndarray((ncomp, ntheta, nm), same dtype as `alm`)
+numpy.ndarray((nleg, ntheta, nm), same dtype as `alm`)
     the Legendre coefficients. If `leg` was supplied, this will be the same object.
+
+Notes
+-----
+nleg = 1 if spin == 0 else 2
+nalm = 1 if spin == 0 else (2 if mode == "STANDARD" else 1)
 )""";
 
 constexpr const char *alm2leg_deriv1_DS = R"""(
@@ -1263,16 +1195,14 @@ Transforms a set of Legendre coefficients to spherical harmonic coefficients
 
 Parameters
 ----------
-leg: numpy.ndarray((ncomp, ntheta, nm), dtype=numpy.complex64 or numpy.complex128)
-    ncomp must be 1 if spin is 0, else 2
-alm: None or numpy.ndarray((ncomp, x), same dtype as `leg`)
+leg: numpy.ndarray((nleg, ntheta, nm), dtype=numpy.complex64 or numpy.complex128)
+alm: None or numpy.ndarray((nalm, x), same dtype as `leg`)
     the set of spherical harmonic coefficients.
     The second dimension must be large enough to accommodate all entries, which
     are stored according to the parameters `lmax`, 'mval`, `mstart`, and `lstride`.
     if `None`, a new suitable array is allocated
 spin: int >= 0
     the spin to use for the transform
-    if spin==0, ncomp must be 1, otherwise 2
 lmax: int >= 0
     the maximum l moment of the transform (inclusive)
 mval: numpy.ndarray((nm,), dtype = numpy.uint64)
@@ -1289,13 +1219,22 @@ theta: numpy.ndarray((ntheta,), dtype=numpy.float64)
 nthreads: int >= 0
     the number of threads to use for the computation
     if 0, use as many threads as there are hardware threads available on the system
+mode: str
+    the transform mode
+      | "STANDARD": standard transform
+      | "GRAD_ONLY": only valid for spin>0, curl a_lm are not computed
 
 Returns
 -------
-numpy.ndarray((ncomp, *), same dtype as `leg`)
+numpy.ndarray((nalm, *), same dtype as `leg`)
     the Legendre coefficients.
     if `alm` was supplied, this will be the same object
     If newly allocated, the smallest possible second dimensions will be chosen.
+
+Notes
+-----
+nleg = 1 if spin == 0 else 2
+nalm = 1 if spin == 0 else (2 if mode == "STANDARD" else 1)
 )""";
 
 constexpr const char *map2leg_DS = R"""(
@@ -1388,11 +1327,11 @@ Transforms one or two sets of spherical harmonic coefficients to 2D maps.
 
 Parameters
 ----------
-alm: numpy.ndarray((ncomp, x), dtype=numpy.complex64 or numpy.complex128)
+alm: numpy.ndarray((nalm, x), dtype=numpy.complex64 or numpy.complex128)
     the set of spherical harmonic coefficients.
     The second dimension must be large enough to accommodate all entries, which
     are stored according to the healpy convention.
-map: numpy.ndarray((ncomp, ntheta, nphi), dtype=numpy.float of same accuracy as alm)
+map: numpy.ndarray((nmaps, ntheta, nphi), dtype=numpy.float of same accuracy as alm)
     storage for the output map.
     If not supplied, a new array is allocated.
 ntheta, nphi: int > 0
@@ -1401,7 +1340,6 @@ ntheta, nphi: int > 0
     If supplied, and `map` is also supplied, must match with the array dimensions
 spin: int >= 0
     the spin to use for the transform.
-    If spin==0, ncomp must be 1, otherwise 2
 lmax: int >= 0
     the maximum l moment of the transform (inclusive).
 mmax: int >= 0 and <= lmax
@@ -1424,12 +1362,23 @@ geometry: one of "CC", "F1", "MW", "MWflip", "GL", "DH", "F2"
 nthreads: int >= 0
     the number of threads to use for the computation.
     If 0, use as many threads as there are hardware threads available on the system
+mode: str
+    the transform mode
+      | "STANDARD": standard transform
+      | "GRAD_ONLY": only valid for spin>0, curl a_lm are assumed to be zero
+      | "DERIV1": same as "GRAD_ONLY", but spin is assumed to be 1 and a_lm are
+        muliplied by sqrt(l*(l+1))
 
 Returns
 -------
-numpy.ndarray((ncomp, ntheta, nphi), dtype=numpy.float of same accuracy as alm)
+numpy.ndarray((nmaps, ntheta, nphi), dtype=numpy.float of same accuracy as alm)
     the computed map. If the map parameter was specified, this is identical with
     map.
+
+Notes
+-----
+nmaps = 1 if spin == 0 else 2
+nalm = 1 if spin == 0 else (2 if mode == "STANDARD" else 1)
 )""";
 
 constexpr const char *synthesis_2d_deriv1_DS = R"""(
@@ -1474,7 +1423,7 @@ nthreads: int >= 0
 
 Returns
 -------
-numpy.ndarray((ncomp, ntheta, nphi), dtype=numpy.float of same accuracy as alm)
+numpy.ndarray((2, ntheta, nphi), dtype=numpy.float of same accuracy as alm)
     the maps containing the derivatives with respect to theta and phi.
     If the map parameter was specified, this is identical with map.
 )""";
@@ -1485,16 +1434,15 @@ This is the adjoint operation of `synthesis_2D`.
 
 Parameters
 ----------
-alm: numpy.ndarray((ncomp, x), dtype=numpy.complex64 or numpy.complex128)
+alm: numpy.ndarray((nalm, x), dtype=numpy.complex64 or numpy.complex128)
     storage for the spherical harmonic coefficients.
     The second dimension must be large enough to accommodate all entries, which
     are stored according to the healpy convention.
     If not supplied, a new array is allocated.
-map: numpy.ndarray((ncomp, ntheta, nphi), dtype=numpy.float of same accuracy as alm)
+map: numpy.ndarray((nmaps, ntheta, nphi), dtype=numpy.float of same accuracy as alm)
     The input map.
 spin: int >= 0
     the spin to use for the transform.
-    If spin==0, ncomp must be 1, otherwise 2
 lmax: int >= 0
     the maximum l (and m) moment of the transform (inclusive)
 mmax: int >= 0 and <= lmax
@@ -1517,12 +1465,21 @@ geometry: one of "CC", "F1", "MW", "MWflip", "GL", "DH", "F2"
 nthreads: int >= 0
     the number of threads to use for the computation.
     If 0, use as many threads as there are hardware threads available on the system
+mode: str
+    the transform mode
+      | "STANDARD": standard transform
+      | "GRAD_ONLY": only valid for spin>0, curl a_lm are not computed
 
 Returns
 -------
-numpy.ndarray((ncomp, x), dtype=numpy.complex64 or numpy.complex128)
+numpy.ndarray((nalm, x), dtype=numpy.complex64 or numpy.complex128)
     the computed spherical harmonic coefficients
     If the `alm` parameter was specified, this is identical to `alm`.
+
+Notes
+-----
+nmaps = 1 if spin == 0 else 2
+nalm = 1 if spin == 0 else (2 if mode == "STANDARD" else 1)
 )""";
 
 constexpr const char *analysis_2d_DS = R"""(
@@ -1648,12 +1605,11 @@ Transforms (sets of) one or two sets of spherical harmonic coefficients to maps
 
 Parameters
 ----------
-alm: numpy.ndarray(([ntrans,] ncomp, x), dtype=numpy.complex64 or numpy.complex128)
+alm: numpy.ndarray(([ntrans,] nalm, x), dtype=numpy.complex64 or numpy.complex128)
     the set(s) of spherical harmonic coefficients.
-    ncomp must be 1 if spin is 0, else 2.
     The last dimension must be large enough to accommodate all entries, which
     are stored according to the parameters `lmax`, 'mmax`, `mstart`, and `lstride`.
-map: None or numpy.ndarray(([ntrans,] ncomp, x), dtype=numpy.float of same accuracy as `alm`
+map: None or numpy.ndarray(([ntrans,] nmaps, x), dtype=numpy.float of same accuracy as `alm`
     the map pixel data.
     The last dimension must be large enough to accommodate all pixels, which
     are stored according to the parameters `nphi`, 'ringstart`, and `pixstride`.
@@ -1682,18 +1638,28 @@ nthreads: int >= 0
     if 0, use as many threads as there are hardware threads available on the system
 spin: int >= 0
     the spin to use for the transform.
-    If spin==0, ncomp must be 1, otherwise 2
 lmax: int >= 0
     the maximum l moment of the transform (inclusive).
 mmax: int >= 0 <= lmax
     the maximum m moment of the transform (inclusive).
+mode: str
+    the transform mode
+      | "STANDARD": standard transform
+      | "GRAD_ONLY": only valid for spin>0, curl a_lm are assumed to be zero
+      | "DERIV1": same as "GRAD_ONLY", but spin is assumed to be 1 and a_lm are
+        muliplied by sqrt(l*(l+1))
 
 Returns
 -------
-numpy.ndarray(([ntrans,] ncomp, x), dtype=numpy.float of same accuracy as `alm`)
+numpy.ndarray(([ntrans,] nmaps, x), dtype=numpy.float of same accuracy as `alm`)
     the map pixel data.
     If `map` was supplied, this will be the same object
     If newly allocated, the smallest possible last dimension will be chosen.
+
+Notes
+-----
+nmaps = 1 if spin == 0 else 2
+nalm = 1 if spin == 0 else (2 if mode == "STANDARD" else 1)
 )""";
 
 constexpr const char *adjoint_synthesis_DS = R"""(
@@ -1702,12 +1668,12 @@ This is the adjoint operation of `synthesis`.
 
 Parameters
 ----------
-alm: None or numpy.ndarray(([ntrans,] ncomp, x), dtype=numpy.complex of same precision as `map`)
+alm: None or numpy.ndarray(([ntrans,] nalm, x), dtype=numpy.complex of same precision as `map`)
     the set of spherical harmonic coefficients.
     The last dimension must be large enough to accommodate all entries, which
     are stored according to the parameters `lmax`, 'mmax`, `mstart`, and `lstride`.
     if `None`, a new suitable array is allocated
-map: numpy.ndarray(([ntrans,] ncomp, x), dtype=numpy.float32 or numpy.float64
+map: numpy.ndarray(([ntrans,] nmaps, x), dtype=numpy.float32 or numpy.float64
     The last dimension must be large enough to accommodate all pixels, which
     are stored according to the parameters `nphi`, `ringstart`, and `pixstride`.
 theta: numpy.ndarray((ntheta,), dtype=numpy.float64)
@@ -1734,18 +1700,26 @@ nthreads: int >= 0
     if 0, use as many threads as there are hardware threads available on the system
 spin: int >= 0
     the spin to use for the transform.
-    If spin==0, ncomp must be 1, otherwise 2
 lmax: int >= 0
     the maximum l moment of the transform (inclusive).
 mmax: int >= 0 <= lmax
     the maximum m moment of the transform (inclusive).
+mode: str
+    the transform mode
+      | "STANDARD": standard transform
+      | "GRAD_ONLY": only valid for spin>0, curl a_lm are not computed
 
 Returns
 -------
-numpy.ndarray(([ntrans,] ncomp, x), dtype=numpy.complex of same accuracy as `map`)
+numpy.ndarray(([ntrans,] nalm, x), dtype=numpy.complex of same accuracy as `map`)
     the set(s) of spherical harmonic coefficients.
     If `alm` was supplied, this will be the same object
     If newly allocated, the smallest possible last dimension will be chosen.
+
+Notes
+-----
+nmaps = 1 if spin == 0 else 2
+nalm = 1 if spin == 0 else (2 if mode == "STANDARD" else 1)
 )""";
 
 constexpr const char *pseudo_analysis_DS = R"""(
@@ -1875,13 +1849,12 @@ Evaluate a_lm at arbitrary positions on the sphere
 
 Parameters
 ----------
-alm: numpy.ndarray((ncomp, x), dtype=numpy.complex64 or numpy.complex128)
+alm: numpy.ndarray((nalm, x), dtype=numpy.complex64 or numpy.complex128)
     the set(s) of spherical harmonic coefficients.
     The last dimension must be large enough to accommodate all entries, which
     are stored according to the healpy convention.
 spin: int >= 0
     the spin to use for the transform.
-    If spin==0, ncomp must be 1, otherwise 2
 lmax: int >= 0
     the maximum l moment of the transform (inclusive).
 mmax: int >= 0 and <= lmax
@@ -1898,29 +1871,39 @@ epsilon : float
 nthreads: int >= 0
     the number of threads to use for the computation
     if 0, use as many threads as there are hardware threads available on the system
-map: None or numpy.ndarray((ncomp, npix), dtype=numpy.float of same accuracy as `alm`
+map: None or numpy.ndarray((nmaps, npix), dtype=numpy.float of same accuracy as `alm`
     the map pixel data.
     If `None`, a new suitable array is allocated.
 sigma_min, sigma_max: float
     minimum and maximum allowed oversampling factors for the NUFFT component
     1.2 <= sigma_min < sigma_max <= 2.5
+mode: str
+    the transform mode
+      | "STANDARD": standard transform
+      | "GRAD_ONLY": only valid for spin>0, curl a_lm are assumed to be zero
+      | "DERIV1": same as "GRAD_ONLY", but spin is assumed to be 1 and a_lm are
+        muliplied by sqrt(l*(l+1))
 
 Returns
 -------
-numpy.ndarray((ncomp, npix), dtype=numpy.float of same accuracy as `alm`
+numpy.ndarray((nmaps, npix), dtype=numpy.float of same accuracy as `alm`
     the pixel values at the locations specified by `loc`.
     If the map parameter was specified, this is identical with map.
+
+Notes
+-----
+nmaps = 1 if spin == 0 else 2
+nalm = 1 if spin == 0 else (2 if mode == "STANDARD" else 1)
 )""";
 constexpr const char *adjoint_synthesis_general_DS = R"""(
 This is the adjoint operation of `synthesis_general`.
 
 Parameters
 ----------
-map: numpy.ndarray((ncomp, npix), dtype=numpy.float32 or numpy.float64
+map: numpy.ndarray((nmaps, npix), dtype=numpy.float32 or numpy.float64
     The pixel values at the locations specified by `loc`.
 spin: int >= 0
     the spin to use for the transform.
-    If spin==0, ncomp must be 1, otherwise 2
 lmax: int >= 0
     the maximum l moment of the transform (inclusive).
 mmax: int >= 0 and <= lmax
@@ -1937,7 +1920,7 @@ epsilon : float
 nthreads: int >= 0
     the number of threads to use for the computation
     if 0, use as many threads as there are hardware threads available on the system
-alm: None or numpy.ndarray((ncomp, x), dtype=complex, same accuracy as `map`)
+alm: None or numpy.ndarray((nalm, x), dtype=complex, same accuracy as `map`)
     the set(s) of spherical harmonic coefficients.
     The last dimension must be large enough to accommodate all entries, which
     are stored according to the healpy convention.
@@ -1945,12 +1928,21 @@ alm: None or numpy.ndarray((ncomp, x), dtype=complex, same accuracy as `map`)
 sigma_min, sigma_max: float
     minimum and maximum allowed oversampling factors for the NUFFT component
     1.2 <= sigma_min < sigma_max <= 2.5
+mode: str
+    the transform mode
+      | "STANDARD": standard transform
+      | "GRAD_ONLY": only valid for spin>0, curl a_lm are not computed
 
 Returns
 -------
-numpy.ndarray((ncomp, x), dtype=complex, same accuracy as `map`)
+numpy.ndarray((nalm, x), dtype=complex, same accuracy as `map`)
     the computed spherical harmonic coefficients
     If the `alm` parameter was specified, this is identical to `alm`.
+
+Notes
+-----
+nmaps = 1 if spin == 0 else 2
+nalm = 1 if spin == 0 else (2 if mode == "STANDARD" else 1)
 )""";
 
 constexpr const char *pseudo_analysis_general_DS = R"""(
@@ -2033,10 +2025,10 @@ void add_sht(py::module_ &msup)
 
   m2.def("synthesis", &Py_synthesis, synthesis_DS, py::kw_only(), "alm"_a, "theta"_a,
     "lmax"_a, "mstart"_a=None, "nphi"_a, "phi0"_a, "ringstart"_a, "spin"_a,
-    "lstride"_a=1, "pixstride"_a=1, "nthreads"_a=1, "map"_a=None, "mmax"_a=None);
+    "lstride"_a=1, "pixstride"_a=1, "nthreads"_a=1, "map"_a=None, "mmax"_a=None, "mode"_a="STANDARD");
   m2.def("adjoint_synthesis", &Py_adjoint_synthesis, adjoint_synthesis_DS, py::kw_only(), "map"_a, "theta"_a,
     "lmax"_a, "mstart"_a=None, "nphi"_a, "phi0"_a, "ringstart"_a, "spin"_a,
-    "lstride"_a=1, "pixstride"_a=1, "nthreads"_a=1, "alm"_a=None, "mmax"_a=None);
+    "lstride"_a=1, "pixstride"_a=1, "nthreads"_a=1, "alm"_a=None, "mmax"_a=None, "mode"_a="STANDARD");
   m2.def("pseudo_analysis", &Py_pseudo_analysis, pseudo_analysis_DS, py::kw_only(), "map"_a, "theta"_a,
     "lmax"_a, "mstart"_a=None, "nphi"_a, "phi0"_a, "ringstart"_a, "spin"_a,
     "lstride"_a=1, "pixstride"_a=1, "nthreads"_a=1, "alm"_a=None, "maxiter"_a, "epsilon"_a, "mmax"_a=None);
@@ -2044,23 +2036,23 @@ void add_sht(py::module_ &msup)
     "lmax"_a, "mstart"_a=None, "nphi"_a, "phi0"_a, "ringstart"_a,
     "lstride"_a=1, "pixstride"_a=1, "nthreads"_a=1, "map"_a=None, "mmax"_a=None);
 
-  m2.def("synthesis_2d", &Py_synthesis_2d, synthesis_2d_DS, py::kw_only(), "alm"_a, "spin"_a, "lmax"_a, "geometry"_a, "ntheta"_a=None, "nphi"_a=None, "mmax"_a=None, "nthreads"_a=1, "map"_a=None);
-  m2.def("adjoint_synthesis_2d", &Py_adjoint_synthesis_2d, adjoint_synthesis_2d_DS, py::kw_only(), "map"_a, "spin"_a, "lmax"_a, "geometry"_a, "mmax"_a=None, "nthreads"_a=1, "alm"_a=None);
+  m2.def("synthesis_2d", &Py_synthesis_2d, synthesis_2d_DS, py::kw_only(), "alm"_a, "spin"_a, "lmax"_a, "geometry"_a, "ntheta"_a=None, "nphi"_a=None, "mmax"_a=None, "nthreads"_a=1, "map"_a=None, "mode"_a="STANDARD");
+  m2.def("adjoint_synthesis_2d", &Py_adjoint_synthesis_2d, adjoint_synthesis_2d_DS, py::kw_only(), "map"_a, "spin"_a, "lmax"_a, "geometry"_a, "mmax"_a=None, "nthreads"_a=1, "alm"_a=None, "mode"_a="STANDARD");
   m2.def("synthesis_2d_deriv1", &Py_synthesis_2d_deriv1, synthesis_2d_deriv1_DS, py::kw_only(), "alm"_a, "lmax"_a, "geometry"_a, "ntheta"_a=None, "nphi"_a=None, "mmax"_a=None, "nthreads"_a=1, "map"_a=None);
   m2.def("analysis_2d", &Py_analysis_2d, analysis_2d_DS, py::kw_only(), "map"_a, "spin"_a, "lmax"_a, "geometry"_a, "mmax"_a=None, "nthreads"_a=1, "alm"_a=None);
   m2.def("adjoint_analysis_2d", &Py_adjoint_analysis_2d, adjoint_analysis_2d_DS, py::kw_only(), "alm"_a, "spin"_a, "lmax"_a, "geometry"_a, "ntheta"_a=None, "nphi"_a=None, "mmax"_a=None, "nthreads"_a=1, "map"_a=None);
 
   // FIXME: maybe add mstart, lstride
-  m2.def("synthesis_general", &Py_synthesis_general, synthesis_general_DS, py::kw_only(), "alm"_a, "spin"_a, "lmax"_a, "loc"_a, "epsilon"_a=1e-5, "mmax"_a=None, "nthreads"_a=1, "map"_a=None, "sigma_min"_a=1.1, "sigma_max"_a=2.6);
-  m2.def("adjoint_synthesis_general", &Py_adjoint_synthesis_general, adjoint_synthesis_general_DS, py::kw_only(), "map"_a, "spin"_a, "lmax"_a, "loc"_a, "epsilon"_a=1e-5, "mmax"_a=None, "nthreads"_a=1, "alm"_a=None, "sigma_min"_a=1.1, "sigma_max"_a=2.6);
+  m2.def("synthesis_general", &Py_synthesis_general, synthesis_general_DS, py::kw_only(), "alm"_a, "spin"_a, "lmax"_a, "loc"_a, "epsilon"_a=1e-5, "mmax"_a=None, "nthreads"_a=1, "map"_a=None, "sigma_min"_a=1.1, "sigma_max"_a=2.6, "mode"_a="STANDARD");
+  m2.def("adjoint_synthesis_general", &Py_adjoint_synthesis_general, adjoint_synthesis_general_DS, py::kw_only(), "map"_a, "spin"_a, "lmax"_a, "loc"_a, "epsilon"_a=1e-5, "mmax"_a=None, "nthreads"_a=1, "alm"_a=None, "sigma_min"_a=1.1, "sigma_max"_a=2.6, "mode"_a="STANDARD");
   m2.def("pseudo_analysis_general", &Py_pseudo_analysis_general, pseudo_analysis_general_DS, py::kw_only(), "lmax"_a, "map"_a, "loc"_a, "spin"_a, "nthreads"_a, "maxiter"_a, "epsilon"_a=1e-5, "sigma_min"_a=1.1, "sigma_max"_a=2.6, "mmax"_a=None, "alm"_a=None);
 
   m2.def("GL_weights",&Py_GL_weights, "nlat"_a, "nlon"_a);
   m2.def("GL_thetas",&Py_GL_thetas, "nlat"_a);
   m2.def("get_gridweights", &Py_get_gridweights, "type"_a, "ntheta"_a);
-  m2.def("alm2leg", &Py_alm2leg, alm2leg_DS, py::kw_only(), "alm"_a, "lmax"_a, "theta"_a, "spin"_a=0, "mval"_a=None, "mstart"_a=None, "lstride"_a=1, "nthreads"_a=1, "leg"_a=None);
+  m2.def("alm2leg", &Py_alm2leg, alm2leg_DS, py::kw_only(), "alm"_a, "lmax"_a, "theta"_a, "spin"_a=0, "mval"_a=None, "mstart"_a=None, "lstride"_a=1, "nthreads"_a=1, "leg"_a=None, "mode"_a="STANDARD");
   m2.def("alm2leg_deriv1", &Py_alm2leg_deriv1, alm2leg_deriv1_DS, py::kw_only(), "alm"_a, "lmax"_a, "theta"_a, "mval"_a=None, "mstart"_a=None, "lstride"_a=1, "nthreads"_a=1, "leg"_a=None);
-  m2.def("leg2alm", &Py_leg2alm, leg2alm_DS, py::kw_only(), "leg"_a, "lmax"_a, "theta"_a, "spin"_a=0, "mval"_a=None, "mstart"_a=None, "lstride"_a=1, "nthreads"_a=1, "alm"_a=None);
+  m2.def("leg2alm", &Py_leg2alm, leg2alm_DS, py::kw_only(), "leg"_a, "lmax"_a, "theta"_a, "spin"_a=0, "mval"_a=None, "mstart"_a=None, "lstride"_a=1, "nthreads"_a=1, "alm"_a=None, "mode"_a="STANDARD");
   m2.def("map2leg", &Py_map2leg, map2leg_DS, py::kw_only(), "map"_a, "nphi"_a, "phi0"_a, "ringstart"_a, "mmax"_a, "pixstride"_a=1, "nthreads"_a=1, "leg"_a=None);
   m2.def("leg2map", &Py_leg2map, leg2map_DS, py::kw_only(), "leg"_a, "nphi"_a, "phi0"_a, "ringstart"_a, "pixstride"_a=1, "nthreads"_a=1, "map"_a=None);
   m.def("rotate_alm", &Py_rotate_alm, rotate_alm_DS, "alm"_a, "lmax"_a, "psi"_a, "theta"_a,
diff --git a/python/test/test_sht.py b/python/test/test_sht.py
index 4202bcdf..c3c8c244 100644
--- a/python/test/test_sht.py
+++ b/python/test/test_sht.py
@@ -131,6 +131,20 @@ def test_2d_adjoint(lmmax, geometry, spin, nthreads):
     v1 = np.sum([myalmdot(alm0[i], alm1[i], lmax) for i in range(ncomp)])
     assert_(np.abs((v1-v2)/v1)<1e-10)
 
+    if spin > 0:
+        # test adjointness between synthesis and adjoint_synthesis (gradient only)
+        map1 = ducc0.sht.experimental.synthesis_2d(alm=alm0[:1], lmax=lmax, mmax=mmax, spin=spin, ntheta=nrings, nphi=nphi, nthreads=nthreads, geometry=geometry, mode="GRAD_ONLY")
+        v2 = np.sum([ducc0.misc.vdot(map0[i], map1[i]) for i in range(ncomp)])
+        # compare grad-only alm2map with full alm2map where curl is set to zero
+        almx = alm0.copy()
+        almx[1] = 0
+        map1b = ducc0.sht.experimental.synthesis_2d(alm=almx, lmax=lmax, mmax=mmax, spin=spin, ntheta=nrings, nphi=nphi, nthreads=nthreads, geometry=geometry)
+        assert_(ducc0.misc.l2error(map1, map1b)<1e-10)
+        del almx, map1
+        alm1 = ducc0.sht.experimental.adjoint_synthesis_2d(lmax=lmax, mmax=mmax, spin=spin, map=map0, nthreads=nthreads, geometry=geometry, mode="GRAD_ONLY")
+        v1 = np.sum([myalmdot(alm0[i], alm1[i], lmax) for i in range(1)])
+        assert_(np.abs((v1-v2)/v1)<1e-10)
+
     # test adjointness between analysis and adjoint_analysis
 
     # naive version
@@ -189,6 +203,14 @@ def test_healpix_adjoint(lmax, nside, spin, mmaxhalf, nthreads):
     v1 = np.sum([myalmdot(alm0[i], alm1[i], lmax) for i in range(ncomp)])
     assert_(np.abs((v1-v2)/v1)<1e-10)
 
+    if spin > 0:
+        map1 = ducc0.sht.experimental.synthesis(alm=alm0[:1], lmax=lmax, spin=spin, nthreads=nthreads, mmax=mmax, mode="GRAD_ONLY", **geom)
+        v2 = np.sum([ducc0.misc.vdot(map0[i], map1[i]) for i in range(ncomp)])
+        del map1
+        alm1 = ducc0.sht.experimental.adjoint_synthesis(lmax=lmax, spin=spin, map=map0, nthreads=nthreads, mmax=mmax, mode="GRAD_ONLY", **geom)
+        v1 = np.sum([myalmdot(alm0[i], alm1[i], lmax) for i in range(1)])
+        assert_(np.abs((v1-v2)/v1)<1e-10)
+
 
 @pmp("lmax", tuple(range(0,70,3)))
 @pmp("nthreads", (0,1,2))
@@ -243,15 +265,21 @@ def test_adjointness_general(lmmax, npix, spin, nthreads):
     loc = rng.uniform(0., 1., (npix,2))
     loc[:, 0] *= np.pi
     loc[:, 1] *= 2*np.pi
-    points1 = ducc0.sht.experimental.synthesis_general(lmax=lmax, mmax=mmax, alm=slm1, loc=loc, spin=spin, epsilon=epsilon, nthreads=nthreads)
     points2 = rng.uniform(-0.5, 0.5, (loc.shape[0],ncomp)).T
+    points1 = ducc0.sht.experimental.synthesis_general(lmax=lmax, mmax=mmax, alm=slm1, loc=loc, spin=spin, epsilon=epsilon, nthreads=nthreads)
     slm2 = ducc0.sht.experimental.adjoint_synthesis_general(lmax=lmax, mmax=mmax, map=points2, loc=loc, spin=spin, epsilon=epsilon, nthreads=nthreads)
-    print(slm1.shape, slm2.shape)
     v1 = np.sum([myalmdot(slm1[c, :], slm2[c, :], lmax)
                 for c in range(ncomp)])
     v2 = ducc0.misc.vdot(points2.real, points1.real) + ducc0.misc.vdot(points2.imag, points1.imag) 
     assert_allclose(v1, v2, rtol=1e-9)
 
+    if spin > 0:
+        points1 = ducc0.sht.experimental.synthesis_general(lmax=lmax, mmax=mmax, alm=slm1[:1], loc=loc, spin=spin, epsilon=epsilon, nthreads=nthreads, mode="GRAD_ONLY")
+        slm2 = ducc0.sht.experimental.adjoint_synthesis_general(lmax=lmax, mmax=mmax, map=points2, loc=loc, spin=spin, epsilon=epsilon, nthreads=nthreads, mode="GRAD_ONLY")
+        v1 = np.sum([myalmdot(slm1[c, :], slm2[c, :], lmax)
+                    for c in range(1)])
+        v2 = ducc0.misc.vdot(points2.real, points1.real) + ducc0.misc.vdot(points2.imag, points1.imag) 
+        assert_allclose(v1, v2, rtol=1e-9)
 
 @pmp('spin', (0, 1, 2))
 @pmp('nthreads', (1, 4))
diff --git a/src/ducc0/sht/sht.cc b/src/ducc0/sht/sht.cc
index 110d7b94..2d7222af 100644
--- a/src/ducc0/sht/sht.cc
+++ b/src/ducc0/sht/sht.cc
@@ -1133,7 +1133,7 @@ DUCC0_NOINLINE static void calc_map2alm_spin (dcmplx * DUCC0_RESTRICT alm,
   }
 
 
-DUCC0_NOINLINE static void alm2map_deriv1_kernel(sxdata_v & DUCC0_RESTRICT d,
+DUCC0_NOINLINE static void alm2map_spin_gradonly_kernel(sxdata_v & DUCC0_RESTRICT d,
   const vector<Ylmgen::dbl2> &fx, const dcmplx * DUCC0_RESTRICT alm,
   size_t l, size_t lmax, size_t nv2)
   {
@@ -1177,7 +1177,7 @@ DUCC0_NOINLINE static void alm2map_deriv1_kernel(sxdata_v & DUCC0_RESTRICT d,
     }
   }
 
-DUCC0_NOINLINE static void calc_alm2map_deriv1(const dcmplx * DUCC0_RESTRICT alm,
+DUCC0_NOINLINE static void calc_alm2map_spin_gradonly(const dcmplx * DUCC0_RESTRICT alm,
   const Ylmgen &gen, sxdata_v & DUCC0_RESTRICT d, size_t nth)
   {
   size_t l,lmax=gen.lmax;
@@ -1240,7 +1240,7 @@ DUCC0_NOINLINE static void calc_alm2map_deriv1(const dcmplx * DUCC0_RESTRICT alm
     d.l1m[i] *= d.cfm[i];
     d.l2m[i] *= d.cfm[i];
     }
-  alm2map_deriv1_kernel(d, fx, alm, l, lmax, nv2);
+  alm2map_spin_gradonly_kernel(d, fx, alm, l, lmax, nv2);
 
   for (size_t i=0; i<nv2; ++i)
     {
@@ -1252,6 +1252,117 @@ DUCC0_NOINLINE static void calc_alm2map_deriv1(const dcmplx * DUCC0_RESTRICT alm
     }
   }
 
+DUCC0_NOINLINE static void map2alm_spin_gradonly_kernel(sxdata_v & DUCC0_RESTRICT d,
+  const vector<Ylmgen::dbl2> &fx, dcmplx * DUCC0_RESTRICT alm,
+  size_t l, size_t lmax, size_t nv2)
+  {
+  size_t lsave=l;
+  while (l<=lmax)
+    {
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
+    Tv agr1=0, agi1=0;
+    Tv agr2=0, agi2=0;
+    for (size_t i=0; i<nv2; ++i)
+      {
+      d.l1p[i] = (d.cth[i]*fx10 - fx11)*d.l2p[i] - d.l1p[i];
+      agr1 += d.p2mi[i]*d.l2p[i];
+      agi1 -= d.p2mr[i]*d.l2p[i];
+      agr2 += d.p2pr[i]*d.l1p[i];
+      agi2 += d.p2pi[i]*d.l1p[i];
+      d.l2p[i] = (d.cth[i]*fx20 - fx21)*d.l1p[i] - d.l2p[i];
+      }
+    vhsum_cmplx_special (agr1,agi1,agr2,agi2,&alm[l]);
+    l+=2;
+    }
+  l=lsave;
+  while (l<=lmax)
+    {
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
+    Tv agr1=0, agi1=0;
+    Tv agr2=0, agi2=0;
+    for (size_t i=0; i<nv2; ++i)
+      {
+      d.l1m[i] = (d.cth[i]*fx10 + fx11)*d.l2m[i] - d.l1m[i];
+      agr1 += d.p1pr[i]*d.l2m[i];
+      agi1 += d.p1pi[i]*d.l2m[i];
+      agr2 -= d.p1mi[i]*d.l1m[i];
+      agi2 += d.p1mr[i]*d.l1m[i];
+      d.l2m[i] = (d.cth[i]*fx20 + fx21)*d.l1m[i] - d.l2m[i];
+      }
+    vhsum_cmplx_special (agr1,agi1,agr2,agi2,&alm[l]);
+    l+=2;
+    }
+  }
+
+DUCC0_NOINLINE static void calc_map2alm_spin_gradonly (dcmplx * DUCC0_RESTRICT alm,
+  const Ylmgen &gen, sxdata_v & DUCC0_RESTRICT d, size_t nth)
+  {
+  size_t l,lmax=gen.lmax;
+  size_t nv2 = (nth+VLEN-1)/VLEN;
+  iter_to_ieee_spin(gen, d, l, nv2);
+  if (l>lmax) return;
+
+  const auto &fx = gen.coef;
+  bool full_ieee=true;
+  for (size_t i=0; i<nv2; ++i)
+    {
+    getCorfac(d.scp[i], d.cfp[i]);
+    getCorfac(d.scm[i], d.cfm[i]);
+    full_ieee &= all_of(d.scp[i]>=0) &&
+                 all_of(d.scm[i]>=0);
+    }
+  for (size_t i=0; i<nv2; ++i)
+    {
+    Tv tmp;
+    tmp = d.p1pr[i]; d.p1pr[i] -= d.p2mi[i]; d.p2mi[i] += tmp;
+    tmp = d.p1pi[i]; d.p1pi[i] += d.p2mr[i]; d.p2mr[i] -= tmp;
+    tmp = d.p1mr[i]; d.p1mr[i] += d.p2pi[i]; d.p2pi[i] -= tmp;
+    tmp = d.p1mi[i]; d.p1mi[i] -= d.p2pr[i]; d.p2pr[i] += tmp;
+    }
+
+  while((!full_ieee) && (l<=lmax))
+    {
+    Tv fx10=fx[l+1].a,fx11=fx[l+1].b;
+    Tv fx20=fx[l+2].a,fx21=fx[l+2].b;
+    Tv agr1=0, agi1=0;
+    Tv agr2=0, agi2=0;
+    full_ieee=1;
+    for (size_t i=0; i<nv2; ++i)
+      {
+      d.l1p[i] = (d.cth[i]*fx10 - fx11)*d.l2p[i] - d.l1p[i];
+      d.l1m[i] = (d.cth[i]*fx10 + fx11)*d.l2m[i] - d.l1m[i];
+      Tv l2p = d.l2p[i]*d.cfp[i], l2m = d.l2m[i]*d.cfm[i];
+      Tv l1p = d.l1p[i]*d.cfp[i], l1m = d.l1m[i]*d.cfm[i];
+      agr1 += d.p1pr[i]*l2m + d.p2mi[i]*l2p;
+      agi1 += d.p1pi[i]*l2m - d.p2mr[i]*l2p;
+      agr2 += d.p2pr[i]*l1p - d.p1mi[i]*l1m;
+      agi2 += d.p2pi[i]*l1p + d.p1mr[i]*l1m;
+
+      d.l2p[i] = (d.cth[i]*fx20 - fx21)*d.l1p[i] - d.l2p[i];
+      d.l2m[i] = (d.cth[i]*fx20 + fx21)*d.l1m[i] - d.l2m[i];
+      if (rescale(d.l1p[i], d.l2p[i], d.scp[i], sharp_ftol))
+        getCorfac(d.scp[i], d.cfp[i]);
+      full_ieee &= all_of(d.scp[i]>=0);
+      if (rescale(d.l1m[i], d.l2m[i], d.scm[i], sharp_ftol))
+        getCorfac(d.scm[i], d.cfm[i]);
+      full_ieee &= all_of(d.scm[i]>=0);
+      }
+    vhsum_cmplx_special (agr1,agi1,agr2,agi2,&alm[l]);
+    l+=2;
+    }
+  if (l>lmax) return;
+
+  for (size_t i=0; i<nv2; ++i)
+    {
+    d.l1p[i] *= d.cfp[i];
+    d.l2p[i] *= d.cfp[i];
+    d.l1m[i] *= d.cfm[i];
+    d.l2m[i] *= d.cfm[i];
+    }
+  map2alm_spin_gradonly_kernel(d, fx, alm, l, lmax, nv2);
+  }
 
 template<typename T> DUCC0_NOINLINE static void inner_loop_a2m(SHT_mode mode,
   vmav<complex<double>,2> &almtmp,
@@ -1374,9 +1485,10 @@ template<typename T> DUCC0_NOINLINE static void inner_loop_a2m(SHT_mode mode,
         for (size_t i=0; i<nvec; ++i)
           d.v.p1pr[i] = d.v.p1pi[i] = d.v.p2pr[i] = d.v.p2pi[i] =
           d.v.p1mr[i] = d.v.p1mi[i] = d.v.p2mr[i] = d.v.p2mi[i] = 0;
-        (mode==ALM2MAP) ?
-          calc_alm2map_spin  (almtmp.data(), gen, d.v, nth) :
-          calc_alm2map_deriv1(almtmp.data(), gen, d.v, nth);
+        if (mode==STANDARD)
+          calc_alm2map_spin(almtmp.data(), gen, d.v, nth);
+        else // GRAD_ONLY or DERIV1
+          calc_alm2map_spin_gradonly(almtmp.data(), gen, d.v, nth);
         double fct = ((gen.mhi-gen.m+gen.s)&1) ? -1.: 1.;
         for (size_t i=0; i<nvec; ++i)
           {
@@ -1412,7 +1524,7 @@ template<typename T> DUCC0_NOINLINE static void inner_loop_a2m(SHT_mode mode,
     }
   }
 
-template<typename T> DUCC0_NOINLINE static void inner_loop_m2a(
+template<typename T> DUCC0_NOINLINE static void inner_loop_m2a(SHT_mode mode,
   vmav<complex<double>,2> &almtmp,
   const cmav<complex<T>,3> &phase, const vector<ringdata> &rdata,
   Ylmgen &gen, size_t mi)
@@ -1508,15 +1620,16 @@ template<typename T> DUCC0_NOINLINE static void inner_loop_m2a(
           d.s.p1pr[i]=d.s.p1pi[i]=d.s.p2pr[i]=d.s.p2pi[i]=0.;
           d.s.p1mr[i]=d.s.p1mi[i]=d.s.p2mr[i]=d.s.p2mi[i]=0.;
           }
-        calc_map2alm_spin(almtmp.data(), gen, d.v, nth);
+        if (mode==STANDARD)
+          calc_map2alm_spin(almtmp.data(), gen, d.v, nth);
+        else
+          calc_map2alm_spin_gradonly(almtmp.data(), gen, d.v, nth);
         }
       }
     //adjust the a_lm for the new algorithm
     for (size_t l=gen.mhi; l<=gen.lmax; ++l)
-      {
-      almtmp(l,0)*=gen.alpha[l];
-      almtmp(l,1)*=gen.alpha[l];
-      }
+      for (size_t i=0; i<almtmp.shape(1); ++i)
+        almtmp(l,i)*=gen.alpha[l];
     }
   }
 
@@ -1803,12 +1916,18 @@ template<typename T> void alm2leg(  // associated Legendre transform
   MR_assert(nm==leg.shape(2), "nm mismatch");
   auto nalm=alm.shape(0);
   auto mmax = get_mmax(mval, lmax);
-  if (mode==ALM2MAP_DERIV1)
+  if (mode==DERIV1)
     {
     spin=1;
     MR_assert(nalm==1, "need one a_lm component");
     MR_assert(leg.shape(0)==2, "need two Legendre components");
     }
+  else if (mode==GRAD_ONLY)
+    {
+    MR_assert(spin>0, "spin must be positive for grad-only SHTs");
+    MR_assert(nalm==1, "need one a_lm component");
+    MR_assert(leg.shape(0)==2, "need two Legendre components");
+    }
   else
     {
     size_t ncomp = (spin==0) ? 1 : 2;
@@ -1838,8 +1957,8 @@ template<typename T> void alm2leg(  // associated Legendre transform
     return;
     }
 
-  auto norm_l = (mode==ALM2MAP_DERIV1) ? Ylmgen::get_d1norm (lmax) :
-                                         Ylmgen::get_norm (lmax, spin);
+  auto norm_l = (mode==DERIV1) ? Ylmgen::get_d1norm (lmax) :
+                                 Ylmgen::get_norm (lmax, spin);
   auto rdata = make_ringdata(theta, lmax, spin);
   YlmBase base(lmax, mmax, spin);
 
@@ -1875,7 +1994,8 @@ template<typename T> void leg2alm(  // associated Legendre transform
   const cmav<size_t,1> &mstart, // (nm)
   ptrdiff_t lstride,
   const cmav<double,1> &theta, // (nrings)
-  size_t nthreads)
+  size_t nthreads,
+  SHT_mode mode)
   {
   // sanity checks
   auto nrings=theta.shape(0);
@@ -1884,9 +2004,21 @@ template<typename T> void leg2alm(  // associated Legendre transform
   MR_assert(nm==mstart.shape(0), "nm mismatch");
   MR_assert(nm==leg.shape(2), "nm mismatch");
   auto mmax = get_mmax(mval, lmax);
-  size_t ncomp = (spin==0) ? 1 : 2;
-  MR_assert(alm.shape(0)==ncomp, "incorrect number of a_lm components");
-  MR_assert(leg.shape(0)==ncomp, "incorrect number of Legendre components");
+  auto nalm = alm.shape(0);
+  if (mode==DERIV1)
+    MR_fail("DERIV1 mode not supported in map->alm direction");
+  else if (mode==GRAD_ONLY)
+    {
+    MR_assert(spin>0, "spin must be positive for grad-only SHTs");
+    MR_assert(nalm==1, "need one a_lm component");
+    MR_assert(leg.shape(0)==2, "need two Legendre components");
+    }
+  else
+    {
+    size_t ncomp = (spin==0) ? 1 : 2;
+    MR_assert(nalm==ncomp, "incorrect number of a_lm components");
+    MR_assert(leg.shape(0)==ncomp, "incorrect number of Legendre components");
+    }
 
   bool npi, spi;
   size_t ntheta_tmp;
@@ -1897,7 +2029,7 @@ template<typename T> void leg2alm(  // associated Legendre transform
       theta_tmp(i) = i*pi/(ntheta_tmp-1);
     auto leg_tmp(vmav<complex<T>,3>::build_noncritical({leg.shape(0), ntheta_tmp, leg.shape(2)}, UNINITIALIZED));
     resample_theta(leg, npi, spi, leg_tmp, true, true, spin, nthreads, true);
-    leg2alm(alm, leg_tmp, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads);
+    leg2alm(alm, leg_tmp, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads, mode);
     return;
     }
 
@@ -1908,22 +2040,22 @@ template<typename T> void leg2alm(  // associated Legendre transform
   ducc0::execDynamic(nm, nthreads, 1, [&](ducc0::Scheduler &sched)
     {
     Ylmgen gen(base);
-    vmav<complex<double>,2> almtmp({lmax+2,ncomp}, UNINITIALIZED);
+    vmav<complex<double>,2> almtmp({lmax+2,nalm}, UNINITIALIZED);
 
     while (auto rng=sched.getNext()) for(auto mi=rng.lo; mi<rng.hi; ++mi)
       {
       auto m=mval(mi);
       gen.prepare(m);
       for (size_t l=m; l<almtmp.shape(0); ++l)
-        for (size_t ialm=0; ialm<ncomp; ++ialm)
+        for (size_t ialm=0; ialm<nalm; ++ialm)
           almtmp(l,ialm) = 0.;
-      inner_loop_m2a (almtmp, leg, rdata, gen, mi);
+      inner_loop_m2a (mode, almtmp, leg, rdata, gen, mi);
       auto lmin=max(spin,m);
       for (size_t l=m; l<lmin; ++l)
-        for (size_t ialm=0; ialm<ncomp; ++ialm)
+        for (size_t ialm=0; ialm<nalm; ++ialm)
           alm(ialm,mstart(mi)+l*lstride) = 0;
       for (size_t l=lmin; l<=lmax; ++l)
-        for (size_t ialm=0; ialm<ncomp; ++ialm)
+        for (size_t ialm=0; ialm<nalm; ++ialm)
           alm(ialm,mstart(mi)+l*lstride) = complex<T>(almtmp(l,ialm)*norm_l[l]);
       }
     }); /* end of parallel region */
@@ -2329,13 +2461,19 @@ void sanity_checks(
             (nphi.shape(0)==nrings) &&
             (ringstart.shape(0)==nrings),
     "inconsistency in the number of rings");
-  size_t ncomp = 1+(spin>0);
-  if (mode==ALM2MAP_DERIV1)
+  if ((mode==DERIV1) || (mode==GRAD_ONLY))
+    {
+    MR_assert(spin>0, "DERIV and GRAD_ONLY modes require spin>0");
+cout << alm.shape(0) <<" " << map.shape(0)<< endl;
     MR_assert((alm.shape(0)==1) && (map.shape(0)==2),
       "inconsistent number of components");
+    }
   else
+    {
+    size_t ncomp = 1+(spin>0);
     MR_assert((alm.shape(0)==ncomp) && (map.shape(0)==ncomp),
       "inconsistent number of components");
+    }
   }
 
 template<typename T> void synthesis(
@@ -2462,9 +2600,10 @@ template<typename T> void adjoint_synthesis(
   const cmav<double,1> &phi0, // (nrings)
   const cmav<size_t,1> &ringstart, // (nrings)
   ptrdiff_t pixstride,
-  size_t nthreads)
+  size_t nthreads,
+  SHT_mode mode)
   {
-  sanity_checks(alm, lmax, mstart, map, theta, phi0, nphi, ringstart, spin, MAP2ALM);
+  sanity_checks(alm, lmax, mstart, map, theta, phi0, nphi, ringstart, spin, mode);
   vmav<size_t,1> mval({mstart.shape(0)}, UNINITIALIZED);
   for (size_t i=0; i<mstart.shape(0); ++i)
     mval(i) = i;
@@ -2481,13 +2620,13 @@ template<typename T> void adjoint_synthesis(
     auto lego(subarray<3>(leg, {{},{0,ntheta_tmp},{}}));
     map2leg(map, legi, nphi, phi0, ringstart, pixstride, nthreads);
     resample_theta(legi, npi, spi, lego, true, true, spin, nthreads, true);
-    leg2alm(alm, lego, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads);
+    leg2alm(alm, lego, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads,mode);
     }
   else
     {
-    auto leg(vmav<complex<T>,3>::build_noncritical({alm.shape(0),theta.shape(0),mstart.shape(0)}, UNINITIALIZED));
+    auto leg(vmav<complex<T>,3>::build_noncritical({map.shape(0),theta.shape(0),mstart.shape(0)}, UNINITIALIZED));
     map2leg(map, leg, nphi, phi0, ringstart, pixstride, nthreads);
-    leg2alm(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads);
+    leg2alm(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads, mode);
     }
   }
 template<typename T> tuple<size_t, size_t, double, double> pseudo_analysis(
@@ -2509,12 +2648,12 @@ template<typename T> tuple<size_t, size_t, double, double> pseudo_analysis(
   auto op = [&](const cmav<complex<T>,2> &xalm, vmav<T,2> &xmap)
     {
     synthesis(xalm, xmap, spin, lmax, mstart, lstride, theta, nphi, phi0,
-              ringstart, pixstride, nthreads, ALM2MAP);
+              ringstart, pixstride, nthreads, STANDARD);
     };
   auto op_adj = [&](const cmav<T,2> &xmap, vmav<complex<T>,2> &xalm)
     {
     adjoint_synthesis(xalm, xmap, spin, lmax, mstart, lstride, theta, nphi,
-                      phi0, ringstart, pixstride, nthreads);
+                      phi0, ringstart, pixstride, nthreads, STANDARD);
     };
   auto mapnorm = [&](const cmav<T,2> &xmap)
     {
@@ -2585,7 +2724,7 @@ template tuple<size_t, size_t, double, double> pseudo_analysis(
 
 template<typename T> void adjoint_synthesis_2d(vmav<complex<T>,2> &alm,
   const cmav<T,3> &map, size_t spin, size_t lmax, size_t mmax,
-  const string &geometry, size_t nthreads)
+  const string &geometry, size_t nthreads, SHT_mode mode)
   {
   auto nphi = cmav<size_t,1>::build_uniform({map.shape(1)}, map.shape(2));
   auto phi0 = cmav<double,1>::build_uniform({map.shape(1)}, 0.);
@@ -2604,14 +2743,14 @@ template<typename T> void adjoint_synthesis_2d(vmav<complex<T>,2> &alm,
                 {map.stride(0), 1});
   vmav<double,1> theta({map.shape(1)}, UNINITIALIZED);
   get_ringtheta_2d(geometry, theta);
-  adjoint_synthesis(alm, map2, spin, lmax, mstart, 1, theta, nphi, phi0, ringstart, pixstride, nthreads);
+  adjoint_synthesis(alm, map2, spin, lmax, mstart, 1, theta, nphi, phi0, ringstart, pixstride, nthreads, mode);
   }
 template void adjoint_synthesis_2d(vmav<complex<double>,2> &alm,
   const cmav<double,3> &map, size_t spin, size_t lmax, size_t mmax,
-  const string &geometry, size_t nthreads);
+  const string &geometry, size_t nthreads, SHT_mode mode);
 template void adjoint_synthesis_2d(vmav<complex<float>,2> &alm,
   const cmav<float,3> &map, size_t spin, size_t lmax, size_t mmax,
-  const string &geometry, size_t nthreads);
+  const string &geometry, size_t nthreads, SHT_mode mode);
 
 template<typename T> void analysis_2d(
   vmav<complex<T>,2> &alm, // (ncomp, *)
@@ -2642,7 +2781,7 @@ template<typename T> void analysis_2d(
     mval(i) = i;
   vmav<double,1> theta({nphi.shape(0)}, UNINITIALIZED);
   get_ringtheta_2d(geometry, theta);
-  sanity_checks(alm, lmax, mstart, map, theta, phi0, nphi, ringstart, spin, MAP2ALM);
+  sanity_checks(alm, lmax, mstart, map, theta, phi0, nphi, ringstart, spin, STANDARD);
   if ((geometry=="CC")||(geometry=="F1")||(geometry=="MW")||(geometry=="MWflip"))
     {
     bool npi, spi;
@@ -2672,7 +2811,7 @@ template<typename T> void analysis_2d(
     vmav<double,1> newtheta({ntheta_leg}, UNINITIALIZED);
     for (size_t i=0; i<ntheta_leg; ++i)
       newtheta(i) = (pi*i)/(ntheta_leg-1);
-    leg2alm(alm, lego, spin, lmax, mval, mstart, lstride, newtheta, nthreads);
+    leg2alm(alm, lego, spin, lmax, mval, mstart, lstride, newtheta, nthreads, STANDARD);
     return;
     }
   else
@@ -2687,7 +2826,7 @@ template<typename T> void analysis_2d(
         for (size_t k=0; k<leg.shape(2); ++k)
           leg(i,j,k) *= wgt1;
         }
-    leg2alm(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads);
+    leg2alm(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads, STANDARD);
     }
   }
 
@@ -2749,7 +2888,7 @@ template<typename T> void adjoint_analysis_2d(
     mval(i) = i;
   vmav<double,1> theta({nphi.shape(0)}, UNINITIALIZED);
   get_ringtheta_2d(geometry, theta);
-  sanity_checks(alm, lmax, mstart, map, theta, phi0, nphi, ringstart, spin, MAP2ALM);
+  sanity_checks(alm, lmax, mstart, map, theta, phi0, nphi, ringstart, spin, STANDARD);
   if ((geometry=="CC")||(geometry=="F1")||(geometry=="MW")||(geometry=="MWflip"))
     {
     bool npo, spo;
@@ -2770,7 +2909,7 @@ template<typename T> void adjoint_analysis_2d(
     vmav<double,1> theta_tmp({ntheta_leg}, UNINITIALIZED);
     for (size_t i=0; i<ntheta_leg; ++i)
       theta_tmp(i) = (pi*i)/(ntheta_leg-1);
-    alm2leg(alm, legi, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads);
+    alm2leg(alm, legi, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads, STANDARD);
     resample_from_prepared_CC(legi, lego, npo, spo, spin, lmax, nthreads);
     for (size_t i=0; i<lego.shape(0); ++i)
       for (size_t j=0; j<lego.shape(1); ++j)
@@ -2786,7 +2925,7 @@ template<typename T> void adjoint_analysis_2d(
     {
     auto wgt = get_gridweights(geometry, theta.shape(0));
     auto leg(vmav<complex<T>,3>::build_noncritical({map.shape(0), theta.shape(0), mstart.shape(0)}, UNINITIALIZED));
-    alm2leg(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads);
+    alm2leg(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads, STANDARD);
     for (size_t i=0; i<leg.shape(0); ++i)
       for (size_t j=0; j<leg.shape(1); ++j)
         {
@@ -2828,19 +2967,20 @@ template void adjoint_analysis_2d(const cmav<complex<float>,2> &alm, vmav<float,
 template<typename T, typename Tloc> void synthesis_general(
   const cmav<complex<T>,2> &alm, vmav<T,2> &map,
   size_t spin, size_t lmax, size_t mmax, const cmav<Tloc,2> &loc,
-  double epsilon, double sigma_min, double sigma_max, size_t nthreads)
+  double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode)
   {
   MR_assert(loc.shape(1)==2, "last dimension of loc must have size 2");
-  size_t ncomp = (spin==0) ? 1 : 2;
-  MR_assert(alm.shape(0)==ncomp, "number of components mismatch in alm");
+  size_t nalm = (spin==0) ? 1 : ((mode==STANDARD) ? 2 : 1);
+  size_t nmaps = (spin==0) ? 1 : 2;
+  MR_assert(alm.shape(0)==nalm, "number of components mismatch in alm");
 
   SphereInterpol<T> inter(lmax, mmax, spin, loc.shape(0),
     sigma_min, sigma_max, epsilon, nthreads);
-  auto planes_ = vmav<complex<T>,3>::build_noncritical({ncomp, inter.Ntheta(), (inter.Nphi()+1)/2}, UNINITIALIZED);
+  auto planes_ = vmav<complex<T>,3>::build_noncritical({nmaps, inter.Ntheta(), (inter.Nphi()+1)/2}, UNINITIALIZED);
   vmav<T,3> planes(reinterpret_cast<T *>(planes_.data()),
-    {ncomp, inter.Ntheta(), inter.Nphi()},
+    {nmaps, inter.Ntheta(), inter.Nphi()},
     {2*planes_.stride(0), 2*planes_.stride(1), 1});
-  inter.getPlane(alm, planes);
+  inter.getPlane(alm, planes, mode);
   auto xtheta = subarray<1>(loc, {{},{0}});
   auto xphi = subarray<1>(loc, {{},{1}});
   inter.interpol(planes, 0, 0, xtheta, xphi, map);
@@ -2849,41 +2989,41 @@ template<typename T, typename Tloc> void synthesis_general(
 template void synthesis_general(
   const cmav<complex<float>,2> &alm, vmav<float,2> &map,
   size_t spin, size_t lmax, size_t mmax, const cmav<double,2> &loc,
-  double epsilon, double sigma_min, double sigma_max, size_t nthreads);
+  double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode);
 template void synthesis_general(
   const cmav<complex<double>,2> &alm, vmav<double,2> &map,
   size_t spin, size_t lmax, size_t mmax, const cmav<double,2> &loc,
-  double epsilon, double sigma_min, double sigma_max, size_t nthreads);
+  double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode);
 
 template<typename T, typename Tloc> void adjoint_synthesis_general(
   vmav<complex<T>,2> &alm, const cmav<T,2> &map,
   size_t spin, size_t lmax, size_t mmax, const cmav<Tloc,2> &loc,
-  double epsilon, double sigma_min, double sigma_max, size_t nthreads)
+  double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode)
   {
   MR_assert(loc.shape(1)==2, "last dimension of loc must have size 2");
-  size_t ncomp = (spin==0) ? 1 : 2;
-  MR_assert(map.shape(0)==ncomp, "number of components mismatch in map");
+  size_t nmaps = (spin==0) ? 1 : 2;
+  MR_assert(map.shape(0)==nmaps, "number of components mismatch in map");
 
   SphereInterpol<T> inter(lmax, mmax, spin, loc.shape(0),
     sigma_min, sigma_max, epsilon, nthreads);
-  auto planes_ = vmav<complex<T>,3>::build_noncritical({ncomp, inter.Ntheta(), (inter.Nphi()+1)/2}, UNINITIALIZED);
+  auto planes_ = vmav<complex<T>,3>::build_noncritical({nmaps, inter.Ntheta(), (inter.Nphi()+1)/2}, UNINITIALIZED);
   vmav<T,3> planes(reinterpret_cast<T *>(planes_.data()),
-    {ncomp, inter.Ntheta(), inter.Nphi()},
+    {nmaps, inter.Ntheta(), inter.Nphi()},
     {2*planes_.stride(0), 2*planes_.stride(1), 1});
   mav_apply([](auto &v){v=0;}, nthreads, planes);
   auto xtheta = subarray<1>(loc, {{},{0}});
   auto xphi = subarray<1>(loc, {{},{1}});
   inter.deinterpol(planes, 0, 0, xtheta, xphi, map);
-  inter.updateAlm(alm, planes);
+  inter.updateAlm(alm, planes, mode);
   }
 template void adjoint_synthesis_general(
   vmav<complex<float>,2> &alm, const cmav<float,2> &map,
   size_t spin, size_t lmax, size_t mmax, const cmav<double,2> &loc,
-  double epsilon, double sigma_min, double sigma_max, size_t nthreads);
+  double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode);
 template void adjoint_synthesis_general(
   vmav<complex<double>,2> &alm, const cmav<double,2> &map,
   size_t spin, size_t lmax, size_t mmax, const cmav<double,2> &loc,
-  double epsilon, double sigma_min, double sigma_max, size_t nthreads);
+  double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode);
 
 template<typename T> tuple<size_t, size_t, double, double> pseudo_analysis_general(
   vmav<complex<T>,2> &alm, // (ncomp, *)
@@ -2907,12 +3047,12 @@ template<typename T> tuple<size_t, size_t, double, double> pseudo_analysis_gener
   auto op = [&](const cmav<complex<T>,2> &xalm, vmav<T,2> &xmap)
     {
     synthesis_general(xalm, xmap, spin, lmax, mmax, loc, 1e-1*epsilon,
-                      sigma_min, sigma_max, nthreads);
+                      sigma_min, sigma_max, nthreads, STANDARD);
     };
   auto op_adj = [&](const cmav<T,2> &xmap, vmav<complex<T>,2> &xalm)
     {
     adjoint_synthesis_general(xalm, xmap, spin, lmax, mmax, loc, 1e-1*epsilon,
-                              sigma_min, sigma_max, nthreads);
+                              sigma_min, sigma_max, nthreads, STANDARD);
     };
   auto mapnorm = [&](const cmav<T,2> &xmap)
     {
diff --git a/src/ducc0/sht/sht.h b/src/ducc0/sht/sht.h
index a1d89a2a..3f7d06a2 100644
--- a/src/ducc0/sht/sht.h
+++ b/src/ducc0/sht/sht.h
@@ -37,10 +37,7 @@ namespace detail_sht {
 
 using namespace std;
 
-enum SHT_mode { MAP2ALM,
-                ALM2MAP,
-                ALM2MAP_DERIV1
-              };
+enum SHT_mode { STANDARD, GRAD_ONLY, DERIV1 };
 
 void get_gridweights(const string &type, vmav<double,1> &wgt);
 vmav<double,1> get_gridweights(const string &type, size_t nrings);
@@ -55,7 +52,7 @@ template<typename T> void alm2leg(  // associated Legendre transform
   ptrdiff_t lstride,
   const cmav<double,1> &theta, // (nrings)
   size_t nthreads,
-  SHT_mode mode=ALM2MAP);
+  SHT_mode mode);
 template<typename T> void leg2alm(  // associated Legendre transform
   vmav<complex<T>,2> &alm, // (ncomp, lmidx)
   const cmav<complex<T>,3> &leg, // (ncomp, nrings, nm)
@@ -65,7 +62,8 @@ template<typename T> void leg2alm(  // associated Legendre transform
   const cmav<size_t,1> &mstart, // (nm)
   ptrdiff_t lstride,
   const cmav<double,1> &theta, // (nrings)
-  size_t nthreads);
+  size_t nthreads,
+  SHT_mode mode);
 
 template<typename T> void map2leg(  // FFT
   const cmav<T,2> &map, // (ncomp, pix)
@@ -97,7 +95,7 @@ template<typename T> void synthesis(
   const cmav<size_t,1> &ringstart, // (nrings)
   ptrdiff_t pixstride,
   size_t nthreads,
-  SHT_mode mode=ALM2MAP);
+  SHT_mode mode);
 
 template<typename T> void adjoint_synthesis(
   vmav<complex<T>,2> &alm, // (ncomp, *)
@@ -111,7 +109,8 @@ template<typename T> void adjoint_synthesis(
   const cmav<double,1> &phi0, // (nrings)
   const cmav<size_t,1> &ringstart, // (nrings)
   ptrdiff_t pixstride,
-  size_t nthreads);
+  size_t nthreads,
+  SHT_mode mode);
 
 template<typename T> tuple<size_t, size_t, double, double> pseudo_analysis(
   vmav<complex<T>,2> &alm, // (ncomp, *)
@@ -127,14 +126,14 @@ template<typename T> tuple<size_t, size_t, double, double> pseudo_analysis(
   ptrdiff_t pixstride,
   size_t nthreads,
   size_t maxiter,
-  double epsilom);
+  double epsilon);
 
 template<typename T> void synthesis_2d(const cmav<complex<T>,2> &alm, vmav<T,3> &map,
-  size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads, SHT_mode mode=ALM2MAP);
+  size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads, SHT_mode mode);
 
 template<typename T> void adjoint_synthesis_2d(vmav<complex<T>,2> &alm,
   const cmav<T,3> &map, size_t spin, size_t lmax, size_t mmax,
-  const string &geometry, size_t nthreads);
+  const string &geometry, size_t nthreads, SHT_mode mode);
 
 template<typename T> void analysis_2d(vmav<complex<T>,2> &alm,
   const cmav<T,3> &map, size_t spin, size_t lmax, size_t mmax,
@@ -147,12 +146,12 @@ template<typename T> void adjoint_analysis_2d(const cmav<complex<T>,2> &alm,
 template<typename T, typename Tloc> void synthesis_general(
   const cmav<complex<T>,2> &alm, vmav<T,2> &map,
   size_t spin, size_t lmax, size_t mmax, const cmav<Tloc,2> &loc,
-  double epsilon, double sigma_min, double sigma_max, size_t nthreads);
+  double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode);
 
 template<typename T, typename Tloc> void adjoint_synthesis_general(
   vmav<complex<T>,2> &alm, const cmav<T,2> &map,
   size_t spin, size_t lmax, size_t mmax, const cmav<Tloc,2> &loc,
-  double epsilon, double sigma_min, double sigma_max, size_t nthreads);
+  double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode);
 
 template<typename T> tuple<size_t, size_t, double, double> pseudo_analysis_general(
   vmav<complex<T>,2> &alm, // (ncomp, *)
@@ -168,9 +167,9 @@ template<typename T> tuple<size_t, size_t, double, double> pseudo_analysis_gener
 }
 
 using detail_sht::SHT_mode;
-using detail_sht::ALM2MAP;
-using detail_sht::ALM2MAP_DERIV1;
-using detail_sht::MAP2ALM;
+using detail_sht::STANDARD;
+using detail_sht::GRAD_ONLY;
+using detail_sht::DERIV1;
 using detail_sht::get_gridweights;
 using detail_sht::alm2leg;
 using detail_sht::leg2alm;
diff --git a/src/ducc0/sht/sphere_interpol.h b/src/ducc0/sht/sphere_interpol.h
index 1a1cc4c2..af6c2a51 100644
--- a/src/ducc0/sht/sphere_interpol.h
+++ b/src/ducc0/sht/sphere_interpol.h
@@ -643,11 +643,9 @@ template<typename T> class SphereInterpol
       return res;
       }
 
-    void getPlane(const cmav<complex<T>,2> &valm, vmav<T,3> &planes) const
+    void getPlane(const cmav<complex<T>,2> &valm, vmav<T,3> &planes, SHT_mode mode) const
       {
       size_t nplanes=1+(spin>0);
-      auto ncomp = valm.shape(0);
-      MR_assert(ncomp==nplanes, "number of components mismatch");
       Alm_Base ialm(lmax, mmax);
       MR_assert(ialm.Num_Alms()==valm.shape(1), "bad array dimension");
       MR_assert(planes.conformable({nplanes, Ntheta(), Nphi()}), "bad planes shape");
@@ -659,7 +657,7 @@ template<typename T> class SphereInterpol
       MR_assert((planes.stride(0)&1)==0, "stride must be even");
       MR_assert(2*(mmax+1)<=nphi_b, "aargh");
       vmav<complex<T>,3> leg(reinterpret_cast<complex<T> *>(&planes(0,nbtheta,nbphi-1)),
-        {ncomp, ntheta_s, mmax+1}, {subplanes.stride(0)/2, subplanes.stride(1)/2, 1});
+        {nplanes, ntheta_s, mmax+1}, {subplanes.stride(0)/2, subplanes.stride(1)/2, 1});
       vmav<double,1> theta({ntheta_s}, UNINITIALIZED);
       for (size_t i=0; i<ntheta_s; ++i)
         theta(i) = (i*pi)/(ntheta_s-1);
@@ -672,7 +670,7 @@ template<typename T> class SphereInterpol
         mstart(i) = ofs-i;
         ofs += lmax+1-i;
         }
-      alm2leg(valm, leg, spin, lmax, mval, mstart, 1, theta, nthreads);
+      alm2leg(valm, leg, spin, lmax, mval, mstart, 1, theta, nthreads, mode);
       // make halfcomplex
       for (size_t iplane=0; iplane<nplanes; ++iplane)
         for (size_t itheta=0; itheta<ntheta_s; ++itheta)
@@ -731,11 +729,9 @@ template<typename T> class SphereInterpol
       deinterpolx<maxsupp>(kernel->support(), cube, itheta0, iphi0, theta, phi, signal);
       }
 
-    void updateAlm(vmav<complex<T>,2> &valm, vmav<T,3> &planes) const
+    void updateAlm(vmav<complex<T>,2> &valm, vmav<T,3> &planes, SHT_mode mode) const
       {
       size_t nplanes=1+(spin>0);
-      auto ncomp = valm.shape(0);
-      MR_assert(ncomp>0, "need at least one component");
       Alm_Base ialm(lmax, mmax);
       MR_assert(ialm.Num_Alms()==valm.shape(1), "bad array dimension");
       MR_assert(planes.conformable({nplanes, Ntheta(), Nphi()}), "bad planes shape");
@@ -785,7 +781,7 @@ template<typename T> class SphereInterpol
       MR_assert((planes.stride(0)&1)==0, "stride must be even");
       MR_assert(2*(mmax+1)<=nphi_b, "aargh");
       vmav<complex<T>,3> leg(reinterpret_cast<complex<T> *>(&planes(0,nbtheta,nbphi-1)),
-        {ncomp, ntheta_s, mmax+1}, {subplanes.stride(0)/2, subplanes.stride(1)/2, 1});
+        {nplanes, ntheta_s, mmax+1}, {subplanes.stride(0)/2, subplanes.stride(1)/2, 1});
       vmav<double,1> theta({ntheta_s}, UNINITIALIZED);
       for (size_t i=0; i<ntheta_s; ++i)
         theta(i) = (i*pi)/(ntheta_s-1);
@@ -806,13 +802,13 @@ template<typename T> class SphereInterpol
           planes(iplane, nbtheta+itheta, nbphi-1) = planes(iplane, nbtheta+itheta, nbphi);
           planes(iplane, nbtheta+itheta, nbphi) = T(0);
           }
-      leg2alm(valm, leg, spin, lmax, mval, mstart, 1, theta, nthreads);
+      leg2alm(valm, leg, spin, lmax, mval, mstart, 1, theta, nthreads, mode);
       }
 
-    void updateAlm(vmav<complex<T>,1> &alm, vmav<T,3> &planes) const
+    void updateAlm(vmav<complex<T>,1> &alm, vmav<T,3> &planes, SHT_mode mode) const
       {
       vmav<complex<T>,2> valm(alm.data(), {1,alm.shape(0)}, {0,alm.stride(0)});
-      updateAlm(valm, planes);
+      updateAlm(valm, planes, mode);
       }
   };
 
diff --git a/src/ducc0/sht/totalconvolve.h b/src/ducc0/sht/totalconvolve.h
index 58b9e4f0..d240c9c6 100644
--- a/src/ducc0/sht/totalconvolve.h
+++ b/src/ducc0/sht/totalconvolve.h
@@ -543,7 +543,7 @@ template<typename T> class ConvolverPlan
             }
           }
       auto subplanes=subarray<3>(planes,{{0, nplanes}, {nbtheta, nbtheta+ntheta_s}, {nbphi, nbphi+nphi_s}});
-      synthesis_2d(aarr, subplanes, mbeam, lmax, lmax, "CC", nthreads);
+      synthesis_2d(aarr, subplanes, mbeam, lmax, lmax, "CC", nthreads, STANDARD);
       for (size_t iplane=0; iplane<nplanes; ++iplane)
         {
         auto m = subarray<2>(planes, {{iplane},{nbtheta, nbtheta+ntheta_b}, {nbphi, nbphi+nphi_b}});
@@ -659,7 +659,7 @@ template<typename T> class ConvolverPlan
       auto subplanes=subarray<3>(planes, {{0, nplanes}, {nbtheta, nbtheta+ntheta_s}, {nbphi,nbphi+nphi_s}});
 
       vmav<complex<T>,2> aarr({nplanes, base.Num_Alms()}, UNINITIALIZED);
-      adjoint_synthesis_2d(aarr, subplanes, mbeam, lmax, lmax, "CC", nthreads);
+      adjoint_synthesis_2d(aarr, subplanes, mbeam, lmax, lmax, "CC", nthreads, STANDARD);
       for (size_t m=0; m<=lmax; ++m)
         for (size_t l=m; l<=lmax; ++l)
           if (l>=mbeam)