diff --git a/.gitignore b/.gitignore
index e2a2e99043c..9640e0f33a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,8 @@ config/sample-runs-build.log
 doc/CodeDocumentation.conf
 doc/CodeDocumentation.html
 doc/CodeDocumentation
+doc/undoc.log
+doc/warnings.log
 
 # Temporary files created by the tests.
 *.stderr
diff --git a/.travis.yml b/.travis.yml
index b678d918093..4adc20c5514 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,8 +11,6 @@
 
 language: cpp
 
-sudo: false
-
 stages:
   - checks
   - tests
@@ -370,8 +368,10 @@ script:
    # Compiler
    - if [ $MPI == "YES" ]; then
         export MYCXX=mpic++;
+        export MAKE_CXX_FLAG=MPICXX=$MYCXX;
      else
         export MYCXX="$CXX";
+        export MAKE_CXX_FLAG=CXX=$MYCXX;
      fi
 
    # Print the compiler version
@@ -384,12 +384,9 @@ script:
      if [ "$CODECOV" == "YES" ]; then
         CPPFLAGS="--coverage -g";
      fi;
-     if [ "$CXX" == "clang++" ]; then
-        export MFEM_PERF_SW=clang;
-     fi
 
    # Configure the library
-   - make config MFEM_USE_MPI=$MPI MFEM_DEBUG=$DEBUG MFEM_CXX="$MYCXX"
+   - make config MFEM_USE_MPI=$MPI MFEM_DEBUG=$DEBUG $MAKE_CXX_FLAG
         MFEM_MPI_NP=$NPROCS CPPFLAGS="$CPPFLAGS"
    # Show the configuration
    - make info
diff --git a/CHANGELOG b/CHANGELOG
index 907f91922a9..8d5d626105a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -23,6 +23,27 @@ Meshing improvements
   Hessian for r-adaptivity using discrete fields, and allows use of skewness
   and orientation based metrics.
 
+- Added support for r-adaptivity with more than one discrete field. This allows
+  the user to specify different discrete functions for controlling the
+  size, aspect-ratio, orientation, and skew of elements in the mesh.
+
+- Added TMOP capability for approximate tangential mesh relaxation.
+
+- Added support for reading periodic meshes in Gmsh format (version 2.2). See
+  for example the periodic-annulus-sector and periodic-torus-sector files in
+  the data directory.
+
+Performance improvements
+------------------------
+- Added support for explicit vectorization in the high-performance templated
+  code, which can now take advantage of specific intrinsics classes on the
+  following architectures:
+    - x86 (SSE/AVX/AVX2/AVX512),
+    - Power8 & Power9 (VSX),
+    - BG/Q (QPX).
+  These are now enabled by default, and can be disabled with MFEM_USE_SIMD=NO.
+  See the new file linalg/simd.hpp and the new directory linalg/simd.
+
 Improved GPU capabilities
 -------------------------
 - Added support for Chebyshev accelerated polynomial smoother on GPU.
@@ -35,6 +56,24 @@ Discretization improvements
 
 - Added support for simplices in GSLIB-FindPoints.
 
+- Added support for H1 and L2 element matrix assembly in the mass, convection,
+  diffusion, transpose, and the face DG trace integrators. This is compatible
+  with GPU device execution and is illustrated in Example 9/9p, see the option
+  '-ea'. When enabled, this level of assembly stores independent dense matrices
+  for the elements, and independent dense matrices for the faces in the DG case.
+
+- Added new partial assembly kernels for H(div) bilinear forms, as well as
+  VectorFEDivergenceIntegrator.
+
+- Improved the documentation of the GridFunction GetValue and GetVectorValue
+  methods. Expanded the GetValue and GetVectorValue methods which accept an
+  ElementTransformation argument to support evaluation on boundary elements
+  and, in the continuous field case, arbitrary mesh edges and faces.
+
+- Added new coefficient and vector coefficient classes for QuadratureFunctions.
+  Additionaly, new LinearForm integrators were also added which make use of
+  these new QuadratureFunction coefficient classes.
+
 Linear and nonlinear solvers
 ----------------------------
 - Added power method to iteratively estimate the largest eigenvalue and the
@@ -43,6 +82,17 @@ Linear and nonlinear solvers
 - Added initial support for h- and p-multigrid solvers and preconditioners for
   matrix-based and matrix-free discretizations with basic GPU capability.
 
+- Added a new IterativeSolverMonitor class that allows to monitor the residual
+  and solution during the solving process of an IterativeSolver after every
+  iteration.
+
+- Block arrays of parallel matrices can now be merged into a single parallel
+  matrix with the function HypreParMatrixFromBlocks. This could be useful for
+  solving block systems with parallel direct solvers such as STRUMPACK.
+
+- In SLISolver, changed the residual inner product from (Br,r) to (Br,Br) so the
+  solver can work with non-SPD preconditioner B.
+
 New and updated examples and miniapps
 -------------------------------------
 - Added a new example, Example 25/25p, to demonstrate the use of a Perfectly
@@ -65,6 +115,12 @@ New and updated examples and miniapps
 - Added a new meshing miniapp, Minimal Surface, which solves Plateau's problem:
   the Dirichlet problem for the minimal surface equation.
 
+- Added partial assembly support to examples 4/4p and 5/5p, with diagonal
+  preconditioning.
+
+- Added a new test problem in example 24/24p, demonstrating a mixed bilinear
+  form for H(div) and L_2, with partial assembly support.
+
 Improved testing
 ----------------
 - Added a GitLab pipeline that automates PR testing on supercomputing systems
@@ -74,15 +130,17 @@ Improved testing
 
 Miscellaneous
 -------------
-- In SLISolver, changed the residual inner product from (Br,r) to (Br,Br) so the
-  solver can work with non-SPD preconditioner B.
-
 - Added support for ADIOS2 for parallel I/O with ParaView visualization. The
   classes adios2stream and ADIOS2DataCollection are introduced in mfem as the
   interfaces to generate ADIOS2 Binary Pack (BP4) directory datasets for the
   entire spatial and temporal data. In addition, ADIOS2 allows for setting a
   user-defined number of data substreams/subfiles. See examples 5, 9, 12, 16.
 
+- The integration order used in the ComputeLpError and ComputeElementLpError
+  methods of class GridFunction has been increased.
+
+- Various other simplifications, extensions, and bugfixes in the code.
+
 
 Version 4.1, released on March 10, 2020
 =======================================
diff --git a/INSTALL b/INSTALL
index ddc911225b8..46a714f7c3e 100644
--- a/INSTALL
+++ b/INSTALL
@@ -396,6 +396,12 @@ MFEM_USE_SIDRE = YES/NO
    blueprint specification. When enabled, this option requires installation of
    HDF5 (see also MFEM_USE_NETCDF), Conduit and LLNL's axom project.
 
+MFEM_USE_SIMD = YES/NO
+   Enables the high performance templated classes to use architecture dependent
+   SIMD intrinsics instead of the generic implementation of class AutoSIMD in
+   linalg/simd/auto.hpp. This option should be combined with suitable
+   compiler options, such as -march=native, to enable optimal vectorization.
+
 MFEM_USE_CONDUIT = YES/NO
    Enables support for converting MFEM Mesh and Grid Function objects to and
    from Conduit Mesh Blueprint Descriptions (https://github.com/LLNL/conduit/)
@@ -426,6 +432,8 @@ MFEM_USE_PUMI = YES/NO
    data management system that is capable of handling general non-manifold
    models and effectively supports automated adaptive analysis. PUMI enables
    support for parallel unstructured mesh modifications in MFEM.
+   The develop branch of PUMI repository (https://github.com/SCOREC/core)
+   should be used for most updated features.
 
 MFEM_USE_UMPIRE = YES/NO
    Enables support for Umpire, a resource management library that allows the
@@ -609,8 +617,9 @@ The specific libraries and their options are:
 
 - PUMI (optional), used when MFEM_USE_PUMI = YES.
   URL: https://scorec.rpi.edu/pumi
+       https://github.com/SCOREC/core
   Options: PUMI_OPT, PUMI_LIB.
-  Versions: PUMI >= 2.2.0.
+  Versions: PUMI >= 2.2.3.
 
 - HiOp (optional), used when MFEM_USE_HIOP = YES.
   URL: https://github.com/LLNL/hiop
diff --git a/config/cmake/MFEMConfig.cmake.in b/config/cmake/MFEMConfig.cmake.in
index 896e1c35105..7e98372647a 100644
--- a/config/cmake/MFEMConfig.cmake.in
+++ b/config/cmake/MFEMConfig.cmake.in
@@ -47,6 +47,7 @@ set(MFEM_USE_OCCA @MFEM_USE_OCCA@)
 set(MFEM_USE_RAJA @MFEM_USE_RAJA@)
 set(MFEM_USE_CEED @MFEM_USE_CEED@)
 set(MFEM_USE_UMPIRE @MFEM_USE_UMPIRE@)
+set(MFEM_USE_SIMD @MFEM_USE_SIMD@)
 set(MFEM_USE_ADIOS2 @MFEM_USE_ADIOS2@)
 
 set(MFEM_CXX_COMPILER "@CMAKE_CXX_COMPILER@")
diff --git a/config/cmake/config.hpp.in b/config/cmake/config.hpp.in
index 26327ae59b4..4bcbf59d885 100644
--- a/config/cmake/config.hpp.in
+++ b/config/cmake/config.hpp.in
@@ -107,6 +107,9 @@
 // Enable MFEM functionality based on the Sidre library
 #cmakedefine MFEM_USE_SIDRE
 
+// Enable the use of SIMD in the high performance templated classes
+#cmakedefine MFEM_USE_SIMD
+
 // Enable MFEM functionality based on Conduit
 #cmakedefine MFEM_USE_CONDUIT
 
diff --git a/config/cmake/modules/MfemCmakeUtilities.cmake b/config/cmake/modules/MfemCmakeUtilities.cmake
index 6f3428933c1..850479680fb 100644
--- a/config/cmake/modules/MfemCmakeUtilities.cmake
+++ b/config/cmake/modules/MfemCmakeUtilities.cmake
@@ -733,7 +733,7 @@ function(mfem_export_mk_files)
       MFEM_USE_SUPERLU MFEM_USE_STRUMPACK MFEM_USE_GNUTLS
       MFEM_USE_GSLIB MFEM_USE_NETCDF MFEM_USE_PETSC MFEM_USE_MPFR MFEM_USE_SIDRE
       MFEM_USE_CONDUIT MFEM_USE_PUMI MFEM_USE_CUDA MFEM_USE_OCCA MFEM_USE_RAJA
-      MFEM_USE_UMPIRE)
+      MFEM_USE_UMPIRE MFEM_USE_SIMD MFEM_USE_ADIOS2)
   foreach(var ${CONFIG_MK_BOOL_VARS})
     if (${var})
       set(${var} YES)
@@ -743,6 +743,7 @@ function(mfem_export_mk_files)
   endforeach()
   # TODO: Add support for MFEM_USE_CUDA=YES
   set(MFEM_CXX ${CMAKE_CXX_COMPILER})
+  set(MFEM_HOST_CXX ${MFEM_CXX})
   set(MFEM_CPPFLAGS "")
   string(STRIP "${CMAKE_CXX_FLAGS_${BUILD_TYPE}} ${CMAKE_CXX_FLAGS}"
          MFEM_CXXFLAGS)
diff --git a/config/config.hpp.in b/config/config.hpp.in
index d71cfcfbe0b..56a8cee7da2 100644
--- a/config/config.hpp.in
+++ b/config/config.hpp.in
@@ -106,6 +106,9 @@
 // Enable Sidre support
 // #define MFEM_USE_SIDRE
 
+// Enable the use of SIMD in the high performance templated classes
+// #define MFEM_USE_SIMD
+
 // Enable Conduit support
 // #define MFEM_USE_CONDUIT
 
diff --git a/config/config.mk.in b/config/config.mk.in
index 4d7c34affcc..99d0c822b8e 100644
--- a/config/config.mk.in
+++ b/config/config.mk.in
@@ -49,10 +49,12 @@ MFEM_USE_RAJA          = @MFEM_USE_RAJA@
 MFEM_USE_OCCA          = @MFEM_USE_OCCA@
 MFEM_USE_CEED          = @MFEM_USE_CEED@
 MFEM_USE_UMPIRE        = @MFEM_USE_UMPIRE@
+MFEM_USE_SIMD          = @MFEM_USE_SIMD@
 MFEM_USE_ADIOS2        = @MFEM_USE_ADIOS2@
 
 # Compiler, compile options, and link options
 MFEM_CXX       = @MFEM_CXX@
+MFEM_HOST_CXX  = @MFEM_HOST_CXX@
 MFEM_CPPFLAGS  = @MFEM_CPPFLAGS@
 MFEM_CXXFLAGS  = @MFEM_CXXFLAGS@
 MFEM_TPLFLAGS  = @MFEM_TPLFLAGS@
diff --git a/config/defaults.cmake b/config/defaults.cmake
index 5854e6916e2..5703860e106 100644
--- a/config/defaults.cmake
+++ b/config/defaults.cmake
@@ -49,6 +49,7 @@ option(MFEM_USE_OCCA "Enable OCCA" OFF)
 option(MFEM_USE_RAJA "Enable RAJA" OFF)
 option(MFEM_USE_CEED "Enable CEED" OFF)
 option(MFEM_USE_UMPIRE "Enable Umpire" OFF)
+option(MFEM_USE_SIMD "Enable use of SIMD intrinsics" ON)
 option(MFEM_USE_ADIOS2 "Enable ADIOS2" OFF)
 
 set(MFEM_MPI_NP 4 CACHE STRING "Number of processes used for MPI tests")
diff --git a/config/defaults.mk b/config/defaults.mk
index 490fa4f0e30..ed5ab20f563 100644
--- a/config/defaults.mk
+++ b/config/defaults.mk
@@ -137,6 +137,7 @@ MFEM_USE_RAJA          = NO
 MFEM_USE_OCCA          = NO
 MFEM_USE_CEED          = NO
 MFEM_USE_UMPIRE        = NO
+MFEM_USE_SIMD          = YES
 MFEM_USE_ADIOS2        = NO
 
 # Compile and link options for zlib.
diff --git a/config/tconfig.hpp b/config/tconfig.hpp
index 59cc77ecd98..a10ae8f4b68 100644
--- a/config/tconfig.hpp
+++ b/config/tconfig.hpp
@@ -29,8 +29,20 @@
 #define MFEM_ALWAYS_INLINE
 #endif
 
+// --- MFEM_VECTORIZE_LOOP (disabled)
+#if (__cplusplus >= 201103L) && !defined(MFEM_DEBUG) && defined(__GNUC__)
+//#define MFEM_VECTORIZE_LOOP _Pragma("GCC ivdep")
+#define MFEM_VECTORIZE_LOOP
+#else
+#define MFEM_VECTORIZE_LOOP
+#endif
+
+// MFEM_TEMPLATE_BLOCK_SIZE is the block size used by the template matrix-matrix
+// multiply, Mult_AB, defined in tmatrix.hpp. This parameter will generally
+// require tuning to determine good value. It is probably highly influenced by
+// the SIMD width when Mult_AB is used with a SIMD type like AutoSIMD.
 #define MFEM_TEMPLATE_BLOCK_SIZE 4
-#define MFEM_SIMD_SIZE 32
+
 #define MFEM_TEMPLATE_ENABLE_SERIALIZE
 
 // #define MFEM_TEMPLATE_ELTRANS_HAS_NODE_DOFS
@@ -38,11 +50,6 @@
 // #define MFEM_TEMPLATE_FIELD_EVAL_DATA_HAS_DOFS
 #define MFEM_TEMPLATE_INTRULE_COEFF_PRECOMP
 
-// derived macros
-#define MFEM_ROUNDUP(val,base) ((((val)+(base)-1)/(base))*(base))
-#define MFEM_ALIGN_SIZE(size,type) \
-   MFEM_ROUNDUP(size,(MFEM_SIMD_SIZE)/sizeof(type))
-
 #ifdef MFEM_COUNT_FLOPS
 namespace mfem
 {
diff --git a/data/periodic-annulus-sector.geo b/data/periodic-annulus-sector.geo
new file mode 100644
index 00000000000..655b41bddf3
--- /dev/null
+++ b/data/periodic-annulus-sector.geo
@@ -0,0 +1,37 @@
+SetFactory("OpenCASCADE");
+
+R1 = 1.0;
+R2 = 2.0;
+
+Point(1) = {0.0, 0, 0, 1.0};
+Point(2) = {R1, 0, 0, 1.0};
+Point(3) = {R2, 0, 0, 1.0};
+Point(4) = {R1*Cos(Pi/3), R1*Sin(Pi/3), 0, 1.0};
+Point(5) = {R2*Cos(Pi/3), R2*Sin(Pi/3), 0, 1.0};
+Line(1) = {2, 3};
+Line(2) = {4, 5};
+Circle(3) = {2, 1, 4};
+Circle(4) = {3, 1, 5};
+Curve Loop(5) = {1, 4, -2, -3};
+Plane Surface(1) = {5};
+
+Transfinite Curve{1} = 7;
+Transfinite Curve{2} = 7;
+Transfinite Curve{3} = 4;
+Transfinite Curve{4} = 10;
+
+// Set a rotation periodicity constraint:
+Periodic Line{1} = {2} Rotate{{0,0,1}, {0,0,0}, -Pi/3};
+
+// Tag surfaces and volumes with positive integers
+Physical Curve(1) = {3};
+Physical Curve(2) = {4};
+Physical Curve(3) = {1};
+Physical Curve(4) = {2};
+Physical Surface(1) = {1};
+
+// Generate 2D mesh
+Mesh 2;
+Mesh.MshFileVersion = 2.2;
+
+Save "periodic-annulus-sector.msh";
diff --git a/data/periodic-annulus-sector.msh b/data/periodic-annulus-sector.msh
new file mode 100644
index 00000000000..8c70fef1f69
--- /dev/null
+++ b/data/periodic-annulus-sector.msh
@@ -0,0 +1,185 @@
+$MeshFormat
+2.2 0 8
+$EndMeshFormat
+$Nodes
+55
+1 1 0 0
+2 2 0 0
+3 0.5000000000000001 0.8660254037844386 0
+4 1 1.732050807568877 0
+5 1.166666666666667 0 0
+6 1.333333333333333 0 0
+7 1.5 0 0
+8 1.666666666666667 0 0
+9 1.833333333333333 0 0
+10 0.5833333333333335 1.010362971081845 0
+11 0.6666666666666667 1.154700538379251 0
+12 0.7500000000000002 1.299038105676658 0
+13 0.8333333333333335 1.443375672974064 0
+14 0.9166666666666669 1.587713240271471 0
+15 0.9396926207859085 0.3420201433256683 0
+16 0.7660444431189786 0.6427876096865386 0
+17 1.986476715483886 0.2321858282504602 0
+18 1.946089741159648 0.4612317414848793 0
+19 1.879385241571817 0.6840402866513365 0
+20 1.787265280646825 0.8975983604009234 0
+21 1.670975622825874 1.09901795614161 0
+22 1.532088886237958 1.285575219373077 0
+23 1.372483275737469 1.454747283146095 0
+24 1.194317183405575 1.604246385510085 0
+25 1.425989114816062 0.1915326920916892 0
+26 0.8788667344146573 1.13917645290495 0
+27 1.630372059110754 0.7154531062316609 0
+28 1.436395769298814 1.053728612482506 0
+29 1.081023776188756 0.6241293681829633 0
+30 1.168737372335971 1.428012728596308 0
+31 1.821063986059922 0.298149890497067 0
+32 1.234707097211386 0.3469796339295647 0
+33 1.377747393186519 0.6200150626754309 0
+34 1.457047681210906 0.3890895843559762 0
+35 0.917846726184522 0.8957978954532204 0
+36 1.218335619030348 0.9017812086952638 0
+37 1.066623110765233 1.061857005744772 0
+38 1.587029716281926 0.1355955181472859 0
+39 1.744445799211916 0.1441515753740107 0
+40 1.25 0.1443375672974065 0
+41 1.453660070628011 0.8435769396609902 0
+42 1.741367044061892 0.499612708014486 0
+43 1.30550638526547 1.257610469847477 0
+44 1.118213276932792 0.1666674689105279 0
+45 0.9109440214958271 1.306610291787315 0
+46 0.9970618258753989 1.438658589955562 0
+47 0.7499999999999998 1.010362971081845 0
+48 0.7034449005273667 0.8850673702175776 0
+49 1.605449512513618 0.9269067082200894 0
+50 1.561654019115059 0.5298592532912715 0
+51 1.229782222487711 1.096820457143683 0
+52 1.617066998712459 0.3090202662210922 0
+53 1.079645953234324 1.246963713711438 0
+54 1.877063966817811 0.1348974588243076 0
+55 1.055356609656722 1.558136350380461 0
+$EndNodes
+$Elements
+108
+1 1 2 3 1 1 5
+2 1 2 3 1 5 6
+3 1 2 3 1 6 7
+4 1 2 3 1 7 8
+5 1 2 3 1 8 9
+6 1 2 3 1 9 2
+7 1 2 4 2 3 10
+8 1 2 4 2 10 11
+9 1 2 4 2 11 12
+10 1 2 4 2 12 13
+11 1 2 4 2 13 14
+12 1 2 4 2 14 4
+13 1 2 1 3 1 15
+14 1 2 1 3 15 16
+15 1 2 1 3 16 3
+16 1 2 2 4 2 17
+17 1 2 2 4 17 18
+18 1 2 2 4 18 19
+19 1 2 2 4 19 20
+20 1 2 2 4 20 21
+21 1 2 2 4 21 22
+22 1 2 2 4 22 23
+23 1 2 2 4 23 24
+24 1 2 2 4 24 4
+25 2 2 1 1 32 40 25
+26 2 2 1 1 25 34 32
+27 2 2 1 1 33 41 36
+28 2 2 1 1 38 52 25
+29 2 2 1 1 33 36 29
+30 2 2 1 1 26 47 35
+31 2 2 1 1 35 37 26
+32 2 2 1 1 25 52 34
+33 2 2 1 1 32 44 40
+34 2 2 1 1 15 32 29
+35 2 2 1 1 15 29 16
+36 2 2 1 1 36 41 28
+37 2 2 1 1 32 33 29
+38 2 2 1 1 50 52 42
+39 2 2 1 1 32 34 33
+40 2 2 1 1 42 52 31
+41 2 2 1 1 43 53 51
+42 2 2 1 1 27 41 33
+43 2 2 1 1 26 53 45
+44 2 2 1 1 18 31 17
+45 2 2 1 1 29 35 16
+46 2 2 1 1 29 36 35
+47 2 2 1 1 24 30 23
+48 2 2 1 1 30 53 43
+49 2 2 1 1 17 54 2
+50 2 2 1 1 4 55 24
+51 2 2 1 1 28 51 36
+52 2 2 1 1 47 48 35
+53 2 2 1 1 36 37 35
+54 2 2 1 1 37 53 26
+55 2 2 1 1 22 28 21
+56 2 2 1 1 20 27 19
+57 2 2 1 1 33 50 27
+58 2 2 1 1 15 44 32
+59 2 2 1 1 18 42 31
+60 2 2 1 1 30 43 23
+61 2 2 1 1 35 48 16
+62 2 2 1 1 31 54 17
+63 2 2 1 1 9 39 8
+64 2 2 1 1 8 38 7
+65 2 2 1 1 7 25 6
+66 2 2 1 1 22 43 28
+67 2 2 1 1 23 43 22
+68 2 2 1 1 39 54 31
+69 2 2 1 1 19 42 18
+70 2 2 1 1 24 55 30
+71 2 2 1 1 27 42 19
+72 2 2 1 1 13 46 14
+73 2 2 1 1 51 53 37
+74 2 2 1 1 39 52 38
+75 2 2 1 1 6 40 5
+76 2 2 1 1 34 52 50
+77 2 2 1 1 12 45 13
+78 2 2 1 1 30 55 46
+79 2 2 1 1 10 47 11
+80 2 2 1 1 8 39 38
+81 2 2 1 1 28 49 21
+82 2 2 1 1 7 38 25
+83 2 2 1 1 41 49 28
+84 2 2 1 1 20 49 27
+85 2 2 1 1 11 26 12
+86 2 2 1 1 27 49 41
+87 2 2 1 1 31 52 39
+88 2 2 1 1 25 40 6
+89 2 2 1 1 2 54 9
+90 2 2 1 1 14 55 4
+91 2 2 1 1 45 53 46
+92 2 2 1 1 45 46 13
+93 2 2 1 1 5 44 1
+94 2 2 1 1 21 49 20
+95 2 2 1 1 46 53 30
+96 2 2 1 1 3 48 10
+97 2 2 1 1 34 50 33
+98 2 2 1 1 36 51 37
+99 2 2 1 1 26 45 12
+100 2 2 1 1 11 47 26
+101 2 2 1 1 27 50 42
+102 2 2 1 1 40 44 5
+103 2 2 1 1 43 51 28
+104 2 2 1 1 10 48 47
+105 2 2 1 1 9 54 39
+106 2 2 1 1 46 55 14
+107 2 2 1 1 1 44 15
+108 2 2 1 1 16 48 3
+$EndElements
+$Periodic
+1
+1 1 2
+Affine 0.5000000000000001 0.8660254037844386 0 0 -0.8660254037844386 0.5000000000000001 0 0 0 0 1 0 0 0 0 1
+7
+9 14
+6 11
+8 13
+5 10
+7 12
+2 4
+1 3
+$EndPeriodic
diff --git a/data/periodic-torus-sector.geo b/data/periodic-torus-sector.geo
new file mode 100644
index 00000000000..05eacae9089
--- /dev/null
+++ b/data/periodic-torus-sector.geo
@@ -0,0 +1,25 @@
+SetFactory("OpenCASCADE");
+
+R = 1.5;
+r = 0.5;
+
+Torus(1) = {0,0,0, R, r, Pi/3};
+
+pts() = PointsOf{ Volume{1}; };
+
+Characteristic Length{ pts() } = 0.25;
+
+// Set a rotation periodicity constraint:
+Periodic Surface{3} = {2} Rotate{{0,0,1}, {0,0,0}, Pi/3};
+
+// Tag surfaces and volumes with positive integers
+Physical Surface(1) = {1};
+Physical Surface(2) = {2};
+Physical Surface(3) = {3};
+Physical Volume(1) = {1};
+
+// Generate 3D mesh
+Mesh 3;
+
+Mesh.MshFileVersion = 2.2;
+Save "periodic-torus-sector.msh";
diff --git a/data/periodic-torus-sector.msh b/data/periodic-torus-sector.msh
new file mode 100644
index 00000000000..2bb34fd344f
--- /dev/null
+++ b/data/periodic-torus-sector.msh
@@ -0,0 +1,1056 @@
+$MeshFormat
+2.2 0 8
+$EndMeshFormat
+$Nodes
+178
+1 1 1.732050807568877 -1.224646799147353e-16
+2 2 0 -1.224646799147353e-16
+3 1.986476715483886 0.2321858282504604 -1.224646799147353e-16
+4 1.946089741159648 0.4612317414848803 -1.224646799147353e-16
+5 1.879385241571817 0.6840402866513373 -1.224646799147353e-16
+6 1.787265280646825 0.8975983604009242 -1.224646799147353e-16
+7 1.670975622825873 1.099017956141612 -1.224646799147353e-16
+8 1.532088886237956 1.285575219373079 -1.224646799147353e-16
+9 1.372483275737467 1.454747283146097 -1.224646799147353e-16
+10 1.194317183405573 1.604246385510087 -1.224646799147353e-16
+11 0.9713640064107713 1.682451811747116 0.2323615860315311
+12 0.8920161844090693 1.545017352570237 0.4114919360856992
+13 0.7801341700627819 1.351232019269317 0.4963544370492818
+14 0.6613487770564798 1.145489683385365 0.467508120445488
+15 0.5628723129571309 0.9749234442155601 0.3315613291201853
+16 0.5072645455148519 0.8786079657100585 0.1196578310999915
+17 0.5072645456434897 0.8786079659328657 -0.1196578321437994
+18 0.5628723130955459 0.9749234444553019 -0.3315613294326618
+19 0.6613487782393664 1.145489685434185 -0.4675081213427075
+20 0.7801341701356748 1.351232019395571 -0.4963544370315802
+21 0.8920161866827891 1.545017356508435 -0.4114919329468283
+22 0.9713640064234279 1.682451811769038 -0.2323615859833005
+23 1.942728012826605 0 0.2323615860218843
+24 1.784032373365578 0 0.4114919329468282
+25 1.560268340127662 0 0.496354437049027
+26 1.322697556478732 0 0.4675081213427075
+27 1.12574462591445 0 0.3315613291203977
+28 1.014529091286974 0 0.119657832143779
+29 1.014529091286974 0 -0.1196578321437787
+30 1.125744625914449 0 -0.3315613291203975
+31 1.322697556478732 0 -0.4675081213427074
+32 1.560268340127661 0 -0.496354437049027
+33 1.784032373365578 0 -0.4114919329468283
+34 1.942728012826605 0 -0.2323615860218846
+35 0.9094092833839373 1.140104827634976 0.4982645365438731
+36 0.9145403631597188 1.127554721496875 -0.4976726875517566
+37 1.439543163169036 0.2066277058988615 0.4979068470620236
+38 1.438046243463446 0.2043628722395764 -0.4977381412060781
+39 1.696686878481083 0.9795826260215588 0.1979072290649842
+40 1.705046974097277 0.9627338322458719 -0.2004271899713759
+41 1.425045163821982 1.344460771417579 0.1979072290649849
+42 1.426940472289519 1.3461283575362 -0.1919478807927782
+43 1.881724626638111 0.5593582007143618 -0.1885115572979801
+44 1.876859764350238 0.561894927701204 0.197907229064985
+45 0.6673803921678336 0.7447234565204733 0.002192541055854352
+46 1.050984583067059 0.1682970125584115 0.2454183754604045
+47 1.04172076116343 0.2047537321359636 -0.240523499060071
+48 1.113311450848127 1.513972982303749 0.3258377535851458
+49 1.113311453634618 1.513972982601351 -0.3258377513847104
+50 1.867794790374375 0.2071695099506198 0.3258377513964845
+51 1.867794790374375 0.2071695099506223 -0.3258377513964847
+52 0.699980884447671 0.7923257843216313 0.2322978976561902
+53 0.8143742402024221 0.6091370533569924 0.1292055331351144
+54 0.9234512163817853 0.6950259596839291 0.362644934140704
+55 0.8149471563398465 0.6014811214666318 -0.1127408743463922
+56 0.9089882740735398 0.4178664121885499 0.02087403616690475
+57 0.8036391734156695 0.9174735877750071 0.4140227221617467
+58 1.04528975873691 0.9197904008254907 0.488274495579655
+59 1.132701372162777 0.6901822877229037 0.4688993254694935
+60 1.277010042388928 0.8802828084812561 0.4973904977528614
+61 1.353438092664882 0.6544862687155961 0.4999885855347576
+62 1.206606309387528 1.137304388082402 0.4743397986991702
+63 1.203861688615928 0.4781648664665324 0.456198716333284
+64 1.420536892021458 1.019694143898059 0.4338017162656254
+65 0.9513629883317927 0.4940253704652492 0.2584636609340892
+66 0.706186918123033 0.8068591038835513 -0.258902571664125
+67 0.8997137668513249 0.6778946202619701 -0.332423851739337
+68 0.8754534158794457 0.8896155968337219 -0.431929083733448
+69 1.068014665374453 0.7835782676774907 -0.4682372295982862
+70 1.094885326848669 0.5587005809631567 -0.420314909746101
+71 1.278914902324639 0.6814395891060806 -0.4974057052434921
+72 1.253859094818866 0.9188999749034799 -0.4970184215451375
+73 1.467592256574374 0.8234236412687718 -0.4653816350826456
+74 1.493455917091833 0.5922523984885606 -0.4885036149289653
+75 1.427903945844154 1.053079485024634 -0.4180895076610792
+76 1.192654033939925 1.176482818815241 -0.468273221931405
+77 1.664532079158557 0.7347462962335058 -0.3846173119001672
+78 1.684193467076019 0.4966353289223597 -0.4295574916595252
+79 0.9581284611377014 0.4449942507914502 -0.2307371775066812
+80 1.060510603934137 1.323992179178645 0.4598289463295096
+81 1.20965670082534 0.1918462924921296 -0.4174342094103178
+82 1.306569861240723 0.4081890859478611 -0.4824924751602535
+83 1.222973879657449 0.2262967791144349 0.429334321634214
+84 1.804078223352526 0.7663783517359284 -0.1956990807188041
+85 1.57086252165904 1.155773912505517 -0.2174559627111528
+86 1.574823294802248 1.167621289969356 0.1948731313716022
+87 1.799061725861226 0.7760394771228164 0.1975920088688854
+88 1.736321519153448 0.6195058019401377 0.3633011133312876
+89 1.680111207596174 0.2053409065696982 0.4614111573472731
+90 1.585805226690124 0.4095866832898705 0.4806230171620565
+91 1.033769170861085 1.346420377509085 -0.4593379913029474
+92 1.686177954337897 0.2595089698405005 -0.4555780070333795
+93 0.9805825388808332 0.1961068189584932 0
+94 1.267064046804422 1.349831092695157 0.3557435462640699
+95 1.479801458923729 0.8235937501426396 0.4610179491301168
+96 1.806713163893442 0.41291149496386 0.3538099754989008
+97 1.660304769719565 0.8158013561953625 0.3571657671790813
+98 1.272522679585459 1.496027967309635 0.1862154784961512
+99 1.273193945142967 1.495820795927453 -0.1855231731675253
+100 1.262667463342245 1.368719575871037 -0.3447086919841124
+101 1.931859564293957 0.3540229855991108 0.1862154775733828
+102 1.934684648085233 0.3534998953881237 -0.1793804689566357
+103 1.044602110326715 0.9883306626754986 -0.496147496375881
+104 1.531662137703001 0.3834487538500693 -0.4937306551344234
+105 1.551796568960546 0.6139737231494753 0.4706294779813407
+106 1.581724214084509 0.9482897902504869 -0.3626575574879038
+107 1.096782839124086 0.35444480392985 -0.3596337488065346
+108 1.387291936348416 0.4077234230541955 0.497071738421552
+109 1.08924964123622 0.3634521195547355 0.355383901525496
+110 1.441115266930556 1.203974457102059 0.3274445044904424
+111 1.395080190203861 1.235707600010656 -0.3431511175769478
+112 1.133587364788978 1.610473031723204 -0.1721570661176043
+113 1.961520334484167 0.1716255200510413 0.1732789072467822
+114 1.961422513597609 0.1736228878529459 -0.1730685535438358
+115 1.82696920620189 0.4017949294223938 -0.3356092840765813
+116 1.131073087671027 1.61183028132059 0.1730685525791028
+117 1.580661987779667 0.9904905866376601 0.3413397946090942
+118 1.053798949591161 0.5492367868134354 0.3909838746147544
+119 0.7242610698965356 0.986066622565408 -0.4165714295663997
+120 1.776913423346694 0.6100558234668472 -0.3264522078233214
+121 0.976815381999997 0.3286770548430039 0.1723113781768278
+122 0.9684597721183544 0.3086573347446573 -0.1272227140227045
+123 1.598657272365354 0 -0.2601378477606267
+124 1.336856475131923 0 0.2435465079686098
+125 1.264366353304017 0 -0.1342496457412713
+126 1.708248762070371 0 0.1844923042412902
+127 1.770132970054707 0 -0.06658179719356594
+128 1.627974743938903 0 0.3374416670744554
+129 1.518381683403412 0 0.189367998345964
+130 1.388026045801723 0 0.02759910094603191
+131 1.57082733902861 0 -0.01324440776767719
+132 1.205770154876862 0 0.09185560579891296
+133 1.448748593218543 0 -0.1456951042254292
+134 1.386747123568126 0 -0.3059177475399098
+135 1.773887657153061 0 -0.2426432909807264
+136 1.855277436237921 0 0.08756802326740212
+137 1.456499046249551 0 0.3582625863623187
+138 0.7993286361826776 1.384477809813136 -0.2601378477606267
+139 0.6684282375659617 1.157751668677965 0.2435465079686098
+140 0.6321831766520087 1.09497338165157 -0.1342496457412713
+141 0.8541243810351855 1.47938682393626 0.1844923042412902
+142 0.885066485027354 1.532980120143776 -0.06658179719356594
+143 0.8139873719694519 1.409867484970557 0.3374416670744554
+144 0.7591908417017061 1.314957110468335 0.189367998345964
+145 0.6940130229008615 1.202065816778755 0.02759910094603191
+146 0.7854136695143054 1.360376380557887 -0.01324440776767719
+147 0.6028850774384313 1.04422758524846 0.09185560579891296
+148 0.7243742966092718 1.254653085424226 -0.1456951042254292
+149 0.6933735617840632 1.200958237634995 -0.3059177475399098
+150 0.8869438285765309 1.536231774554212 -0.2426432909807264
+151 0.9276387181189605 1.606717390850103 0.08756802326740212
+152 0.7282495231247758 1.261365174639917 0.3582625863623187
+153 1.411918130602168 0.496166241357142 -0.00157291651681768
+154 1.137417628047573 0.9720715936541285 -0.004558233820192492
+155 1.450923907048078 0.835854081603992 0.008303709726106997
+156 1.195281628759975 0.6892135661795487 0.1379139557683655
+157 1.667647997439524 0.3046155345731573 -0.01063985748379645
+158 1.095077566932522 1.30506262456116 0.008783669059526909
+159 1.449746181951948 0.2899752883377874 -0.2128942904574569
+160 1.425404656176498 0.2838646166858353 0.2090378240215375
+161 1.237422074642534 0.6934452933460877 -0.1740536126458837
+162 0.9265939870131148 1.089233825798041 0.146036662270623
+163 0.9141871309006279 1.035605401598253 -0.1819109563473867
+164 1.230559284884559 0.2965650718118637 -0.0489211473166503
+165 1.633700383529476 0.5873280387970766 -0.1078820007088586
+166 1.381887166693965 0.5422592078649243 0.2595533680109433
+167 1.628614112376115 0.5120437923677087 0.1405317590914031
+168 1.296790159421439 0.9296724900104604 0.2103100611208717
+169 1.43040652357321 0.5390860673292487 -0.2497801650023363
+170 1.187552760136346 0.4469311325853691 0.1497889600434833
+171 1.136555246481813 0.9253132713408032 -0.2607675417720481
+172 1.169875547734923 1.157176703594696 0.209783121723046
+173 1.252256239782287 1.161539312959809 -0.1396154626225087
+174 0.9854410532992778 1.295312212838649 0.232826117690098
+175 0.8998494176549744 0.8641570369932714 0.01526132714743774
+176 1.047849560177692 0.8788161627502054 0.2215760845022245
+177 1.402089433100673 1.077521710164203 0.04039637986273873
+178 1.03525188898502 1.249979305919021 -0.2281501268865102
+$EndNodes
+$Elements
+835
+1 2 2 1 1 10 112 1
+2 2 2 1 1 1 116 10
+3 2 2 1 1 11 116 1
+4 2 2 1 1 1 112 22
+5 2 2 1 1 3 113 2
+6 2 2 1 1 2 114 3
+7 2 2 1 1 2 113 23
+8 2 2 1 1 34 114 2
+9 2 2 1 1 4 101 3
+10 2 2 1 1 3 102 4
+11 2 2 1 1 101 113 3
+12 2 2 1 1 3 114 102
+13 2 2 1 1 4 43 5
+14 2 2 1 1 5 44 4
+15 2 2 1 1 4 102 43
+16 2 2 1 1 44 101 4
+17 2 2 1 1 5 84 6
+18 2 2 1 1 6 87 5
+19 2 2 1 1 43 84 5
+20 2 2 1 1 5 87 44
+21 2 2 1 1 7 39 6
+22 2 2 1 1 6 40 7
+23 2 2 1 1 39 87 6
+24 2 2 1 1 6 84 40
+25 2 2 1 1 7 85 8
+26 2 2 1 1 8 86 7
+27 2 2 1 1 7 86 39
+28 2 2 1 1 40 85 7
+29 2 2 1 1 9 41 8
+30 2 2 1 1 8 42 9
+31 2 2 1 1 41 86 8
+32 2 2 1 1 8 85 42
+33 2 2 1 1 10 98 9
+34 2 2 1 1 9 99 10
+35 2 2 1 1 9 98 41
+36 2 2 1 1 42 99 9
+37 2 2 1 1 10 116 98
+38 2 2 1 1 99 112 10
+39 2 2 1 1 23 50 24
+40 2 2 1 1 23 113 50
+41 2 2 1 1 24 89 25
+42 2 2 1 1 50 89 24
+43 2 2 1 1 25 37 26
+44 2 2 1 1 25 89 37
+45 2 2 1 1 26 83 27
+46 2 2 1 1 37 83 26
+47 2 2 1 1 27 46 28
+48 2 2 1 1 27 83 46
+49 2 2 1 1 28 93 29
+50 2 2 1 1 46 93 28
+51 2 2 1 1 29 47 30
+52 2 2 1 1 29 93 47
+53 2 2 1 1 30 81 31
+54 2 2 1 1 47 81 30
+55 2 2 1 1 31 38 32
+56 2 2 1 1 31 81 38
+57 2 2 1 1 32 92 33
+58 2 2 1 1 38 92 32
+59 2 2 1 1 33 51 34
+60 2 2 1 1 33 92 51
+61 2 2 1 1 51 114 34
+62 2 2 1 1 12 48 11
+63 2 2 1 1 48 116 11
+64 2 2 1 1 13 80 12
+65 2 2 1 1 12 80 48
+66 2 2 1 1 14 35 13
+67 2 2 1 1 35 80 13
+68 2 2 1 1 15 57 14
+69 2 2 1 1 14 57 35
+70 2 2 1 1 16 52 15
+71 2 2 1 1 52 57 15
+72 2 2 1 1 17 45 16
+73 2 2 1 1 45 52 16
+74 2 2 1 1 18 66 17
+75 2 2 1 1 17 66 45
+76 2 2 1 1 19 119 18
+77 2 2 1 1 18 119 66
+78 2 2 1 1 20 36 19
+79 2 2 1 1 36 119 19
+80 2 2 1 1 21 91 20
+81 2 2 1 1 20 91 36
+82 2 2 1 1 22 49 21
+83 2 2 1 1 49 91 21
+84 2 2 1 1 22 112 49
+85 2 2 1 1 57 58 35
+86 2 2 1 1 58 62 35
+87 2 2 1 1 62 80 35
+88 2 2 1 1 36 103 68
+89 2 2 1 1 68 119 36
+90 2 2 1 1 36 91 76
+91 2 2 1 1 76 103 36
+92 2 2 1 1 37 108 83
+93 2 2 1 1 89 90 37
+94 2 2 1 1 90 108 37
+95 2 2 1 1 81 82 38
+96 2 2 1 1 82 104 38
+97 2 2 1 1 38 104 92
+98 2 2 1 1 86 117 39
+99 2 2 1 1 39 97 87
+100 2 2 1 1 39 117 97
+101 2 2 1 1 40 84 77
+102 2 2 1 1 77 106 40
+103 2 2 1 1 40 106 85
+104 2 2 1 1 41 110 86
+105 2 2 1 1 41 98 94
+106 2 2 1 1 94 110 41
+107 2 2 1 1 85 111 42
+108 2 2 1 1 42 100 99
+109 2 2 1 1 42 111 100
+110 2 2 1 1 43 120 84
+111 2 2 1 1 102 115 43
+112 2 2 1 1 115 120 43
+113 2 2 1 1 87 88 44
+114 2 2 1 1 88 96 44
+115 2 2 1 1 96 101 44
+116 2 2 1 1 45 53 52
+117 2 2 1 1 45 55 53
+118 2 2 1 1 45 66 55
+119 2 2 1 1 83 109 46
+120 2 2 1 1 46 121 93
+121 2 2 1 1 109 121 46
+122 2 2 1 1 79 107 47
+123 2 2 1 1 47 122 79
+124 2 2 1 1 47 107 81
+125 2 2 1 1 93 122 47
+126 2 2 1 1 80 94 48
+127 2 2 1 1 94 98 48
+128 2 2 1 1 98 116 48
+129 2 2 1 1 49 100 91
+130 2 2 1 1 99 100 49
+131 2 2 1 1 49 112 99
+132 2 2 1 1 50 96 89
+133 2 2 1 1 50 101 96
+134 2 2 1 1 50 113 101
+135 2 2 1 1 92 115 51
+136 2 2 1 1 102 114 51
+137 2 2 1 1 51 115 102
+138 2 2 1 1 53 54 52
+139 2 2 1 1 54 57 52
+140 2 2 1 1 53 65 54
+141 2 2 1 1 55 56 53
+142 2 2 1 1 56 65 53
+143 2 2 1 1 54 58 57
+144 2 2 1 1 54 59 58
+145 2 2 1 1 54 118 59
+146 2 2 1 1 65 118 54
+147 2 2 1 1 55 79 56
+148 2 2 1 1 66 67 55
+149 2 2 1 1 67 79 55
+150 2 2 1 1 56 121 65
+151 2 2 1 1 79 122 56
+152 2 2 1 1 93 121 56
+153 2 2 1 1 56 122 93
+154 2 2 1 1 59 60 58
+155 2 2 1 1 60 62 58
+156 2 2 1 1 59 61 60
+157 2 2 1 1 59 63 61
+158 2 2 1 1 59 118 63
+159 2 2 1 1 61 95 60
+160 2 2 1 1 60 64 62
+161 2 2 1 1 60 95 64
+162 2 2 1 1 63 108 61
+163 2 2 1 1 61 105 95
+164 2 2 1 1 61 108 105
+165 2 2 1 1 64 110 62
+166 2 2 1 1 62 94 80
+167 2 2 1 1 62 110 94
+168 2 2 1 1 83 108 63
+169 2 2 1 1 63 109 83
+170 2 2 1 1 63 118 109
+171 2 2 1 1 95 117 64
+172 2 2 1 1 64 117 110
+173 2 2 1 1 109 118 65
+174 2 2 1 1 65 121 109
+175 2 2 1 1 66 68 67
+176 2 2 1 1 66 119 68
+177 2 2 1 1 68 69 67
+178 2 2 1 1 69 70 67
+179 2 2 1 1 70 79 67
+180 2 2 1 1 68 103 69
+181 2 2 1 1 69 71 70
+182 2 2 1 1 69 72 71
+183 2 2 1 1 69 103 72
+184 2 2 1 1 71 82 70
+185 2 2 1 1 70 107 79
+186 2 2 1 1 82 107 70
+187 2 2 1 1 72 73 71
+188 2 2 1 1 73 74 71
+189 2 2 1 1 74 82 71
+190 2 2 1 1 72 75 73
+191 2 2 1 1 72 76 75
+192 2 2 1 1 72 103 76
+193 2 2 1 1 73 77 74
+194 2 2 1 1 75 106 73
+195 2 2 1 1 73 106 77
+196 2 2 1 1 77 78 74
+197 2 2 1 1 78 104 74
+198 2 2 1 1 74 104 82
+199 2 2 1 1 76 111 75
+200 2 2 1 1 85 106 75
+201 2 2 1 1 75 111 85
+202 2 2 1 1 91 100 76
+203 2 2 1 1 100 111 76
+204 2 2 1 1 77 120 78
+205 2 2 1 1 84 120 77
+206 2 2 1 1 92 104 78
+207 2 2 1 1 78 115 92
+208 2 2 1 1 78 120 115
+209 2 2 1 1 81 107 82
+210 2 2 1 1 110 117 86
+211 2 2 1 1 87 97 88
+212 2 2 1 1 90 96 88
+213 2 2 1 1 88 105 90
+214 2 2 1 1 97 105 88
+215 2 2 1 1 89 96 90
+216 2 2 1 1 105 108 90
+217 2 2 1 1 95 105 97
+218 2 2 1 1 97 117 95
+219 2 2 2 2 23 136 2
+220 2 2 2 2 2 127 34
+221 2 2 2 2 2 136 127
+222 2 2 2 2 24 126 23
+223 2 2 2 2 126 136 23
+224 2 2 2 2 25 128 24
+225 2 2 2 2 24 128 126
+226 2 2 2 2 26 137 25
+227 2 2 2 2 25 137 128
+228 2 2 2 2 27 124 26
+229 2 2 2 2 124 137 26
+230 2 2 2 2 28 132 27
+231 2 2 2 2 27 132 124
+232 2 2 2 2 29 132 28
+233 2 2 2 2 30 125 29
+234 2 2 2 2 125 132 29
+235 2 2 2 2 31 134 30
+236 2 2 2 2 30 134 125
+237 2 2 2 2 32 134 31
+238 2 2 2 2 33 123 32
+239 2 2 2 2 123 134 32
+240 2 2 2 2 34 135 33
+241 2 2 2 2 33 135 123
+242 2 2 2 2 127 135 34
+243 2 2 2 2 127 131 123
+244 2 2 2 2 123 135 127
+245 2 2 2 2 131 133 123
+246 2 2 2 2 133 134 123
+247 2 2 2 2 124 130 129
+248 2 2 2 2 129 137 124
+249 2 2 2 2 124 132 130
+250 2 2 2 2 130 132 125
+251 2 2 2 2 125 133 130
+252 2 2 2 2 125 134 133
+253 2 2 2 2 126 131 127
+254 2 2 2 2 127 136 126
+255 2 2 2 2 128 129 126
+256 2 2 2 2 129 131 126
+257 2 2 2 2 128 137 129
+258 2 2 2 2 130 131 129
+259 2 2 2 2 130 133 131
+260 2 2 3 3 11 1 151
+261 2 2 3 3 1 22 142
+262 2 2 3 3 1 142 151
+263 2 2 3 3 12 11 141
+264 2 2 3 3 141 11 151
+265 2 2 3 3 13 12 143
+266 2 2 3 3 12 141 143
+267 2 2 3 3 14 13 152
+268 2 2 3 3 13 143 152
+269 2 2 3 3 15 14 139
+270 2 2 3 3 139 14 152
+271 2 2 3 3 16 15 147
+272 2 2 3 3 15 139 147
+273 2 2 3 3 17 16 147
+274 2 2 3 3 18 17 140
+275 2 2 3 3 140 17 147
+276 2 2 3 3 19 18 149
+277 2 2 3 3 18 140 149
+278 2 2 3 3 20 19 149
+279 2 2 3 3 21 20 138
+280 2 2 3 3 138 20 149
+281 2 2 3 3 22 21 150
+282 2 2 3 3 21 138 150
+283 2 2 3 3 142 22 150
+284 2 2 3 3 142 138 146
+285 2 2 3 3 138 142 150
+286 2 2 3 3 146 138 148
+287 2 2 3 3 148 138 149
+288 2 2 3 3 139 144 145
+289 2 2 3 3 144 139 152
+290 2 2 3 3 139 145 147
+291 2 2 3 3 145 140 147
+292 2 2 3 3 140 145 148
+293 2 2 3 3 140 148 149
+294 2 2 3 3 141 142 146
+295 2 2 3 3 142 141 151
+296 2 2 3 3 143 141 144
+297 2 2 3 3 144 141 146
+298 2 2 3 3 143 144 152
+299 2 2 3 3 145 144 146
+300 2 2 3 3 145 146 148
+301 4 2 1 1 106 161 155 171
+302 4 2 1 1 50 157 126 160
+303 4 2 1 1 56 161 156 170
+304 4 2 1 1 157 159 130 160
+305 4 2 1 1 50 96 157 160
+306 4 2 1 1 106 73 161 171
+307 4 2 1 1 106 155 75 171
+308 4 2 1 1 75 171 155 173
+309 4 2 1 1 126 129 128 160
+310 4 2 1 1 79 164 161 170
+311 4 2 1 1 55 156 56 161
+312 4 2 1 1 130 160 159 164
+313 4 2 1 1 56 79 161 170
+314 4 2 1 1 85 173 155 177
+315 4 2 1 1 126 128 89 160
+316 4 2 1 1 97 166 155 167
+317 4 2 1 1 106 75 73 171
+318 4 2 1 1 115 159 157 165
+319 4 2 1 1 50 126 89 160
+320 4 2 1 1 96 157 160 167
+321 4 2 1 1 75 155 85 173
+322 4 2 1 1 156 161 55 175
+323 4 2 1 1 50 89 96 160
+324 4 2 1 1 131 130 157 159
+325 4 2 1 1 75 106 85 155
+326 4 2 1 1 155 77 165 169
+327 4 2 1 1 115 78 159 165
+328 4 2 1 1 56 164 79 170
+329 4 2 1 1 106 77 155 169
+330 4 2 1 1 137 128 129 160
+331 4 2 1 1 106 73 77 169
+332 4 2 1 1 131 157 130 160
+333 4 2 1 1 107 161 164 169
+334 4 2 1 1 87 155 6 165
+335 4 2 1 1 87 155 165 167
+336 4 2 1 1 131 133 130 159
+337 4 2 1 1 97 155 87 167
+338 4 2 1 1 153 159 157 160
+339 4 2 1 1 161 164 153 170
+340 4 2 1 1 129 126 157 160
+341 4 2 1 1 56 122 79 164
+342 4 2 1 1 145 148 146 163
+343 4 2 1 1 79 55 56 161
+344 4 2 1 1 137 89 128 160
+345 4 2 1 1 159 160 153 164
+346 4 2 1 1 107 161 79 164
+347 4 2 1 1 127 51 135 157
+348 4 2 1 1 55 53 56 156
+349 4 2 1 1 159 165 78 169
+350 4 2 1 1 130 159 133 164
+351 4 2 1 1 107 164 159 169
+352 4 2 1 1 55 53 156 175
+353 4 2 1 1 61 166 156 168
+354 4 2 1 1 114 135 127 51
+355 4 2 1 1 67 161 171 175
+356 4 2 1 1 127 135 123 157
+357 4 2 1 1 61 95 166 168
+358 4 2 1 1 148 146 163 178
+359 4 2 1 1 156 166 155 168
+360 4 2 1 1 136 126 113 157
+361 4 2 1 1 153 155 166 167
+362 4 2 1 1 87 6 5 165
+363 4 2 1 1 153 164 161 169
+364 4 2 1 1 9 41 158 177
+365 4 2 1 1 129 157 131 160
+366 4 2 1 1 67 55 161 175
+367 4 2 1 1 101 50 96 157
+368 4 2 1 1 156 161 153 170
+369 4 2 1 1 155 171 154 173
+370 4 2 1 1 46 160 164 170
+371 4 2 1 1 59 61 156 168
+372 4 2 1 1 3 136 113 157
+373 4 2 1 1 150 49 112 178
+374 4 2 1 1 115 157 43 165
+375 4 2 1 1 158 162 146 163
+376 4 2 1 1 97 39 87 155
+377 4 2 1 1 145 146 162 163
+378 4 2 1 1 102 43 115 157
+379 4 2 1 1 117 155 97 168
+380 4 2 1 1 154 155 173 177
+381 4 2 1 1 155 156 153 166
+382 4 2 1 1 77 40 155 165
+383 4 2 1 1 129 131 130 160
+384 4 2 1 1 9 158 173 177
+385 4 2 1 1 158 163 146 178
+386 4 2 1 1 150 112 142 178
+387 4 2 1 1 154 161 156 175
+388 4 2 1 1 41 172 158 177
+389 4 2 1 1 133 131 123 159
+390 4 2 1 1 126 50 113 157
+391 4 2 1 1 117 97 95 168
+392 4 2 1 1 56 156 65 170
+393 4 2 1 1 3 127 136 157
+394 4 2 1 1 125 130 133 164
+395 4 2 1 1 101 113 50 157
+396 4 2 1 1 51 102 115 157
+397 4 2 1 1 125 159 81 164
+398 4 2 1 1 153 155 165 169
+399 4 2 1 1 153 165 155 167
+400 4 2 1 1 87 165 5 167
+401 4 2 1 1 87 39 6 155
+402 4 2 1 1 37 89 137 160
+403 4 2 1 1 107 70 161 169
+404 4 2 1 1 46 160 132 164
+405 4 2 1 1 40 106 77 155
+406 4 2 1 1 125 133 159 164
+407 4 2 1 1 67 171 163 175
+408 4 2 1 1 59 60 61 168
+409 4 2 1 1 131 157 123 159
+410 4 2 1 1 127 114 51 157
+411 4 2 1 1 118 156 59 166
+412 4 2 1 1 117 39 97 155
+413 4 2 1 1 116 141 151 158
+414 4 2 1 1 101 157 96 167
+415 4 2 1 1 72 161 73 171
+416 4 2 1 1 53 65 56 156
+417 4 2 1 1 67 55 79 161
+418 4 2 1 1 96 160 90 167
+419 4 2 1 1 126 131 129 157
+420 4 2 1 1 43 157 4 165
+421 4 2 1 1 105 166 97 167
+422 4 2 1 1 157 159 153 165
+423 4 2 1 1 153 156 155 161
+424 4 2 1 1 59 156 61 166
+425 4 2 1 1 155 161 154 171
+426 4 2 1 1 107 159 82 169
+427 4 2 1 1 72 73 75 171
+428 4 2 1 1 107 81 159 164
+429 4 2 1 1 145 140 163 175
+430 4 2 1 1 95 97 105 166
+431 4 2 1 1 153 161 155 169
+432 4 2 1 1 156 166 118 170
+433 4 2 1 1 115 78 92 159
+434 4 2 1 1 10 116 151 158
+435 4 2 1 1 63 166 160 170
+436 4 2 1 1 96 89 90 160
+437 4 2 1 1 43 102 4 157
+438 4 2 1 1 124 132 46 160
+439 4 2 1 1 102 51 114 157
+440 4 2 1 1 4 165 157 167
+441 4 2 1 1 124 46 83 160
+442 4 2 1 1 145 163 162 175
+443 4 2 1 1 130 132 160 164
+444 4 2 1 1 61 60 95 168
+445 4 2 1 1 85 106 40 155
+446 4 2 1 1 154 171 161 175
+447 4 2 1 1 151 142 10 158
+448 4 2 1 1 83 160 46 170
+449 4 2 1 1 137 124 83 160
+450 4 2 1 1 47 125 81 164
+451 4 2 1 1 136 127 126 157
+452 4 2 1 1 107 70 79 161
+453 4 2 1 1 85 155 7 177
+454 4 2 1 1 9 41 98 158
+455 4 2 1 1 77 84 40 165
+456 4 2 1 1 39 155 117 177
+457 4 2 1 1 58 162 172 176
+458 4 2 1 1 5 165 4 167
+459 4 2 1 1 104 159 78 169
+460 4 2 1 1 9 8 41 177
+461 4 2 1 1 59 168 156 176
+462 4 2 1 1 98 158 41 172
+463 4 2 1 1 155 156 154 161
+464 4 2 1 1 78 115 120 165
+465 4 2 1 1 112 158 142 178
+466 4 2 1 1 109 83 46 170
+467 4 2 1 1 112 99 158 178
+468 4 2 1 1 112 49 99 178
+469 4 2 1 1 125 134 81 159
+470 4 2 1 1 63 118 59 166
+471 4 2 1 1 101 4 157 167
+472 4 2 1 1 154 162 158 163
+473 4 2 1 1 156 175 53 176
+474 4 2 1 1 71 73 72 161
+475 4 2 1 1 123 131 127 157
+476 4 2 1 1 155 168 117 177
+477 4 2 1 1 153 165 159 169
+478 4 2 1 1 137 83 37 160
+479 4 2 1 1 73 161 71 169
+480 4 2 1 1 66 67 163 175
+481 4 2 1 1 9 173 8 177
+482 4 2 1 1 46 164 121 170
+483 4 2 1 1 153 159 164 169
+484 4 2 1 1 48 116 98 174
+485 4 2 1 1 88 105 97 167
+486 4 2 1 1 116 158 98 174
+487 4 2 1 1 139 57 52 162
+488 4 2 1 1 160 166 90 167
+489 4 2 1 1 103 163 171 178
+490 4 2 1 1 145 147 140 175
+491 4 2 1 1 153 160 157 167
+492 4 2 1 1 104 92 78 159
+493 4 2 1 1 140 66 163 175
+494 4 2 1 1 101 44 4 167
+495 4 2 1 1 124 130 132 160
+496 4 2 1 1 113 101 3 157
+497 4 2 1 1 158 172 98 174
+498 4 2 1 1 154 163 158 178
+499 4 2 1 1 58 162 35 172
+500 4 2 1 1 67 68 163 171
+501 4 2 1 1 141 158 116 174
+502 4 2 1 1 118 166 63 170
+503 4 2 1 1 152 162 144 174
+504 4 2 1 1 3 114 127 157
+505 4 2 1 1 107 82 70 169
+506 4 2 1 1 120 115 43 165
+507 4 2 1 1 107 47 81 164
+508 4 2 1 1 108 160 63 166
+509 4 2 1 1 85 40 7 155
+510 4 2 1 1 35 162 152 174
+511 4 2 1 1 172 173 158 177
+512 4 2 1 1 139 52 147 162
+513 4 2 1 1 44 5 4 167
+514 4 2 1 1 107 81 82 159
+515 4 2 1 1 78 165 120 169
+516 4 2 1 1 6 84 5 165
+517 4 2 1 1 144 152 139 162
+518 4 2 1 1 54 156 53 176
+519 4 2 1 1 35 172 162 174
+520 4 2 1 1 98 172 94 174
+521 4 2 1 1 99 9 158 173
+522 4 2 1 1 7 6 39 155
+523 4 2 1 1 140 45 66 175
+524 4 2 1 1 151 141 142 158
+525 4 2 1 1 124 129 130 160
+526 4 2 1 1 5 43 4 165
+527 4 2 1 1 54 65 53 156
+528 4 2 1 1 145 162 147 175
+529 4 2 1 1 69 67 161 171
+530 4 2 1 1 129 124 137 160
+531 4 2 1 1 86 39 117 177
+532 4 2 1 1 148 145 140 163
+533 4 2 1 1 102 114 3 157
+534 4 2 1 1 143 35 152 174
+535 4 2 1 1 154 156 155 168
+536 4 2 1 1 149 148 163 178
+537 4 2 1 1 70 67 79 161
+538 4 2 1 1 153 164 160 170
+539 4 2 1 1 54 59 118 156
+540 4 2 1 1 7 155 39 177
+541 4 2 1 1 101 4 3 157
+542 4 2 1 1 59 60 168 176
+543 4 2 1 1 152 139 57 14
+544 4 2 1 1 125 133 134 159
+545 4 2 1 1 99 173 158 178
+546 4 2 1 1 56 121 164 170
+547 4 2 1 1 142 138 150 178
+548 4 2 1 1 146 162 158 174
+549 4 2 1 1 99 42 9 173
+550 4 2 1 1 112 10 142 158
+551 4 2 1 1 58 57 162 176
+552 4 2 1 1 49 150 91 178
+553 4 2 1 1 144 162 146 174
+554 4 2 1 1 4 102 3 157
+555 4 2 1 1 46 93 121 164
+556 4 2 1 1 38 123 92 159
+557 4 2 1 1 103 36 163 178
+558 4 2 1 1 83 63 108 160
+559 4 2 1 1 65 156 118 170
+560 4 2 1 1 68 66 67 163
+561 4 2 1 1 147 45 140 175
+562 4 2 1 1 54 118 65 156
+563 4 2 1 1 154 158 173 178
+564 4 2 1 1 128 137 25 89
+565 4 2 1 1 71 161 72 171
+566 4 2 1 1 52 175 162 176
+567 4 2 1 1 48 98 94 174
+568 4 2 1 1 69 67 70 161
+569 4 2 1 1 52 162 57 176
+570 4 2 1 1 103 171 76 178
+571 4 2 1 1 123 38 134 159
+572 4 2 1 1 142 158 146 178
+573 4 2 1 1 86 117 168 177
+574 4 2 1 1 98 41 94 172
+575 4 2 1 1 131 126 127 157
+576 4 2 1 1 88 97 87 167
+577 4 2 1 1 112 99 10 158
+578 4 2 1 1 56 93 122 164
+579 4 2 1 1 9 42 8 173
+580 4 2 1 1 91 150 138 178
+581 4 2 1 1 147 162 52 175
+582 4 2 1 1 99 100 173 178
+583 4 2 1 1 75 76 171 173
+584 4 2 1 1 138 146 148 178
+585 4 2 1 1 10 99 9 158
+586 4 2 1 1 157 165 153 167
+587 4 2 1 1 144 145 146 162
+588 4 2 1 1 77 120 165 169
+589 4 2 1 1 149 138 148 178
+590 4 2 1 1 137 26 83 124
+591 4 2 1 1 149 163 36 178
+592 4 2 1 1 87 5 44 167
+593 4 2 1 1 153 166 160 167
+594 4 2 1 1 154 173 163 178
+595 4 2 1 1 71 69 161 171
+596 4 2 1 1 70 71 161 169
+597 4 2 1 1 153 160 166 170
+598 4 2 1 1 154 175 156 176
+599 4 2 1 1 61 95 105 166
+600 4 2 1 1 48 141 116 174
+601 4 2 1 1 10 98 116 158
+602 4 2 1 1 66 55 67 175
+603 4 2 1 1 90 166 105 167
+604 4 2 1 1 56 121 93 164
+605 4 2 1 1 156 153 166 170
+606 4 2 1 1 9 98 10 158
+607 4 2 1 1 7 40 6 155
+608 4 2 1 1 58 57 35 162
+609 4 2 1 1 163 173 171 178
+610 4 2 1 1 46 132 93 164
+611 4 2 1 1 111 75 85 173
+612 4 2 1 1 139 57 15 52
+613 4 2 1 1 171 173 76 178
+614 4 2 1 1 62 58 35 172
+615 4 2 1 1 29 125 47 164
+616 4 2 1 1 80 172 35 174
+617 4 2 1 1 17 66 140 45
+618 4 2 1 1 77 78 120 169
+619 4 2 1 1 138 142 146 178
+620 4 2 1 1 119 140 66 163
+621 4 2 1 1 90 89 37 160
+622 4 2 1 1 86 7 39 177
+623 4 2 1 1 73 71 74 169
+624 4 2 1 1 134 38 81 159
+625 4 2 1 1 149 36 138 178
+626 4 2 1 1 59 156 54 176
+627 4 2 1 1 154 171 163 173
+628 4 2 1 1 117 110 86 168
+629 4 2 1 1 63 59 61 166
+630 4 2 1 1 123 134 133 159
+631 4 2 1 1 141 146 158 174
+632 4 2 1 1 75 111 76 173
+633 4 2 1 1 86 168 110 177
+634 4 2 1 1 36 91 138 178
+635 4 2 1 1 82 159 104 169
+636 4 2 1 1 93 29 47 164
+637 4 2 1 1 158 172 154 173
+638 4 2 1 1 81 47 30 125
+639 4 2 1 1 37 26 83 137
+640 4 2 1 1 52 53 175 176
+641 4 2 1 1 154 173 172 177
+642 4 2 1 1 146 142 141 158
+643 4 2 1 1 90 160 108 166
+644 4 2 1 1 70 82 71 169
+645 4 2 1 1 163 171 154 175
+646 4 2 1 1 44 101 96 167
+647 4 2 1 1 73 74 77 169
+648 4 2 1 1 158 162 154 172
+649 4 2 1 1 152 57 35 14
+650 4 2 1 1 71 72 69 171
+651 4 2 1 1 143 152 144 174
+652 4 2 1 1 62 60 58 168
+653 4 2 1 1 92 51 135 33
+654 4 2 1 1 48 143 141 174
+655 4 2 1 1 91 49 21 150
+656 4 2 1 1 42 85 8 173
+657 4 2 1 1 99 49 100 178
+658 4 2 1 1 130 125 132 164
+659 4 2 1 1 41 110 172 177
+660 4 2 1 1 15 147 139 52
+661 4 2 1 1 64 117 95 168
+662 4 2 1 1 13 143 35 152
+663 4 2 1 1 136 3 2 127
+664 4 2 1 1 58 168 60 176
+665 4 2 1 1 103 76 36 178
+666 4 2 1 1 10 1 142 151
+667 4 2 1 1 64 110 168 172
+668 4 2 1 1 138 91 21 150
+669 4 2 1 1 52 54 53 176
+670 4 2 1 1 38 82 81 159
+671 4 2 1 1 67 69 68 171
+672 4 2 1 1 138 91 36 20
+673 4 2 1 1 92 135 123 33
+674 4 2 1 1 48 80 143 174
+675 4 2 1 1 8 173 85 177
+676 4 2 1 1 27 46 83 124
+677 4 2 1 1 70 71 69 161
+678 4 2 1 1 103 163 68 171
+679 4 2 1 1 119 149 140 163
+680 4 2 1 1 147 17 140 45
+681 4 2 1 1 137 37 25 89
+682 4 2 1 1 168 172 110 177
+683 4 2 1 1 46 121 109 170
+684 4 2 1 1 149 119 36 163
+685 4 2 1 1 154 168 155 177
+686 4 2 1 1 64 168 62 172
+687 4 2 1 1 38 92 104 159
+688 4 2 1 1 109 118 63 170
+689 4 2 1 1 59 58 60 176
+690 4 2 1 1 75 76 72 171
+691 4 2 1 1 38 104 82 159
+692 4 2 1 1 99 100 42 173
+693 4 2 1 1 143 80 35 174
+694 4 2 1 1 162 163 154 175
+695 4 2 1 1 138 36 149 20
+696 4 2 1 1 117 64 110 168
+697 4 2 1 1 36 68 103 163
+698 4 2 1 1 90 105 88 167
+699 4 2 1 1 80 62 35 172
+700 4 2 1 1 139 145 144 162
+701 4 2 1 1 147 52 45 175
+702 4 2 1 1 141 144 146 174
+703 4 2 1 1 56 65 121 170
+704 4 2 1 1 108 90 37 160
+705 4 2 1 1 134 81 30 125
+706 4 2 1 1 154 156 168 176
+707 4 2 1 1 148 140 149 163
+708 4 2 1 1 123 32 38 92
+709 4 2 1 1 116 11 151 141
+710 4 2 1 1 119 18 66 140
+711 4 2 1 1 113 136 23 126
+712 4 2 1 1 83 108 37 160
+713 4 2 1 1 145 139 147 162
+714 4 2 1 1 96 90 88 167
+715 4 2 1 1 68 36 119 163
+716 4 2 1 1 76 173 100 178
+717 4 2 1 1 64 62 110 172
+718 4 2 1 1 64 60 62 168
+719 4 2 1 1 64 95 60 168
+720 4 2 1 1 12 143 141 48
+721 4 2 1 1 74 104 78 169
+722 4 2 1 1 48 12 143 80
+723 4 2 1 1 154 172 168 177
+724 4 2 1 1 52 57 54 176
+725 4 2 1 1 77 74 78 169
+726 4 2 1 1 119 66 68 163
+727 4 2 1 1 43 5 84 165
+728 4 2 1 1 162 154 172 176
+729 4 2 1 1 77 120 84 165
+730 4 2 1 1 38 32 123 134
+731 4 2 1 1 47 122 93 164
+732 4 2 1 1 42 111 85 173
+733 4 2 1 1 100 76 111 173
+734 4 2 1 1 66 45 55 175
+735 4 2 1 1 127 3 2 114
+736 4 2 1 1 162 172 158 174
+737 4 2 1 1 168 172 154 176
+738 4 2 1 1 41 86 110 177
+739 4 2 1 1 109 65 118 170
+740 4 2 1 1 1 10 142 112
+741 4 2 1 1 87 44 88 167
+742 4 2 1 1 74 71 82 169
+743 4 2 1 1 41 8 86 177
+744 4 2 1 1 46 27 132 124
+745 4 2 1 1 43 84 120 165
+746 4 2 1 1 13 143 80 35
+747 4 2 1 1 8 85 7 177
+748 4 2 1 1 82 104 74 169
+749 4 2 1 1 91 36 76 178
+750 4 2 1 1 103 72 76 171
+751 4 2 1 1 80 94 172 174
+752 4 2 1 1 132 93 28 46
+753 4 2 1 1 116 11 141 48
+754 4 2 1 1 126 23 113 50
+755 4 2 1 1 94 41 110 172
+756 4 2 1 1 69 103 68 171
+757 4 2 1 1 45 147 16 52
+758 4 2 1 1 26 27 83 124
+759 4 2 1 1 80 48 94 174
+760 4 2 1 1 15 57 139 14
+761 4 2 1 1 24 23 126 50
+762 4 2 1 1 90 108 105 166
+763 4 2 1 1 91 100 49 178
+764 4 2 1 1 38 81 31 134
+765 4 2 1 1 105 108 61 166
+766 4 2 1 1 150 22 142 112
+767 4 2 1 1 138 21 91 20
+768 4 2 1 1 59 54 58 176
+769 4 2 1 1 114 127 135 34
+770 4 2 1 1 2 3 136 113
+771 4 2 1 1 1 10 116 151
+772 4 2 1 1 154 162 175 176
+773 4 2 1 1 119 149 18 140
+774 4 2 1 1 17 147 16 45
+775 4 2 1 1 61 108 63 166
+776 4 2 1 1 29 30 47 125
+777 4 2 1 1 92 123 32 33
+778 4 2 1 1 76 100 91 178
+779 4 2 1 1 93 28 29 132
+780 4 2 1 1 121 65 109 170
+781 4 2 1 1 58 54 57 176
+782 4 2 1 1 100 111 42 173
+783 4 2 1 1 140 18 66 17
+784 4 2 1 1 21 49 22 150
+785 4 2 1 1 8 7 86 177
+786 4 2 1 1 94 110 62 172
+787 4 2 1 1 12 141 11 48
+788 4 2 1 1 96 88 44 167
+789 4 2 1 1 80 94 62 172
+790 4 2 1 1 49 22 150 112
+791 4 2 1 1 45 53 55 175
+792 4 2 1 1 135 114 34 51
+793 4 2 1 1 45 52 53 175
+794 4 2 1 1 13 143 12 80
+795 4 2 1 1 22 1 142 112
+796 4 2 1 1 37 25 26 137
+797 4 2 1 1 114 2 127 34
+798 4 2 1 1 19 149 36 20
+799 4 2 1 1 81 30 31 134
+800 4 2 1 1 119 19 149 36
+801 4 2 1 1 69 72 103 171
+802 4 2 1 1 15 16 147 52
+803 4 2 1 1 128 25 24 89
+804 4 2 1 1 51 34 135 33
+805 4 2 1 1 144 141 143 174
+806 4 2 1 1 11 1 116 151
+807 4 2 1 1 38 31 32 134
+808 4 2 1 1 136 23 2 113
+809 4 2 1 1 28 27 132 46
+810 4 2 1 1 35 13 152 14
+811 4 2 1 1 149 18 19 119
+812 4 2 1 1 161 106 169 73
+813 4 2 1 1 161 169 106 155
+814 4 2 1 1 168 97 166 155
+815 4 2 1 1 168 166 97 95
+816 4 2 1 1 159 51 115 157
+817 4 2 1 1 159 115 51 92
+818 4 2 1 1 165 40 6 84
+819 4 2 1 1 165 6 40 155
+820 4 2 1 1 126 89 24 50
+821 4 2 1 1 24 89 126 128
+822 4 2 1 1 170 83 63 109
+823 4 2 1 1 170 63 83 160
+824 4 2 1 1 135 92 159 51
+825 4 2 1 1 159 92 135 123
+826 4 2 1 1 159 157 135 51
+827 4 2 1 1 135 157 159 123
+828 4 2 1 1 152 57 162 35
+829 4 2 1 1 152 162 57 139
+830 4 2 1 1 164 29 132 93
+831 4 2 1 1 164 132 29 125
+832 4 2 1 1 172 58 168 62
+833 4 2 1 1 172 168 58 176
+834 4 2 1 1 79 47 164 122
+835 4 2 1 1 79 164 47 107
+$EndElements
+$Periodic
+1
+2 3 2
+Affine 0.5000000000000001 -0.8660254037844386 0 0 0.8660254037844386 0.5000000000000001 0 0 0 0 1 0 0 0 0 1
+28
+152 137
+141 126
+147 132
+150 135
+149 134
+145 130
+139 124
+144 129
+143 128
+140 125
+138 123
+148 133
+142 127
+151 136
+146 131
+11 23
+12 24
+13 25
+16 28
+14 26
+17 29
+15 27
+18 30
+21 33
+19 31
+22 34
+20 32
+1 2
+$EndPeriodic
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 8a48e4045ae..57ef3f97ab2 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -16,36 +16,21 @@ if (DOXYGEN_FOUND)
   configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CodeDocumentation.conf.in
     ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation.conf @ONLY)
 
-  if (UNIX)
-    # Only create symlinks if UNIX operating system
-    add_custom_target(doc
-      COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation.conf
-      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation.html
-      COMMAND ${CMAKE_COMMAND} -E create_symlink
-      ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation/html/index.html
-      ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation.html
-      BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation/html/index.html
-      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-      COMMENT "Generating API documentation with Doxygen to CodeDocumentation.html"
-      VERBATIM)
 
-    add_custom_target(clean-doc
-      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation.html
-      COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation
-      COMMENT "Removing API documentation"
-      VERBATIM)
+  add_custom_target(doc
+    COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation.conf
+    COMMAND echo "<meta http-equiv=\"REFRESH\" content=\"0;URL=CodeDocumentation/html/index.html\">" > ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation.html
+    BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation/html/index.html
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    COMMENT "Generating API documentation with Doxygen to CodeDocumentation.html"
+    VERBATIM)
+
+  add_custom_target(clean-doc
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation.html
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/warnings.log
+    COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation
+    COMMENT "Removing API documentation"
+    VERBATIM)
 
-  else (UNIX)
-    add_custom_target(doc
-      COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation.conf
-      BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation/html/index.html
-      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-      COMMENT "Generating API documentation with Doxygen to CodeDocumentation/html/index.html"
-      VERBATIM)
 
-    add_custom_target(clean-doc
-      COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_BINARY_DIR}/CodeDocumentation
-      COMMENT "Removing API documentation"
-      VERBATIM)
-  endif (UNIX)
 endif (DOXYGEN_FOUND)
diff --git a/doc/CodeDocumentation.conf.in b/doc/CodeDocumentation.conf.in
index 8b2f2284063..20bd4d2b104 100644
--- a/doc/CodeDocumentation.conf.in
+++ b/doc/CodeDocumentation.conf.in
@@ -51,7 +51,7 @@ PROJECT_BRIEF          = "Finite element discretization library"
 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
 # the logo to the output directory.
 
-PROJECT_LOGO           =
+PROJECT_LOGO           = web/logo-small.png
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
 # into which the generated documentation will be written. If a relative path is
@@ -746,7 +746,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).
 
-WARN_LOGFILE           =
+WARN_LOGFILE           = warnings.log
 
 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -1470,7 +1470,7 @@ MATHJAX_FORMAT         = HTML-CSS
 # The default value is: http://cdn.mathjax.org/mathjax/latest.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_RELPATH        = https://cdn.llnl.gov/mathjax/2.7.2
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 
 # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
 # extension names that should be enabled during MathJax rendering. For example
diff --git a/doc/makefile b/doc/makefile
index f8e111f1f3e..9a6ff290352 100644
--- a/doc/makefile
+++ b/doc/makefile
@@ -9,18 +9,25 @@
 # terms of the BSD-3 license. We welcome feedback and contributions, see file
 # CONTRIBUTING.md for details.
 
+SHELL = /bin/bash
 MFEM_DIR ?= ..
 DOXYGEN_CONF = CodeDocumentation.conf
 
+
 # doxygen uses: graphviz, latex
 html: $(DOXYGEN_CONF)
-	doxygen $(DOXYGEN_CONF)
-	rm -f CodeDocumentation.html
-	ln -s CodeDocumentation/html/index.html CodeDocumentation.html
+	@# Generate the html documentation
+	@doxygen $(DOXYGEN_CONF)
+	@echo "<meta http-equiv=\"REFRESH\" content=\"0;URL=CodeDocumentation/html/index.html\">" > CodeDocumentation.html
+	@cat warnings.log
+	@# Generate the log of undocumented methods
+	@( cat $(DOXYGEN_CONF) ; echo "GENERATE_HTML=NO" ; echo "EXTRACT_ALL=NO" ; echo "WARN_LOGFILE=undoc.log" ; echo "QUIET=YES" ) | doxygen - &> /dev/null
 
 clean:
 	rm -rf $(DOXYGEN_CONF) CodeDocumentation CodeDocumentation.html *~
+	rm -rf  undoc.log warnings.log
 
 $(DOXYGEN_CONF): $(MFEM_DIR)/doc/$(DOXYGEN_CONF).in
-	sed -e 's%@MFEM_SOURCE_DIR@%$(MFEM_DIR)%g' $(<) \
+	@sed -e 's%@MFEM_SOURCE_DIR@%$(MFEM_DIR)%g' $(<) \
 	  > $(DOXYGEN_CONF)
+
diff --git a/doc/web/logo-small.png b/doc/web/logo-small.png
new file mode 100644
index 00000000000..021cbcfe8a7
Binary files /dev/null and b/doc/web/logo-small.png differ
diff --git a/examples/ex1.cpp b/examples/ex1.cpp
index c5e240c1b6d..50bbd175c5e 100644
--- a/examples/ex1.cpp
+++ b/examples/ex1.cpp
@@ -9,6 +9,8 @@
 //               ex1 -m ../data/fichera.mesh
 //               ex1 -m ../data/fichera-mixed.mesh
 //               ex1 -m ../data/toroid-wedge.mesh
+//               ex1 -m ../data/periodic-annulus-sector.msh
+//               ex1 -m ../data/periodic-torus-sector.msh
 //               ex1 -m ../data/square-disc-p2.vtk -o 2
 //               ex1 -m ../data/square-disc-p3.mesh -o 3
 //               ex1 -m ../data/square-disc-nurbs.mesh -o -1
diff --git a/examples/ex11p.cpp b/examples/ex11p.cpp
index 1dc807f8596..de1e7ce6893 100644
--- a/examples/ex11p.cpp
+++ b/examples/ex11p.cpp
@@ -8,6 +8,8 @@
 //               mpirun -np 4 ex11p -m ../data/escher.mesh
 //               mpirun -np 4 ex11p -m ../data/fichera.mesh
 //               mpirun -np 4 ex11p -m ../data/fichera-mixed.mesh
+//               mpirun -np 4 ex11p -m ../data/periodic-annulus-sector.msh
+//               mpirun -np 4 ex11p -m ../data/periodic-torus-sector.msh -rs 1
 //               mpirun -np 4 ex11p -m ../data/toroid-wedge.mesh -o 2
 //               mpirun -np 4 ex11p -m ../data/square-disc-p2.vtk -o 2
 //               mpirun -np 4 ex11p -m ../data/square-disc-p3.mesh -o 3
diff --git a/examples/ex14p.cpp b/examples/ex14p.cpp
index 39eb63f7825..471d9c94101 100644
--- a/examples/ex14p.cpp
+++ b/examples/ex14p.cpp
@@ -35,6 +35,38 @@
 using namespace std;
 using namespace mfem;
 
+class CustomSolverMonitor : public IterativeSolverMonitor
+{
+public:
+   CustomSolverMonitor(const ParMesh *m,
+                       ParGridFunction *f) :
+      pmesh(m),
+      pgf(f) {}
+
+   void MonitorSolution(int i, double norm, const Vector &x, bool final)
+   {
+      char vishost[] = "localhost";
+      int  visport   = 19916;
+      int  num_procs, myid;
+
+      MPI_Comm_size(pmesh->GetComm(),&num_procs);
+      MPI_Comm_rank(pmesh->GetComm(),&myid);
+
+      pgf->SetFromTrueDofs(x);
+
+      socketstream sol_sock(vishost, visport);
+      sol_sock << "parallel " << num_procs << " " << myid << "\n";
+      sol_sock.precision(8);
+      sol_sock << "solution\n" << *pmesh << *pgf
+               << "window_title 'Iteration no " << i << "'"
+               << "keys rRjlc\n" << flush;
+   }
+
+private:
+   const ParMesh *pmesh;
+   ParGridFunction *pgf;
+};
+
 int main(int argc, char *argv[])
 {
    // 1. Initialize MPI.
@@ -188,6 +220,7 @@ int main(int argc, char *argv[])
    }
    else
    {
+      CustomSolverMonitor monitor(pmesh, &x);
       GMRESSolver gmres(MPI_COMM_WORLD);
       gmres.SetAbsTol(0.0);
       gmres.SetRelTol(1e-12);
@@ -196,6 +229,7 @@ int main(int argc, char *argv[])
       gmres.SetPrintLevel(1);
       gmres.SetOperator(*A);
       gmres.SetPreconditioner(*amg);
+      gmres.SetMonitor(monitor);
       gmres.Mult(*B, *X);
    }
    delete amg;
diff --git a/examples/ex18.hpp b/examples/ex18.hpp
index fe0e38ffc2a..75fa5e885bc 100644
--- a/examples/ex18.hpp
+++ b/examples/ex18.hpp
@@ -418,7 +418,7 @@ void FaceIntegrator::AssembleFaceVector(const FiniteElement &el1,
    {
       intorder++;
    }
-   const IntegrationRule *ir = &IntRules.Get(Tr.FaceGeom, intorder);
+   const IntegrationRule *ir = &IntRules.Get(Tr.GetGeometryType(), intorder);
 
    for (int i = 0; i < ir->GetNPoints(); i++)
    {
@@ -435,10 +435,10 @@ void FaceIntegrator::AssembleFaceVector(const FiniteElement &el1,
       elfun1_mat.MultTranspose(shape1, funval1);
       elfun2_mat.MultTranspose(shape2, funval2);
 
-      Tr.Face->SetIntPoint(&ip);
+      Tr.SetIntPoint(&ip);
 
       // Get the normal vector and the flux on the face
-      CalcOrtho(Tr.Face->Jacobian(), nor);
+      CalcOrtho(Tr.Jacobian(), nor);
       const double mcs = rsolver.Eval(funval1, funval2, nor, fluxN);
 
       // Update max char speed
diff --git a/examples/ex19.cpp b/examples/ex19.cpp
index 16db0a167ef..0e70d483c29 100644
--- a/examples/ex19.cpp
+++ b/examples/ex19.cpp
@@ -38,6 +38,42 @@
 using namespace std;
 using namespace mfem;
 
+class GeneralResidualMonitor : public IterativeSolverMonitor
+{
+public:
+   GeneralResidualMonitor(const std::string& prefix_, int print_lvl)
+      : prefix(prefix_)
+   {
+      print_level = print_lvl;
+   }
+
+   virtual void MonitorResidual(int it, double norm, const Vector &r, bool final);
+
+private:
+   const std::string prefix;
+   int print_level;
+   mutable double norm0;
+};
+
+void GeneralResidualMonitor::MonitorResidual(int it, double norm,
+                                             const Vector &r, bool final)
+{
+   if (print_level == 1 || (print_level == 3 && (final || it == 0)))
+   {
+      mfem::out << prefix << " iteration " << setw(2) << it
+                << " : ||r|| = " << norm;
+      if (it > 0)
+      {
+         mfem::out << ",  ||r||/||r_0|| = " << norm/norm0;
+      }
+      else
+      {
+         norm0 = norm;
+      }
+      mfem::out << '\n';
+   }
+}
+
 // Custom block preconditioner for the Jacobian of the incompressible nonlinear
 // elasticity operator. It has the form
 //
@@ -103,9 +139,11 @@ class RubberOperator : public Operator
 
    // Newton solver for the hyperelastic operator
    NewtonSolver newton_solver;
+   GeneralResidualMonitor newton_monitor;
 
    // Solver for the Jacobian solve in the Newton method
    Solver *j_solver;
+   GeneralResidualMonitor j_monitor;
 
    // Preconditioner for the Jacobian
    Solver *j_prec;
@@ -410,7 +448,8 @@ RubberOperator::RubberOperator(Array<FiniteElementSpace *> &fes,
                                int iter,
                                Coefficient &c_mu)
    : Operator(fes[0]->GetVSize() + fes[1]->GetVSize()),
-     newton_solver(), mu(c_mu), block_offsets(offsets)
+     newton_solver(), newton_monitor("Newton", 1),
+     j_monitor("  GMRES", 3), mu(c_mu), block_offsets(offsets)
 {
    Array<Vector *> rhs(2);
    rhs = NULL; // Set all entries in the array
@@ -446,7 +485,8 @@ RubberOperator::RubberOperator(Array<FiniteElementSpace *> &fes,
    j_gmres->SetRelTol(1e-12);
    j_gmres->SetAbsTol(1e-12);
    j_gmres->SetMaxIter(300);
-   j_gmres->SetPrintLevel(0);
+   j_gmres->SetPrintLevel(-1);
+   j_gmres->SetMonitor(j_monitor);
    j_gmres->SetPreconditioner(*j_prec);
    j_solver = j_gmres;
 
@@ -454,7 +494,8 @@ RubberOperator::RubberOperator(Array<FiniteElementSpace *> &fes,
    newton_solver.iterative_mode = true;
    newton_solver.SetSolver(*j_solver);
    newton_solver.SetOperator(*this);
-   newton_solver.SetPrintLevel(1);
+   newton_solver.SetPrintLevel(-1);
+   newton_solver.SetMonitor(newton_monitor);
    newton_solver.SetRelTol(rel_tol);
    newton_solver.SetAbsTol(abs_tol);
    newton_solver.SetMaxIter(iter);
diff --git a/examples/ex19p.cpp b/examples/ex19p.cpp
index 3372531e276..f2382f14247 100644
--- a/examples/ex19p.cpp
+++ b/examples/ex19p.cpp
@@ -38,6 +38,56 @@
 using namespace std;
 using namespace mfem;
 
+class GeneralResidualMonitor : public IterativeSolverMonitor
+{
+public:
+   GeneralResidualMonitor(MPI_Comm comm, const std::string& prefix_,
+                          int print_lvl)
+      : prefix(prefix_)
+   {
+#ifndef MFEM_USE_MPI
+      print_level = print_lvl;
+#else
+      int rank;
+      MPI_Comm_rank(comm, &rank);
+      if (rank == 0)
+      {
+         print_level = print_lvl;
+      }
+      else
+      {
+         print_level = -1;
+      }
+#endif
+   }
+
+   virtual void MonitorResidual(int it, double norm, const Vector &r, bool final);
+
+private:
+   const std::string prefix;
+   int print_level;
+   mutable double norm0;
+};
+
+void GeneralResidualMonitor::MonitorResidual(int it, double norm,
+                                             const Vector &r, bool final)
+{
+   if (print_level == 1 || (print_level == 3 && (final || it == 0)))
+   {
+      mfem::out << prefix << " iteration " << setw(2) << it
+                << " : ||r|| = " << norm;
+      if (it > 0)
+      {
+         mfem::out << ",  ||r||/||r_0|| = " << norm/norm0;
+      }
+      else
+      {
+         norm0 = norm;
+      }
+      mfem::out << '\n';
+   }
+}
+
 // Custom block preconditioner for the Jacobian of the incompressible nonlinear
 // elasticity operator. It has the form
 //
@@ -103,9 +153,11 @@ class RubberOperator : public Operator
 
    // Newton solver for the hyperelastic operator
    NewtonSolver newton_solver;
+   GeneralResidualMonitor newton_monitor;
 
    // Solver for the Jacobian solve in the Newton method
    Solver *j_solver;
+   GeneralResidualMonitor j_monitor;
 
    // Preconditioner for the Jacobian
    Solver *j_prec;
@@ -459,7 +511,10 @@ RubberOperator::RubberOperator(Array<ParFiniteElementSpace *> &fes,
                                int iter,
                                Coefficient &c_mu)
    : Operator(fes[0]->TrueVSize() + fes[1]->TrueVSize()),
-     newton_solver(fes[0]->GetComm()), mu(c_mu), block_trueOffsets(trueOffsets)
+     newton_solver(fes[0]->GetComm()),
+     newton_monitor(fes[0]->GetComm(), "Newton", 1),
+     j_monitor(fes[0]->GetComm(), "  GMRES", 3),
+     mu(c_mu), block_trueOffsets(trueOffsets)
 {
    Array<Vector *> rhs(2);
    rhs = NULL; // Set all entries in the array
@@ -499,7 +554,8 @@ RubberOperator::RubberOperator(Array<ParFiniteElementSpace *> &fes,
    j_gmres->SetRelTol(1e-12);
    j_gmres->SetAbsTol(1e-12);
    j_gmres->SetMaxIter(300);
-   j_gmres->SetPrintLevel(0);
+   j_gmres->SetPrintLevel(-1);
+   j_gmres->SetMonitor(j_monitor);
    j_gmres->SetPreconditioner(*j_prec);
    j_solver = j_gmres;
 
@@ -507,7 +563,8 @@ RubberOperator::RubberOperator(Array<ParFiniteElementSpace *> &fes,
    newton_solver.iterative_mode = true;
    newton_solver.SetSolver(*j_solver);
    newton_solver.SetOperator(*this);
-   newton_solver.SetPrintLevel(1);
+   newton_solver.SetPrintLevel(-1);
+   newton_solver.SetMonitor(newton_monitor);
    newton_solver.SetRelTol(rel_tol);
    newton_solver.SetAbsTol(abs_tol);
    newton_solver.SetMaxIter(iter);
diff --git a/examples/ex1p.cpp b/examples/ex1p.cpp
index 83ede5910ae..649ecbf1172 100644
--- a/examples/ex1p.cpp
+++ b/examples/ex1p.cpp
@@ -9,6 +9,8 @@
 //               mpirun -np 4 ex1p -m ../data/fichera.mesh
 //               mpirun -np 4 ex1p -m ../data/fichera-mixed.mesh
 //               mpirun -np 4 ex1p -m ../data/toroid-wedge.mesh
+//               mpirun -np 4 ex1p -m ../data/periodic-annulus-sector.msh
+//               mpirun -np 4 ex1p -m ../data/periodic-torus-sector.msh
 //               mpirun -np 4 ex1p -m ../data/square-disc-p2.vtk -o 2
 //               mpirun -np 4 ex1p -m ../data/square-disc-p3.mesh -o 3
 //               mpirun -np 4 ex1p -m ../data/square-disc-nurbs.mesh -o -1
diff --git a/examples/ex24.cpp b/examples/ex24.cpp
index a39809e401d..23f7cc361bd 100644
--- a/examples/ex24.cpp
+++ b/examples/ex24.cpp
@@ -6,6 +6,7 @@
 //               ex24 -m ../data/square-disc.mesh -o 2
 //               ex24 -m ../data/beam-tet.mesh
 //               ex24 -m ../data/beam-hex.mesh -o 2 -pa
+//               ex24 -m ../data/beam-hex.mesh -o 2 -pa -p 1
 //               ex24 -m ../data/escher.mesh
 //               ex24 -m ../data/escher.mesh -o 2
 //               ex24 -m ../data/fichera.mesh
@@ -23,11 +24,15 @@
 //               ex24 -m ../data/beam-hex.mesh -pa -d cuda
 //
 // Description:  This example code illustrates usage of mixed finite element
-//               spaces. Using two different approaches, we project a gradient
-//               of a function in H^1 to H(curl). Other spaces and example
-//               computations are to be added in the future.
+//               spaces, with two variants:
 //
-//               We recommend viewing examples 1 and 3 before viewing this
+//               1) (grad p, u) for p in H^1 tested against u in H(curl)
+//               2) (div v, q) for v in H(div) tested against q in L_2
+//
+//               Using different approaches, we project the gradient or
+//               divergence to the appropriate space.
+//
+//               We recommend viewing examples 1, 3, and 5 before viewing this
 //               example.
 
 #include "mfem.hpp"
@@ -39,6 +44,7 @@ using namespace mfem;
 
 double p_exact(const Vector &x);
 void gradp_exact(const Vector &, Vector &);
+double div_gradp_exact(const Vector &x);
 
 int dim;
 
@@ -47,6 +53,7 @@ int main(int argc, char *argv[])
    // 1. Parse command-line options.
    const char *mesh_file = "../data/beam-hex.mesh";
    int order = 1;
+   int prob = 0;
    bool static_cond = false;
    bool pa = false;
    const char *device_config = "cpu";
@@ -57,6 +64,8 @@ int main(int argc, char *argv[])
                   "Mesh file to use.");
    args.AddOption(&order, "-o", "--order",
                   "Finite element order (polynomial degree).");
+   args.AddOption(&prob, "-p", "--problem-type",
+                  "Choose between 0: H(Curl) or 1: H(Div)");
    args.AddOption(&static_cond, "-sc", "--static-condensation", "-no-sc",
                   "--no-static-condensation", "Enable static condensation.");
    args.AddOption(&pa, "-pa", "--partial-assembly", "-no-pa",
@@ -100,72 +109,107 @@ int main(int argc, char *argv[])
    }
    mesh->ReorientTetMesh();
 
-   // 5. Define a parallel finite element space on the parallel mesh. Here we
-   //    use the Nedelec finite elements of the specified order.
-   FiniteElementCollection *fec = new ND_FECollection(order, dim);
-   FiniteElementCollection *H1fec = new H1_FECollection(order, dim);
-   FiniteElementSpace *fespace = new FiniteElementSpace(mesh, fec);
-   FiniteElementSpace *H1fespace = new FiniteElementSpace(mesh, H1fec);
-
-   int size = fespace->GetTrueVSize();
-   int H1size = H1fespace->GetTrueVSize();
-   cout << "Number of Nedelec finite element unknowns: " << size << endl;
-   cout << "Number of H1 finite element unknowns: " << H1size << endl;
-
-   // 6. Define the solution vector x as a parallel finite element grid function
-   //    corresponding to fespace. Initialize x by projecting the exact
-   //    solution. Note that only values from the boundary edges will be used
-   //    when eliminating the non-homogeneous boundary condition to modify the
-   //    r.h.s. vector b.
-   GridFunction x(fespace);
-   FunctionCoefficient p_coef(p_exact);
-   GridFunction p(H1fespace);
-   p.ProjectCoefficient(p_coef);
-   p.SetTrueVector();
-   p.SetFromTrueVector();
+   // 5. Define a finite element space on the mesh. Here we use Nedelec or
+   //    Raviart-Thomas finite elements of the specified order.
+   FiniteElementCollection *trial_fec = NULL;
+   FiniteElementCollection *test_fec = NULL;
+
+   if (prob == 0)
+   {
+      trial_fec = new H1_FECollection(order, dim);
+      test_fec = new ND_FECollection(order, dim);
+   }
+   else
+   {
+      trial_fec = new RT_FECollection(order - 1, dim);
+      test_fec = new L2_FECollection(order - 1, dim);
+   }
+
+   FiniteElementSpace trial_fes(mesh, trial_fec);
+   FiniteElementSpace test_fes(mesh, test_fec);
+
+   int trial_size = trial_fes.GetTrueVSize();
+   int test_size = test_fes.GetTrueVSize();
+
+   if (prob == 0)
+   {
+      cout << "Number of Nedelec finite element unknowns: " << test_size << endl;
+      cout << "Number of H1 finite element unknowns: " << trial_size << endl;
+   }
+   else
+   {
+      cout << "Number of Raviart-Thomas finite element unknowns: "
+           << trial_size << endl;
+      cout << "Number of L2 finite element unknowns: " << test_size << endl;
+   }
 
+   // 6. Define the solution vector as a finite element grid function
+   //    corresponding to the trial fespace.
+   GridFunction gftest(&test_fes);
+   GridFunction gftrial(&trial_fes);
+   GridFunction x(&test_fes);
+   FunctionCoefficient p_coef(p_exact);
    VectorFunctionCoefficient gradp_coef(sdim, gradp_exact);
+   FunctionCoefficient divgradp_coef(div_gradp_exact);
 
-   // 7. Set up the bilinear forms.
-   Coefficient *muinv = new ConstantCoefficient(1.0);
-   Coefficient *sigma = new ConstantCoefficient(1.0);
-   BilinearForm *a = new BilinearForm(fespace);
-   MixedBilinearForm *a_NDH1 = new MixedBilinearForm(H1fespace, fespace);
+   if (prob == 0)
+   {
+      gftrial.ProjectCoefficient(p_coef);
+   }
+   else
+   {
+      gftrial.ProjectCoefficient(gradp_coef);
+   }
+
+   gftrial.SetTrueVector();
+   gftrial.SetFromTrueVector();
+
+   // 7. Set up the bilinear forms for L2 projection.
+   ConstantCoefficient one(1.0);
+   BilinearForm a(&test_fes);
+   MixedBilinearForm a_mixed(&trial_fes, &test_fes);
    if (pa)
    {
-      a->SetAssemblyLevel(AssemblyLevel::PARTIAL);
-      a_NDH1->SetAssemblyLevel(AssemblyLevel::PARTIAL);
+      a.SetAssemblyLevel(AssemblyLevel::PARTIAL);
+      a_mixed.SetAssemblyLevel(AssemblyLevel::PARTIAL);
    }
 
-   // First approach: L2 projection
-   a->AddDomainIntegrator(new VectorFEMassIntegrator(*sigma));
-   a_NDH1->AddDomainIntegrator(new MixedVectorGradientIntegrator(*muinv));
+   if (prob == 0)
+   {
+      a.AddDomainIntegrator(new VectorFEMassIntegrator(one));
+      a_mixed.AddDomainIntegrator(new MixedVectorGradientIntegrator(one));
+   }
+   else
+   {
+      a.AddDomainIntegrator(new MassIntegrator(one));
+      a_mixed.AddDomainIntegrator(new VectorFEDivergenceIntegrator(one));
+   }
 
-   // 8. Assemble the parallel bilinear form and the corresponding linear
-   //    system, applying any necessary transformations such as: parallel
-   //    assembly, eliminating boundary conditions, applying conforming
-   //    constraints for non-conforming AMR, static condensation, etc.
-   if (static_cond) { a->EnableStaticCondensation(); }
+   // 8. Assemble the bilinear form and the corresponding linear system,
+   //    applying any necessary transformations such as: eliminating boundary
+   //    conditions, applying conforming constraints for non-conforming AMR,
+   //    static condensation, etc.
+   if (static_cond) { a.EnableStaticCondensation(); }
 
-   a->Assemble();
-   if (!pa) { a->Finalize(); }
+   a.Assemble();
+   if (!pa) { a.Finalize(); }
 
-   a_NDH1->Assemble();
-   if (!pa) { a_NDH1->Finalize(); }
+   a_mixed.Assemble();
+   if (!pa) { a_mixed.Finalize(); }
 
    if (pa)
    {
-      a_NDH1->Mult(p, x);
+      a_mixed.Mult(gftrial, x);
    }
    else
    {
-      SparseMatrix& NDH1 = a_NDH1->SpMat();
-      NDH1.Mult(p, x);
+      SparseMatrix& mixed = a_mixed.SpMat();
+      mixed.Mult(gftrial, x);
    }
 
    // 9. Define and apply a PCG solver for Ax = b with Jacobi preconditioner.
    {
-      GridFunction rhs(fespace);
+      GridFunction rhs(&test_fes);
       rhs = x;
       x = 0.0;
 
@@ -176,15 +220,15 @@ int main(int argc, char *argv[])
       if (pa)
       {
          Array<int> ess_tdof_list; // empty
-         OperatorJacobiSmoother Jacobi(*a, ess_tdof_list);
+         OperatorJacobiSmoother Jacobi(a, ess_tdof_list);
 
-         cg.SetOperator(*a);
+         cg.SetOperator(a);
          cg.SetPreconditioner(Jacobi);
          cg.Mult(rhs, x);
       }
       else
       {
-         SparseMatrix& Amat = a->SpMat();
+         SparseMatrix& Amat = a.SpMat();
          DSmoother Jacobi(Amat);
 
          cg.SetOperator(Amat);
@@ -193,33 +237,68 @@ int main(int argc, char *argv[])
       }
    }
 
-   // 10. Second approach: compute the same solution by applying
-   //     GradientInterpolator in H(curl).
-   DiscreteLinearOperator grad(H1fespace, fespace);
-   grad.AddDomainInterpolator(new GradientInterpolator());
-   grad.Assemble();
+   // 10. Compute the same field by applying a DiscreteInterpolator.
+   GridFunction discreteInterpolant(&test_fes);
+   DiscreteLinearOperator dlo(&trial_fes, &test_fes);
+   if (prob == 0)
+   {
+      dlo.AddDomainInterpolator(new GradientInterpolator());
+   }
+   else
+   {
+      dlo.AddDomainInterpolator(new DivergenceInterpolator());
+   }
 
-   GridFunction gradp(fespace);
-   grad.Mult(p, gradp);
+   dlo.Assemble();
+   dlo.Mult(gftrial, discreteInterpolant);
 
-   // 11. Compute the projection of the exact grad p.
-   GridFunction exact_gradp(fespace);
-   exact_gradp.ProjectCoefficient(gradp_coef);
-   exact_gradp.SetTrueVector();
-   exact_gradp.SetFromTrueVector();
+   // 11. Compute the projection of the exact field.
+   GridFunction exact_proj(&test_fes);
+   if (prob == 0)
+   {
+      exact_proj.ProjectCoefficient(gradp_coef);
+   }
+   else
+   {
+      exact_proj.ProjectCoefficient(divgradp_coef);
+   }
 
-   // 12. Compute and print the L^2 norm of the error.
+   exact_proj.SetTrueVector();
+   exact_proj.SetFromTrueVector();
+
+   // 12. Compute and print the L_2 norm of the error.
+   if (prob == 0)
    {
       double errSol = x.ComputeL2Error(gradp_coef);
-      double errInterp = gradp.ComputeL2Error(gradp_coef);
-      double errProj = exact_gradp.ComputeL2Error(gradp_coef);
+      double errInterp = discreteInterpolant.ComputeL2Error(gradp_coef);
+      double errProj = exact_proj.ComputeL2Error(gradp_coef);
 
       cout << "\n Solution of (E_h,v) = (grad p_h,v) for E_h and v in H(curl): "
-           "|| E_h - grad p ||_{L^2} = " << errSol << '\n' << endl;
+           "|| E_h - grad p ||_{L_2} = " << errSol << '\n' << endl;
       cout << " Gradient interpolant E_h = grad p_h in H(curl): || E_h - grad p"
-           "||_{L^2} = " << errInterp << '\n' << endl;
+           "||_{L_2} = " << errInterp << '\n' << endl;
       cout << " Projection E_h of exact grad p in H(curl): || E_h - grad p "
-           "||_{L^2} = " << errProj << '\n' << endl;
+           "||_{L_2} = " << errProj << '\n' << endl;
+   }
+   else
+   {
+      int order_quad = max(2, 2*order+1);
+      const IntegrationRule *irs[Geometry::NumGeom];
+      for (int i=0; i < Geometry::NumGeom; ++i)
+      {
+         irs[i] = &(IntRules.Get(i, order_quad));
+      }
+
+      double errSol = x.ComputeL2Error(divgradp_coef, irs);
+      double errInterp = discreteInterpolant.ComputeL2Error(divgradp_coef, irs);
+      double errProj = exact_proj.ComputeL2Error(divgradp_coef, irs);
+
+      cout << "\n Solution of (f_h,q) = (div v_h,q) for f_h and q in L_2: "
+           "|| f_h - div v ||_{L_2} = " << errSol << '\n' << endl;
+      cout << " Divergence interpolant f_h = div v_h in L_2: || f_h - div v"
+           "||_{L_2} = " << errInterp << '\n' << endl;
+      cout << " Projection f_h of exact div v in L_2: || f_h - div v "
+           "||_{L_2} = " << errProj << '\n' << endl;
    }
 
    // 13. Save the refined mesh and the solution. This output can be viewed
@@ -242,14 +321,8 @@ int main(int argc, char *argv[])
    }
 
    // 15. Free the used memory.
-   delete a;
-   delete a_NDH1;
-   delete sigma;
-   delete muinv;
-   delete fespace;
-   delete H1fespace;
-   delete fec;
-   delete H1fec;
+   delete trial_fec;
+   delete test_fec;
    delete mesh;
 
    return 0;
@@ -284,3 +357,17 @@ void gradp_exact(const Vector &x, Vector &f)
       if (x.Size() == 3) { f(2) = 0.0; }
    }
 }
+
+double div_gradp_exact(const Vector &x)
+{
+   if (dim == 3)
+   {
+      return -3.0 * sin(x(0)) * sin(x(1)) * sin(x(2));
+   }
+   else if (dim == 2)
+   {
+      return -2.0 * sin(x(0)) * sin(x(1));
+   }
+
+   return 0.0;
+}
diff --git a/examples/ex24p.cpp b/examples/ex24p.cpp
index 5bcd4afa853..b1bcccfd227 100644
--- a/examples/ex24p.cpp
+++ b/examples/ex24p.cpp
@@ -6,6 +6,7 @@
 //               mpirun -np 4 ex24p -m ../data/square-disc.mesh -o 2
 //               mpirun -np 4 ex24p -m ../data/beam-tet.mesh
 //               mpirun -np 4 ex24p -m ../data/beam-hex.mesh -o 2 -pa
+//               mpirun -np 4 ex24p -m ../data/beam-hex.mesh -o 2 -p 1 -pa
 //               mpirun -np 4 ex24p -m ../data/escher.mesh
 //               mpirun -np 4 ex24p -m ../data/escher.mesh -o 2
 //               mpirun -np 4 ex24p -m ../data/fichera.mesh
@@ -23,11 +24,15 @@
 //               mpirun -np 4 ex24p -m ../data/beam-hex.mesh -pa -d cuda
 //
 // Description:  This example code illustrates usage of mixed finite element
-//               spaces. Using two different approaches, we project a gradient
-//               of a function in H^1 to H(curl). Other spaces and example
-//               computations are to be added in the future.
+//               spaces, with two variants:
 //
-//               We recommend viewing examples 1 and 3 before viewing this
+//               1) (grad p, u) for p in H^1 tested against u in H(curl)
+//               2) (div v, q) for v in H(div) tested against q in L_2
+//
+//               Using different approaches, we project the gradient or
+//               divergence to the appropriate space.
+//
+//               We recommend viewing examples 1, 3, and 5 before viewing this
 //               example.
 
 #include "mfem.hpp"
@@ -39,6 +44,7 @@ using namespace mfem;
 
 double p_exact(const Vector &x);
 void gradp_exact(const Vector &, Vector &);
+double div_gradp_exact(const Vector &x);
 
 int dim;
 
@@ -53,6 +59,7 @@ int main(int argc, char *argv[])
    // 2. Parse command-line options.
    const char *mesh_file = "../data/beam-hex.mesh";
    int order = 1;
+   int prob = 0;
    bool static_cond = false;
    bool pa = false;
    const char *device_config = "cpu";
@@ -63,6 +70,8 @@ int main(int argc, char *argv[])
                   "Mesh file to use.");
    args.AddOption(&order, "-o", "--order",
                   "Finite element order (polynomial degree).");
+   args.AddOption(&prob, "-p", "--problem-type",
+                  "Choose between 0: H(Curl) or 1: H(Div)");
    args.AddOption(&static_cond, "-sc", "--static-condensation", "-no-sc",
                   "--no-static-condensation", "Enable static condensation.");
    args.AddOption(&pa, "-pa", "--partial-assembly", "-no-pa",
@@ -129,80 +138,115 @@ int main(int argc, char *argv[])
    pmesh->ReorientTetMesh();
 
    // 7. Define a parallel finite element space on the parallel mesh. Here we
-   //    use the Nedelec finite elements of the specified order.
-   FiniteElementCollection *fec = new ND_FECollection(order, dim);
-   FiniteElementCollection *H1fec = new H1_FECollection(order, dim);
-   ParFiniteElementSpace *fespace = new ParFiniteElementSpace(pmesh, fec);
-   ParFiniteElementSpace *H1fespace = new ParFiniteElementSpace(pmesh, H1fec);
-   HYPRE_Int size = fespace->GlobalTrueVSize();
-   HYPRE_Int H1size = H1fespace->GlobalTrueVSize();
+   //    use Nedelec or Raviart-Thomas finite elements of the specified order.
+   FiniteElementCollection *trial_fec = NULL;
+   FiniteElementCollection *test_fec = NULL;
+
+   if (prob == 0)
+   {
+      trial_fec = new H1_FECollection(order, dim);
+      test_fec = new ND_FECollection(order, dim);
+   }
+   else
+   {
+      trial_fec = new RT_FECollection(order - 1, dim);
+      test_fec = new L2_FECollection(order - 1, dim);
+   }
+
+   ParFiniteElementSpace trial_fes(pmesh, trial_fec);
+   ParFiniteElementSpace test_fes(pmesh, test_fec);
+
+   HYPRE_Int trial_size = trial_fes.GlobalTrueVSize();
+   HYPRE_Int test_size = test_fes.GlobalTrueVSize();
+
    if (myid == 0)
    {
-      cout << "Number of Nedelec finite element unknowns: " << size << endl;
-      cout << "Number of H1 finite element unknowns: " << H1size << endl;
+      if (prob == 0)
+      {
+         cout << "Number of Nedelec finite element unknowns: " << test_size << endl;
+         cout << "Number of H1 finite element unknowns: " << trial_size << endl;
+      }
+      else
+      {
+         cout << "Number of Raviart-Thomas finite element unknowns: "
+              << trial_size << endl;
+         cout << "Number of L2 finite element unknowns: " << test_size << endl;
+      }
    }
 
-   // 8. Define the solution vector x as a parallel finite element grid function
-   //    corresponding to fespace. Initialize x by projecting the exact
-   //    solution. Note that only values from the boundary edges will be used
-   //    when eliminating the non-homogeneous boundary condition to modify the
-   //    r.h.s. vector b.
-   ParGridFunction x(fespace);
+   // 8. Define the solution vector as a parallel finite element grid function
+   //    corresponding to the trial fespace.
+   ParGridFunction gftest(&test_fes);
+   ParGridFunction gftrial(&trial_fes);
+   ParGridFunction x(&test_fes);
    FunctionCoefficient p_coef(p_exact);
-   ParGridFunction p(H1fespace);
-   p.ProjectCoefficient(p_coef);
-   p.SetTrueVector();
-   p.SetFromTrueVector();
-
    VectorFunctionCoefficient gradp_coef(sdim, gradp_exact);
+   FunctionCoefficient divgradp_coef(div_gradp_exact);
+
+   if (prob == 0)
+   {
+      gftrial.ProjectCoefficient(p_coef);
+   }
+   else
+   {
+      gftrial.ProjectCoefficient(gradp_coef);
+   }
+
+   gftrial.SetTrueVector();
+   gftrial.SetFromTrueVector();
 
-   // 9. Set up the parallel bilinear forms.
-   Coefficient *muinv = new ConstantCoefficient(1.0);
-   Coefficient *sigma = new ConstantCoefficient(1.0);
-   ParBilinearForm *a = new ParBilinearForm(fespace);
-   ParMixedBilinearForm *a_NDH1 = new ParMixedBilinearForm(H1fespace, fespace);
+   // 9. Set up the parallel bilinear forms for L2 projection.
+   ConstantCoefficient one(1.0);
+   ParBilinearForm a(&test_fes);
+   ParMixedBilinearForm a_mixed(&trial_fes, &test_fes);
    if (pa)
    {
-      a->SetAssemblyLevel(AssemblyLevel::PARTIAL);
-      a_NDH1->SetAssemblyLevel(AssemblyLevel::PARTIAL);
+      a.SetAssemblyLevel(AssemblyLevel::PARTIAL);
+      a_mixed.SetAssemblyLevel(AssemblyLevel::PARTIAL);
    }
 
-   // First approach: L2 projection
-   a->AddDomainIntegrator(new VectorFEMassIntegrator(*sigma));
-   a_NDH1->AddDomainIntegrator(new MixedVectorGradientIntegrator(*muinv));
+   if (prob == 0)
+   {
+      a.AddDomainIntegrator(new VectorFEMassIntegrator(one));
+      a_mixed.AddDomainIntegrator(new MixedVectorGradientIntegrator(one));
+   }
+   else
+   {
+      a.AddDomainIntegrator(new MassIntegrator(one));
+      a_mixed.AddDomainIntegrator(new VectorFEDivergenceIntegrator(one));
+   }
 
    // 10. Assemble the parallel bilinear form and the corresponding linear
    //     system, applying any necessary transformations such as: parallel
    //     assembly, eliminating boundary conditions, applying conforming
    //     constraints for non-conforming AMR, static condensation, etc.
-   if (static_cond) { a->EnableStaticCondensation(); }
+   if (static_cond) { a.EnableStaticCondensation(); }
 
-   a->Assemble();
-   if (!pa) { a->Finalize(); }
+   a.Assemble();
+   if (!pa) { a.Finalize(); }
 
-   a_NDH1->Assemble();
-   if (!pa) { a_NDH1->Finalize(); }
+   a_mixed.Assemble();
+   if (!pa) { a_mixed.Finalize(); }
 
-   Vector B(fespace->GetTrueVSize());
-   Vector X(fespace->GetTrueVSize());
+   Vector B(test_fes.GetTrueVSize());
+   Vector X(test_fes.GetTrueVSize());
 
    if (pa)
    {
-      ParLinearForm *b = new ParLinearForm(fespace); // used as a vector
-      a_NDH1->Mult(p, *b); // process-local multiplication
-      b->ParallelAssemble(B);
-      delete b;
+      ParLinearForm b(&test_fes); // used as a vector
+      a_mixed.Mult(gftrial, b); // process-local multiplication
+      b.ParallelAssemble(B);
    }
    else
    {
-      HypreParMatrix *NDH1 = a_NDH1->ParallelAssemble();
+      HypreParMatrix *mixed = a_mixed.ParallelAssemble();
 
-      Vector P(H1fespace->GetTrueVSize());
-      p.GetTrueDofs(P);
+      Vector P(trial_fes.GetTrueVSize());
+      gftrial.GetTrueDofs(P);
 
-      NDH1->Mult(P,B);
+      mixed->Mult(P,B);
 
-      delete NDH1;
+      delete mixed;
    }
 
    // 11. Define and apply a parallel PCG solver for AX=B with Jacobi
@@ -212,9 +256,9 @@ int main(int argc, char *argv[])
       Array<int> ess_tdof_list; // empty
 
       OperatorPtr A;
-      a->FormSystemMatrix(ess_tdof_list, A);
+      a.FormSystemMatrix(ess_tdof_list, A);
 
-      OperatorJacobiSmoother Jacobi(*a, ess_tdof_list);
+      OperatorJacobiSmoother Jacobi(a, ess_tdof_list);
 
       CGSolver cg(MPI_COMM_WORLD);
       cg.SetRelTol(1e-12);
@@ -227,7 +271,7 @@ int main(int argc, char *argv[])
    }
    else
    {
-      HypreParMatrix *Amat = a->ParallelAssemble();
+      HypreParMatrix *Amat = a.ParallelAssemble();
       HypreDiagScale Jacobi(*Amat);
       HyprePCG pcg(*Amat);
       pcg.SetTol(1e-12);
@@ -242,35 +286,73 @@ int main(int argc, char *argv[])
 
    x.SetFromTrueDofs(X);
 
-   // 12. Second approach: compute the same solution by applying
-   //     GradientInterpolator in H(curl).
-   ParDiscreteLinearOperator grad(H1fespace, fespace);
-   grad.AddDomainInterpolator(new GradientInterpolator());
-   grad.Assemble();
+   // 12. Compute the same field by applying a DiscreteInterpolator.
+   ParGridFunction discreteInterpolant(&test_fes);
+   ParDiscreteLinearOperator dlo(&trial_fes, &test_fes);
+   if (prob == 0)
+   {
+      dlo.AddDomainInterpolator(new GradientInterpolator());
+   }
+   else
+   {
+      dlo.AddDomainInterpolator(new DivergenceInterpolator());
+   }
+
+   dlo.Assemble();
+   dlo.Mult(gftrial, discreteInterpolant);
 
-   ParGridFunction gradp(fespace);
-   grad.Mult(p, gradp);
+   // 13. Compute the projection of the exact field.
+   ParGridFunction exact_proj(&test_fes);
+   if (prob == 0)
+   {
+      exact_proj.ProjectCoefficient(gradp_coef);
+   }
+   else
+   {
+      exact_proj.ProjectCoefficient(divgradp_coef);
+   }
 
-   // 13. Compute the projection of the exact grad p.
-   ParGridFunction exact_gradp(fespace);
-   exact_gradp.ProjectCoefficient(gradp_coef);
-   exact_gradp.SetTrueVector();
-   exact_gradp.SetFromTrueVector();
+   exact_proj.SetTrueVector();
+   exact_proj.SetFromTrueVector();
 
-   // 14. Compute and print the L^2 norm of the error.
+   // 14. Compute and print the L_2 norm of the error.
+   if (prob == 0)
    {
       double errSol = x.ComputeL2Error(gradp_coef);
-      double errInterp = gradp.ComputeL2Error(gradp_coef);
-      double errProj = exact_gradp.ComputeL2Error(gradp_coef);
+      double errInterp = discreteInterpolant.ComputeL2Error(gradp_coef);
+      double errProj = exact_proj.ComputeL2Error(gradp_coef);
 
       if (myid == 0)
       {
-         cout << "\n Solution of (E_h,v) = (grad p_h,v) for E_h and v in "
-              "H(curl): || E_h - grad p ||_{L^2} = " << errSol << '\n' << endl;
-         cout << " Gradient interpolant E_h = grad p_h in H(curl): || E_h - "
-              "grad p ||_{L^2} = " << errInterp << '\n' << endl;
+         cout << "\n Solution of (E_h,v) = (grad p_h,v) for E_h and v in H(curl): "
+              "|| E_h - grad p ||_{L_2} = " << errSol << '\n' << endl;
+         cout << " Gradient interpolant E_h = grad p_h in H(curl): || E_h - grad p"
+              "||_{L_2} = " << errInterp << '\n' << endl;
          cout << " Projection E_h of exact grad p in H(curl): || E_h - grad p "
-              "||_{L^2} = " << errProj << '\n' << endl;
+              "||_{L_2} = " << errProj << '\n' << endl;
+      }
+   }
+   else
+   {
+      int order_quad = max(2, 2*order+1);
+      const IntegrationRule *irs[Geometry::NumGeom];
+      for (int i=0; i < Geometry::NumGeom; ++i)
+      {
+         irs[i] = &(IntRules.Get(i, order_quad));
+      }
+
+      double errSol = x.ComputeL2Error(divgradp_coef, irs);
+      double errInterp = discreteInterpolant.ComputeL2Error(divgradp_coef, irs);
+      double errProj = exact_proj.ComputeL2Error(divgradp_coef, irs);
+
+      if (myid == 0)
+      {
+         cout << "\n Solution of (f_h,q) = (div v_h,q) for f_h and q in L_2: "
+              "|| f_h - div v ||_{L_2} = " << errSol << '\n' << endl;
+         cout << " Divergence interpolant f_h = div v_h in L_2: || f_h - div v"
+              "||_{L_2} = " << errInterp << '\n' << endl;
+         cout << " Projection f_h of exact div v in L_2: || f_h - div v "
+              "||_{L_2} = " << errProj << '\n' << endl;
       }
    }
 
@@ -302,14 +384,8 @@ int main(int argc, char *argv[])
    }
 
    // 17. Free the used memory.
-   delete a;
-   delete a_NDH1;
-   delete sigma;
-   delete muinv;
-   delete fespace;
-   delete H1fespace;
-   delete fec;
-   delete H1fec;
+   delete trial_fec;
+   delete test_fec;
    delete pmesh;
 
    MPI_Finalize();
@@ -346,3 +422,17 @@ void gradp_exact(const Vector &x, Vector &f)
       if (x.Size() == 3) { f(2) = 0.0; }
    }
 }
+
+double div_gradp_exact(const Vector &x)
+{
+   if (dim == 3)
+   {
+      return -3.0 * sin(x(0)) * sin(x(1)) * sin(x(2));
+   }
+   else if (dim == 2)
+   {
+      return -2.0 * sin(x(0)) * sin(x(1));
+   }
+
+   return 0.0;
+}
diff --git a/examples/ex4.cpp b/examples/ex4.cpp
index 6b2d656a00f..6e6c43129c4 100644
--- a/examples/ex4.cpp
+++ b/examples/ex4.cpp
@@ -6,6 +6,7 @@
 //               ex4 -m ../data/star.mesh
 //               ex4 -m ../data/beam-tet.mesh
 //               ex4 -m ../data/beam-hex.mesh
+//               ex4 -m ../data/beam-hex.mesh -o 2 -pa
 //               ex4 -m ../data/escher.mesh
 //               ex4 -m ../data/fichera.mesh -o 2 -hb
 //               ex4 -m ../data/fichera-q2.vtk
@@ -20,6 +21,12 @@
 //               ex4 -m ../data/fichera-amr.mesh -o 2 -sc
 //               ex4 -m ../data/star-surf.mesh -o 1
 //
+// Device sample runs:
+//               ex4 -m ../data/star.mesh -pa -d cuda
+//               ex4 -m ../data/star.mesh -pa -d raja-cuda
+//               ex4 -m ../data/star.mesh -pa -d raja-omp
+//               ex4 -m ../data/beam-hex.mesh -pa -d cuda
+//
 // Description:  This example code solves a simple 2D/3D H(div) diffusion
 //               problem corresponding to the second order definite equation
 //               -grad(alpha div F) + beta F = f with boundary condition F dot n
@@ -55,6 +62,8 @@ int main(int argc, char *argv[])
    bool set_bc = true;
    bool static_cond = false;
    bool hybridization = false;
+   bool pa = false;
+   const char *device_config = "cpu";
    bool visualization = 1;
 
    OptionsParser args(argc, argv);
@@ -70,6 +79,10 @@ int main(int argc, char *argv[])
                   "--no-static-condensation", "Enable static condensation.");
    args.AddOption(&hybridization, "-hb", "--hybridization", "-no-hb",
                   "--no-hybridization", "Enable hybridization.");
+   args.AddOption(&pa, "-pa", "--partial-assembly", "-no-pa",
+                  "--no-partial-assembly", "Enable Partial Assembly.");
+   args.AddOption(&device_config, "-d", "--device",
+                  "Device configuration string, see Device::Configure().");
    args.AddOption(&visualization, "-vis", "--visualization", "-no-vis",
                   "--no-visualization",
                   "Enable or disable GLVis visualization.");
@@ -82,14 +95,19 @@ int main(int argc, char *argv[])
    args.PrintOptions(cout);
    kappa = freq * M_PI;
 
-   // 2. Read the mesh from the given mesh file. We can handle triangular,
+   // 2. Enable hardware devices such as GPUs, and programming models such as
+   //    CUDA, OCCA, RAJA and OpenMP based on command line options.
+   Device device(device_config);
+   device.Print();
+
+   // 3. Read the mesh from the given mesh file. We can handle triangular,
    //    quadrilateral, tetrahedral, hexahedral, surface and volume, as well as
    //    periodic meshes with the same code.
    Mesh *mesh = new Mesh(mesh_file, 1, 1);
    int dim = mesh->Dimension();
    int sdim = mesh->SpaceDimension();
 
-   // 3. Refine the mesh to increase the resolution. In this example we do
+   // 4. Refine the mesh to increase the resolution. In this example we do
    //    'ref_levels' of uniform refinement. We choose 'ref_levels' to be the
    //    largest number that gives a final mesh with no more than 25,000
    //    elements.
@@ -102,14 +120,14 @@ int main(int argc, char *argv[])
       }
    }
 
-   // 4. Define a finite element space on the mesh. Here we use the
+   // 5. Define a finite element space on the mesh. Here we use the
    //    Raviart-Thomas finite elements of the specified order.
    FiniteElementCollection *fec = new RT_FECollection(order-1, dim);
    FiniteElementSpace *fespace = new FiniteElementSpace(mesh, fec);
    cout << "Number of finite element unknowns: "
         << fespace->GetTrueVSize() << endl;
 
-   // 5. Determine the list of true (i.e. conforming) essential boundary dofs.
+   // 6. Determine the list of true (i.e. conforming) essential boundary dofs.
    //    In this example, the boundary conditions are defined by marking all
    //    the boundary attributes from the mesh as essential (Dirichlet) and
    //    converting them to a list of true dofs.
@@ -121,7 +139,7 @@ int main(int argc, char *argv[])
       fespace->GetEssentialTrueDofs(ess_bdr, ess_tdof_list);
    }
 
-   // 6. Set up the linear form b(.) which corresponds to the right-hand side
+   // 7. Set up the linear form b(.) which corresponds to the right-hand side
    //    of the FEM linear system, which in this case is (f,phi_i) where f is
    //    given by the function f_exact and phi_i are the basis functions in the
    //    finite element fespace.
@@ -130,7 +148,7 @@ int main(int argc, char *argv[])
    b->AddDomainIntegrator(new VectorFEDomainLFIntegrator(f));
    b->Assemble();
 
-   // 7. Define the solution vector x as a finite element grid function
+   // 8. Define the solution vector x as a finite element grid function
    //    corresponding to fespace. Initialize x by projecting the exact
    //    solution. Note that only values from the boundary faces will be used
    //    when eliminating the non-homogeneous boundary condition to modify the
@@ -139,16 +157,17 @@ int main(int argc, char *argv[])
    VectorFunctionCoefficient F(sdim, F_exact);
    x.ProjectCoefficient(F);
 
-   // 8. Set up the bilinear form corresponding to the H(div) diffusion operator
+   // 9. Set up the bilinear form corresponding to the H(div) diffusion operator
    //    grad alpha div + beta I, by adding the div-div and the mass domain
    //    integrators.
    Coefficient *alpha = new ConstantCoefficient(1.0);
    Coefficient *beta  = new ConstantCoefficient(1.0);
    BilinearForm *a = new BilinearForm(fespace);
+   if (pa) { a->SetAssemblyLevel(AssemblyLevel::PARTIAL); }
    a->AddDomainIntegrator(new DivDivIntegrator(*alpha));
    a->AddDomainIntegrator(new VectorFEMassIntegrator(*beta));
 
-   // 9. Assemble the bilinear form and the corresponding linear system,
+   // 10. Assemble the bilinear form and the corresponding linear system,
    //    applying any necessary transformations such as: eliminating boundary
    //    conditions, applying conforming constraints for non-conforming AMR,
    //    static condensation, hybridization, etc.
@@ -167,32 +186,47 @@ int main(int argc, char *argv[])
    }
    a->Assemble();
 
-   SparseMatrix A;
+   OperatorPtr A;
    Vector B, X;
    a->FormLinearSystem(ess_tdof_list, x, *b, A, X, B);
 
-   cout << "Size of linear system: " << A.Height() << endl;
+   cout << "Size of linear system: " << A->Height() << endl;
 
+   // 11. Solve the linear system A X = B.
+   if (!pa)
+   {
 #ifndef MFEM_USE_SUITESPARSE
-   // 10. Define a simple symmetric Gauss-Seidel preconditioner and use it to
-   //     solve the system A X = B with PCG.
-   GSSmoother M(A);
-   PCG(A, M, B, X, 1, 10000, 1e-20, 0.0);
+      // Use a simple symmetric Gauss-Seidel preconditioner with PCG.
+      GSSmoother M((SparseMatrix&)(*A));
+      PCG(*A, M, B, X, 1, 10000, 1e-20, 0.0);
 #else
-   // 10. If compiled with SuiteSparse support, use UMFPACK to solve the system.
-   UMFPackSolver umf_solver;
-   umf_solver.Control[UMFPACK_ORDERING] = UMFPACK_ORDERING_METIS;
-   umf_solver.SetOperator(A);
-   umf_solver.Mult(B, X);
+      // If MFEM was compiled with SuiteSparse, use UMFPACK to solve the system.
+      UMFPackSolver umf_solver;
+      umf_solver.Control[UMFPACK_ORDERING] = UMFPACK_ORDERING_METIS;
+      umf_solver.SetOperator(*A);
+      umf_solver.Mult(B, X);
 #endif
+   }
+   else // Jacobi preconditioning in partial assembly mode
+   {
+      if (UsesTensorBasis(*fespace))
+      {
+         OperatorJacobiSmoother M(*a, ess_tdof_list);
+         PCG(*A, M, B, X, 1, 10000, 1e-20, 0.0);
+      }
+      else
+      {
+         CG(*A, B, X, 1, 10000, 1e-20, 0.0);
+      }
+   }
 
-   // 11. Recover the solution as a finite element grid function.
+   // 12. Recover the solution as a finite element grid function.
    a->RecoverFEMSolution(X, *b, x);
 
-   // 12. Compute and print the L^2 norm of the error.
+   // 13. Compute and print the L^2 norm of the error.
    cout << "\n|| F_h - F ||_{L^2} = " << x.ComputeL2Error(F) << '\n' << endl;
 
-   // 13. Save the refined mesh and the solution. This output can be viewed
+   // 14. Save the refined mesh and the solution. This output can be viewed
    //     later using GLVis: "glvis -m refined.mesh -g sol.gf".
    {
       ofstream mesh_ofs("refined.mesh");
@@ -203,7 +237,7 @@ int main(int argc, char *argv[])
       x.Save(sol_ofs);
    }
 
-   // 14. Send the solution by socket to a GLVis server.
+   // 15. Send the solution by socket to a GLVis server.
    if (visualization)
    {
       char vishost[] = "localhost";
@@ -213,7 +247,7 @@ int main(int argc, char *argv[])
       sol_sock << "solution\n" << *mesh << x << flush;
    }
 
-   // 15. Free the used memory.
+   // 16. Free the used memory.
    delete hfes;
    delete hfec;
    delete a;
@@ -235,7 +269,7 @@ void F_exact(const Vector &p, Vector &F)
 
    double x = p(0);
    double y = p(1);
-   // double z = (dim == 3) ? p(2) : 0.0;
+   // double z = (dim == 3) ? p(2) : 0.0; // Uncomment if F is changed to depend on z
 
    F(0) = cos(kappa*x)*sin(kappa*y);
    F(1) = cos(kappa*y)*sin(kappa*x);
@@ -252,7 +286,7 @@ void f_exact(const Vector &p, Vector &f)
 
    double x = p(0);
    double y = p(1);
-   // double z = (dim == 3) ? p(2) : 0.0;
+   // double z = (dim == 3) ? p(2) : 0.0; // Uncomment if f is changed to depend on z
 
    double temp = 1 + 2*kappa*kappa;
 
diff --git a/examples/ex4p.cpp b/examples/ex4p.cpp
index ee81d7c4e6b..282ebb4fa3c 100644
--- a/examples/ex4p.cpp
+++ b/examples/ex4p.cpp
@@ -6,6 +6,7 @@
 //               mpirun -np 4 ex4p -m ../data/star.mesh
 //               mpirun -np 4 ex4p -m ../data/beam-tet.mesh
 //               mpirun -np 4 ex4p -m ../data/beam-hex.mesh
+//               mpirun -np 4 ex4p -m ../data/beam-hex.mesh -o 2 -pa
 //               mpirun -np 4 ex4p -m ../data/escher.mesh -o 2 -sc
 //               mpirun -np 4 ex4p -m ../data/fichera.mesh -o 2 -hb
 //               mpirun -np 4 ex4p -m ../data/fichera-q2.vtk
@@ -19,6 +20,12 @@
 //               mpirun -np 4 ex4p -m ../data/amr-hex.mesh -o 2 -hb
 //               mpirun -np 4 ex4p -m ../data/star-surf.mesh -o 3 -hb
 //
+// Device sample runs:
+//               mpirun -np 4 ex4p -m ../data/star.mesh -pa -d cuda
+//               mpirun -np 4 ex4p -m ../data/star.mesh -pa -d raja-cuda
+//               mpirun -np 4 ex4p -m ../data/star.mesh -pa -d raja-omp
+//               mpirun -np 4 ex4p -m ../data/beam-hex.mesh -pa -d cuda
+//
 // Description:  This example code solves a simple 2D/3D H(div) diffusion
 //               problem corresponding to the second order definite equation
 //               -grad(alpha div F) + beta F = f with boundary condition F dot n
@@ -60,6 +67,8 @@ int main(int argc, char *argv[])
    bool set_bc = true;
    bool static_cond = false;
    bool hybridization = false;
+   bool pa = false;
+   const char *device_config = "cpu";
    bool visualization = 1;
 
    OptionsParser args(argc, argv);
@@ -75,6 +84,10 @@ int main(int argc, char *argv[])
                   "--no-static-condensation", "Enable static condensation.");
    args.AddOption(&hybridization, "-hb", "--hybridization", "-no-hb",
                   "--no-hybridization", "Enable hybridization.");
+   args.AddOption(&pa, "-pa", "--partial-assembly", "-no-pa",
+                  "--no-partial-assembly", "Enable Partial Assembly.");
+   args.AddOption(&device_config, "-d", "--device",
+                  "Device configuration string, see Device::Configure().");
    args.AddOption(&visualization, "-vis", "--visualization", "-no-vis",
                   "--no-visualization",
                   "Enable or disable GLVis visualization.");
@@ -94,14 +107,19 @@ int main(int argc, char *argv[])
    }
    kappa = freq * M_PI;
 
-   // 3. Read the (serial) mesh from the given mesh file on all processors.  We
+   // 3. Enable hardware devices such as GPUs, and programming models such as
+   //    CUDA, OCCA, RAJA and OpenMP based on command line options.
+   Device device(device_config);
+   if (myid == 0) { device.Print(); }
+
+   // 4. Read the (serial) mesh from the given mesh file on all processors.  We
    //    can handle triangular, quadrilateral, tetrahedral, hexahedral, surface
    //    and volume, as well as periodic meshes with the same code.
    Mesh *mesh = new Mesh(mesh_file, 1, 1);
    int dim = mesh->Dimension();
    int sdim = mesh->SpaceDimension();
 
-   // 4. Refine the serial mesh on all processors to increase the resolution. In
+   // 5. Refine the serial mesh on all processors to increase the resolution. In
    //    this example we do 'ref_levels' of uniform refinement. We choose
    //    'ref_levels' to be the largest number that gives a final mesh with no
    //    more than 1,000 elements.
@@ -114,7 +132,7 @@ int main(int argc, char *argv[])
       }
    }
 
-   // 5. Define a parallel mesh by a partitioning of the serial mesh. Refine
+   // 6. Define a parallel mesh by a partitioning of the serial mesh. Refine
    //    this mesh further in parallel to increase the resolution. Once the
    //    parallel mesh is defined, the serial mesh can be deleted. Tetrahedral
    //    meshes need to be reoriented before we can define high-order Nedelec
@@ -130,7 +148,7 @@ int main(int argc, char *argv[])
    }
    pmesh->ReorientTetMesh();
 
-   // 6. Define a parallel finite element space on the parallel mesh. Here we
+   // 7. Define a parallel finite element space on the parallel mesh. Here we
    //    use the Raviart-Thomas finite elements of the specified order.
    FiniteElementCollection *fec = new RT_FECollection(order-1, dim);
    ParFiniteElementSpace *fespace = new ParFiniteElementSpace(pmesh, fec);
@@ -140,7 +158,7 @@ int main(int argc, char *argv[])
       cout << "Number of finite element unknowns: " << size << endl;
    }
 
-   // 7. Determine the list of true (i.e. parallel conforming) essential
+   // 8. Determine the list of true (i.e. parallel conforming) essential
    //    boundary dofs. In this example, the boundary conditions are defined
    //    by marking all the boundary attributes from the mesh as essential
    //    (Dirichlet) and converting them to a list of true dofs.
@@ -152,7 +170,7 @@ int main(int argc, char *argv[])
       fespace->GetEssentialTrueDofs(ess_bdr, ess_tdof_list);
    }
 
-   // 8. Set up the parallel linear form b(.) which corresponds to the
+   // 9. Set up the parallel linear form b(.) which corresponds to the
    //    right-hand side of the FEM linear system, which in this case is
    //    (f,phi_i) where f is given by the function f_exact and phi_i are the
    //    basis functions in the finite element fespace.
@@ -161,7 +179,7 @@ int main(int argc, char *argv[])
    b->AddDomainIntegrator(new VectorFEDomainLFIntegrator(f));
    b->Assemble();
 
-   // 9. Define the solution vector x as a parallel finite element grid function
+   // 10. Define the solution vector x as a parallel finite element grid function
    //    corresponding to fespace. Initialize x by projecting the exact
    //    solution. Note that only values from the boundary faces will be used
    //    when eliminating the non-homogeneous boundary condition to modify the
@@ -170,16 +188,17 @@ int main(int argc, char *argv[])
    VectorFunctionCoefficient F(sdim, F_exact);
    x.ProjectCoefficient(F);
 
-   // 10. Set up the parallel bilinear form corresponding to the H(div)
+   // 11. Set up the parallel bilinear form corresponding to the H(div)
    //     diffusion operator grad alpha div + beta I, by adding the div-div and
    //     the mass domain integrators.
    Coefficient *alpha = new ConstantCoefficient(1.0);
    Coefficient *beta  = new ConstantCoefficient(1.0);
    ParBilinearForm *a = new ParBilinearForm(fespace);
+   if (pa) { a->SetAssemblyLevel(AssemblyLevel::PARTIAL); }
    a->AddDomainIntegrator(new DivDivIntegrator(*alpha));
    a->AddDomainIntegrator(new VectorFEMassIntegrator(*beta));
 
-   // 11. Assemble the parallel bilinear form and the corresponding linear
+   // 12. Assemble the parallel bilinear form and the corresponding linear
    //     system, applying any necessary transformations such as: parallel
    //     assembly, eliminating boundary conditions, applying conforming
    //     constraints for non-conforming AMR, static condensation,
@@ -199,41 +218,43 @@ int main(int argc, char *argv[])
    }
    a->Assemble();
 
-   HypreParMatrix A;
+   OperatorPtr A;
    Vector B, X;
    a->FormLinearSystem(ess_tdof_list, x, *b, A, X, B);
 
-   HYPRE_Int glob_size = A.GetGlobalNumRows();
-   if (myid == 0)
+   if (myid == 0 && !pa)
    {
-      cout << "Size of linear system: " << glob_size << endl;
+      cout << "Size of linear system: "
+           << A.As<HypreParMatrix>()->GetGlobalNumRows() << endl;
    }
 
-   // 12. Define and apply a parallel PCG solver for A X = B with the 2D AMS or
+   // 13. Define and apply a parallel PCG solver for A X = B with the 2D AMS or
    //     the 3D ADS preconditioners from hypre. If using hybridization, the
-   //     system is preconditioned with hypre's BoomerAMG.
-   HypreSolver *prec = NULL;
-   CGSolver *pcg = new CGSolver(A.GetComm());
-   pcg->SetOperator(A);
+   //     system is preconditioned with hypre's BoomerAMG. In the partial
+   //     assembly case, use Jacobi preconditioning.
+   Solver *prec = NULL;
+   CGSolver *pcg = new CGSolver(MPI_COMM_WORLD);
+   pcg->SetOperator(*A);
    pcg->SetRelTol(1e-12);
-   pcg->SetMaxIter(500);
+   pcg->SetMaxIter(2000);
    pcg->SetPrintLevel(1);
-   if (hybridization) { prec = new HypreBoomerAMG(A); }
+   if (hybridization) { prec = new HypreBoomerAMG(*A.As<HypreParMatrix>()); }
+   else if (pa) { prec = new OperatorJacobiSmoother(*a, ess_tdof_list); }
    else
    {
       ParFiniteElementSpace *prec_fespace =
          (a->StaticCondensationIsEnabled() ? a->SCParFESpace() : fespace);
-      if (dim == 2)   { prec = new HypreAMS(A, prec_fespace); }
-      else            { prec = new HypreADS(A, prec_fespace); }
+      if (dim == 2)   { prec = new HypreAMS(*A.As<HypreParMatrix>(), prec_fespace); }
+      else            { prec = new HypreADS(*A.As<HypreParMatrix>(), prec_fespace); }
    }
    pcg->SetPreconditioner(*prec);
    pcg->Mult(B, X);
 
-   // 13. Recover the parallel grid function corresponding to X. This is the
+   // 14. Recover the parallel grid function corresponding to X. This is the
    //     local finite element solution on each processor.
    a->RecoverFEMSolution(X, *b, x);
 
-   // 14. Compute and print the L^2 norm of the error.
+   // 15. Compute and print the L^2 norm of the error.
    {
       double err = x.ComputeL2Error(F);
       if (myid == 0)
@@ -242,7 +263,7 @@ int main(int argc, char *argv[])
       }
    }
 
-   // 15. Save the refined mesh and the solution in parallel. This output can
+   // 16. Save the refined mesh and the solution in parallel. This output can
    //     be viewed later using GLVis: "glvis -np <np> -m mesh -g sol".
    {
       ostringstream mesh_name, sol_name;
@@ -258,7 +279,7 @@ int main(int argc, char *argv[])
       x.Save(sol_ofs);
    }
 
-   // 16. Send the solution by socket to a GLVis server.
+   // 17. Send the solution by socket to a GLVis server.
    if (visualization)
    {
       char vishost[] = "localhost";
@@ -269,7 +290,7 @@ int main(int argc, char *argv[])
       sol_sock << "solution\n" << *pmesh << x << flush;
    }
 
-   // 17. Free the used memory.
+   // 18. Free the used memory.
    delete pcg;
    delete prec;
    delete hfes;
@@ -295,7 +316,7 @@ void F_exact(const Vector &p, Vector &F)
 
    double x = p(0);
    double y = p(1);
-   // double z = (dim == 3) ? p(2) : 0.0;
+   // double z = (dim == 3) ? p(2) : 0.0; // Uncomment if F is changed to depend on z
 
    F(0) = cos(kappa*x)*sin(kappa*y);
    F(1) = cos(kappa*y)*sin(kappa*x);
@@ -312,7 +333,7 @@ void f_exact(const Vector &p, Vector &f)
 
    double x = p(0);
    double y = p(1);
-   // double z = (dim == 3) ? p(2) : 0.0;
+   // double z = (dim == 3) ? p(2) : 0.0; // Uncomment if f is changed to depend on z
 
    double temp = 1 + 2*kappa*kappa;
 
diff --git a/examples/ex5.cpp b/examples/ex5.cpp
index 596157e9c52..b08ecb7aff1 100644
--- a/examples/ex5.cpp
+++ b/examples/ex5.cpp
@@ -4,8 +4,10 @@
 //
 // Sample runs:  ex5 -m ../data/square-disc.mesh
 //               ex5 -m ../data/star.mesh
+//               ex5 -m ../data/star.mesh -pa
 //               ex5 -m ../data/beam-tet.mesh
 //               ex5 -m ../data/beam-hex.mesh
+//               ex5 -m ../data/beam-hex.mesh -pa
 //               ex5 -m ../data/escher.mesh
 //               ex5 -m ../data/fichera.mesh
 //
@@ -47,6 +49,7 @@ int main(int argc, char *argv[])
    // 1. Parse command-line options.
    const char *mesh_file = "../data/star.mesh";
    int order = 1;
+   bool pa = false;
    bool visualization = 1;
 
    OptionsParser args(argc, argv);
@@ -54,6 +57,8 @@ int main(int argc, char *argv[])
                   "Mesh file to use.");
    args.AddOption(&order, "-o", "--order",
                   "Finite element order (polynomial degree).");
+   args.AddOption(&pa, "-pa", "--partial-assembly", "-no-pa",
+                  "--no-partial-assembly", "Enable Partial Assembly.");
    args.AddOption(&visualization, "-vis", "--visualization", "-no-vis",
                   "--no-visualization",
                   "Enable or disable GLVis visualization.");
@@ -146,22 +151,39 @@ int main(int argc, char *argv[])
    BilinearForm *mVarf(new BilinearForm(R_space));
    MixedBilinearForm *bVarf(new MixedBilinearForm(R_space, W_space));
 
+   if (pa) { mVarf->SetAssemblyLevel(AssemblyLevel::PARTIAL); }
    mVarf->AddDomainIntegrator(new VectorFEMassIntegrator(k));
    mVarf->Assemble();
-   mVarf->Finalize();
-   SparseMatrix &M(mVarf->SpMat());
+   if (!pa) { mVarf->Finalize(); }
 
+   if (pa) { bVarf->SetAssemblyLevel(AssemblyLevel::PARTIAL); }
    bVarf->AddDomainIntegrator(new VectorFEDivergenceIntegrator);
    bVarf->Assemble();
-   bVarf->Finalize();
-   SparseMatrix & B(bVarf->SpMat());
-   B *= -1.;
-   SparseMatrix *BT = Transpose(B);
+   if (!pa) { bVarf->Finalize(); }
 
-   BlockMatrix darcyMatrix(block_offsets);
-   darcyMatrix.SetBlock(0,0, &M);
-   darcyMatrix.SetBlock(0,1, BT);
-   darcyMatrix.SetBlock(1,0, &B);
+   BlockOperator darcyOp(block_offsets);
+
+   TransposeOperator *Bt = NULL;
+
+   if (pa)
+   {
+      Bt = new TransposeOperator(bVarf);
+
+      darcyOp.SetBlock(0,0, mVarf);
+      darcyOp.SetBlock(0,1, Bt, -1.0);
+      darcyOp.SetBlock(1,0, bVarf, -1.0);
+   }
+   else
+   {
+      SparseMatrix &M(mVarf->SpMat());
+      SparseMatrix &B(bVarf->SpMat());
+      B *= -1.;
+      Bt = new TransposeOperator(&B);
+
+      darcyOp.SetBlock(0,0, &M);
+      darcyOp.SetBlock(0,1, Bt);
+      darcyOp.SetBlock(1,0, &B);
+   }
 
    // 9. Construct the operators for preconditioner
    //
@@ -170,27 +192,57 @@ int main(int argc, char *argv[])
    //
    //     Here we use Symmetric Gauss-Seidel to approximate the inverse of the
    //     pressure Schur Complement
-   SparseMatrix *MinvBt = Transpose(B);
-   Vector Md(M.Height());
-   M.GetDiag(Md);
-   for (int i = 0; i < Md.Size(); i++)
+   SparseMatrix *MinvBt = NULL;
+   Vector Md(mVarf->Height());
+
+   BlockDiagonalPreconditioner darcyPrec(block_offsets);
+   Solver *invM, *invS;
+   SparseMatrix *S = NULL;
+
+   if (pa)
    {
-      MinvBt->ScaleRow(i, 1./Md(i));
+      mVarf->AssembleDiagonal(Md);
+      Vector invMd(mVarf->Height());
+      for (int i=0; i<mVarf->Height(); ++i)
+      {
+         invMd(i) = 1.0 / Md(i);
+      }
+
+      Vector BMBt_diag(bVarf->Height());
+      bVarf->AssembleDiagonal_ADAt(invMd, BMBt_diag);
+
+      Array<int> ess_tdof_list;  // empty
+
+      invM = new OperatorJacobiSmoother(Md, ess_tdof_list);
+      invS = new OperatorJacobiSmoother(BMBt_diag, ess_tdof_list);
    }
-   SparseMatrix *S = Mult(B, *MinvBt);
+   else
+   {
+      SparseMatrix &M(mVarf->SpMat());
+      M.GetDiag(Md);
+
+      SparseMatrix &B(bVarf->SpMat());
+      MinvBt = Transpose(B);
+
+      for (int i = 0; i < Md.Size(); i++)
+      {
+         MinvBt->ScaleRow(i, 1./Md(i));
+      }
+
+      S = Mult(B, *MinvBt);
+
+      invM = new DSmoother(M);
 
-   Solver *invM, *invS;
-   invM = new DSmoother(M);
 #ifndef MFEM_USE_SUITESPARSE
-   invS = new GSSmoother(*S);
+      invS = new GSSmoother(*S);
 #else
-   invS = new UMFPackSolver(*S);
+      invS = new UMFPackSolver(*S);
 #endif
+   }
 
    invM->iterative_mode = false;
    invS->iterative_mode = false;
 
-   BlockDiagonalPreconditioner darcyPrec(block_offsets);
    darcyPrec.SetDiagonalBlock(0, invM);
    darcyPrec.SetDiagonalBlock(1, invS);
 
@@ -206,7 +258,7 @@ int main(int argc, char *argv[])
    solver.SetAbsTol(atol);
    solver.SetRelTol(rtol);
    solver.SetMaxIter(maxIter);
-   solver.SetOperator(darcyMatrix);
+   solver.SetOperator(darcyOp);
    solver.SetPreconditioner(darcyPrec);
    solver.SetPrintLevel(1);
    x = 0.0;
@@ -295,8 +347,8 @@ int main(int argc, char *argv[])
    delete invM;
    delete invS;
    delete S;
+   delete Bt;
    delete MinvBt;
-   delete BT;
    delete mVarf;
    delete bVarf;
    delete W_space;
diff --git a/examples/ex5p.cpp b/examples/ex5p.cpp
index 389dcb6b363..546a1df3a5a 100644
--- a/examples/ex5p.cpp
+++ b/examples/ex5p.cpp
@@ -4,8 +4,10 @@
 //
 // Sample runs:  mpirun -np 4 ex5p -m ../data/square-disc.mesh
 //               mpirun -np 4 ex5p -m ../data/star.mesh
+//               mpirun -np 4 ex5p -m ../data/star.mesh -r 2 -pa
 //               mpirun -np 4 ex5p -m ../data/beam-tet.mesh
 //               mpirun -np 4 ex5p -m ../data/beam-hex.mesh
+//               mpirun -np 4 ex5p -m ../data/beam-hex.mesh -pa
 //               mpirun -np 4 ex5p -m ../data/escher.mesh
 //               mpirun -np 4 ex5p -m ../data/fichera.mesh
 //
@@ -54,19 +56,25 @@ int main(int argc, char *argv[])
 
    // 2. Parse command-line options.
    const char *mesh_file = "../data/star.mesh";
+   int ref_levels = -1;
    int order = 1;
    bool par_format = false;
+   bool pa = false;
    bool visualization = 1;
    bool adios2 = false;
 
    OptionsParser args(argc, argv);
    args.AddOption(&mesh_file, "-m", "--mesh",
                   "Mesh file to use.");
+   args.AddOption(&ref_levels, "-r", "--refine",
+                  "Number of times to refine the mesh uniformly.");
    args.AddOption(&order, "-o", "--order",
                   "Finite element order (polynomial degree).");
    args.AddOption(&par_format, "-pf", "--parallel-format", "-sf",
                   "--serial-format",
                   "Format to use when saving the results for VisIt.");
+   args.AddOption(&pa, "-pa", "--partial-assembly", "-no-pa",
+                  "--no-partial-assembly", "Enable Partial Assembly.");
    args.AddOption(&visualization, "-vis", "--visualization", "-no-vis",
                   "--no-visualization",
                   "Enable or disable GLVis visualization.");
@@ -97,10 +105,13 @@ int main(int argc, char *argv[])
    // 4. Refine the serial mesh on all processors to increase the resolution. In
    //    this example we do 'ref_levels' of uniform refinement. We choose
    //    'ref_levels' to be the largest number that gives a final mesh with no
-   //    more than 10,000 elements.
+   //    more than 10,000 elements, unless the user specifies it as input.
    {
-      int ref_levels =
-         (int)floor(log(10000./mesh->GetNE())/log(2.)/dim);
+      if (ref_levels == -1)
+      {
+         ref_levels = (int)floor(log(10000./mesh->GetNE())/log(2.)/dim);
+      }
+
       for (int l = 0; l < ref_levels; l++)
       {
          mesh->UniformRefinement();
@@ -196,25 +207,47 @@ int main(int argc, char *argv[])
    ParBilinearForm *mVarf(new ParBilinearForm(R_space));
    ParMixedBilinearForm *bVarf(new ParMixedBilinearForm(R_space, W_space));
 
-   HypreParMatrix *M, *B;
+   HypreParMatrix *M = NULL;
+   HypreParMatrix *B = NULL;
 
+   if (pa) { mVarf->SetAssemblyLevel(AssemblyLevel::PARTIAL); }
    mVarf->AddDomainIntegrator(new VectorFEMassIntegrator(k));
    mVarf->Assemble();
-   mVarf->Finalize();
-   M = mVarf->ParallelAssemble();
+   if (!pa) { mVarf->Finalize(); }
 
+   if (pa) { bVarf->SetAssemblyLevel(AssemblyLevel::PARTIAL); }
    bVarf->AddDomainIntegrator(new VectorFEDivergenceIntegrator);
    bVarf->Assemble();
-   bVarf->Finalize();
-   B = bVarf->ParallelAssemble();
-   (*B) *= -1;
-
-   HypreParMatrix *BT = B->Transpose();
+   if (!pa) { bVarf->Finalize(); }
 
    BlockOperator *darcyOp = new BlockOperator(block_trueOffsets);
-   darcyOp->SetBlock(0,0, M);
-   darcyOp->SetBlock(0,1, BT);
-   darcyOp->SetBlock(1,0, B);
+
+   Array<int> empty_tdof_list;  // empty
+   OperatorPtr opM, opB;
+
+   TransposeOperator *Bt = NULL;
+
+   if (pa)
+   {
+      mVarf->FormSystemMatrix(empty_tdof_list, opM);
+      bVarf->FormRectangularSystemMatrix(empty_tdof_list, empty_tdof_list, opB);
+      Bt = new TransposeOperator(opB.Ptr());
+
+      darcyOp->SetBlock(0,0, opM.Ptr());
+      darcyOp->SetBlock(0,1, Bt, -1.0);
+      darcyOp->SetBlock(1,0, opB.Ptr(), -1.0);
+   }
+   else
+   {
+      M = mVarf->ParallelAssemble();
+      B = bVarf->ParallelAssemble();
+      (*B) *= -1;
+      Bt = new TransposeOperator(B);
+
+      darcyOp->SetBlock(0,0, M);
+      darcyOp->SetBlock(0,1, Bt);
+      darcyOp->SetBlock(1,0, B);
+   }
 
    // 11. Construct the operators for preconditioner
    //
@@ -223,17 +256,43 @@ int main(int argc, char *argv[])
    //
    //     Here we use Symmetric Gauss-Seidel to approximate the inverse of the
    //     pressure Schur Complement.
-   HypreParMatrix *MinvBt = B->Transpose();
-   HypreParVector *Md = new HypreParVector(MPI_COMM_WORLD, M->GetGlobalNumRows(),
-                                           M->GetRowStarts());
-   M->GetDiag(*Md);
+   HypreParMatrix *MinvBt = NULL;
+   HypreParVector *Md = NULL;
+   HypreParMatrix *S = NULL;
+   Vector Md_PA;
+   Solver *invM, *invS;
+
+   if (pa)
+   {
+      Md_PA.SetSize(R_space->GetTrueVSize());
+      mVarf->AssembleDiagonal(Md_PA);
+      Vector invMd(Md_PA.Size());
+      for (int i=0; i<Md_PA.Size(); ++i)
+      {
+         invMd(i) = 1.0 / Md_PA(i);
+      }
+
+      Vector BMBt_diag(W_space->GetTrueVSize());
+      bVarf->AssembleDiagonal_ADAt(invMd, BMBt_diag);
 
-   MinvBt->InvScaleRows(*Md);
-   HypreParMatrix *S = ParMult(B, MinvBt);
+      Array<int> ess_tdof_list;  // empty
 
-   HypreSolver *invM, *invS;
-   invM = new HypreDiagScale(*M);
-   invS = new HypreBoomerAMG(*S);
+      invM = new OperatorJacobiSmoother(Md_PA, ess_tdof_list);
+      invS = new OperatorJacobiSmoother(BMBt_diag, ess_tdof_list);
+   }
+   else
+   {
+      Md = new HypreParVector(MPI_COMM_WORLD, M->GetGlobalNumRows(),
+                              M->GetRowStarts());
+      M->GetDiag(*Md);
+
+      MinvBt = B->Transpose();
+      MinvBt->InvScaleRows(*Md);
+      S = ParMult(B, MinvBt);
+
+      invM = new HypreDiagScale(*M);
+      invS = new HypreBoomerAMG(*S);
+   }
 
    invM->iterative_mode = false;
    invS->iterative_mode = false;
@@ -245,7 +304,7 @@ int main(int argc, char *argv[])
 
    // 12. Solve the linear system with MINRES.
    //     Check the norm of the unpreconditioned residual.
-   int maxIter(500);
+   int maxIter(pa ? 1000 : 500);
    double rtol(1.e-6);
    double atol(1.e-10);
 
@@ -395,7 +454,7 @@ int main(int argc, char *argv[])
    delete S;
    delete Md;
    delete MinvBt;
-   delete BT;
+   delete Bt;
    delete B;
    delete M;
    delete mVarf;
diff --git a/examples/ex9.cpp b/examples/ex9.cpp
index 91175dc0215..03a73f8bd66 100644
--- a/examples/ex9.cpp
+++ b/examples/ex9.cpp
@@ -19,6 +19,7 @@
 //
 // Device sample runs:
 //    ex9 -pa
+//    ex9 -ea
 //    ex9 -pa -m ../data/periodic-cube.mesh
 //    ex9 -pa -m ../data/periodic-cube.mesh -d cuda
 //
@@ -142,6 +143,7 @@ int main(int argc, char *argv[])
    int ref_levels = 2;
    int order = 3;
    bool pa = false;
+   bool ea = false;
    const char *device_config = "cpu";
    int ode_solver_type = 4;
    double t_final = 10.0;
@@ -166,6 +168,8 @@ int main(int argc, char *argv[])
                   "Order (degree) of the finite elements.");
    args.AddOption(&pa, "-pa", "--partial-assembly", "-no-pa",
                   "--no-partial-assembly", "Enable Partial Assembly.");
+   args.AddOption(&ea, "-ea", "--element-assembly", "-no-ea",
+                  "--no-element-assembly", "Enable Element Assembly.");
    args.AddOption(&device_config, "-d", "--device",
                   "Device configuration string, see Device::Configure().");
    args.AddOption(&ode_solver_type, "-s", "--ode-solver",
@@ -269,6 +273,11 @@ int main(int argc, char *argv[])
       m.SetAssemblyLevel(AssemblyLevel::PARTIAL);
       k.SetAssemblyLevel(AssemblyLevel::PARTIAL);
    }
+   else if (ea)
+   {
+      m.SetAssemblyLevel(AssemblyLevel::ELEMENT);
+      k.SetAssemblyLevel(AssemblyLevel::ELEMENT);
+   }
    m.AddDomainIntegrator(new MassIntegrator);
    k.AddDomainIntegrator(new ConvectionIntegrator(velocity, -1.0));
    k.AddInteriorFaceIntegrator(
@@ -429,8 +438,9 @@ FE_Evolution::FE_Evolution(BilinearForm &_M, BilinearForm &_K, const Vector &_b)
    : TimeDependentOperator(_M.Height()), M(_M), K(_K), b(_b), z(_M.Height())
 {
    bool pa = M.GetAssemblyLevel() == AssemblyLevel::PARTIAL;
+   bool ea = M.GetAssemblyLevel() == AssemblyLevel::ELEMENT;
    Array<int> ess_tdof_list;
-   if (pa)
+   if (pa || ea)
    {
       M_prec = new OperatorJacobiSmoother(M, ess_tdof_list);
       M_solver.SetOperator(M);
diff --git a/examples/ex9p.cpp b/examples/ex9p.cpp
index b81e8c3471e..68143e1861b 100644
--- a/examples/ex9p.cpp
+++ b/examples/ex9p.cpp
@@ -19,6 +19,7 @@
 //
 // Device sample runs:
 //    mpirun -np 4 ex9p -pa
+//    mpirun -np 4 ex9p -ea
 //    mpirun -np 4 ex9p -pa -m ../data/periodic-cube.mesh
 //    mpirun -np 4 ex9p -pa -m ../data/periodic-cube.mesh -d cuda
 //
@@ -161,6 +162,7 @@ int main(int argc, char *argv[])
    int par_ref_levels = 0;
    int order = 3;
    bool pa = false;
+   bool ea = false;
    const char *device_config = "cpu";
    int ode_solver_type = 4;
    double t_final = 10.0;
@@ -188,6 +190,8 @@ int main(int argc, char *argv[])
                   "Order (degree) of the finite elements.");
    args.AddOption(&pa, "-pa", "--partial-assembly", "-no-pa",
                   "--no-partial-assembly", "Enable Partial Assembly.");
+   args.AddOption(&ea, "-ea", "--element-assembly", "-no-ea",
+                  "--no-element-assembly", "Enable Element Assembly.");
    args.AddOption(&device_config, "-d", "--device",
                   "Device configuration string, see Device::Configure().");
    args.AddOption(&ode_solver_type, "-s", "--ode-solver",
@@ -319,6 +323,11 @@ int main(int argc, char *argv[])
       m->SetAssemblyLevel(AssemblyLevel::PARTIAL);
       k->SetAssemblyLevel(AssemblyLevel::PARTIAL);
    }
+   else if (ea)
+   {
+      m->SetAssemblyLevel(AssemblyLevel::ELEMENT);
+      k->SetAssemblyLevel(AssemblyLevel::ELEMENT);
+   }
    m->AddDomainIntegrator(new MassIntegrator);
    k->AddDomainIntegrator(new ConvectionIntegrator(velocity, -1.0));
    k->AddInteriorFaceIntegrator(
@@ -556,8 +565,9 @@ FE_Evolution::FE_Evolution(ParBilinearForm &_M, ParBilinearForm &_K,
      z(_M.Height())
 {
    bool pa = _M.GetAssemblyLevel()==AssemblyLevel::PARTIAL;
+   bool ea = _M.GetAssemblyLevel()==AssemblyLevel::ELEMENT;
 
-   if (pa)
+   if (pa || ea)
    {
       M.Reset(&_M, false);
       K.Reset(&_K, false);
@@ -571,7 +581,7 @@ FE_Evolution::FE_Evolution(ParBilinearForm &_M, ParBilinearForm &_K,
    M_solver.SetOperator(*M);
 
    Array<int> ess_tdof_list;
-   if (pa)
+   if (pa || ea)
    {
       M_prec = new OperatorJacobiSmoother(_M, ess_tdof_list);
       dg_solver = NULL;
diff --git a/examples/pumi/ex1.cpp b/examples/pumi/ex1.cpp
index 95def29a732..c72d5ab5baf 100644
--- a/examples/pumi/ex1.cpp
+++ b/examples/pumi/ex1.cpp
@@ -32,6 +32,13 @@
 //               is used for the Finite Element order and "-go" is used for the
 //               geometry order. Note that they can be used independently, i.e.
 //               "-o 8 -go 3" solves for 8th order FE on a third order geometry.
+//
+// NOTE:         Model/Mesh files for this example are in the (large) data file
+//               repository of MFEM here https://github.com/mfem/data under the
+//               folder named "pumi", which consists of the following sub-folders:
+//               a) geom -->  model files
+//               b) parallel --> parallel pumi mesh files
+//               c) serial --> serial pumi mesh files
 
 #include "mfem.hpp"
 #include <fstream>
diff --git a/examples/pumi/ex1p.cpp b/examples/pumi/ex1p.cpp
index c17a9aee8dd..acf6b9832fa 100644
--- a/examples/pumi/ex1p.cpp
+++ b/examples/pumi/ex1p.cpp
@@ -36,6 +36,14 @@
 //               option "-o" is used for the Finite Element order and "-go" for
 //               the geometry order. Note that they can be used independently:
 //               "-o 8 -go 3" solves for 8th order FE on third order geometry.
+//
+// NOTE:         Model/Mesh files for this example are in the (large) data file
+//               repository of MFEM here https://github.com/mfem/data under the
+//               folder named "pumi", which consists of the following sub-folders:
+//               a) geom -->  model files
+//               b) parallel --> parallel pumi mesh files
+//               c) serial --> serial pumi mesh files
+
 
 #include "mfem.hpp"
 #include <fstream>
diff --git a/examples/pumi/ex2.cpp b/examples/pumi/ex2.cpp
index f7b54c195d9..e294b0e4b93 100644
--- a/examples/pumi/ex2.cpp
+++ b/examples/pumi/ex2.cpp
@@ -43,6 +43,14 @@
 //               also illustrated.
 //
 //               We recommend viewing Example 1 before viewing this example.
+//
+// NOTE:         Model/Mesh files for this example are in the (large) data file
+//               repository of MFEM here https://github.com/mfem/data under the
+//               folder named "pumi", which consists of the following sub-folders:
+//               a) geom -->  model files
+//               b) parallel --> parallel pumi mesh files
+//               c) serial --> serial pumi mesh files
+
 
 #include "mfem.hpp"
 #include <fstream>
diff --git a/examples/pumi/ex6p.cpp b/examples/pumi/ex6p.cpp
index fc9fb5386e8..70b6cd9554f 100644
--- a/examples/pumi/ex6p.cpp
+++ b/examples/pumi/ex6p.cpp
@@ -1,7 +1,7 @@
 //                       MFEM Example 6 - Parallel Version
 //                              PUMI Modification
 //
-// Compile with: make ex1p
+// Compile with: make ex6p
 //
 // Sample runs:  mpirun -np 8 ex6p
 //
@@ -18,6 +18,13 @@
 //               is added to modify the "adapt_ratio" which is the fraction of
 //               allowable error that scales the output size field of the error
 //               estimator.
+//
+// NOTE:         Model/Mesh files for this example are in the (large) data file
+//               repository of MFEM here https://github.com/mfem/data under the
+//               folder named "pumi", which consists of the following sub-folders:
+//               a) geom -->  model files
+//               b) parallel --> parallel pumi mesh files
+//               c) serial --> serial pumi mesh files
 
 #include "mfem.hpp"
 #include <fstream>
@@ -332,7 +339,6 @@ int main(int argc, char *argv[])
 
       apf::destroyField(Tmag_field);
       apf::destroyField(ipfield);
-      apf::destroyNumbering(pumi_mesh->findNumbering("LocalVertexNumbering"));
 
       // 18. Perform MesAdapt.
       ma::Input* erinput = ma::configure(pumi_mesh, sizefield);
diff --git a/fem/CMakeLists.txt b/fem/CMakeLists.txt
index 1a6417b8db6..4b7dc7bdbda 100644
--- a/fem/CMakeLists.txt
+++ b/fem/CMakeLists.txt
@@ -13,13 +13,20 @@ set(SRCS
   bilinearform.cpp
   bilinearform_ext.cpp
   bilininteg.cpp
-  bilininteg_convection.cpp
-  bilininteg_dgtrace.cpp
-  bilininteg_diffusion.cpp
+  bilininteg_convection_pa.cpp
+  bilininteg_convection_ea.cpp
+  bilininteg_dgtrace_pa.cpp
+  bilininteg_dgtrace_ea.cpp
+  bilininteg_diffusion_pa.cpp
+  bilininteg_diffusion_ea.cpp
   bilininteg_divergence.cpp
   bilininteg_hcurl.cpp
+  bilininteg_hdiv.cpp
+  bilininteg_vectorfe.cpp
   bilininteg_gradient.cpp
-  bilininteg_mass.cpp
+  bilininteg_mass_pa.cpp
+  bilininteg_mass_ea.cpp
+  bilininteg_transpose_ea.cpp
   bilininteg_vecdiffusion.cpp
   bilininteg_vecmass.cpp
   coefficient.cpp
diff --git a/fem/adios2datacollection.cpp b/fem/adios2datacollection.cpp
index 8c2c8e64259..51703c01cc0 100644
--- a/fem/adios2datacollection.cpp
+++ b/fem/adios2datacollection.cpp
@@ -15,6 +15,8 @@
 
 #include "adios2datacollection.hpp"
 
+#ifdef MFEM_USE_ADIOS2
+
 namespace mfem
 {
 
@@ -87,4 +89,4 @@ noexcept
 
 } //end namespace mfem
 
-
+#endif // MFEM_USE_ADIOS2
diff --git a/fem/adios2datacollection.hpp b/fem/adios2datacollection.hpp
index c611f736755..dd05b4c4787 100644
--- a/fem/adios2datacollection.hpp
+++ b/fem/adios2datacollection.hpp
@@ -17,6 +17,9 @@
 #define MFEM_ADIOS2DATACOLLECTION
 
 #include "../config/config.hpp"
+
+#ifdef MFEM_USE_ADIOS2
+
 #include "../general/adios2stream.hpp"
 #include "datacollection.hpp"
 
@@ -85,4 +88,6 @@ class ADIOS2DataCollection : public DataCollection
 
 }  // namespace mfem
 
+#endif // MFEM_USE_ADIOS2
+
 #endif /* MFEM_ADIOS2DATACOLLECTION */
diff --git a/fem/bilinearform.cpp b/fem/bilinearform.cpp
index 5dd7f08244f..2629834bf68 100644
--- a/fem/bilinearform.cpp
+++ b/fem/bilinearform.cpp
@@ -126,8 +126,7 @@ void BilinearForm::SetAssemblyLevel(AssemblyLevel assembly_level)
          // Use the original BilinearForm implementation for now
          break;
       case AssemblyLevel::ELEMENT:
-         mfem_error("Element assembly not supported yet... stay tuned!");
-         // ext = new EABilinearFormExtension(this);
+         ext = new EABilinearFormExtension(this);
          break;
       case AssemblyLevel::PARTIAL:
          ext = new PABilinearFormExtension(this);
@@ -1432,6 +1431,54 @@ void MixedBilinearForm::Assemble (int skip_zeros)
    }
 }
 
+void MixedBilinearForm::AssembleDiagonal_ADAt(const Vector &D,
+                                              Vector &diag) const
+{
+   if (ext)
+   {
+      MFEM_ASSERT(diag.Size() == test_fes->GetTrueVSize(),
+                  "Vector for holding diagonal has wrong size!");
+      MFEM_ASSERT(D.Size() == trial_fes->GetTrueVSize(),
+                  "Vector for holding diagonal has wrong size!");
+      const Operator *P_trial = trial_fes->GetProlongationMatrix();
+      const Operator *P_test = test_fes->GetProlongationMatrix();
+      if (!IsIdentityProlongation(P_trial))
+      {
+         Vector local_D(P_trial->Height());
+         P_trial->Mult(D, local_D);
+
+         if (!IsIdentityProlongation(P_test))
+         {
+            Vector local_diag(P_test->Height());
+            ext->AssembleDiagonal_ADAt(local_D, local_diag);
+            P_test->MultTranspose(local_diag, diag);
+         }
+         else
+         {
+            ext->AssembleDiagonal_ADAt(local_D, diag);
+         }
+      }
+      else
+      {
+         if (!IsIdentityProlongation(P_test))
+         {
+            Vector local_diag(P_test->Height());
+            ext->AssembleDiagonal_ADAt(D, local_diag);
+            P_test->MultTranspose(local_diag, diag);
+         }
+         else
+         {
+            ext->AssembleDiagonal_ADAt(D, diag);
+         }
+      }
+   }
+   else
+   {
+      MFEM_ABORT("Not implemented. Maybe assemble your bilinear form into a "
+                 "matrix and use SparseMatrix functions?");
+   }
+}
+
 void MixedBilinearForm::ConformingAssemble()
 {
    if (assembly != AssemblyLevel::FULL)
diff --git a/fem/bilinearform.hpp b/fem/bilinearform.hpp
index d616a3dcafd..179ff64f596 100644
--- a/fem/bilinearform.hpp
+++ b/fem/bilinearform.hpp
@@ -25,8 +25,8 @@
 namespace mfem
 {
 
-/// Enumeration defining the assembly level for bilinear and nonlinear form
-/// classes derived from Operator.
+/** @brief Enumeration defining the assembly level for bilinear and nonlinear
+    form classes derived from Operator. */
 enum class AssemblyLevel
 {
    /// Fully assembled form, i.e. a global sparse matrix in MFEM, Hypre or PETSC
@@ -44,15 +44,19 @@ enum class AssemblyLevel
 };
 
 
-/** Class for bilinear form - "Matrix" with associated FE space and
-    BLFIntegrators. */
+/** @brief A "square matrix" operator for the associated FE space and
+    BLFIntegrators The sum of all the BLFIntegrators can be used form the matrix
+    M. This class also supports other assembly levels specified via the
+    SetAssemblyLevel() function. */
 class BilinearForm : public Matrix
 {
 protected:
-   /// Sparse matrix to be associated with the form. Owned.
+   /// Sparse matrix \f$ M \f$ to be associated with the form. Owned.
    SparseMatrix *mat;
 
-   /// Matrix used to eliminate b.c. Owned.
+   /** @brief Sparse Matrix \f$ M_e \f$ used to store the eliminations
+        from the b.c.  Owned.
+       \f$ M + M_e = M_{original} \f$ */
    SparseMatrix *mat_e;
 
    /// FE space on which the form lives. Not owned.
@@ -62,12 +66,12 @@ class BilinearForm : public Matrix
    AssemblyLevel assembly;
    /// Element batch size used in the form action (1, 8, num_elems, etc.)
    int batch;
-   /** Extension for supporting Full Assembly (FA), Element Assembly (EA),
+   /** @brief Extension for supporting Full Assembly (FA), Element Assembly (EA),
        Partial Assembly (PA), or Matrix Free assembly (MF). */
    BilinearFormExtension *ext;
 
-   /// Indicates the Mesh::sequence corresponding to the current state of the
-   /// BilinearForm.
+   /** @brief Indicates the Mesh::sequence corresponding to the current state of
+       the BilinearForm. */
    long sequence;
 
    /** @brief Indicates the BilinearFormIntegrator%s stored in #dbfi, #bbfi,
@@ -147,35 +151,43 @@ class BilinearForm : public Matrix
    /// Get the size of the BilinearForm as a square matrix.
    int Size() const { return height; }
 
-   /// Set the desired assembly level. The default is AssemblyLevel::FULL.
-   /** This method must be called before assembly. */
+   /// Set the desired assembly level.
+   /** Valid choices are:
+
+       - AssemblyLevel::FULL  (default)
+       - AssemblyLevel::PARTIAL
+       - AssemblyLevel::ELEMENT
+       - AssemblyLevel::NONE
+
+       This method must be called before assembly. */
    void SetAssemblyLevel(AssemblyLevel assembly_level);
 
    /// Returns the assembly level
    AssemblyLevel GetAssemblyLevel() const { return assembly; }
 
-   /** Enable the use of static condensation. For details see the description
-       for class StaticCondensation in fem/staticcond.hpp This method should be
-       called before assembly. If the number of unknowns after static
+   /** @brief Enable the use of static condensation. For details see the
+       description for class StaticCondensation in fem/staticcond.hpp This method
+       should be called before assembly. If the number of unknowns after static
        condensation is not reduced, it is not enabled. */
    void EnableStaticCondensation();
 
-   /** Check if static condensation was actually enabled by a previous call to
-       EnableStaticCondensation(). */
+   /** @brief Check if static condensation was actually enabled by a previous
+       call to EnableStaticCondensation(). */
    bool StaticCondensationIsEnabled() const { return static_cond; }
 
    /// Return the trace FE space associated with static condensation.
    FiniteElementSpace *SCFESpace() const
    { return static_cond ? static_cond->GetTraceFESpace() : NULL; }
 
-   /** Enable hybridization; for details see the description for class
+   /// Enable hybridization.
+   /** For details see the description for class
        Hybridization in fem/hybridization.hpp. This method should be called
        before assembly. */
    void EnableHybridization(FiniteElementSpace *constr_space,
                             BilinearFormIntegrator *constr_integ,
                             const Array<int> &ess_tdof_list);
 
-   /** For scalar FE spaces, precompute the sparsity pattern of the matrix
+   /** @brief For scalar FE spaces, precompute the sparsity pattern of the matrix
        (assuming dense element matrices) based on the types of integrators
        present in the bilinear form. */
    void UsePrecomputedSparsity(int ps = 1) { precompute_sparsity = ps; }
@@ -194,15 +206,16 @@ class BilinearForm : public Matrix
    /// Use the sparsity of @a A to allocate the internal SparseMatrix.
    void UseSparsity(SparseMatrix &A);
 
-   /** Pre-allocate the internal SparseMatrix before assembly. If the flag
-       'precompute sparsity' is set, the matrix is allocated in CSR format (i.e.
+   /// Pre-allocate the internal SparseMatrix before assembly.
+   /**  If the flag 'precompute sparsity'
+       is set, the matrix is allocated in CSR format (i.e.
        finalized) and the entries are initialized with zeros. */
    void AllocateMatrix() { if (mat == NULL) { AllocMat(); } }
 
-   /// Access all integrators added with AddDomainIntegrator().
+   /// Access all the integrators added with AddDomainIntegrator().
    Array<BilinearFormIntegrator*> *GetDBFI() { return &dbfi; }
 
-   /// Access all integrators added with AddBoundaryIntegrator().
+   /// Access all the integrators added with AddBoundaryIntegrator().
    Array<BilinearFormIntegrator*> *GetBBFI() { return &bbfi; }
    /** @brief Access all boundary markers added with AddBoundaryIntegrator().
        If no marker was specified when the integrator was added, the
@@ -219,64 +232,85 @@ class BilinearForm : public Matrix
        corresponding pointer (to Array<int>) will be NULL. */
    Array<Array<int>*> *GetBFBFI_Marker() { return &bfbfi_marker; }
 
+   /// Returns a reference to: \f$ M_{ij} \f$
    const double &operator()(int i, int j) { return (*mat)(i,j); }
 
-   /// Returns reference to a_{ij}.
+   /// Returns a reference to: \f$ M_{ij} \f$
    virtual double &Elem(int i, int j);
 
-   /// Returns constant reference to a_{ij}.
+   /// Returns constant reference to: \f$ M_{ij} \f$
    virtual const double &Elem(int i, int j) const;
 
-   /// Matrix vector multiplication.
+   /// Matrix vector multiplication:  \f$ y = M x \f$
    virtual void Mult(const Vector &x, Vector &y) const;
 
+   /** @brief Matrix vector multiplication with the original uneliminated
+       matrix.  The original matrix is \f$ M + M_e \f$ so we have:
+       \f$ y = M x + M_e x \f$ */
    void FullMult(const Vector &x, Vector &y) const
    { mat->Mult(x, y); mat_e->AddMult(x, y); }
 
+   /// Add the matrix vector multiple to a vector:  \f$ y += a M x \f$
    virtual void AddMult(const Vector &x, Vector &y, const double a = 1.0) const
    { mat -> AddMult (x, y, a); }
 
+   /** @brief Add the original uneliminated matrix vector multiple to a vector.
+       The original matrix is \f$ M + Me \f$ so we have:
+       \f$ y += M x + M_e x \f$ */
    void FullAddMult(const Vector &x, Vector &y) const
    { mat->AddMult(x, y); mat_e->AddMult(x, y); }
 
+   /// Add the matrix transpose vector multiplication:  \f$ y += a M^T x \f$
    virtual void AddMultTranspose(const Vector & x, Vector & y,
                                  const double a = 1.0) const
    { mat->AddMultTranspose(x, y, a); }
 
+   /** @brief Add the original uneliminated matrix transpose vector
+       multiple to a vector. The original matrix is \f$ M + M_e \f$
+       so we have: \f$ y += M^T x + {M_e}^T x \f$ */
    void FullAddMultTranspose(const Vector & x, Vector & y) const
    { mat->AddMultTranspose(x, y); mat_e->AddMultTranspose(x, y); }
 
+   /// Matrix transpose vector multiplication:  \f$ y = M^T x \f$
    virtual void MultTranspose(const Vector & x, Vector & y) const
    { y = 0.0; AddMultTranspose (x, y); }
 
+   /// Compute \f$ y^T M x \f$
    double InnerProduct(const Vector &x, const Vector &y) const
    { return mat->InnerProduct (x, y); }
 
-   /// Returns a pointer to (approximation) of the matrix inverse.
+   /// Returns a pointer to (approximation) of the matrix inverse:  \f$ M^{-1} \f$
    virtual MatrixInverse *Inverse() const;
 
    /// Finalizes the matrix initialization.
    virtual void Finalize(int skip_zeros = 1);
 
-   /// Returns a reference to the sparse matrix
+   /// Returns a const reference to the sparse matrix.
    const SparseMatrix &SpMat() const
    {
       MFEM_VERIFY(mat, "mat is NULL and can't be dereferenced");
       return *mat;
    }
+
+   /// Returns a reference to the sparse matrix:  \f$ M \f$
    SparseMatrix &SpMat()
    {
       MFEM_VERIFY(mat, "mat is NULL and can't be dereferenced");
       return *mat;
    }
+
+   /**  @brief Nullifies the internal matrix \f$ M \f$ and returns a pointer
+        to it.  Used for transfering ownership. */
    SparseMatrix *LoseMat() { SparseMatrix *tmp = mat; mat = NULL; return tmp; }
 
-   /// Returns a reference to the sparse matrix of eliminated b.c.
+   /// Returns a const reference to the sparse matrix of eliminated b.c.: \f$ M_e \f$
    const SparseMatrix &SpMatElim() const
    {
       MFEM_VERIFY(mat_e, "mat_e is NULL and can't be dereferenced");
       return *mat_e;
    }
+
+   /// Returns a reference to the sparse matrix of eliminated b.c.: \f$ M_e \f$
    SparseMatrix &SpMatElim()
    {
       MFEM_VERIFY(mat_e, "mat_e is NULL and can't be dereferenced");
@@ -311,6 +345,7 @@ class BilinearForm : public Matrix
    void AddBdrFaceIntegrator(BilinearFormIntegrator *bfi,
                              Array<int> &bdr_marker);
 
+   /// Sets all sparse values of \f$ M \f$ and \f$ M_e \f$ to 'a'.
    void operator=(const double a)
    {
       if (mat != NULL) { *mat = a; }
@@ -328,10 +363,10 @@ class BilinearForm : public Matrix
        for an AMR mesh. */
    void AssembleDiagonal(Vector &diag) const;
 
-   /// Get the finite element space prolongation matrix
+   /// Get the finite element space prolongation operator.
    virtual const Operator *GetProlongation() const
    { return fes->GetConformingProlongation(); }
-   /// Get the finite element space restriction matrix
+   /// Get the finite element space restriction operator
    virtual const Operator *GetRestriction() const
    { return fes->GetConformingRestriction(); }
    /// Get the output finite element space prolongation matrix
@@ -491,10 +526,12 @@ class BilinearForm : public Matrix
                                  double value);
 
    /// Eliminate the given @a vdofs. NOTE: here, @a vdofs is a list of DOFs.
+   /** In this case the eliminations are applied to the internal \f$ M \f$
+       and @a rhs without storing the elimination matrix \f$ M_e \f$. */
    void EliminateVDofs(const Array<int> &vdofs, const Vector &sol, Vector &rhs,
                        DiagonalPolicy dpolicy = DIAG_ONE);
 
-   /// Eliminate the given @a vdofs, storing the eliminated part internally.
+   /// Eliminate the given @a vdofs, storing the eliminated part internally in \f$ M_e \f$.
    /** This method works in conjunction with EliminateVDofsInRHS() and allows
        elimination of boundary conditions in multiple right-hand sides. In this
        method, @a vdofs is a list of DOFs. */
@@ -523,9 +560,11 @@ class BilinearForm : public Matrix
    void EliminateVDofsInRHS(const Array<int> &vdofs, const Vector &x,
                             Vector &b);
 
+   /// Compute inner product for full uneliminated matrix \f$ y^T M x + y^T M_e x \f$
    double FullInnerProduct(const Vector &x, const Vector &y) const
    { return mat->InnerProduct(x, y) + mat_e->InnerProduct(x, y); }
 
+   /// Update the @a FiniteElementSpace and delete all data associated with the old one.
    virtual void Update(FiniteElementSpace *nfes = NULL);
 
    /// (DEPRECATED) Return the FE space associated with the BilinearForm.
@@ -537,7 +576,13 @@ class BilinearForm : public Matrix
    /// Read-only access to the associated FiniteElementSpace.
    const FiniteElementSpace *FESpace() const { return fes; }
 
-   /// Sets diagonal policy used upon construction of the linear system
+   /// Sets diagonal policy used upon construction of the linear system.
+   /** Policies include:
+
+       - DIAG_ZERO (Set the diagonal values to zero)
+       - DIAG_ONE  (Set the diagonal values to one)
+       - DIAG_KEEP (Keep the diagonal values)
+   */
    void SetDiagonalPolicy(DiagonalPolicy policy);
 
    /// Indicate that integrators are not owned by the BilinearForm
@@ -550,16 +595,16 @@ class BilinearForm : public Matrix
 
 /**
    Class for assembling of bilinear forms `a(u,v)` defined on different
-   trial and test spaces. The assembled matrix `A` is such that
+   trial and test spaces. The assembled matrix `M` is such that
 
-       a(u,v) = V^t A U
+       a(u,v) = V^t M U
 
    where `U` and `V` are the vectors representing the functions `u` and `v`,
    respectively.  The first argument, `u`, of `a(,)` is in the trial space
    and the second argument, `v`, is in the test space. Thus,
 
-       # of rows of A = dimension of the test space and
-       # of cols of A = dimension of the trial space.
+       # of rows of M = dimension of the test space and
+       # of cols of M = dimension of the trial space.
 
    Both trial and test spaces should be defined on the same mesh.
 */
@@ -628,11 +673,15 @@ class MixedBilinearForm : public Matrix
                      FiniteElementSpace *te_fes,
                      MixedBilinearForm *mbf);
 
+   /// Returns a reference to: \f$ M_{ij} \f$
    virtual double &Elem(int i, int j);
 
+   /// Returns a reference to: \f$ M_{ij} \f$
    virtual const double &Elem(int i, int j) const;
 
+   /// Matrix multiplication: \f$ y = M x \f$
    virtual void Mult(const Vector & x, Vector & y) const;
+
    virtual void AddMult(const Vector & x, Vector & y,
                         const double a = 1.0) const;
 
@@ -642,6 +691,7 @@ class MixedBilinearForm : public Matrix
 
    virtual MatrixInverse *Inverse() const;
 
+   /// Finalizes the matrix initialization.
    virtual void Finalize(int skip_zeros = 1);
 
    /** Extract the associated matrix as SparseMatrix blocks. The number of
@@ -649,8 +699,14 @@ class MixedBilinearForm : public Matrix
        test and trial spaces, respectively. */
    void GetBlocks(Array2D<SparseMatrix *> &blocks) const;
 
+   /// Returns a const reference to the sparse matrix:  \f$ M \f$
    const SparseMatrix &SpMat() const { return *mat; }
+
+   /// Returns a reference to the sparse matrix:  \f$ M \f$
    SparseMatrix &SpMat() { return *mat; }
+
+   /**  @brief Nullifies the internal matrix \f$ M \f$ and returns a pointer
+        to it.  Used for transfering ownership. */
    SparseMatrix *LoseMat() { SparseMatrix *tmp = mat; mat = NULL; return tmp; }
 
    /// Adds a domain integrator. Assumes ownership of @a bfi.
@@ -697,6 +753,7 @@ class MixedBilinearForm : public Matrix
        corresponding pointer (to Array<int>) will be NULL. */
    Array<Array<int>*> *GetBTFBFI_Marker() { return &btfbfi_marker; }
 
+   /// Sets all sparse values of \f$ M \f$ to @a a.
    void operator=(const double a) { *mat = a; }
 
    /// Set the desired assembly level. The default is AssemblyLevel::FULL.
@@ -705,6 +762,10 @@ class MixedBilinearForm : public Matrix
 
    void Assemble(int skip_zeros = 1);
 
+   /** @brief Assemble the diagonal of ADA^T into diag, where A is this mixed
+       bilinear form and D is a diagonal. */
+   void AssembleDiagonal_ADAt(const Vector &D, Vector &diag) const;
+
    /// Get the input finite element space prolongation matrix
    virtual const Operator *GetProlongation() const
    { return trial_fes->GetProlongationMatrix(); }
diff --git a/fem/bilinearform_ext.cpp b/fem/bilinearform_ext.cpp
index 00e57b093f5..a758370a491 100644
--- a/fem/bilinearform_ext.cpp
+++ b/fem/bilinearform_ext.cpp
@@ -47,7 +47,7 @@ PABilinearFormExtension::PABilinearFormExtension(BilinearForm *form)
    bdr_face_restrict_lex = NULL;
 }
 
-void PABilinearFormExtension::SetupRestrictionOperators()
+void PABilinearFormExtension::SetupRestrictionOperators(const L2FaceValues m)
 {
    ElementDofOrdering ordering = UsesTensorBasis(*a->FESpace())?
                                  ElementDofOrdering::LEXICOGRAPHIC:
@@ -65,7 +65,8 @@ void PABilinearFormExtension::SetupRestrictionOperators()
    if (int_face_restrict_lex == NULL && a->GetFBFI()->Size() > 0)
    {
       int_face_restrict_lex = trialFes->GetFaceRestriction(
-                                 ElementDofOrdering::LEXICOGRAPHIC, FaceType::Interior);
+                                 ElementDofOrdering::LEXICOGRAPHIC,
+                                 FaceType::Interior);
       faceIntX.SetSize(int_face_restrict_lex->Height(), Device::GetMemoryType());
       faceIntY.SetSize(int_face_restrict_lex->Height(), Device::GetMemoryType());
       faceIntY.UseDevice(true); // ensure 'faceIntY = 0.0' is done on device
@@ -74,7 +75,9 @@ void PABilinearFormExtension::SetupRestrictionOperators()
    if (bdr_face_restrict_lex == NULL && a->GetBFBFI()->Size() > 0)
    {
       bdr_face_restrict_lex = trialFes->GetFaceRestriction(
-                                 ElementDofOrdering::LEXICOGRAPHIC, FaceType::Boundary);
+                                 ElementDofOrdering::LEXICOGRAPHIC,
+                                 FaceType::Boundary,
+                                 m);
       faceBdrX.SetSize(bdr_face_restrict_lex->Height(), Device::GetMemoryType());
       faceBdrY.SetSize(bdr_face_restrict_lex->Height(), Device::GetMemoryType());
       faceBdrY.UseDevice(true); // ensure 'faceBoundY = 0.0' is done on device
@@ -83,7 +86,7 @@ void PABilinearFormExtension::SetupRestrictionOperators()
 
 void PABilinearFormExtension::Assemble()
 {
-   SetupRestrictionOperators();
+   SetupRestrictionOperators(L2FaceValues::DoubleValued);
 
    Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
    const int integratorCount = integrators.Size();
@@ -287,6 +290,311 @@ void PABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
    }
 }
 
+// Data and methods for element-assembled bilinear forms
+EABilinearFormExtension::EABilinearFormExtension(BilinearForm *form)
+   : PABilinearFormExtension(form)
+{
+}
+
+void EABilinearFormExtension::Assemble()
+{
+   SetupRestrictionOperators(L2FaceValues::SingleValued);
+
+   ne = trialFes->GetMesh()->GetNE();
+   elemDofs = trialFes->GetFE(0)->GetDof();
+
+   ea_data.SetSize(ne*elemDofs*elemDofs, Device::GetMemoryType());
+   ea_data.UseDevice(true);
+   ea_data = 0.0;
+
+   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
+   const int integratorCount = integrators.Size();
+   for (int i = 0; i < integratorCount; ++i)
+   {
+      integrators[i]->AssembleEA(*a->FESpace(), ea_data);
+   }
+
+   faceDofs = trialFes ->
+              GetTraceElement(0, trialFes->GetMesh()->GetFaceBaseGeometry(0)) ->
+              GetDof();
+
+   Array<BilinearFormIntegrator*> &intFaceIntegrators = *a->GetFBFI();
+   const int intFaceIntegratorCount = intFaceIntegrators.Size();
+   if (intFaceIntegratorCount>0)
+   {
+      nf_int = trialFes->GetNFbyType(FaceType::Interior);
+      ea_data_int.SetSize(2*nf_int*faceDofs*faceDofs, Device::GetMemoryType());
+      ea_data_ext.SetSize(2*nf_int*faceDofs*faceDofs, Device::GetMemoryType());
+      ea_data_int = 0.0;
+      ea_data_ext = 0.0;
+   }
+   for (int i = 0; i < intFaceIntegratorCount; ++i)
+   {
+      intFaceIntegrators[i]->AssembleEAInteriorFaces(*a->FESpace(),
+                                                     ea_data_int,
+                                                     ea_data_ext);
+   }
+
+   Array<BilinearFormIntegrator*> &bdrFaceIntegrators = *a->GetBFBFI();
+   const int boundFaceIntegratorCount = bdrFaceIntegrators.Size();
+   if (boundFaceIntegratorCount>0)
+   {
+      nf_bdr = trialFes->GetNFbyType(FaceType::Boundary);
+      ea_data_bdr.SetSize(nf_bdr*faceDofs*faceDofs, Device::GetMemoryType());
+      ea_data_bdr = 0.0;
+   }
+   for (int i = 0; i < boundFaceIntegratorCount; ++i)
+   {
+      bdrFaceIntegrators[i]->AssembleEABoundaryFaces(*a->FESpace(),ea_data_bdr);
+   }
+}
+
+void EABilinearFormExtension::Mult(const Vector &x, Vector &y) const
+{
+   // Apply the Element Restriction
+   const bool useRestrict = !DeviceCanUseCeed() && elem_restrict;
+   if (!useRestrict)
+   {
+      y.UseDevice(true); // typically this is a large vector, so store on device
+      y = 0.0;
+   }
+   else
+   {
+      elem_restrict->Mult(x, localX);
+      localY = 0.0;
+   }
+   // Apply the Element Matrices
+   const int NDOFS = elemDofs;
+   auto X = Reshape(useRestrict?localX.Read():x.Read(), NDOFS, ne);
+   auto Y = Reshape(useRestrict?localY.ReadWrite():y.ReadWrite(), NDOFS, ne);
+   auto A = Reshape(ea_data.Read(), NDOFS, NDOFS, ne);
+   MFEM_FORALL(glob_j, ne*NDOFS,
+   {
+      const int e = glob_j/NDOFS;
+      const int j = glob_j%NDOFS;
+      double res = 0.0;
+      for (int i = 0; i < NDOFS; i++)
+      {
+         res += A(i, j, e)*X(i, e);
+      }
+      Y(j, e) += res;
+   });
+   // Apply the Element Restriction transposed
+   if (useRestrict)
+   {
+      elem_restrict->MultTranspose(localY, y);
+   }
+
+   // Treatment of interior faces
+   Array<BilinearFormIntegrator*> &intFaceIntegrators = *a->GetFBFI();
+   const int iFISz = intFaceIntegrators.Size();
+   if (int_face_restrict_lex && iFISz>0)
+   {
+      // Apply the Interior Face Restriction
+      int_face_restrict_lex->Mult(x, faceIntX);
+      if (faceIntX.Size()>0)
+      {
+         faceIntY = 0.0;
+         // Apply the interior face matrices
+         const int NDOFS = faceDofs;
+         auto X = Reshape(faceIntX.Read(), NDOFS, 2, nf_int);
+         auto Y = Reshape(faceIntY.ReadWrite(), NDOFS, 2, nf_int);
+         auto A_int = Reshape(ea_data_int.Read(), NDOFS, NDOFS, 2, nf_int);
+         MFEM_FORALL(glob_j, nf_int*NDOFS,
+         {
+            const int f = glob_j/NDOFS;
+            const int j = glob_j%NDOFS;
+            double res = 0.0;
+            for (int i = 0; i < NDOFS; i++)
+            {
+               res += A_int(i, j, 0, f)*X(i, 0, f);
+            }
+            Y(j, 0, f) += res;
+            res = 0.0;
+            for (int i = 0; i < NDOFS; i++)
+            {
+               res += A_int(i, j, 1, f)*X(i, 1, f);
+            }
+            Y(j, 1, f) += res;
+         });
+         auto A_ext = Reshape(ea_data_ext.Read(), NDOFS, NDOFS, 2, nf_int);
+         MFEM_FORALL(glob_j, nf_int*NDOFS,
+         {
+            const int f = glob_j/NDOFS;
+            const int j = glob_j%NDOFS;
+            double res = 0.0;
+            for (int i = 0; i < NDOFS; i++)
+            {
+               res += A_ext(i, j, 0, f)*X(i, 0, f);
+            }
+            Y(j, 1, f) += res;
+            res = 0.0;
+            for (int i = 0; i < NDOFS; i++)
+            {
+               res += A_ext(i, j, 1, f)*X(i, 1, f);
+            }
+            Y(j, 0, f) += res;
+         });
+         // Apply the Interior Face Restriction transposed
+         int_face_restrict_lex->MultTranspose(faceIntY, y);
+      }
+   }
+
+   // Treatment of boundary faces
+   Array<BilinearFormIntegrator*> &bdrFaceIntegrators = *a->GetBFBFI();
+   const int bFISz = bdrFaceIntegrators.Size();
+   if (bdr_face_restrict_lex && bFISz>0)
+   {
+      // Apply the Boundary Face Restriction
+      bdr_face_restrict_lex->Mult(x, faceBdrX);
+      if (faceBdrX.Size()>0)
+      {
+         faceBdrY = 0.0;
+         // Apply the boundary face matrices
+         const int NDOFS = faceDofs;
+         auto X = Reshape(faceBdrX.Read(), NDOFS, nf_bdr);
+         auto Y = Reshape(faceBdrY.ReadWrite(), NDOFS, nf_bdr);
+         auto A = Reshape(ea_data_bdr.Read(), NDOFS, NDOFS, nf_bdr);
+         MFEM_FORALL(glob_j, nf_bdr*NDOFS,
+         {
+            const int f = glob_j/NDOFS;
+            const int j = glob_j%NDOFS;
+            double res = 0.0;
+            for (int i = 0; i < NDOFS; i++)
+            {
+               res += A(i, j, f)*X(i, f);
+            }
+            Y(j, f) += res;
+         });
+         // Apply the Boundary Face Restriction transposed
+         bdr_face_restrict_lex->MultTranspose(faceBdrY, y);
+      }
+   }
+}
+
+void EABilinearFormExtension::MultTranspose(const Vector &x, Vector &y) const
+{
+   // Apply the Element Restriction
+   const bool useRestrict = DeviceCanUseCeed() || !elem_restrict;
+   if (!useRestrict)
+   {
+      y.UseDevice(true); // typically this is a large vector, so store on device
+      y = 0.0;
+   }
+   else
+   {
+      elem_restrict->Mult(x, localX);
+      localY = 0.0;
+   }
+   // Apply the Element Matrices transposed
+   const int NDOFS = elemDofs;
+   auto X = Reshape(useRestrict?localX.Read():x.Read(), NDOFS, ne);
+   auto Y = Reshape(useRestrict?localY.ReadWrite():y.ReadWrite(), NDOFS, ne);
+   auto A = Reshape(ea_data.Read(), NDOFS, NDOFS, ne);
+   MFEM_FORALL(glob_j, ne*NDOFS,
+   {
+      const int e = glob_j/NDOFS;
+      const int j = glob_j%NDOFS;
+      double res = 0.0;
+      for (int i = 0; i < NDOFS; i++)
+      {
+         res += A(j, i, e)*X(i, e);
+      }
+      Y(j, e) += res;
+   });
+   // Apply the Element Restriction transposed
+   if (useRestrict)
+   {
+      elem_restrict->MultTranspose(localY, y);
+   }
+
+   // Treatment of interior faces
+   Array<BilinearFormIntegrator*> &intFaceIntegrators = *a->GetFBFI();
+   const int iFISz = intFaceIntegrators.Size();
+   if (int_face_restrict_lex && iFISz>0)
+   {
+      // Apply the Interior Face Restriction
+      int_face_restrict_lex->Mult(x, faceIntX);
+      if (faceIntX.Size()>0)
+      {
+         faceIntY = 0.0;
+         // Apply the interior face matrices transposed
+         const int NDOFS = faceDofs;
+         auto X = Reshape(faceIntX.Read(), NDOFS, 2, nf_int);
+         auto Y = Reshape(faceIntY.ReadWrite(), NDOFS, 2, nf_int);
+         auto A_int = Reshape(ea_data_int.Read(), NDOFS, NDOFS, 2, nf_int);
+         MFEM_FORALL(glob_j, nf_int*NDOFS,
+         {
+            const int f = glob_j/NDOFS;
+            const int j = glob_j%NDOFS;
+            double res = 0.0;
+            for (int i = 0; i < NDOFS; i++)
+            {
+               res += A_int(j, i, 0, f)*X(i, 0, f);
+            }
+            Y(j, 0, f) += res;
+            res = 0.0;
+            for (int i = 0; i < NDOFS; i++)
+            {
+               res += A_int(j, i, 1, f)*X(i, 1, f);
+            }
+            Y(j, 1, f) += res;
+         });
+         auto A_ext = Reshape(ea_data_ext.Read(), NDOFS, NDOFS, 2, nf_int);
+         MFEM_FORALL(glob_j, nf_int*NDOFS,
+         {
+            const int f = glob_j/NDOFS;
+            const int j = glob_j%NDOFS;
+            double res = 0.0;
+            for (int i = 0; i < NDOFS; i++)
+            {
+               res += A_ext(j, i, 0, f)*X(i, 0, f);
+            }
+            Y(j, 1, f) += res;
+            res = 0.0;
+            for (int i = 0; i < NDOFS; i++)
+            {
+               res += A_ext(j, i, 1, f)*X(i, 1, f);
+            }
+            Y(j, 0, f) += res;
+         });
+         // Apply the Interior Face Restriction transposed
+         int_face_restrict_lex->MultTranspose(faceIntY, y);
+      }
+   }
+
+   // Treatment of boundary faces
+   Array<BilinearFormIntegrator*> &bdrFaceIntegrators = *a->GetBFBFI();
+   const int bFISz = bdrFaceIntegrators.Size();
+   if (bdr_face_restrict_lex && bFISz>0)
+   {
+      // Apply the Boundary Face Restriction
+      bdr_face_restrict_lex->Mult(x, faceBdrX);
+      if (faceBdrX.Size()>0)
+      {
+         faceBdrY = 0.0;
+         // Apply the boundary face matrices transposed
+         const int NDOFS = faceDofs;
+         auto X = Reshape(faceBdrX.Read(), NDOFS, nf_bdr);
+         auto Y = Reshape(faceBdrY.ReadWrite(), NDOFS, nf_bdr);
+         auto A = Reshape(ea_data_bdr.Read(), NDOFS, NDOFS, nf_bdr);
+         MFEM_FORALL(glob_j, nf_bdr*NDOFS,
+         {
+            const int f = glob_j/NDOFS;
+            const int j = glob_j%NDOFS;
+            double res = 0.0;
+            for (int i = 0; i < NDOFS; i++)
+            {
+               res += A(j, i, f)*X(i, f);
+            }
+            Y(j, f) += res;
+         });
+         // Apply the Boundary Face Restriction transposed
+         bdr_face_restrict_lex->MultTranspose(faceBdrY, y);
+      }
+   }
+}
+
 MixedBilinearFormExtension::MixedBilinearFormExtension(MixedBilinearForm *form)
    : Operator(form->Height(), form->Width()), a(form)
 {
@@ -487,4 +795,68 @@ void PAMixedBilinearFormExtension::AddMultTranspose(const Vector &x, Vector &y,
    }
 }
 
+void PAMixedBilinearFormExtension::AssembleDiagonal_ADAt(const Vector &D,
+                                                         Vector &diag) const
+{
+   Array<BilinearFormIntegrator*> &integrators = *a->GetDBFI();
+
+   const int iSz = integrators.Size();
+
+   if (elem_restrict_trial)
+   {
+      const ElementRestriction* H1elem_restrict_trial =
+         dynamic_cast<const ElementRestriction*>(elem_restrict_trial);
+      if (H1elem_restrict_trial)
+      {
+         H1elem_restrict_trial->MultUnsigned(D, localTrial);
+      }
+      else
+      {
+         elem_restrict_trial->Mult(D, localTrial);
+      }
+   }
+
+   if (elem_restrict_test)
+   {
+      localTest = 0.0;
+      for (int i = 0; i < iSz; ++i)
+      {
+         if (elem_restrict_trial)
+         {
+            integrators[i]->AssembleDiagonalPA_ADAt(localTrial, localTest);
+         }
+         else
+         {
+            integrators[i]->AssembleDiagonalPA_ADAt(D, localTest);
+         }
+      }
+      const ElementRestriction* H1elem_restrict_test =
+         dynamic_cast<const ElementRestriction*>(elem_restrict_test);
+      if (H1elem_restrict_test)
+      {
+         H1elem_restrict_test->MultTransposeUnsigned(localTest, diag);
+      }
+      else
+      {
+         elem_restrict_test->MultTranspose(localTest, diag);
+      }
+   }
+   else
+   {
+      diag.UseDevice(true); // typically this is a large vector, so store on device
+      diag = 0.0;
+      for (int i = 0; i < iSz; ++i)
+      {
+         if (elem_restrict_trial)
+         {
+            integrators[i]->AssembleDiagonalPA_ADAt(localTrial, diag);
+         }
+         else
+         {
+            integrators[i]->AssembleDiagonalPA_ADAt(D, diag);
+         }
+      }
+   }
+}
+
 } // namespace mfem
diff --git a/fem/bilinearform_ext.hpp b/fem/bilinearform_ext.hpp
index d14eae739c0..57fbc027870 100644
--- a/fem/bilinearform_ext.hpp
+++ b/fem/bilinearform_ext.hpp
@@ -22,9 +22,12 @@ namespace mfem
 class BilinearForm;
 class MixedBilinearForm;
 
-
-/** @brief Class extending the BilinearForm class to support the different
-    AssemblyLevel%s. */
+/// Class extending the BilinearForm class to support different AssemblyLevels.
+/**  FA - Full Assembly
+     PA - Partial Assembly
+     EA - Element Assembly
+     MF - Matrix Free
+*/
 class BilinearFormExtension : public Operator
 {
 protected:
@@ -42,6 +45,7 @@ class BilinearFormExtension : public Operator
    /// Get the finite element space restriction matrix
    virtual const Operator *GetRestriction() const;
 
+   /// Assemble at the level given for the BilinearFormExtension subclass
    virtual void Assemble() = 0;
 
    virtual void AssembleDiagonal(Vector &diag) const
@@ -58,7 +62,8 @@ class BilinearFormExtension : public Operator
    virtual void Update() = 0;
 };
 
-/// Data and methods for fully-assembled bilinear forms
+/** @brief Data and methods for fully-assembled bilinear forms.
+    Not yet implemented!  Use the BilinearForm Class instead. */
 class FABilinearFormExtension : public BilinearFormExtension
 {
 public:
@@ -78,26 +83,6 @@ class FABilinearFormExtension : public BilinearFormExtension
    ~FABilinearFormExtension() {}
 };
 
-/// Data and methods for element-assembled bilinear forms
-class EABilinearFormExtension : public BilinearFormExtension
-{
-public:
-   EABilinearFormExtension(BilinearForm *form)
-      : BilinearFormExtension(form) { }
-
-   /// TODO
-   void Assemble() {}
-   void FormSystemMatrix(const Array<int> &ess_tdof_list, OperatorHandle &A) {}
-   void FormLinearSystem(const Array<int> &ess_tdof_list,
-                         Vector &x, Vector &b,
-                         OperatorHandle &A, Vector &X, Vector &B,
-                         int copy_interior = 0) {}
-   void Mult(const Vector &x, Vector &y) const {}
-   void MultTranspose(const Vector &x, Vector &y) const {}
-   void Update() {}
-   ~EABilinearFormExtension() {}
-};
-
 /// Data and methods for partially-assembled bilinear forms
 class PABilinearFormExtension : public BilinearFormExtension
 {
@@ -113,7 +98,6 @@ class PABilinearFormExtension : public BilinearFormExtension
 public:
    PABilinearFormExtension(BilinearForm*);
 
-   void SetupRestrictionOperators();
    void Assemble();
    void AssembleDiagonal(Vector &diag) const;
    void FormSystemMatrix(const Array<int> &ess_tdof_list, OperatorHandle &A);
@@ -121,14 +105,34 @@ class PABilinearFormExtension : public BilinearFormExtension
                          Vector &x, Vector &b,
                          OperatorHandle &A, Vector &X, Vector &B,
                          int copy_interior = 0);
-
    void Mult(const Vector &x, Vector &y) const;
    void MultTranspose(const Vector &x, Vector &y) const;
    void Update();
+
+protected:
+   void SetupRestrictionOperators(const L2FaceValues m);
 };
 
+/// Data and methods for element-assembled bilinear forms
+class EABilinearFormExtension : public PABilinearFormExtension
+{
+protected:
+   int ne;
+   int elemDofs;
+   Vector ea_data;
+   int nf_int, nf_bdr;
+   int faceDofs;
+   Vector ea_data_int, ea_data_ext, ea_data_bdr;
+
+public:
+   EABilinearFormExtension(BilinearForm *form);
+
+   void Assemble();
+   void Mult(const Vector &x, Vector &y) const;
+   void MultTranspose(const Vector &x, Vector &y) const;
+};
 
-/// Data and methods for matrix-free bilinear forms
+/// Data and methods for matrix-free bilinear forms NOT YET IMPLEMENTED.
 class MFBilinearFormExtension : public BilinearFormExtension
 {
 public:
@@ -148,8 +152,12 @@ class MFBilinearFormExtension : public BilinearFormExtension
    ~MFBilinearFormExtension() {}
 };
 
-/** @brief Class extending the MixedBilinearForm class to support the different
-    AssemblyLevel%s. */
+/// Class extending the MixedBilinearForm class to support different AssemblyLevels.
+/**  FA - Full Assembly
+     PA - Partial Assembly
+     EA - Element Assembly
+     MF - Matrix Free
+*/
 class MixedBilinearFormExtension : public Operator
 {
 protected:
@@ -186,6 +194,8 @@ class MixedBilinearFormExtension : public Operator
    virtual void AddMultTranspose(const Vector &x, Vector &y,
                                  const double c=1.0) const = 0;
 
+   virtual void AssembleDiagonal_ADAt(const Vector &D, Vector &diag) const = 0;
+
    virtual void Update() = 0;
 };
 
@@ -236,6 +246,9 @@ class PAMixedBilinearFormExtension : public MixedBilinearFormExtension
    void MultTranspose(const Vector &x, Vector &y) const;
    /// y += c*A^T*x
    void AddMultTranspose(const Vector &x, Vector &y, const double c=1.0) const;
+   /// Assemble the diagonal of ADA^T for a diagonal vector D.
+   void AssembleDiagonal_ADAt(const Vector &D, Vector &diag) const;
+
    /// Update internals for when a new MixedBilinearForm is given to this class
    void Update();
 };
diff --git a/fem/bilininteg.cpp b/fem/bilininteg.cpp
index f145ed596a2..f10e14959a5 100644
--- a/fem/bilininteg.cpp
+++ b/fem/bilininteg.cpp
@@ -47,7 +47,37 @@ void BilinearFormIntegrator::AssemblePABoundaryFaces(const FiniteElementSpace&)
 
 void BilinearFormIntegrator::AssembleDiagonalPA(Vector &)
 {
-   MFEM_ABORT("BilinearFormIntegrator::AssembleDiagonalPA(...)\n"
+   mfem_error ("BilinearFormIntegrator::AssembleDiagonalPA(...)\n"
+               "   is not implemented for this class.");
+}
+
+void BilinearFormIntegrator::AssembleEA(const FiniteElementSpace &fes,
+                                        Vector &emat)
+{
+   mfem_error ("BilinearFormIntegrator::AssembleEA(...)\n"
+               "   is not implemented for this class.");
+}
+
+void BilinearFormIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace
+                                                     &fes,
+                                                     Vector &ea_data_int,
+                                                     Vector &ea_data_ext)
+{
+   mfem_error ("BilinearFormIntegrator::AssembleEAInteriorFaces(...)\n"
+               "   is not implemented for this class.");
+}
+
+void BilinearFormIntegrator::AssembleEABoundaryFaces(const FiniteElementSpace
+                                                     &fes,
+                                                     Vector &ea_data_bdr)
+{
+   mfem_error ("BilinearFormIntegrator::AssembleEABoundaryFaces(...)\n"
+               "   is not implemented for this class.");
+}
+
+void BilinearFormIntegrator::AssembleDiagonalPA_ADAt(const Vector &, Vector &)
+{
+   MFEM_ABORT("BilinearFormIntegrator::AssembleDiagonalPA_ADAt(...)\n"
               "   is not implemented for this class.");
 }
 
@@ -889,7 +919,7 @@ void BoundaryMassIntegrator::AssembleFaceMatrix(
    {
       int order = 2 * el1.GetOrder();
 
-      ir = &IntRules.Get(Trans.FaceGeom, order);
+      ir = &IntRules.Get(Trans.GetGeometryType(), order);
    }
 
    elmat = 0.0;
@@ -900,11 +930,11 @@ void BoundaryMassIntegrator::AssembleFaceMatrix(
       Trans.Loc1.Transform(ip, eip);
       el1.CalcShape(eip, shape);
 
-      Trans.Face->SetIntPoint(&ip);
-      w = Trans.Face->Weight() * ip.weight;
+      Trans.SetIntPoint(&ip);
+      w = Trans.Weight() * ip.weight;
       if (Q)
       {
-         w *= Q -> Eval(*Trans.Face, ip);
+         w *= Q -> Eval(Trans, ip);
       }
 
       AddMult_a_VVt(w, shape, elmat);
@@ -1974,7 +2004,7 @@ void VectorFEMassIntegrator::AssembleElementMatrix2(
       D.SetSize(VQ ? VQ->GetVDim() : 0);
       K.SetSize(MQ ? MQ->GetVDim() : 0, MQ ? MQ->GetVDim() : 0);
 #endif
-      DenseMatrix tmp(trial_vshape.Height(), K.Width());
+      DenseMatrix tmp(test_vshape.Height(), K.Width());
 
       elmat.SetSize (test_dof, trial_dof);
 
@@ -2535,7 +2565,7 @@ void DGTraceIntegrator::AssembleFaceMatrix(const FiniteElement &el1,
       {
          order++;
       }
-      ir = &IntRules.Get(Trans.FaceGeom, order);
+      ir = &IntRules.Get(Trans.GetGeometryType(), order);
    }
 
    for (int p = 0; p < ir->GetNPoints(); p++)
@@ -2549,8 +2579,7 @@ void DGTraceIntegrator::AssembleFaceMatrix(const FiniteElement &el1,
       }
       el1.CalcShape(eip1, shape1);
 
-      Trans.Face->SetIntPoint(&ip);
-      Trans.Elem1->SetIntPoint(&eip1);
+      Trans.SetIntPoint(&ip);
 
       u->Eval(vu, *Trans.Elem1, eip1);
 
@@ -2560,7 +2589,7 @@ void DGTraceIntegrator::AssembleFaceMatrix(const FiniteElement &el1,
       }
       else
       {
-         CalcOrtho(Trans.Face->Jacobian(), nor);
+         CalcOrtho(Trans.Jacobian(), nor);
       }
 
       un = vu * nor;
@@ -2575,7 +2604,6 @@ void DGTraceIntegrator::AssembleFaceMatrix(const FiniteElement &el1,
          double rho_p;
          if (un >= 0.0 && ndof2)
          {
-            Trans.Elem2->SetIntPoint(&eip2);
             rho_p = rho->Eval(*Trans.Elem2, eip2);
          }
          else
@@ -2691,7 +2719,7 @@ void DGDiffusionIntegrator::AssembleFaceMatrix(
       {
          order = 2*el1.GetOrder();
       }
-      ir = &IntRules.Get(Trans.FaceGeom, order);
+      ir = &IntRules.Get(Trans.GetGeometryType(), order);
    }
 
    // assemble: < {(Q \nabla u).n},[v] >      --> elmat
@@ -2702,19 +2730,18 @@ void DGDiffusionIntegrator::AssembleFaceMatrix(
       IntegrationPoint eip1, eip2;
 
       Trans.Loc1.Transform(ip, eip1);
-      Trans.Face->SetIntPoint(&ip);
+      Trans.SetIntPoint(&ip);
       if (dim == 1)
       {
          nor(0) = 2*eip1.x - 1.0;
       }
       else
       {
-         CalcOrtho(Trans.Face->Jacobian(), nor);
+         CalcOrtho(Trans.Jacobian(), nor);
       }
 
       el1.CalcShape(eip1, shape1);
       el1.CalcDShape(eip1, dshape1);
-      Trans.Elem1->SetIntPoint(&eip1);
       w = ip.weight/Trans.Elem1->Weight();
       if (ndof2)
       {
@@ -2763,7 +2790,6 @@ void DGDiffusionIntegrator::AssembleFaceMatrix(
          Trans.Loc2.Transform(ip, eip2);
          el2.CalcShape(eip2, shape2);
          el2.CalcDShape(eip2, dshape2);
-         Trans.Elem2->SetIntPoint(&eip2);
          w = ip.weight/2/Trans.Elem2->Weight();
          if (!MQ)
          {
@@ -2973,7 +2999,7 @@ void DGElasticityIntegrator::AssembleFaceMatrix(
    {
       // a simple choice for the integration order; is this OK?
       const int order = 2 * max(el1.GetOrder(), ndofs2 ? el2.GetOrder() : 0);
-      ir = &IntRules.Get(Trans.FaceGeom, order);
+      ir = &IntRules.Get(Trans.GetGeometryType(), order);
    }
 
    for (int pind = 0; pind < ir->GetNPoints(); ++pind)
@@ -2981,8 +3007,7 @@ void DGElasticityIntegrator::AssembleFaceMatrix(
       const IntegrationPoint &ip = ir->IntPoint(pind);
       IntegrationPoint eip1, eip2; // integration point in the reference space
       Trans.Loc1.Transform(ip, eip1);
-      Trans.Face->SetIntPoint(&ip);
-      Trans.Elem1->SetIntPoint(&eip1);
+      Trans.SetIntPoint(&ip);
 
       el1.CalcShape(eip1, shape1);
       el1.CalcDShape(eip1, dshape1);
@@ -2996,14 +3021,13 @@ void DGElasticityIntegrator::AssembleFaceMatrix(
       }
       else
       {
-         CalcOrtho(Trans.Face->Jacobian(), nor);
+         CalcOrtho(Trans.Jacobian(), nor);
       }
 
       double w, wLM;
       if (ndofs2)
       {
          Trans.Loc2.Transform(ip, eip2);
-         Trans.Elem2->SetIntPoint(&eip2);
          el2.CalcShape(eip2, shape2);
          el2.CalcDShape(eip2, dshape2);
          CalcAdjugate(Trans.Elem2->Jacobian(), adjJ);
@@ -3133,9 +3157,9 @@ void TraceJumpIntegrator::AssembleFaceMatrix(
       order += trial_face_fe.GetOrder();
       if (trial_face_fe.GetMapType() == FiniteElement::VALUE)
       {
-         order += Trans.Face->OrderW();
+         order += Trans.OrderW();
       }
-      ir = &IntRules.Get(Trans.FaceGeom, order);
+      ir = &IntRules.Get(Trans.GetGeometryType(), order);
    }
 
    for (int p = 0; p < ir->GetNPoints(); p++)
@@ -3143,23 +3167,21 @@ void TraceJumpIntegrator::AssembleFaceMatrix(
       const IntegrationPoint &ip = ir->IntPoint(p);
       IntegrationPoint eip1, eip2;
       // Trace finite element shape function
-      Trans.Face->SetIntPoint(&ip);
+      Trans.SetIntPoint(&ip);
       trial_face_fe.CalcShape(ip, face_shape);
       // Side 1 finite element shape function
       Trans.Loc1.Transform(ip, eip1);
       test_fe1.CalcShape(eip1, shape1);
-      Trans.Elem1->SetIntPoint(&eip1);
       if (ndof2)
       {
          // Side 2 finite element shape function
          Trans.Loc2.Transform(ip, eip2);
          test_fe2.CalcShape(eip2, shape2);
-         Trans.Elem2->SetIntPoint(&eip2);
       }
       w = ip.weight;
       if (trial_face_fe.GetMapType() == FiniteElement::VALUE)
       {
-         w *= Trans.Face->Weight();
+         w *= Trans.Weight();
       }
       face_shape *= w;
       for (i = 0; i < ndof1; i++)
@@ -3224,7 +3246,7 @@ void NormalTraceJumpIntegrator::AssembleFaceMatrix(
          order = test_fe1.GetOrder() - 1;
       }
       order += trial_face_fe.GetOrder();
-      ir = &IntRules.Get(Trans.FaceGeom, order);
+      ir = &IntRules.Get(Trans.GetGeometryType(), order);
    }
 
    for (int p = 0; p < ir->GetNPoints(); p++)
diff --git a/fem/bilininteg.hpp b/fem/bilininteg.hpp
index 98eb04c53ce..a7469fe438d 100644
--- a/fem/bilininteg.hpp
+++ b/fem/bilininteg.hpp
@@ -57,6 +57,9 @@ class BilinearFormIntegrator : public NonlinearFormIntegrator
    /// Assemble diagonal and add it to Vector @a diag.
    virtual void AssembleDiagonalPA(Vector &diag);
 
+   /// Assemble diagonal of ADA^T (A is this integrator) and add it to @a diag.
+   virtual void AssembleDiagonalPA_ADAt(const Vector &D, Vector &diag);
+
    /// Method for partially assembled action.
    /** Perform the action of integrator on the input @a x and add the result to
        the output @a y. Both @a x and @a y are E-vectors, i.e. they represent
@@ -75,6 +78,22 @@ class BilinearFormIntegrator : public NonlinearFormIntegrator
        called. */
    virtual void AddMultTransposePA(const Vector &x, Vector &y) const;
 
+   /// Method defining element assembly.
+   /** The result of the element assembly is added and stored in the @a emat
+       Vector. */
+   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
+   /** Used with BilinearFormIntegrators that have different spaces. */
+   // virtual void AssembleEA(const FiniteElementSpace &trial_fes,
+   //                         const FiniteElementSpace &test_fes,
+   //                         Vector &emat);
+
+   virtual void AssembleEAInteriorFaces(const FiniteElementSpace &fes,
+                                        Vector &ea_data_int,
+                                        Vector &ea_data_ext);
+
+   virtual void AssembleEABoundaryFaces(const FiniteElementSpace &fes,
+                                        Vector &ea_data_bdr);
+
    /// Given a particular Finite Element computes the element matrix elmat.
    virtual void AssembleElementMatrix(const FiniteElement &el,
                                       ElementTransformation &Trans,
@@ -180,6 +199,8 @@ class BilinearFormIntegrator : public NonlinearFormIntegrator
    virtual ~BilinearFormIntegrator() { }
 };
 
+/** Wraps a given @a BilinearFormIntegrator and transposes the resulting element
+    matrices. See for example ex9, ex9p. */
 class TransposeIntegrator : public BilinearFormIntegrator
 {
 private:
@@ -234,6 +255,15 @@ class TransposeIntegrator : public BilinearFormIntegrator
       bfi->AddMultTransposePA(x, y);
    }
 
+   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
+
+   virtual void AssembleEAInteriorFaces(const FiniteElementSpace &fes,
+                                        Vector &ea_data_int,
+                                        Vector &ea_data_ext);
+
+   virtual void AssembleEABoundaryFaces(const FiniteElementSpace &fes,
+                                        Vector &ea_data_bdr);
+
    virtual ~TransposeIntegrator() { if (own_bfi) { delete bfi; } }
 };
 
@@ -1885,6 +1915,8 @@ class DiffusionIntegrator: public BilinearFormIntegrator
 
    virtual void AssemblePA(const FiniteElementSpace &fes);
 
+   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
+
    virtual void AssembleDiagonalPA(Vector &diag);
 
    virtual void AddMultPA(const Vector&, Vector&) const;
@@ -1958,6 +1990,8 @@ class MassIntegrator: public BilinearFormIntegrator
 
    virtual void AssemblePA(const FiniteElementSpace &fes);
 
+   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
+
    virtual void AssembleDiagonalPA(Vector &diag);
 
    virtual void AddMultPA(const Vector&, Vector&) const;
@@ -1969,6 +2003,7 @@ class MassIntegrator: public BilinearFormIntegrator
    void SetupPA(const FiniteElementSpace &fes, const bool force = false);
 };
 
+/** Mass integrator (u, v) restricted to the boundary of a domain */
 class BoundaryMassIntegrator : public MassIntegrator
 {
 public:
@@ -2011,6 +2046,8 @@ class ConvectionIntegrator : public BilinearFormIntegrator
 
    virtual void AssemblePA(const FiniteElementSpace&);
 
+   virtual void AssembleEA(const FiniteElementSpace &fes, Vector &emat);
+
    virtual void AddMultPA(const Vector&, Vector&) const;
 
    static const IntegrationRule &GetRule(const FiniteElement &el,
@@ -2110,11 +2147,25 @@ class VectorFEDivergenceIntegrator : public BilinearFormIntegrator
 protected:
    Coefficient *Q;
 
+   using BilinearFormIntegrator::AssemblePA;
+   virtual void AssemblePA(const FiniteElementSpace &trial_fes,
+                           const FiniteElementSpace &test_fes);
+
+   virtual void AddMultPA(const Vector&, Vector&) const;
+   virtual void AddMultTransposePA(const Vector&, Vector&) const;
+
 private:
 #ifndef MFEM_THREAD_SAFE
    Vector divshape, shape;
 #endif
 
+   // PA extension
+   Vector pa_data;
+   const DofToQuad *mapsO;         ///< Not owned. DOF-to-quad map, open.
+   const DofToQuad *L2mapsO;       ///< Not owned. DOF-to-quad map, open.
+   const DofToQuad *mapsC;         ///< Not owned. DOF-to-quad map, closed.
+   int dim, ne, dofs1D, L2dofs1D, quad1D;
+
 public:
    VectorFEDivergenceIntegrator() { Q = NULL; }
    VectorFEDivergenceIntegrator(Coefficient &q) { Q = &q; }
@@ -2125,6 +2176,8 @@ class VectorFEDivergenceIntegrator : public BilinearFormIntegrator
                                        const FiniteElement &test_fe,
                                        ElementTransformation &Trans,
                                        DenseMatrix &elmat);
+
+   virtual void AssembleDiagonalPA_ADAt(const Vector &D, Vector &diag);
 };
 
 
@@ -2308,7 +2361,7 @@ class VectorFEMassIntegrator: public BilinearFormIntegrator
    const DofToQuad *mapsO;         ///< Not owned. DOF-to-quad map, open.
    const DofToQuad *mapsC;         ///< Not owned. DOF-to-quad map, closed.
    const GeometricFactors *geom;   ///< Not owned
-   int dim, ne, nq, dofs1D, quad1D;
+   int dim, ne, nq, dofs1D, quad1D, fetype;
 
 public:
    VectorFEMassIntegrator() { Init(NULL, NULL, NULL); }
@@ -2387,11 +2440,23 @@ class DivDivIntegrator: public BilinearFormIntegrator
 protected:
    Coefficient *Q;
 
+   using BilinearFormIntegrator::AssemblePA;
+   virtual void AssemblePA(const FiniteElementSpace &fes);
+   virtual void AddMultPA(const Vector &x, Vector &y) const;
+   virtual void AssembleDiagonalPA(Vector& diag);
+
 private:
 #ifndef MFEM_THREAD_SAFE
    Vector divshape;
 #endif
 
+   // PA extension
+   Vector pa_data;
+   const DofToQuad *mapsO;         ///< Not owned. DOF-to-quad map, open.
+   const DofToQuad *mapsC;         ///< Not owned. DOF-to-quad map, closed.
+   const GeometricFactors *geom;   ///< Not owned
+   int dim, ne, dofs1D, quad1D;
+
 public:
    DivDivIntegrator() { Q = NULL; }
    DivDivIntegrator(Coefficient &q) : Q(&q) { }
@@ -2544,6 +2609,13 @@ class DGTraceIntegrator : public BilinearFormIntegrator
 
    virtual void AddMultPA(const Vector&, Vector&) const;
 
+   virtual void AssembleEAInteriorFaces(const FiniteElementSpace& fes,
+                                        Vector &ea_data_int,
+                                        Vector &ea_data_ext);
+
+   virtual void AssembleEABoundaryFaces(const FiniteElementSpace& fes,
+                                        Vector &ea_data_bdr);
+
    static const IntegrationRule &GetRule(Geometry::Type geom, int order,
                                          FaceElementTransformations &T);
 
diff --git a/fem/bilininteg_convection_ea.cpp b/fem/bilininteg_convection_ea.cpp
new file mode 100644
index 00000000000..90630f7fa6c
--- /dev/null
+++ b/fem/bilininteg_convection_ea.cpp
@@ -0,0 +1,258 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#include "../general/forall.hpp"
+#include "bilininteg.hpp"
+#include "gridfunc.hpp"
+
+namespace mfem
+{
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EAConvectionAssemble1D(const int NE,
+                                   const Array<double> &b,
+                                   const Array<double> &g,
+                                   const Vector &padata,
+                                   Vector &eadata,
+                                   const int d1d = 0,
+                                   const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto B = Reshape(b.Read(), Q1D, D1D);
+   auto G = Reshape(g.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, NE);
+   auto A = Reshape(eadata.Write(), D1D, D1D, NE);
+   MFEM_FORALL_3D(e, NE, D1D, D1D, 1,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
+      double r_Gi[MQ1];
+      double r_Bj[MQ1];
+      for (int q = 0; q < Q1D; q++)
+      {
+         r_Gi[q] = G(q,MFEM_THREAD_ID(x));
+         r_Bj[q] = B(q,MFEM_THREAD_ID(y));
+      }
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(j1,y,D1D)
+         {
+            double val = 0.0;
+            for (int k1 = 0; k1 < Q1D; ++k1)
+            {
+               val += r_Bj[k1] * D(k1, e) * r_Gi[k1];
+            }
+            A(i1, j1, e) = val;
+         }
+      }
+   });
+}
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EAConvectionAssemble2D(const int NE,
+                                   const Array<double> &b,
+                                   const Array<double> &g,
+                                   const Vector &padata,
+                                   Vector &eadata,
+                                   const int d1d = 0,
+                                   const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto B = Reshape(b.Read(), Q1D, D1D);
+   auto G = Reshape(g.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, Q1D, 2, NE);
+   auto A = Reshape(eadata.Write(), D1D, D1D, D1D, D1D, NE);
+   MFEM_FORALL_3D(e, NE, D1D, D1D, 1,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
+      double r_B[MQ1][MD1];
+      double r_G[MQ1][MD1];
+      for (int d = 0; d < D1D; d++)
+      {
+         for (int q = 0; q < Q1D; q++)
+         {
+            r_B[q][d] = B(q,d);
+            r_G[q][d] = G(q,d);
+         }
+      }
+      MFEM_SHARED double s_D[MQ1][MQ1][2];
+      MFEM_FOREACH_THREAD(k1,x,Q1D)
+      {
+         MFEM_FOREACH_THREAD(k2,y,Q1D)
+         {
+            s_D[k1][k2][0] = D(k1,k2,0,e);
+            s_D[k1][k2][1] = D(k1,k2,1,e);
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(i2,y,D1D)
+         {
+            for (int j1 = 0; j1 < D1D; ++j1)
+            {
+               for (int j2 = 0; j2 < D1D; ++j2)
+               {
+                  double val = 0.0;
+                  for (int k1 = 0; k1 < Q1D; ++k1)
+                  {
+                     for (int k2 = 0; k2 < Q1D; ++k2)
+                     {
+                        val += (r_G[k1][i1] * r_B[k2][i2] * s_D[k1][k2][0]
+                                + r_B[k1][i1] * r_G[k2][i2] * s_D[k1][k2][1])
+                               * r_B[k1][j1]* r_B[k2][j2];
+                     }
+                  }
+                  A(i1, i2, j1, j2, e) = val;
+               }
+            }
+         }
+      }
+   });
+}
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EAConvectionAssemble3D(const int NE,
+                                   const Array<double> &b,
+                                   const Array<double> &g,
+                                   const Vector &padata,
+                                   Vector &eadata,
+                                   const int d1d = 0,
+                                   const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto B = Reshape(b.Read(), Q1D, D1D);
+   auto G = Reshape(g.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, Q1D, Q1D, 3, NE);
+   auto A = Reshape(eadata.Write(), D1D, D1D, D1D, D1D, D1D, D1D, NE);
+   MFEM_FORALL_3D(e, NE, D1D, D1D, D1D,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
+      double r_B[MQ1][MD1];
+      double r_G[MQ1][MD1];
+      for (int d = 0; d < D1D; d++)
+      {
+         for (int q = 0; q < Q1D; q++)
+         {
+            r_B[q][d] = B(q,d);
+            r_G[q][d] = G(q,d);
+         }
+      }
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(i2,y,D1D)
+         {
+            MFEM_FOREACH_THREAD(i3,z,D1D)
+            {
+               for (int j1 = 0; j1 < D1D; ++j1)
+               {
+                  for (int j2 = 0; j2 < D1D; ++j2)
+                  {
+                     for (int j3 = 0; j3 < D1D; ++j3)
+                     {
+                        double val = 0.0;
+                        for (int k1 = 0; k1 < Q1D; ++k1)
+                        {
+                           for (int k2 = 0; k2 < Q1D; ++k2)
+                           {
+                              for (int k3 = 0; k3 < Q1D; ++k3)
+                              {
+                                 double D0 = D(k1,k2,k3,0,e);
+                                 double D1 = D(k1,k2,k3,1,e);
+                                 double D2 = D(k1,k2,k3,2,e);
+                                 val += (r_G[k1][i1] * r_B[k2][i2] * r_B[k3][i3] * D0
+                                         + r_B[k1][i1] * r_G[k2][i2] * r_B[k3][i3] * D1
+                                         + r_B[k1][i1] * r_B[k2][i2] * r_G[k3][i3] * D2)
+                                        * r_B[k1][j1] * r_B[k2][j2] * r_B[k3][j3];
+                              }
+                           }
+                        }
+                        A(i1, i2, i3, j1, j2, j3, e) = val;
+                     }
+                  }
+               }
+            }
+         }
+      }
+   });
+}
+
+void ConvectionIntegrator::AssembleEA(const FiniteElementSpace &fes,
+                                      Vector &ea_data)
+{
+   AssemblePA(fes);
+   const int ne = fes.GetMesh()->GetNE();
+   const Array<double> &B = maps->B;
+   const Array<double> &G = maps->G;
+   if (dim == 1)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x22: return EAConvectionAssemble1D<2,2>(ne,B,G,pa_data,ea_data);
+         case 0x33: return EAConvectionAssemble1D<3,3>(ne,B,G,pa_data,ea_data);
+         case 0x44: return EAConvectionAssemble1D<4,4>(ne,B,G,pa_data,ea_data);
+         case 0x55: return EAConvectionAssemble1D<5,5>(ne,B,G,pa_data,ea_data);
+         case 0x66: return EAConvectionAssemble1D<6,6>(ne,B,G,pa_data,ea_data);
+         case 0x77: return EAConvectionAssemble1D<7,7>(ne,B,G,pa_data,ea_data);
+         case 0x88: return EAConvectionAssemble1D<8,8>(ne,B,G,pa_data,ea_data);
+         case 0x99: return EAConvectionAssemble1D<9,9>(ne,B,G,pa_data,ea_data);
+         default:   return EAConvectionAssemble1D(ne,B,G,pa_data,ea_data,dofs1D,quad1D);
+      }
+   }
+   else if (dim == 2)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x22: return EAConvectionAssemble2D<2,2>(ne,B,G,pa_data,ea_data);
+         case 0x33: return EAConvectionAssemble2D<3,3>(ne,B,G,pa_data,ea_data);
+         case 0x44: return EAConvectionAssemble2D<4,4>(ne,B,G,pa_data,ea_data);
+         case 0x55: return EAConvectionAssemble2D<5,5>(ne,B,G,pa_data,ea_data);
+         case 0x66: return EAConvectionAssemble2D<6,6>(ne,B,G,pa_data,ea_data);
+         case 0x77: return EAConvectionAssemble2D<7,7>(ne,B,G,pa_data,ea_data);
+         case 0x88: return EAConvectionAssemble2D<8,8>(ne,B,G,pa_data,ea_data);
+         case 0x99: return EAConvectionAssemble2D<9,9>(ne,B,G,pa_data,ea_data);
+         default:   return EAConvectionAssemble2D(ne,B,G,pa_data,ea_data,dofs1D,quad1D);
+      }
+   }
+   else if (dim == 3)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x23: return EAConvectionAssemble3D<2,3>(ne,B,G,pa_data,ea_data);
+         case 0x34: return EAConvectionAssemble3D<3,4>(ne,B,G,pa_data,ea_data);
+         case 0x45: return EAConvectionAssemble3D<4,5>(ne,B,G,pa_data,ea_data);
+         case 0x56: return EAConvectionAssemble3D<5,6>(ne,B,G,pa_data,ea_data);
+         case 0x67: return EAConvectionAssemble3D<6,7>(ne,B,G,pa_data,ea_data);
+         case 0x78: return EAConvectionAssemble3D<7,8>(ne,B,G,pa_data,ea_data);
+         case 0x89: return EAConvectionAssemble3D<8,9>(ne,B,G,pa_data,ea_data);
+         default:   return EAConvectionAssemble3D(ne,B,G,pa_data,ea_data,dofs1D,quad1D);
+      }
+   }
+   MFEM_ABORT("Unknown kernel.");
+}
+
+}
diff --git a/fem/bilininteg_convection.cpp b/fem/bilininteg_convection_pa.cpp
similarity index 100%
rename from fem/bilininteg_convection.cpp
rename to fem/bilininteg_convection_pa.cpp
diff --git a/fem/bilininteg_dgtrace_ea.cpp b/fem/bilininteg_dgtrace_ea.cpp
new file mode 100644
index 00000000000..e3a9bc93b9e
--- /dev/null
+++ b/fem/bilininteg_dgtrace_ea.cpp
@@ -0,0 +1,414 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#include "../general/forall.hpp"
+#include "bilininteg.hpp"
+#include "gridfunc.hpp"
+
+namespace mfem
+{
+
+static void EADGTraceAssemble1DInt(const int NF,
+                                   const Array<double> &basis,
+                                   const Vector &padata,
+                                   Vector &eadata_int,
+                                   Vector &eadata_ext)
+{
+   auto D = Reshape(padata.Read(), 2, 2, NF);
+   auto A_int = Reshape(eadata_int.ReadWrite(), 2, NF);
+   auto A_ext = Reshape(eadata_ext.ReadWrite(), 2, NF);
+   MFEM_FORALL(f, NF,
+   {
+      double val_int0, val_int1, val_ext01, val_ext10;
+      val_int0  = D(0, 0, f);
+      val_ext10 = D(1, 0, f);
+      val_ext01 = D(0, 1, f);
+      val_int1  = D(1, 1, f);
+      A_int(0, f) += val_int0;
+      A_int(1, f) += val_int1;
+      A_ext(0, f) += val_ext01;
+      A_ext(1, f) += val_ext10;
+   });
+}
+
+static void EADGTraceAssemble1DBdr(const int NF,
+                                   const Array<double> &basis,
+                                   const Vector &padata,
+                                   Vector &eadata_bdr)
+{
+   auto D = Reshape(padata.Read(), 2, 2, NF);
+   auto A_bdr = Reshape(eadata_bdr.ReadWrite(), NF);
+   MFEM_FORALL(f, NF,
+   {
+      A_bdr(f) += D(0, 0, f);
+   });
+}
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EADGTraceAssemble2DInt(const int NF,
+                                   const Array<double> &basis,
+                                   const Vector &padata,
+                                   Vector &eadata_int,
+                                   Vector &eadata_ext,
+                                   const int d1d = 0,
+                                   const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto B = Reshape(basis.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, 2, 2, NF);
+   auto A_int = Reshape(eadata_int.ReadWrite(), D1D, D1D, 2, NF);
+   auto A_ext = Reshape(eadata_ext.ReadWrite(), D1D, D1D, 2, NF);
+   MFEM_FORALL_3D(f, NF, D1D, D1D, 1,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(j1,y,D1D)
+         {
+            double val_int0 = 0.0;
+            double val_int1 = 0.0;
+            double val_ext01 = 0.0;
+            double val_ext10 = 0.0;
+            for (int k1 = 0; k1 < Q1D; ++k1)
+            {
+               val_int0  += B(k1,i1) * B(k1,j1) * D(k1, 0, 0, f);
+               val_ext01 += B(k1,i1) * B(k1,j1) * D(k1, 0, 1, f);
+               val_ext10 += B(k1,i1) * B(k1,j1) * D(k1, 1, 0, f);
+               val_int1  += B(k1,i1) * B(k1,j1) * D(k1, 1, 1, f);
+            }
+            A_int(i1, j1, 0, f) += val_int0;
+            A_int(i1, j1, 1, f) += val_int1;
+            A_ext(i1, j1, 0, f) += val_ext01;
+            A_ext(i1, j1, 1, f) += val_ext10;
+         }
+      }
+   });
+}
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EADGTraceAssemble2DBdr(const int NF,
+                                   const Array<double> &basis,
+                                   const Vector &padata,
+                                   Vector &eadata_bdr,
+                                   const int d1d = 0,
+                                   const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto B = Reshape(basis.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, 2, 2, NF);
+   auto A_bdr = Reshape(eadata_bdr.ReadWrite(), D1D, D1D, NF);
+   MFEM_FORALL_3D(f, NF, D1D, D1D, 1,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(j1,y,D1D)
+         {
+            double val_bdr = 0.0;
+            for (int k1 = 0; k1 < Q1D; ++k1)
+            {
+               val_bdr  += B(k1,i1) * B(k1,j1) * D(k1, 0, 0, f);
+            }
+            A_bdr(i1, j1, f) += val_bdr;
+         }
+      }
+   });
+}
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EADGTraceAssemble3DInt(const int NF,
+                                   const Array<double> &basis,
+                                   const Vector &padata,
+                                   Vector &eadata_int,
+                                   Vector &eadata_ext,
+                                   const int d1d = 0,
+                                   const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto B = Reshape(basis.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, Q1D, 2, 2, NF);
+   auto A_int = Reshape(eadata_int.ReadWrite(), D1D, D1D, D1D, D1D, 2, NF);
+   auto A_ext = Reshape(eadata_ext.ReadWrite(), D1D, D1D, D1D, D1D, 2, NF);
+   MFEM_FORALL_3D(f, NF, D1D, D1D, 1,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
+      double r_B[MQ1][MD1];
+      for (int d = 0; d < D1D; d++)
+      {
+         for (int q = 0; q < Q1D; q++)
+         {
+            r_B[q][d] = B(q,d);
+         }
+      }
+      MFEM_SHARED double s_D[MQ1][MQ1][2][2];
+      for (int i=0; i < 2; i++)
+      {
+         for (int j=0; j < 2; j++)
+         {
+            MFEM_FOREACH_THREAD(k1,x,Q1D)
+            {
+               MFEM_FOREACH_THREAD(k2,y,Q1D)
+               {
+                  s_D[k1][k2][i][j] = D(k1,k2,i,j,f);
+               }
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(i2,y,D1D)
+         {
+            for (int j1 = 0; j1 < D1D; ++j1)
+            {
+               for (int j2 = 0; j2 < D1D; ++j2)
+               {
+                  double val_int0 = 0.0;
+                  double val_int1 = 0.0;
+                  double val_ext01 = 0.0;
+                  double val_ext10 = 0.0;
+                  for (int k1 = 0; k1 < Q1D; ++k1)
+                  {
+                     for (int k2 = 0; k2 < Q1D; ++k2)
+                     {
+                        val_int0 += r_B[k1][i1] * r_B[k1][j1]
+                                    * r_B[k2][i2] * r_B[k2][j2]
+                                    * s_D[k1][k2][0][0];
+                        val_int1 += r_B[k1][i1] * r_B[k1][j1]
+                                    * r_B[k2][i2] * r_B[k2][j2]
+                                    * s_D[k1][k2][1][1];
+                        val_ext01+= r_B[k1][i1] * r_B[k1][j1]
+                                    * r_B[k2][i2] * r_B[k2][j2]
+                                    * s_D[k1][k2][0][1];
+                        val_ext10+= r_B[k1][i1] * r_B[k1][j1]
+                                    * r_B[k2][i2] * r_B[k2][j2]
+                                    * s_D[k1][k2][1][0];
+                     }
+                  }
+                  A_int(i1, i2, j1, j2, 0, f) += val_int0;
+                  A_int(i1, i2, j1, j2, 1, f) += val_int1;
+                  A_ext(i1, i2, j1, j2, 0, f) += val_ext01;
+                  A_ext(i1, i2, j1, j2, 1, f) += val_ext10;
+               }
+            }
+         }
+      }
+   });
+}
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EADGTraceAssemble3DBdr(const int NF,
+                                   const Array<double> &basis,
+                                   const Vector &padata,
+                                   Vector &eadata_bdr,
+                                   const int d1d = 0,
+                                   const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto B = Reshape(basis.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, Q1D, 2, 2, NF);
+   auto A_bdr = Reshape(eadata_bdr.ReadWrite(), D1D, D1D, D1D, D1D, NF);
+   MFEM_FORALL_3D(f, NF, D1D, D1D, 1,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
+      double r_B[MQ1][MD1];
+      for (int d = 0; d < D1D; d++)
+      {
+         for (int q = 0; q < Q1D; q++)
+         {
+            r_B[q][d] = B(q,d);
+         }
+      }
+      MFEM_SHARED double s_D[MQ1][MQ1][2][2];
+      for (int i=0; i < 2; i++)
+      {
+         for (int j=0; j < 2; j++)
+         {
+            MFEM_FOREACH_THREAD(k1,x,Q1D)
+            {
+               MFEM_FOREACH_THREAD(k2,y,Q1D)
+               {
+                  s_D[k1][k2][i][j] = D(k1,k2,i,j,f);
+               }
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(i2,y,D1D)
+         {
+            for (int j1 = 0; j1 < D1D; ++j1)
+            {
+               for (int j2 = 0; j2 < D1D; ++j2)
+               {
+                  double val_bdr = 0.0;
+                  for (int k1 = 0; k1 < Q1D; ++k1)
+                  {
+                     for (int k2 = 0; k2 < Q1D; ++k2)
+                     {
+                        val_bdr += r_B[k1][i1] * r_B[k1][j1]
+                                   * r_B[k2][i2] * r_B[k2][j2]
+                                   * s_D[k1][k2][0][0];
+                     }
+                  }
+                  A_bdr(i1, i2, j1, j2, f) += val_bdr;
+               }
+            }
+         }
+      }
+   });
+}
+
+void DGTraceIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace& fes,
+                                                Vector &ea_data_int,
+                                                Vector &ea_data_ext)
+{
+   SetupPA(fes, FaceType::Interior);
+   nf = fes.GetNFbyType(FaceType::Interior);
+   if (nf==0) { return; }
+   const Array<double> &B = maps->B;
+   if (dim == 1)
+   {
+      return EADGTraceAssemble1DInt(nf,B,pa_data,ea_data_int,ea_data_ext);
+   }
+   else if (dim == 2)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x22:
+            return EADGTraceAssemble2DInt<2,2>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x33:
+            return EADGTraceAssemble2DInt<3,3>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x44:
+            return EADGTraceAssemble2DInt<4,4>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x55:
+            return EADGTraceAssemble2DInt<5,5>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x66:
+            return EADGTraceAssemble2DInt<6,6>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x77:
+            return EADGTraceAssemble2DInt<7,7>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x88:
+            return EADGTraceAssemble2DInt<8,8>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x99:
+            return EADGTraceAssemble2DInt<9,9>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         default:
+            return EADGTraceAssemble2DInt(nf,B,pa_data,ea_data_int,
+                                          ea_data_ext,dofs1D,quad1D);
+      }
+   }
+   else if (dim == 3)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x23:
+            return EADGTraceAssemble3DInt<2,3>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x34:
+            return EADGTraceAssemble3DInt<3,4>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x45:
+            return EADGTraceAssemble3DInt<4,5>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x56:
+            return EADGTraceAssemble3DInt<5,6>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x67:
+            return EADGTraceAssemble3DInt<6,7>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x78:
+            return EADGTraceAssemble3DInt<7,8>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         case 0x89:
+            return EADGTraceAssemble3DInt<8,9>(nf,B,pa_data,ea_data_int,
+                                               ea_data_ext);
+         default:
+            return EADGTraceAssemble3DInt(nf,B,pa_data,ea_data_int,
+                                          ea_data_ext,dofs1D,quad1D);
+      }
+   }
+   MFEM_ABORT("Unknown kernel.");
+}
+
+void DGTraceIntegrator::AssembleEABoundaryFaces(const FiniteElementSpace& fes,
+                                                Vector &ea_data_bdr)
+{
+   SetupPA(fes, FaceType::Boundary);
+   nf = fes.GetNFbyType(FaceType::Boundary);
+   if (nf==0) { return; }
+   const Array<double> &B = maps->B;
+   if (dim == 1)
+   {
+      return EADGTraceAssemble1DBdr(nf,B,pa_data,ea_data_bdr);
+   }
+   else if (dim == 2)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x22: return EADGTraceAssemble2DBdr<2,2>(nf,B,pa_data,ea_data_bdr);
+         case 0x33: return EADGTraceAssemble2DBdr<3,3>(nf,B,pa_data,ea_data_bdr);
+         case 0x44: return EADGTraceAssemble2DBdr<4,4>(nf,B,pa_data,ea_data_bdr);
+         case 0x55: return EADGTraceAssemble2DBdr<5,5>(nf,B,pa_data,ea_data_bdr);
+         case 0x66: return EADGTraceAssemble2DBdr<6,6>(nf,B,pa_data,ea_data_bdr);
+         case 0x77: return EADGTraceAssemble2DBdr<7,7>(nf,B,pa_data,ea_data_bdr);
+         case 0x88: return EADGTraceAssemble2DBdr<8,8>(nf,B,pa_data,ea_data_bdr);
+         case 0x99: return EADGTraceAssemble2DBdr<9,9>(nf,B,pa_data,ea_data_bdr);
+         default:
+            return EADGTraceAssemble2DBdr(nf,B,pa_data,ea_data_bdr,dofs1D,quad1D);
+      }
+   }
+   else if (dim == 3)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x23: return EADGTraceAssemble3DBdr<2,3>(nf,B,pa_data,ea_data_bdr);
+         case 0x34: return EADGTraceAssemble3DBdr<3,4>(nf,B,pa_data,ea_data_bdr);
+         case 0x45: return EADGTraceAssemble3DBdr<4,5>(nf,B,pa_data,ea_data_bdr);
+         case 0x56: return EADGTraceAssemble3DBdr<5,6>(nf,B,pa_data,ea_data_bdr);
+         case 0x67: return EADGTraceAssemble3DBdr<6,7>(nf,B,pa_data,ea_data_bdr);
+         case 0x78: return EADGTraceAssemble3DBdr<7,8>(nf,B,pa_data,ea_data_bdr);
+         case 0x89: return EADGTraceAssemble3DBdr<8,9>(nf,B,pa_data,ea_data_bdr);
+         default:
+            return EADGTraceAssemble3DBdr(nf,B,pa_data,ea_data_bdr,dofs1D,quad1D);
+      }
+   }
+   MFEM_ABORT("Unknown kernel.");
+}
+
+}
diff --git a/fem/bilininteg_dgtrace.cpp b/fem/bilininteg_dgtrace_pa.cpp
similarity index 100%
rename from fem/bilininteg_dgtrace.cpp
rename to fem/bilininteg_dgtrace_pa.cpp
diff --git a/fem/bilininteg_diffusion_ea.cpp b/fem/bilininteg_diffusion_ea.cpp
new file mode 100644
index 00000000000..2b40a7bb0cd
--- /dev/null
+++ b/fem/bilininteg_diffusion_ea.cpp
@@ -0,0 +1,275 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#include "../general/forall.hpp"
+#include "bilininteg.hpp"
+#include "gridfunc.hpp"
+
+namespace mfem
+{
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EADiffusionAssemble1D(const int NE,
+                                  const Array<double> &b,
+                                  const Array<double> &g,
+                                  const Vector &padata,
+                                  Vector &eadata,
+                                  const int d1d = 0,
+                                  const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto G = Reshape(g.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, NE);
+   auto A = Reshape(eadata.Write(), D1D, D1D, NE);
+   MFEM_FORALL_3D(e, NE, D1D, D1D, 1,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
+      double r_Gi[MQ1];
+      double r_Gj[MQ1];
+      for (int q = 0; q < Q1D; q++)
+      {
+         r_Gi[q] = G(q,MFEM_THREAD_ID(x));
+         r_Gj[q] = G(q,MFEM_THREAD_ID(y));
+      }
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(j1,y,D1D)
+         {
+            double val = 0.0;
+            for (int k1 = 0; k1 < Q1D; ++k1)
+            {
+               val += r_Gj[k1] * D(k1, e) * r_Gi[k1];
+            }
+            A(i1, j1, e) = val;
+         }
+      }
+   });
+}
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EADiffusionAssemble2D(const int NE,
+                                  const Array<double> &b,
+                                  const Array<double> &g,
+                                  const Vector &padata,
+                                  Vector &eadata,
+                                  const int d1d = 0,
+                                  const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto B = Reshape(b.Read(), Q1D, D1D);
+   auto G = Reshape(g.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, Q1D, 3, NE);
+   auto A = Reshape(eadata.Write(), D1D, D1D, D1D, D1D, NE);
+   MFEM_FORALL_3D(e, NE, D1D, D1D, 1,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
+      double r_B[MQ1][MD1];
+      double r_G[MQ1][MD1];
+      for (int d = 0; d < D1D; d++)
+      {
+         for (int q = 0; q < Q1D; q++)
+         {
+            r_B[q][d] = B(q,d);
+            r_G[q][d] = G(q,d);
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(i2,y,D1D)
+         {
+            for (int j1 = 0; j1 < D1D; ++j1)
+            {
+               for (int j2 = 0; j2 < D1D; ++j2)
+               {
+                  double val = 0.0;
+                  for (int k1 = 0; k1 < Q1D; ++k1)
+                  {
+                     for (int k2 = 0; k2 < Q1D; ++k2)
+                     {
+                        double bgi = r_G[k1][i1] * r_B[k2][i2];
+                        double gbi = r_B[k1][i1] * r_G[k2][i2];
+                        double bgj = r_G[k1][j1] * r_B[k2][j2];
+                        double gbj = r_B[k1][j1] * r_G[k2][j2];
+                        double D00 = D(k1,k2,0,e);
+                        double D10 = D(k1,k2,1,e);
+                        double D01 = D10;
+                        double D11 = D(k1,k2,2,e);
+                        val += bgi * D00 * bgj
+                               + gbi * D01 * bgj
+                               + bgi * D10 * gbj
+                               + gbi * D11 * gbj;
+                     }
+                  }
+                  A(i1, i2, j1, j2, e) = val;
+               }
+            }
+         }
+      }
+   });
+}
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EADiffusionAssemble3D(const int NE,
+                                  const Array<double> &g,
+                                  const Array<double> &b,
+                                  const Vector &padata,
+                                  Vector &eadata,
+                                  const int d1d = 0,
+                                  const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto B = Reshape(b.Read(), Q1D, D1D);
+   auto G = Reshape(g.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, Q1D, Q1D, 6, NE);
+   auto A = Reshape(eadata.Write(), D1D, D1D, D1D, D1D, D1D, D1D, NE);
+   MFEM_FORALL_3D(e, NE, D1D, D1D, D1D,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
+      double r_B[MQ1][MD1];
+      double r_G[MQ1][MD1];
+      for (int d = 0; d < D1D; d++)
+      {
+         for (int q = 0; q < Q1D; q++)
+         {
+            r_B[q][d] = B(q,d);
+            r_G[q][d] = G(q,d);
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(i2,y,D1D)
+         {
+            MFEM_FOREACH_THREAD(i3,z,D1D)
+            {
+               for (int j1 = 0; j1 < D1D; ++j1)
+               {
+                  for (int j2 = 0; j2 < D1D; ++j2)
+                  {
+                     for (int j3 = 0; j3 < D1D; ++j3)
+                     {
+                        double val = 0.0;
+                        for (int k1 = 0; k1 < Q1D; ++k1)
+                        {
+                           for (int k2 = 0; k2 < Q1D; ++k2)
+                           {
+                              for (int k3 = 0; k3 < Q1D; ++k3)
+                              {
+                                 double bbgi = r_G[k1][i1] * r_B[k2][i2] * r_B[k3][i3];
+                                 double bgbi = r_B[k1][i1] * r_G[k2][i2] * r_B[k3][i3];
+                                 double gbbi = r_B[k1][i1] * r_B[k2][i2] * r_G[k3][i3];
+                                 double bbgj = r_G[k1][j1] * r_B[k2][j2] * r_B[k3][j3];
+                                 double bgbj = r_B[k1][j1] * r_G[k2][j2] * r_B[k3][j3];
+                                 double gbbj = r_B[k1][j1] * r_B[k2][j2] * r_G[k3][j3];
+                                 double D00 = D(k1,k2,k3,0,e);
+                                 double D10 = D(k1,k2,k3,1,e);
+                                 double D20 = D(k1,k2,k3,2,e);
+                                 double D01 = D10;
+                                 double D11 = D(k1,k2,k3,3,e);
+                                 double D21 = D(k1,k2,k3,4,e);
+                                 double D02 = D20;
+                                 double D12 = D21;
+                                 double D22 = D(k1,k2,k3,5,e);
+                                 val += bbgi * D00 * bbgj
+                                        + bgbi * D10 * bbgj
+                                        + gbbi * D20 * bbgj
+                                        + bbgi * D01 * bgbj
+                                        + bgbi * D11 * bgbj
+                                        + gbbi * D21 * bgbj
+                                        + bbgi * D02 * gbbj
+                                        + bgbi * D12 * gbbj
+                                        + gbbi * D22 * gbbj;
+                              }
+                           }
+                        }
+                        A(i1, i2, i3, j1, j2, j3, e) = val;
+                     }
+                  }
+               }
+            }
+         }
+      }
+   });
+}
+
+void DiffusionIntegrator::AssembleEA(const FiniteElementSpace &fes,
+                                     Vector &ea_data)
+{
+   AssemblePA(fes);
+   const int ne = fes.GetMesh()->GetNE();
+   const Array<double> &B = maps->B;
+   const Array<double> &G = maps->G;
+   if (dim == 1)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x22: return EADiffusionAssemble1D<2,2>(ne,B,G,pa_data,ea_data);
+         case 0x33: return EADiffusionAssemble1D<3,3>(ne,B,G,pa_data,ea_data);
+         case 0x44: return EADiffusionAssemble1D<4,4>(ne,B,G,pa_data,ea_data);
+         case 0x55: return EADiffusionAssemble1D<5,5>(ne,B,G,pa_data,ea_data);
+         case 0x66: return EADiffusionAssemble1D<6,6>(ne,B,G,pa_data,ea_data);
+         case 0x77: return EADiffusionAssemble1D<7,7>(ne,B,G,pa_data,ea_data);
+         case 0x88: return EADiffusionAssemble1D<8,8>(ne,B,G,pa_data,ea_data);
+         case 0x99: return EADiffusionAssemble1D<9,9>(ne,B,G,pa_data,ea_data);
+         default:   return EADiffusionAssemble1D(ne,B,G,pa_data,ea_data,dofs1D,quad1D);
+      }
+   }
+   else if (dim == 2)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x22: return EADiffusionAssemble2D<2,2>(ne,B,G,pa_data,ea_data);
+         case 0x33: return EADiffusionAssemble2D<3,3>(ne,B,G,pa_data,ea_data);
+         case 0x44: return EADiffusionAssemble2D<4,4>(ne,B,G,pa_data,ea_data);
+         case 0x55: return EADiffusionAssemble2D<5,5>(ne,B,G,pa_data,ea_data);
+         case 0x66: return EADiffusionAssemble2D<6,6>(ne,B,G,pa_data,ea_data);
+         case 0x77: return EADiffusionAssemble2D<7,7>(ne,B,G,pa_data,ea_data);
+         case 0x88: return EADiffusionAssemble2D<8,8>(ne,B,G,pa_data,ea_data);
+         case 0x99: return EADiffusionAssemble2D<9,9>(ne,B,G,pa_data,ea_data);
+         default:   return EADiffusionAssemble2D(ne,B,G,pa_data,ea_data,dofs1D,quad1D);
+      }
+   }
+   else if (dim == 3)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x23: return EADiffusionAssemble3D<2,3>(ne,B,G,pa_data,ea_data);
+         case 0x34: return EADiffusionAssemble3D<3,4>(ne,B,G,pa_data,ea_data);
+         case 0x45: return EADiffusionAssemble3D<4,5>(ne,B,G,pa_data,ea_data);
+         case 0x56: return EADiffusionAssemble3D<5,6>(ne,B,G,pa_data,ea_data);
+         case 0x67: return EADiffusionAssemble3D<6,7>(ne,B,G,pa_data,ea_data);
+         case 0x78: return EADiffusionAssemble3D<7,8>(ne,B,G,pa_data,ea_data);
+         case 0x89: return EADiffusionAssemble3D<8,9>(ne,B,G,pa_data,ea_data);
+         default:   return EADiffusionAssemble3D(ne,B,G,pa_data,ea_data,dofs1D,quad1D);
+      }
+   }
+   MFEM_ABORT("Unknown kernel.");
+}
+
+}
diff --git a/fem/bilininteg_diffusion.cpp b/fem/bilininteg_diffusion_pa.cpp
similarity index 100%
rename from fem/bilininteg_diffusion.cpp
rename to fem/bilininteg_diffusion_pa.cpp
diff --git a/fem/bilininteg_hcurl.cpp b/fem/bilininteg_hcurl.cpp
index 3ee6f63aed3..3dd3f8209bc 100644
--- a/fem/bilininteg_hcurl.cpp
+++ b/fem/bilininteg_hcurl.cpp
@@ -24,12 +24,12 @@ constexpr int HCURL_MAX_D1D = 5;
 constexpr int HCURL_MAX_Q1D = 6;
 
 // PA H(curl) Mass Assemble 2D kernel
-static void PAHcurlSetup2D(const int Q1D,
-                           const int NE,
-                           const Array<double> &w,
-                           const Vector &j,
-                           Vector &_coeff,
-                           Vector &op)
+void PAHcurlSetup2D(const int Q1D,
+                    const int NE,
+                    const Array<double> &w,
+                    const Vector &j,
+                    Vector &_coeff,
+                    Vector &op)
 {
    const int NQ = Q1D*Q1D;
    auto W = w.Read();
@@ -55,12 +55,12 @@ static void PAHcurlSetup2D(const int Q1D,
 }
 
 // PA H(curl) Mass Assemble 3D kernel
-static void PAHcurlSetup3D(const int Q1D,
-                           const int NE,
-                           const Array<double> &w,
-                           const Vector &j,
-                           Vector &_coeff,
-                           Vector &op)
+void PAHcurlSetup3D(const int Q1D,
+                    const int NE,
+                    const Array<double> &w,
+                    const Vector &j,
+                    Vector &_coeff,
+                    Vector &op)
 {
    const int NQ = Q1D*Q1D*Q1D;
    auto W = w.Read();
@@ -106,78 +106,16 @@ static void PAHcurlSetup3D(const int Q1D,
    });
 }
 
-void VectorFEMassIntegrator::AssemblePA(const FiniteElementSpace &fes)
-{
-   // Assumes tensor-product elements
-   Mesh *mesh = fes.GetMesh();
-   const FiniteElement *fel = fes.GetFE(0);
-
-   const VectorTensorFiniteElement *el =
-      dynamic_cast<const VectorTensorFiniteElement*>(fel);
-   MFEM_VERIFY(el != NULL, "Only VectorTensorFiniteElement is supported!");
-
-   const IntegrationRule *ir
-      = IntRule ? IntRule : &MassIntegrator::GetRule(*el, *el,
-                                                     *mesh->GetElementTransformation(0));
-   const int dims = el->GetDim();
-   MFEM_VERIFY(dims == 2 || dims == 3, "");
-
-   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
-   const int nq = ir->GetNPoints();
-   dim = mesh->Dimension();
-   MFEM_VERIFY(dim == 2 || dim == 3, "");
-
-   ne = fes.GetNE();
-   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
-   mapsC = &el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-   mapsO = &el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-   dofs1D = mapsC->ndof;
-   quad1D = mapsC->nqpt;
-
-   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
-
-   pa_data.SetSize(symmDims * nq * ne, Device::GetMemoryType());
-
-   Vector coeff(ne * nq);
-   coeff = 1.0;
-   if (Q)
-   {
-      for (int e=0; e<ne; ++e)
-      {
-         ElementTransformation *tr = mesh->GetElementTransformation(e);
-         for (int p=0; p<nq; ++p)
-         {
-            coeff[p + (e * nq)] = Q->Eval(*tr, ir->IntPoint(p));
-         }
-      }
-   }
-
-   if (el->GetDerivType() == mfem::FiniteElement::CURL && dim == 3)
-   {
-      PAHcurlSetup3D(quad1D, ne, ir->GetWeights(), geom->J,
-                     coeff, pa_data);
-   }
-   else if (el->GetDerivType() == mfem::FiniteElement::CURL && dim == 2)
-   {
-      PAHcurlSetup2D(quad1D, ne, ir->GetWeights(), geom->J,
-                     coeff, pa_data);
-   }
-   else
-   {
-      MFEM_ABORT("Unknown kernel.");
-   }
-}
-
-static void PAHcurlMassApply2D(const int D1D,
-                               const int Q1D,
-                               const int NE,
-                               const Array<double> &_Bo,
-                               const Array<double> &_Bc,
-                               const Array<double> &_Bot,
-                               const Array<double> &_Bct,
-                               const Vector &_op,
-                               const Vector &_x,
-                               Vector &_y)
+void PAHcurlMassApply2D(const int D1D,
+                        const int Q1D,
+                        const int NE,
+                        const Array<double> &_Bo,
+                        const Array<double> &_Bc,
+                        const Array<double> &_Bot,
+                        const Array<double> &_Bct,
+                        const Vector &_op,
+                        const Vector &_x,
+                        Vector &_y)
 {
    constexpr static int VDIM = 2;
 
@@ -294,13 +232,13 @@ static void PAHcurlMassApply2D(const int D1D,
    }); // end of element loop
 }
 
-static void PAHcurlMassAssembleDiagonal2D(const int D1D,
-                                          const int Q1D,
-                                          const int NE,
-                                          const Array<double> &_Bo,
-                                          const Array<double> &_Bc,
-                                          const Vector &_op,
-                                          Vector &_diag)
+void PAHcurlMassAssembleDiagonal2D(const int D1D,
+                                   const int Q1D,
+                                   const int NE,
+                                   const Array<double> &_Bo,
+                                   const Array<double> &_Bc,
+                                   const Vector &_op,
+                                   Vector &_diag)
 {
    constexpr static int VDIM = 2;
 
@@ -348,15 +286,17 @@ static void PAHcurlMassAssembleDiagonal2D(const int D1D,
    }); // end of element loop
 }
 
-template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-static void PAHcurlMassAssembleDiagonal3D(const int D1D,
-                                          const int Q1D,
-                                          const int NE,
-                                          const Array<double> &_Bo,
-                                          const Array<double> &_Bc,
-                                          const Vector &_op,
-                                          Vector &_diag)
+void PAHcurlMassAssembleDiagonal3D(const int D1D,
+                                   const int Q1D,
+                                   const int NE,
+                                   const Array<double> &_Bo,
+                                   const Array<double> &_Bc,
+                                   const Vector &_op,
+                                   Vector &_diag)
 {
+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
+
    MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
    MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
    constexpr static int VDIM = 3;
@@ -416,28 +356,20 @@ static void PAHcurlMassAssembleDiagonal3D(const int D1D,
    }); // end of element loop
 }
 
-void VectorFEMassIntegrator::AssembleDiagonalPA(Vector& diag)
+void PAHcurlMassApply3D(const int D1D,
+                        const int Q1D,
+                        const int NE,
+                        const Array<double> &_Bo,
+                        const Array<double> &_Bc,
+                        const Array<double> &_Bot,
+                        const Array<double> &_Bct,
+                        const Vector &_op,
+                        const Vector &_x,
+                        Vector &_y)
 {
-   if (dim == 3)
-      PAHcurlMassAssembleDiagonal3D(dofs1D, quad1D, ne,
-                                    mapsO->B, mapsC->B, pa_data, diag);
-   else
-      PAHcurlMassAssembleDiagonal2D(dofs1D, quad1D, ne,
-                                    mapsO->B, mapsC->B, pa_data, diag);
-}
+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
 
-template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-static void PAHcurlMassApply3D(const int D1D,
-                               const int Q1D,
-                               const int NE,
-                               const Array<double> &_Bo,
-                               const Array<double> &_Bc,
-                               const Array<double> &_Bot,
-                               const Array<double> &_Bct,
-                               const Vector &_op,
-                               const Vector &_x,
-                               Vector &_y)
-{
    MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
    MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
    constexpr static int VDIM = 3;
@@ -615,20 +547,6 @@ static void PAHcurlMassApply3D(const int D1D,
    }); // end of element loop
 }
 
-void VectorFEMassIntegrator::AddMultPA(const Vector &x, Vector &y) const
-{
-   if (dim == 3)
-   {
-      PAHcurlMassApply3D(dofs1D, quad1D, ne, mapsO->B, mapsC->B, mapsO->Bt,
-                         mapsC->Bt, pa_data, x, y);
-   }
-   else
-   {
-      PAHcurlMassApply2D(dofs1D, quad1D, ne, mapsO->B, mapsC->B, mapsO->Bt,
-                         mapsC->Bt, pa_data, x, y);
-   }
-}
-
 // PA H(curl) curl-curl assemble 2D kernel
 static void PACurlCurlSetup2D(const int Q1D,
                               const int NE,
@@ -1678,92 +1596,25 @@ void CurlCurlIntegrator::AssembleDiagonalPA(Vector& diag)
    }
 }
 
-void MixedVectorGradientIntegrator::AssemblePA(const FiniteElementSpace
-                                               &trial_fes,
-                                               const FiniteElementSpace &test_fes)
-{
-   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
-   Mesh *mesh = trial_fes.GetMesh();
-   const FiniteElement *trial_fel = trial_fes.GetFE(0);
-   const FiniteElement *test_fel = test_fes.GetFE(0);
-
-   const NodalTensorFiniteElement *trial_el =
-      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
-   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
-
-   const VectorTensorFiniteElement *test_el =
-      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
-   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
-
-   const IntegrationRule *ir
-      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
-                                                     *mesh->GetElementTransformation(0));
-   const int dims = trial_el->GetDim();
-   MFEM_VERIFY(dims == 2 || dims == 3, "");
-
-   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
-   const int nq = ir->GetNPoints();
-   dim = mesh->Dimension();
-   MFEM_VERIFY(dim == 2 || dim == 3, "");
-
-   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
-
-   ne = trial_fes.GetNE();
-   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
-   mapsC = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
-   mapsO = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
-   dofs1D = mapsC->ndof;
-   quad1D = mapsC->nqpt;
-
-   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
-
-   pa_data.SetSize(symmDims * nq * ne, Device::GetMemoryType());
-
-   Vector coeff(ne * nq);
-   coeff = 1.0;
-   if (Q)
-   {
-      for (int e=0; e<ne; ++e)
-      {
-         ElementTransformation *tr = mesh->GetElementTransformation(e);
-         for (int p=0; p<nq; ++p)
-         {
-            coeff[p + (e * nq)] = Q->Eval(*tr, ir->IntPoint(p));
-         }
-      }
-   }
-
-   // Use the same setup functions as VectorFEMassIntegrator.
-   if (test_el->GetDerivType() == mfem::FiniteElement::CURL && dim == 3)
-   {
-      PAHcurlSetup3D(quad1D, ne, ir->GetWeights(), geom->J,
-                     coeff, pa_data);
-   }
-   else if (test_el->GetDerivType() == mfem::FiniteElement::CURL && dim == 2)
-   {
-      PAHcurlSetup2D(quad1D, ne, ir->GetWeights(), geom->J,
-                     coeff, pa_data);
-   }
-   else
-   {
-      MFEM_ABORT("Unknown kernel.");
-   }
-}
-
 // Apply to x corresponding to DOF's in H^1 (trial), whose gradients are integrated
 // against H(curl) test functions corresponding to y.
-template<int MAX_D1D = HCURL_MAX_D1D, int MAX_Q1D = HCURL_MAX_Q1D>
-static void PAHcurlH1Apply3D(const int D1D,
-                             const int Q1D,
-                             const int NE,
-                             const Array<double> &_Bc,
-                             const Array<double> &_Gc,
-                             const Array<double> &_Bot,
-                             const Array<double> &_Bct,
-                             const Vector &_op,
-                             const Vector &_x,
-                             Vector &_y)
+void PAHcurlH1Apply3D(const int D1D,
+                      const int Q1D,
+                      const int NE,
+                      const Array<double> &_Bc,
+                      const Array<double> &_Gc,
+                      const Array<double> &_Bot,
+                      const Array<double> &_Bct,
+                      const Vector &_op,
+                      const Vector &_x,
+                      Vector &_y)
 {
+   constexpr static int MAX_D1D = HCURL_MAX_D1D;
+   constexpr static int MAX_Q1D = HCURL_MAX_Q1D;
+
+   MFEM_VERIFY(D1D <= MAX_D1D, "Error: D1D > MAX_D1D");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "Error: Q1D > MAX_Q1D");
+
    constexpr static int VDIM = 3;
 
    auto Bc = Reshape(_Bc.Read(), Q1D, D1D);
@@ -1937,16 +1788,16 @@ static void PAHcurlH1Apply3D(const int D1D,
 
 // Apply to x corresponding to DOF's in H^1 (trial), whose gradients are integrated
 // against H(curl) test functions corresponding to y.
-static void PAHcurlH1Apply2D(const int D1D,
-                             const int Q1D,
-                             const int NE,
-                             const Array<double> &_Bc,
-                             const Array<double> &_Gc,
-                             const Array<double> &_Bot,
-                             const Array<double> &_Bct,
-                             const Vector &_op,
-                             const Vector &_x,
-                             Vector &_y)
+void PAHcurlH1Apply2D(const int D1D,
+                      const int Q1D,
+                      const int NE,
+                      const Array<double> &_Bc,
+                      const Array<double> &_Gc,
+                      const Array<double> &_Bot,
+                      const Array<double> &_Bct,
+                      const Vector &_op,
+                      const Vector &_x,
+                      Vector &_y)
 {
    constexpr static int VDIM = 2;
 
@@ -2057,18 +1908,4 @@ static void PAHcurlH1Apply2D(const int D1D,
    }); // end of element loop
 }
 
-void MixedVectorGradientIntegrator::AddMultPA(const Vector &x, Vector &y) const
-{
-   if (dim == 3)
-      PAHcurlH1Apply3D(dofs1D, quad1D, ne, mapsC->B, mapsC->G,
-                       mapsO->Bt, mapsC->Bt, pa_data, x, y);
-   else if (dim == 2)
-      PAHcurlH1Apply2D(dofs1D, quad1D, ne, mapsC->B, mapsC->G,
-                       mapsO->Bt, mapsC->Bt, pa_data, x, y);
-   else
-   {
-      MFEM_ABORT("Unsupported dimension!");
-   }
-}
-
 } // namespace mfem
diff --git a/fem/bilininteg_hdiv.cpp b/fem/bilininteg_hdiv.cpp
new file mode 100644
index 00000000000..59c14d1c293
--- /dev/null
+++ b/fem/bilininteg_hdiv.cpp
@@ -0,0 +1,2022 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license.  We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#include "../general/forall.hpp"
+#include "bilininteg.hpp"
+#include "gridfunc.hpp"
+#include "libceed/mass.hpp"
+
+using namespace std;
+
+
+// Piola transformation in H(div): w = (1 / det (dF)) dF \hat{w}
+// div w = (1 / det (dF)) \hat{div} \hat{w}
+
+namespace mfem
+{
+
+// Local maximum size of dofs and quads in 1D
+constexpr int HDIV_MAX_D1D = 5;
+constexpr int HDIV_MAX_Q1D = 6;
+
+
+// PA H(div) Mass Assemble 2D kernel
+void PAHdivSetup2D(const int Q1D,
+                   const int NE,
+                   const Array<double> &w,
+                   const Vector &j,
+                   Vector &_coeff,
+                   Vector &op)
+{
+   const int NQ = Q1D*Q1D;
+   auto W = w.Read();
+
+   auto J = Reshape(j.Read(), NQ, 2, 2, NE);
+   auto coeff = Reshape(_coeff.Read(), NQ, NE);
+   auto y = Reshape(op.Write(), NQ, 3, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      for (int q = 0; q < NQ; ++q)
+      {
+         const double J11 = J(q,0,0,e);
+         const double J21 = J(q,1,0,e);
+         const double J12 = J(q,0,1,e);
+         const double J22 = J(q,1,1,e);
+         const double c_detJ = W[q] * coeff(q, e) / ((J11*J22)-(J21*J12));
+         // (c/detJ) J^T J
+         y(q,0,e) = c_detJ * (J11*J11 + J21*J21); // 1,1
+         y(q,1,e) = c_detJ * (J11*J12 + J21*J22); // 1,2
+         y(q,2,e) = c_detJ * (J12*J12 + J22*J22); // 2,2
+      }
+   });
+}
+
+// PA H(div) Mass Assemble 3D kernel
+void PAHdivSetup3D(const int Q1D,
+                   const int NE,
+                   const Array<double> &w,
+                   const Vector &j,
+                   Vector &_coeff,
+                   Vector &op)
+{
+   const int NQ = Q1D*Q1D*Q1D;
+   auto W = w.Read();
+   auto J = Reshape(j.Read(), NQ, 3, 3, NE);
+   auto coeff = Reshape(_coeff.Read(), NQ, NE);
+   auto y = Reshape(op.Write(), NQ, 6, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      for (int q = 0; q < NQ; ++q)
+      {
+         const double J11 = J(q,0,0,e);
+         const double J21 = J(q,1,0,e);
+         const double J31 = J(q,2,0,e);
+         const double J12 = J(q,0,1,e);
+         const double J22 = J(q,1,1,e);
+         const double J32 = J(q,2,1,e);
+         const double J13 = J(q,0,2,e);
+         const double J23 = J(q,1,2,e);
+         const double J33 = J(q,2,2,e);
+         const double detJ = J11 * (J22 * J33 - J32 * J23) -
+         /* */               J21 * (J12 * J33 - J32 * J13) +
+         /* */               J31 * (J12 * J23 - J22 * J13);
+         const double c_detJ = W[q] * coeff(q, e) / detJ;
+         // (c/detJ) J^T J
+         y(q,0,e) = c_detJ * (J11*J11 + J21*J21 + J31*J31); // 1,1
+         y(q,1,e) = c_detJ * (J12*J11 + J22*J21 + J32*J31); // 2,1
+         y(q,2,e) = c_detJ * (J13*J11 + J23*J21 + J33*J31); // 3,1
+         y(q,3,e) = c_detJ * (J12*J12 + J22*J22 + J32*J32); // 2,2
+         y(q,4,e) = c_detJ * (J13*J12 + J23*J22 + J33*J32); // 3,2
+         y(q,5,e) = c_detJ * (J13*J13 + J23*J23 + J33*J33); // 3,3
+      }
+   });
+}
+
+void PAHdivMassApply2D(const int D1D,
+                       const int Q1D,
+                       const int NE,
+                       const Array<double> &_Bo,
+                       const Array<double> &_Bc,
+                       const Array<double> &_Bot,
+                       const Array<double> &_Bct,
+                       const Vector &_op,
+                       const Vector &_x,
+                       Vector &_y)
+{
+   constexpr static int VDIM = 2;
+
+   auto Bo = Reshape(_Bo.Read(), Q1D, D1D-1);
+   auto Bc = Reshape(_Bc.Read(), Q1D, D1D);
+   auto Bot = Reshape(_Bot.Read(), D1D-1, Q1D);
+   auto Bct = Reshape(_Bct.Read(), D1D, Q1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, 3, NE);
+   auto x = Reshape(_x.Read(), 2*(D1D-1)*D1D, NE);
+   auto y = Reshape(_y.ReadWrite(), 2*(D1D-1)*D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      double mass[MAX_Q1D][MAX_Q1D][VDIM];
+
+      for (int qy = 0; qy < Q1D; ++qy)
+      {
+         for (int qx = 0; qx < Q1D; ++qx)
+         {
+            for (int c = 0; c < VDIM; ++c)
+            {
+               mass[qy][qx][c] = 0.0;
+            }
+         }
+      }
+
+      int osc = 0;
+
+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
+      {
+         const int D1Dx = (c == 1) ? D1D - 1 : D1D;
+         const int D1Dy = (c == 0) ? D1D - 1 : D1D;
+
+         for (int dy = 0; dy < D1Dy; ++dy)
+         {
+            double massX[MAX_Q1D];
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               massX[qx] = 0.0;
+            }
+
+            for (int dx = 0; dx < D1Dx; ++dx)
+            {
+               const double t = x(dx + (dy * D1Dx) + osc, e);
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  massX[qx] += t * ((c == 0) ? Bc(qx,dx) : Bo(qx,dx));
+               }
+            }
+
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               const double wy = (c == 1) ? Bc(qy,dy) : Bo(qy,dy);
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  mass[qy][qx][c] += massX[qx] * wy;
+               }
+            }
+         }
+
+         osc += D1Dx * D1Dy;
+      }  // loop (c) over components
+
+      // Apply D operator.
+      for (int qy = 0; qy < Q1D; ++qy)
+      {
+         for (int qx = 0; qx < Q1D; ++qx)
+         {
+            const double O11 = op(qx,qy,0,e);
+            const double O12 = op(qx,qy,1,e);
+            const double O22 = op(qx,qy,2,e);
+            const double massX = mass[qy][qx][0];
+            const double massY = mass[qy][qx][1];
+            mass[qy][qx][0] = (O11*massX)+(O12*massY);
+            mass[qy][qx][1] = (O12*massX)+(O22*massY);
+         }
+      }
+
+      for (int qy = 0; qy < Q1D; ++qy)
+      {
+         osc = 0;
+
+         for (int c = 0; c < VDIM; ++c)  // loop over x, y components
+         {
+            const int D1Dx = (c == 1) ? D1D - 1 : D1D;
+            const int D1Dy = (c == 0) ? D1D - 1 : D1D;
+
+            double massX[MAX_D1D];
+            for (int dx = 0; dx < D1Dx; ++dx)
+            {
+               massX[dx] = 0;
+            }
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  massX[dx] += mass[qy][qx][c] * ((c == 0) ? Bct(dx,qx) :
+                                                  Bot(dx,qx));
+               }
+            }
+
+            for (int dy = 0; dy < D1Dy; ++dy)
+            {
+               const double wy = (c == 1) ? Bct(dy,qy) : Bot(dy,qy);
+
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  y(dx + (dy * D1Dx) + osc, e) += massX[dx] * wy;
+               }
+            }
+
+            osc += D1Dx * D1Dy;
+         }  // loop c
+      }  // loop qy
+   }); // end of element loop
+}
+
+void PAHdivMassAssembleDiagonal2D(const int D1D,
+                                  const int Q1D,
+                                  const int NE,
+                                  const Array<double> &_Bo,
+                                  const Array<double> &_Bc,
+                                  const Vector &_op,
+                                  Vector &_diag)
+{
+   constexpr static int VDIM = 2;
+
+   auto Bo = Reshape(_Bo.Read(), Q1D, D1D-1);
+   auto Bc = Reshape(_Bc.Read(), Q1D, D1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, 3, NE);
+   auto diag = Reshape(_diag.ReadWrite(), 2*(D1D-1)*D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      int osc = 0;
+
+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
+      {
+         const int D1Dx = (c == 1) ? D1D - 1 : D1D;
+         const int D1Dy = (c == 0) ? D1D - 1 : D1D;
+
+         for (int dy = 0; dy < D1Dy; ++dy)
+         {
+            double mass[MAX_Q1D];
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               mass[qx] = 0.0;
+               for (int qy = 0; qy < Q1D; ++qy)
+               {
+                  const double wy = (c == 1) ? Bc(qy,dy) : Bo(qy,dy);
+                  mass[qx] += wy*wy*((c == 0) ? op(qx,qy,0,e) : op(qx,qy,2,e));
+               }
+            }
+
+            for (int dx = 0; dx < D1Dx; ++dx)
+            {
+               double val = 0.0;
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  const double wx = (c == 0) ? Bc(qx,dx) : Bo(qx,dx);
+                  val += mass[qx] * wx * wx;
+               }
+               diag(dx + (dy * D1Dx) + osc, e) += val;
+            }
+         }
+
+         osc += D1Dx * D1Dy;
+      }  // loop (c) over components
+   }); // end of element loop
+}
+
+void PAHdivMassAssembleDiagonal3D(const int D1D,
+                                  const int Q1D,
+                                  const int NE,
+                                  const Array<double> &_Bo,
+                                  const Array<double> &_Bc,
+                                  const Vector &_op,
+                                  Vector &_diag)
+{
+   MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
+   MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
+   constexpr static int VDIM = 3;
+
+   auto Bo = Reshape(_Bo.Read(), Q1D, D1D-1);
+   auto Bc = Reshape(_Bc.Read(), Q1D, D1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, Q1D, 6, NE);
+   auto diag = Reshape(_diag.ReadWrite(), 3*(D1D-1)*(D1D-1)*D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      int osc = 0;
+
+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
+      {
+         const int D1Dz = (c == 2) ? D1D : D1D - 1;
+         const int D1Dy = (c == 1) ? D1D : D1D - 1;
+         const int D1Dx = (c == 0) ? D1D : D1D - 1;
+
+         const int opc = (c == 0) ? 0 : ((c == 1) ? 3 : 5);
+
+         double mass[HDIV_MAX_Q1D];
+
+         for (int dz = 0; dz < D1Dz; ++dz)
+         {
+            for (int dy = 0; dy < D1Dy; ++dy)
+            {
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  mass[qx] = 0.0;
+                  for (int qy = 0; qy < Q1D; ++qy)
+                  {
+                     const double wy = (c == 1) ? Bc(qy,dy) : Bo(qy,dy);
+                     for (int qz = 0; qz < Q1D; ++qz)
+                     {
+                        const double wz = (c == 2) ? Bc(qz,dz) : Bo(qz,dz);
+                        mass[qx] += wy * wy * wz * wz * op(qx,qy,qz,opc,e);
+                     }
+                  }
+               }
+
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  double val = 0.0;
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     const double wx = (c == 0) ? Bc(qx,dx) : Bo(qx,dx);
+                     val += mass[qx] * wx * wx;
+                  }
+                  diag(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += val;
+               }
+            }
+         }
+
+         osc += D1Dx * D1Dy * D1Dz;
+      }  // loop c
+   }); // end of element loop
+}
+
+void PAHdivMassApply3D(const int D1D,
+                       const int Q1D,
+                       const int NE,
+                       const Array<double> &_Bo,
+                       const Array<double> &_Bc,
+                       const Array<double> &_Bot,
+                       const Array<double> &_Bct,
+                       const Vector &_op,
+                       const Vector &_x,
+                       Vector &_y)
+{
+   MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
+   MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
+   constexpr static int VDIM = 3;
+
+   auto Bo = Reshape(_Bo.Read(), Q1D, D1D-1);
+   auto Bc = Reshape(_Bc.Read(), Q1D, D1D);
+   auto Bot = Reshape(_Bot.Read(), D1D-1, Q1D);
+   auto Bct = Reshape(_Bct.Read(), D1D, Q1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, Q1D, 6, NE);
+   auto x = Reshape(_x.Read(), 3*(D1D-1)*(D1D-1)*D1D, NE);
+   auto y = Reshape(_y.ReadWrite(), 3*(D1D-1)*(D1D-1)*D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      double mass[HDIV_MAX_Q1D][HDIV_MAX_Q1D][HDIV_MAX_Q1D][VDIM];
+
+      for (int qz = 0; qz < Q1D; ++qz)
+      {
+         for (int qy = 0; qy < Q1D; ++qy)
+         {
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               for (int c = 0; c < VDIM; ++c)
+               {
+                  mass[qz][qy][qx][c] = 0.0;
+               }
+            }
+         }
+      }
+
+      int osc = 0;
+
+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
+      {
+         const int D1Dz = (c == 2) ? D1D : D1D - 1;
+         const int D1Dy = (c == 1) ? D1D : D1D - 1;
+         const int D1Dx = (c == 0) ? D1D : D1D - 1;
+
+         for (int dz = 0; dz < D1Dz; ++dz)
+         {
+            double massXY[HDIV_MAX_Q1D][HDIV_MAX_Q1D];
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  massXY[qy][qx] = 0.0;
+               }
+            }
+
+            for (int dy = 0; dy < D1Dy; ++dy)
+            {
+               double massX[HDIV_MAX_Q1D];
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  massX[qx] = 0.0;
+               }
+
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  const double t = x(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     massX[qx] += t * ((c == 0) ? Bc(qx,dx) : Bo(qx,dx));
+                  }
+               }
+
+               for (int qy = 0; qy < Q1D; ++qy)
+               {
+                  const double wy = (c == 1) ? Bc(qy,dy) : Bo(qy,dy);
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     const double wx = massX[qx];
+                     massXY[qy][qx] += wx * wy;
+                  }
+               }
+            }
+
+            for (int qz = 0; qz < Q1D; ++qz)
+            {
+               const double wz = (c == 2) ? Bc(qz,dz) : Bo(qz,dz);
+               for (int qy = 0; qy < Q1D; ++qy)
+               {
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
+                  }
+               }
+            }
+         }
+
+         osc += D1Dx * D1Dy * D1Dz;
+      }  // loop (c) over components
+
+      // Apply D operator.
+      for (int qz = 0; qz < Q1D; ++qz)
+      {
+         for (int qy = 0; qy < Q1D; ++qy)
+         {
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               const double O11 = op(qx,qy,qz,0,e);
+               const double O12 = op(qx,qy,qz,1,e);
+               const double O13 = op(qx,qy,qz,2,e);
+               const double O22 = op(qx,qy,qz,3,e);
+               const double O23 = op(qx,qy,qz,4,e);
+               const double O33 = op(qx,qy,qz,5,e);
+               const double massX = mass[qz][qy][qx][0];
+               const double massY = mass[qz][qy][qx][1];
+               const double massZ = mass[qz][qy][qx][2];
+               mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
+               mass[qz][qy][qx][1] = (O12*massX)+(O22*massY)+(O23*massZ);
+               mass[qz][qy][qx][2] = (O13*massX)+(O23*massY)+(O33*massZ);
+            }
+         }
+      }
+
+      for (int qz = 0; qz < Q1D; ++qz)
+      {
+         double massXY[HDIV_MAX_D1D][HDIV_MAX_D1D];
+
+         osc = 0;
+
+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
+         {
+            const int D1Dz = (c == 2) ? D1D : D1D - 1;
+            const int D1Dy = (c == 1) ? D1D : D1D - 1;
+            const int D1Dx = (c == 0) ? D1D : D1D - 1;
+
+            for (int dy = 0; dy < D1Dy; ++dy)
+            {
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  massXY[dy][dx] = 0;
+               }
+            }
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               double massX[HDIV_MAX_D1D];
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  massX[dx] = 0;
+               }
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  for (int dx = 0; dx < D1Dx; ++dx)
+                  {
+                     massX[dx] += mass[qz][qy][qx][c] *
+                                  ((c == 0) ? Bct(dx,qx) : Bot(dx,qx));
+                  }
+               }
+               for (int dy = 0; dy < D1Dy; ++dy)
+               {
+                  const double wy = (c == 1) ? Bct(dy,qy) : Bot(dy,qy);
+                  for (int dx = 0; dx < D1Dx; ++dx)
+                  {
+                     massXY[dy][dx] += massX[dx] * wy;
+                  }
+               }
+            }
+
+            for (int dz = 0; dz < D1Dz; ++dz)
+            {
+               const double wz = (c == 2) ? Bct(dz,qz) : Bot(dz,qz);
+               for (int dy = 0; dy < D1Dy; ++dy)
+               {
+                  for (int dx = 0; dx < D1Dx; ++dx)
+                  {
+                     y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) +=
+                        massXY[dy][dx] * wz;
+                  }
+               }
+            }
+
+            osc += D1Dx * D1Dy * D1Dz;
+         }  // loop c
+      }  // loop qz
+   }); // end of element loop
+}
+
+// PA H(div) div-div assemble 2D kernel
+// NOTE: this is identical to PACurlCurlSetup3D
+static void PADivDivSetup2D(const int Q1D,
+                            const int NE,
+                            const Array<double> &w,
+                            const Vector &j,
+                            Vector &_coeff,
+                            Vector &op)
+{
+   const int NQ = Q1D*Q1D;
+   auto W = w.Read();
+   auto J = Reshape(j.Read(), NQ, 2, 2, NE);
+   auto coeff = Reshape(_coeff.Read(), NQ, NE);
+   auto y = Reshape(op.Write(), NQ, NE);
+   MFEM_FORALL(e, NE,
+   {
+      for (int q = 0; q < NQ; ++q)
+      {
+         const double J11 = J(q,0,0,e);
+         const double J21 = J(q,1,0,e);
+         const double J12 = J(q,0,1,e);
+         const double J22 = J(q,1,1,e);
+         const double detJ = (J11*J22)-(J21*J12);
+         y(q,e) = W[q] * coeff(q,e) / detJ;
+      }
+   });
+}
+
+static void PADivDivSetup3D(const int Q1D,
+                            const int NE,
+                            const Array<double> &w,
+                            const Vector &j,
+                            Vector &_coeff,
+                            Vector &op)
+{
+   const int NQ = Q1D*Q1D*Q1D;
+   auto W = w.Read();
+   auto J = Reshape(j.Read(), NQ, 3, 3, NE);
+   auto coeff = Reshape(_coeff.Read(), NQ, NE);
+   auto y = Reshape(op.Write(), NQ, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      for (int q = 0; q < NQ; ++q)
+      {
+         const double J11 = J(q,0,0,e);
+         const double J21 = J(q,1,0,e);
+         const double J31 = J(q,2,0,e);
+         const double J12 = J(q,0,1,e);
+         const double J22 = J(q,1,1,e);
+         const double J32 = J(q,2,1,e);
+         const double J13 = J(q,0,2,e);
+         const double J23 = J(q,1,2,e);
+         const double J33 = J(q,2,2,e);
+         const double detJ = J11 * (J22 * J33 - J32 * J23) -
+         /* */               J21 * (J12 * J33 - J32 * J13) +
+         /* */               J31 * (J12 * J23 - J22 * J13);
+         y(q,e) = W[q] * coeff(q, e) / detJ;
+      }
+   });
+}
+
+static void PADivDivApply2D(const int D1D,
+                            const int Q1D,
+                            const int NE,
+                            const Array<double> &_Bo,
+                            const Array<double> &_Gc,
+                            const Array<double> &_Bot,
+                            const Array<double> &_Gct,
+                            const Vector &_op,
+                            const Vector &_x,
+                            Vector &_y)
+{
+   constexpr static int VDIM = 2;
+
+   auto Bo = Reshape(_Bo.Read(), Q1D, D1D-1);
+   auto Bot = Reshape(_Bot.Read(), D1D-1, Q1D);
+   auto Gc = Reshape(_Gc.Read(), Q1D, D1D);
+   auto Gct = Reshape(_Gct.Read(), D1D, Q1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, NE);
+   auto x = Reshape(_x.Read(), 2*(D1D-1)*D1D, NE);
+   auto y = Reshape(_y.ReadWrite(), 2*(D1D-1)*D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      double div[MAX_Q1D][MAX_Q1D];
+
+      // div[qy][qx] will be computed as du_x/dx + du_y/dy
+
+      for (int qy = 0; qy < Q1D; ++qy)
+      {
+         for (int qx = 0; qx < Q1D; ++qx)
+         {
+            div[qy][qx] = 0;
+         }
+      }
+
+      int osc = 0;
+
+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
+      {
+         const int D1Dx = (c == 1) ? D1D - 1 : D1D;
+         const int D1Dy = (c == 0) ? D1D - 1 : D1D;
+
+         for (int dy = 0; dy < D1Dy; ++dy)
+         {
+            double gradX[MAX_Q1D];
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               gradX[qx] = 0;
+            }
+
+            for (int dx = 0; dx < D1Dx; ++dx)
+            {
+               const double t = x(dx + (dy * D1Dx) + osc, e);
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  gradX[qx] += t * ((c == 0) ? Gc(qx,dx) : Bo(qx,dx));
+               }
+            }
+
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               const double wy = (c == 0) ? Bo(qy,dy) : Gc(qy,dy);
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  div[qy][qx] += gradX[qx] * wy;
+               }
+            }
+         }
+
+         osc += D1Dx * D1Dy;
+      }  // loop (c) over components
+
+      // Apply D operator.
+      for (int qy = 0; qy < Q1D; ++qy)
+      {
+         for (int qx = 0; qx < Q1D; ++qx)
+         {
+            div[qy][qx] *= op(qx,qy,e);
+         }
+      }
+
+      for (int qy = 0; qy < Q1D; ++qy)
+      {
+         osc = 0;
+
+         for (int c = 0; c < VDIM; ++c)  // loop over x, y components
+         {
+            const int D1Dx = (c == 1) ? D1D - 1 : D1D;
+            const int D1Dy = (c == 0) ? D1D - 1 : D1D;
+
+            double gradX[MAX_D1D];
+            for (int dx = 0; dx < D1Dx; ++dx)
+            {
+               gradX[dx] = 0;
+            }
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  gradX[dx] += div[qy][qx] * (c == 0 ? Gct(dx,qx) : Bot(dx,qx));
+               }
+            }
+            for (int dy = 0; dy < D1Dy; ++dy)
+            {
+               const double wy = (c == 0) ? Bot(dy,qy) : Gct(dy,qy);
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  y(dx + (dy * D1Dx) + osc, e) += gradX[dx] * wy;
+               }
+            }
+
+            osc += D1Dx * D1Dy;
+         }  // loop c
+      }  // loop qy
+   }); // end of element loop
+}
+
+static void PADivDivApply3D(const int D1D,
+                            const int Q1D,
+                            const int NE,
+                            const Array<double> &_Bo,
+                            const Array<double> &_Gc,
+                            const Array<double> &_Bot,
+                            const Array<double> &_Gct,
+                            const Vector &_op,
+                            const Vector &_x,
+                            Vector &_y)
+{
+   MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
+   MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
+   constexpr static int VDIM = 3;
+
+   auto Bo = Reshape(_Bo.Read(), Q1D, D1D-1);
+   auto Gc = Reshape(_Gc.Read(), Q1D, D1D);
+   auto Bot = Reshape(_Bot.Read(), D1D-1, Q1D);
+   auto Gct = Reshape(_Gct.Read(), D1D, Q1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, Q1D, NE);
+   auto x = Reshape(_x.Read(), 3*(D1D-1)*(D1D-1)*D1D, NE);
+   auto y = Reshape(_y.ReadWrite(), 3*(D1D-1)*(D1D-1)*D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      double div[HDIV_MAX_Q1D][HDIV_MAX_Q1D][HDIV_MAX_Q1D];
+
+      for (int qz = 0; qz < Q1D; ++qz)
+      {
+         for (int qy = 0; qy < Q1D; ++qy)
+         {
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               div[qz][qy][qx] = 0.0;
+            }
+         }
+      }
+
+      int osc = 0;
+
+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
+      {
+         const int D1Dz = (c == 2) ? D1D : D1D - 1;
+         const int D1Dy = (c == 1) ? D1D : D1D - 1;
+         const int D1Dx = (c == 0) ? D1D : D1D - 1;
+
+         for (int dz = 0; dz < D1Dz; ++dz)
+         {
+            double aXY[HDIV_MAX_Q1D][HDIV_MAX_Q1D];
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  aXY[qy][qx] = 0.0;
+               }
+            }
+
+            for (int dy = 0; dy < D1Dy; ++dy)
+            {
+               double aX[HDIV_MAX_Q1D];
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  aX[qx] = 0.0;
+               }
+
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  const double t = x(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     aX[qx] += t * ((c == 0) ? Gc(qx,dx) : Bo(qx,dx));
+                  }
+               }
+
+               for (int qy = 0; qy < Q1D; ++qy)
+               {
+                  const double wy = (c == 1) ? Gc(qy,dy) : Bo(qy,dy);
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     const double wx = aX[qx];
+                     aXY[qy][qx] += wx * wy;
+                  }
+               }
+            }
+
+            for (int qz = 0; qz < Q1D; ++qz)
+            {
+               const double wz = (c == 2) ? Gc(qz,dz) : Bo(qz,dz);
+               for (int qy = 0; qy < Q1D; ++qy)
+               {
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     div[qz][qy][qx] += aXY[qy][qx] * wz;
+                  }
+               }
+            }
+         }
+
+         osc += D1Dx * D1Dy * D1Dz;
+      }  // loop (c) over components
+
+      // Apply D operator.
+      for (int qz = 0; qz < Q1D; ++qz)
+      {
+         for (int qy = 0; qy < Q1D; ++qy)
+         {
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               div[qz][qy][qx] *= op(qx,qy,qz,e);
+            }
+         }
+      }
+
+      for (int qz = 0; qz < Q1D; ++qz)
+      {
+         double aXY[HDIV_MAX_D1D][HDIV_MAX_D1D];
+
+         osc = 0;
+
+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
+         {
+            const int D1Dz = (c == 2) ? D1D : D1D - 1;
+            const int D1Dy = (c == 1) ? D1D : D1D - 1;
+            const int D1Dx = (c == 0) ? D1D : D1D - 1;
+
+            for (int dy = 0; dy < D1Dy; ++dy)
+            {
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  aXY[dy][dx] = 0;
+               }
+            }
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               double aX[HDIV_MAX_D1D];
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  aX[dx] = 0;
+               }
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  for (int dx = 0; dx < D1Dx; ++dx)
+                  {
+                     aX[dx] += div[qz][qy][qx] *
+                               (c == 0 ? Gct(dx,qx) : Bot(dx,qx));
+                  }
+               }
+               for (int dy = 0; dy < D1Dy; ++dy)
+               {
+                  const double wy = (c == 1) ? Gct(dy,qy) : Bot(dy,qy);
+                  for (int dx = 0; dx < D1Dx; ++dx)
+                  {
+                     aXY[dy][dx] += aX[dx] * wy;
+                  }
+               }
+            }
+
+            for (int dz = 0; dz < D1Dz; ++dz)
+            {
+               const double wz = (c == 2) ? Gct(dz,qz) : Bot(dz,qz);
+               for (int dy = 0; dy < D1Dy; ++dy)
+               {
+                  for (int dx = 0; dx < D1Dx; ++dx)
+                  {
+                     y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) +=
+                        aXY[dy][dx] * wz;
+                  }
+               }
+            }
+
+            osc += D1Dx * D1Dy * D1Dz;
+         }  // loop c
+      }  // loop qz
+   }); // end of element loop
+}
+
+void DivDivIntegrator::AssemblePA(const FiniteElementSpace &fes)
+{
+   // Assumes tensor-product elements
+   Mesh *mesh = fes.GetMesh();
+   const FiniteElement *fel = fes.GetFE(0);
+
+   const VectorTensorFiniteElement *el =
+      dynamic_cast<const VectorTensorFiniteElement*>(fel);
+   MFEM_VERIFY(el != NULL, "Only VectorTensorFiniteElement is supported!");
+
+   const IntegrationRule *ir = IntRule ? IntRule : &MassIntegrator::GetRule
+                               (*el, *el, *mesh->GetElementTransformation(0));
+
+   const int dims = el->GetDim();
+   MFEM_VERIFY(dims == 2 || dims == 3, "");
+
+   const int nq = ir->GetNPoints();
+   dim = mesh->Dimension();
+   MFEM_VERIFY(dim == 2 || dim == 3, "");
+
+   ne = fes.GetNE();
+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
+   mapsC = &el->GetDofToQuad(*ir, DofToQuad::TENSOR);
+   mapsO = &el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
+   dofs1D = mapsC->ndof;
+   quad1D = mapsC->nqpt;
+
+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
+
+   pa_data.SetSize(nq * ne, Device::GetMemoryType());
+
+   Vector coeff(ne * nq);
+   coeff = 1.0;
+   if (Q)
+   {
+      for (int e=0; e<ne; ++e)
+      {
+         ElementTransformation *tr = mesh->GetElementTransformation(e);
+         for (int p=0; p<nq; ++p)
+         {
+            coeff[p + (e * nq)] = Q->Eval(*tr, ir->IntPoint(p));
+         }
+      }
+   }
+
+   if (el->GetDerivType() == mfem::FiniteElement::DIV && dim == 3)
+   {
+      PADivDivSetup3D(quad1D, ne, ir->GetWeights(), geom->J, coeff, pa_data);
+   }
+   else if (el->GetDerivType() == mfem::FiniteElement::DIV && dim == 2)
+   {
+      PADivDivSetup2D(quad1D, ne, ir->GetWeights(), geom->J, coeff, pa_data);
+   }
+   else
+   {
+      MFEM_ABORT("Unknown kernel.");
+   }
+}
+
+void DivDivIntegrator::AddMultPA(const Vector &x, Vector &y) const
+{
+   if (dim == 3)
+      PADivDivApply3D(dofs1D, quad1D, ne, mapsO->B, mapsC->G,
+                      mapsO->Bt, mapsC->Gt, pa_data, x, y);
+   else if (dim == 2)
+      PADivDivApply2D(dofs1D, quad1D, ne, mapsO->B, mapsC->G,
+                      mapsO->Bt, mapsC->Gt, pa_data, x, y);
+   else
+   {
+      MFEM_ABORT("Unsupported dimension!");
+   }
+}
+
+static void PADivDivAssembleDiagonal2D(const int D1D,
+                                       const int Q1D,
+                                       const int NE,
+                                       const Array<double> &_Bo,
+                                       const Array<double> &_Gc,
+                                       const Vector &_op,
+                                       Vector &_diag)
+{
+   constexpr static int VDIM = 2;
+
+   auto Bo = Reshape(_Bo.Read(), Q1D, D1D-1);
+   auto Gc = Reshape(_Gc.Read(), Q1D, D1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, NE);
+   auto diag = Reshape(_diag.ReadWrite(), 2*(D1D-1)*D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      int osc = 0;
+
+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
+      {
+         const int D1Dx = (c == 1) ? D1D - 1 : D1D;
+         const int D1Dy = (c == 0) ? D1D - 1 : D1D;
+
+         double div[MAX_Q1D];
+
+         for (int dy = 0; dy < D1Dy; ++dy)
+         {
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               div[qx] = 0.0;
+               for (int qy = 0; qy < Q1D; ++qy)
+               {
+                  const double wy = (c == 0) ? Bo(qy,dy) : Gc(qy,dy);
+                  div[qx] += wy * wy * op(qx,qy,e);
+               }
+            }
+
+            for (int dx = 0; dx < D1Dx; ++dx)
+            {
+               double val = 0.0;
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  const double wx = (c == 0) ? Gc(qx,dx) : Bo(qx,dx);
+                  val += div[qx] * wx * wx;
+               }
+               diag(dx + (dy * D1Dx) + osc, e) += val;
+            }
+         }
+
+         osc += D1Dx * D1Dy;
+      }  // loop c
+   });
+}
+
+static void PADivDivAssembleDiagonal3D(const int D1D,
+                                       const int Q1D,
+                                       const int NE,
+                                       const Array<double> &_Bo,
+                                       const Array<double> &_Gc,
+                                       const Vector &_op,
+                                       Vector &_diag)
+{
+   MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
+   MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
+   constexpr static int VDIM = 3;
+
+   auto Bo = Reshape(_Bo.Read(), Q1D, D1D-1);
+   auto Gc = Reshape(_Gc.Read(), Q1D, D1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, Q1D, NE);
+   auto diag = Reshape(_diag.ReadWrite(), 3*(D1D-1)*(D1D-1)*D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      int osc = 0;
+
+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
+      {
+         const int D1Dz = (c == 2) ? D1D : D1D - 1;
+         const int D1Dy = (c == 1) ? D1D : D1D - 1;
+         const int D1Dx = (c == 0) ? D1D : D1D - 1;
+
+         for (int dz = 0; dz < D1Dz; ++dz)
+         {
+            for (int dy = 0; dy < D1Dy; ++dy)
+            {
+               double a[HDIV_MAX_Q1D];
+
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  a[qx] = 0.0;
+                  for (int qy = 0; qy < Q1D; ++qy)
+                  {
+                     const double wy = (c == 1) ? Gc(qy,dy) : Bo(qy,dy);
+
+                     for (int qz = 0; qz < Q1D; ++qz)
+                     {
+                        const double wz = (c == 2) ? Gc(qz,dz) : Bo(qz,dz);
+                        a[qx] += wy * wy * wz * wz * op(qx,qy,qz,e);
+                     }
+                  }
+               }
+
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  double val = 0.0;
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     const double wx = (c == 0) ? Gc(qx,dx) : Bo(qx,dx);
+                     val += a[qx] * wx * wx;
+                  }
+                  diag(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += val;
+               }
+            }
+         }
+
+         osc += D1Dx * D1Dy * D1Dz;
+      }  // loop c
+   }); // end of element loop
+}
+
+void DivDivIntegrator::AssembleDiagonalPA(Vector& diag)
+{
+   if (dim == 3)
+   {
+      PADivDivAssembleDiagonal3D(dofs1D, quad1D, ne,
+                                 mapsO->B, mapsC->G, pa_data, diag);
+   }
+   else
+   {
+      PADivDivAssembleDiagonal2D(dofs1D, quad1D, ne,
+                                 mapsO->B, mapsC->G, pa_data, diag);
+   }
+}
+
+// PA H(div)-L2 (div u, p) assemble 2D kernel
+static void PADivL2Setup2D(const int Q1D,
+                           const int NE,
+                           const Array<double> &w,
+                           Vector &_coeff,
+                           Vector &op)
+{
+   const int NQ = Q1D*Q1D;
+   auto W = w.Read();
+   auto coeff = Reshape(_coeff.Read(), NQ, NE);
+   auto y = Reshape(op.Write(), NQ, NE);
+   MFEM_FORALL(e, NE,
+   {
+      for (int q = 0; q < NQ; ++q)
+      {
+         y(q,e) = W[q] * coeff(q,e);
+      }
+   });
+}
+
+static void PADivL2Setup3D(const int Q1D,
+                           const int NE,
+                           const Array<double> &w,
+                           Vector &_coeff,
+                           Vector &op)
+{
+   const int NQ = Q1D*Q1D*Q1D;
+   auto W = w.Read();
+   auto coeff = Reshape(_coeff.Read(), NQ, NE);
+   auto y = Reshape(op.Write(), NQ, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      for (int q = 0; q < NQ; ++q)
+      {
+         y(q,e) = W[q] * coeff(q, e);
+      }
+   });
+}
+
+void
+VectorFEDivergenceIntegrator::AssemblePA(const FiniteElementSpace &trial_fes,
+                                         const FiniteElementSpace &test_fes)
+{
+   // Assumes tensor-product elements, with a vector test space and
+   // scalar trial space.
+   Mesh *mesh = trial_fes.GetMesh();
+   const FiniteElement *trial_fel = trial_fes.GetFE(0);
+   const FiniteElement *test_fel = test_fes.GetFE(0);
+
+   const VectorTensorFiniteElement *trial_el =
+      dynamic_cast<const VectorTensorFiniteElement*>(trial_fel);
+   MFEM_VERIFY(trial_el != NULL, "Only VectorTensorFiniteElement is supported!");
+
+   const NodalTensorFiniteElement *test_el =
+      dynamic_cast<const NodalTensorFiniteElement*>(test_fel);
+   MFEM_VERIFY(test_el != NULL, "Only NodalTensorFiniteElement is supported!");
+
+   const IntegrationRule *ir = IntRule ? IntRule : &MassIntegrator::GetRule(
+                                  *trial_el, *trial_el,
+                                  *mesh->GetElementTransformation(0));
+
+   const int dims = trial_el->GetDim();
+   MFEM_VERIFY(dims == 2 || dims == 3, "");
+
+   const int nq = ir->GetNPoints();
+   dim = mesh->Dimension();
+   MFEM_VERIFY(dim == 2 || dim == 3, "");
+
+   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder() + 1, "");
+
+   ne = trial_fes.GetNE();
+   mapsC = &trial_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
+   mapsO = &trial_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
+   dofs1D = mapsC->ndof;
+   quad1D = mapsC->nqpt;
+
+   L2mapsO = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
+   L2dofs1D = L2mapsO->ndof;
+
+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
+   if (dim == 2)
+   {
+      MFEM_VERIFY(nq == quad1D * quad1D, "");
+   }
+   else
+   {
+      MFEM_VERIFY(nq == quad1D * quad1D * quad1D, "");
+   }
+
+   pa_data.SetSize(nq * ne, Device::GetMemoryType());
+
+   Vector coeff(ne * nq);
+   coeff = 1.0;
+   if (Q)
+   {
+      for (int e=0; e<ne; ++e)
+      {
+         ElementTransformation *tr = mesh->GetElementTransformation(e);
+         for (int p=0; p<nq; ++p)
+         {
+            coeff[p + (e * nq)] = Q->Eval(*tr, ir->IntPoint(p));
+         }
+      }
+   }
+
+   if (trial_el->GetDerivType() == mfem::FiniteElement::DIV && dim == 3)
+   {
+      PADivL2Setup3D(quad1D, ne, ir->GetWeights(), coeff, pa_data);
+   }
+   else if (trial_el->GetDerivType() == mfem::FiniteElement::DIV && dim == 2)
+   {
+      PADivL2Setup2D(quad1D, ne, ir->GetWeights(), coeff, pa_data);
+   }
+   else
+   {
+      MFEM_ABORT("Unknown kernel.");
+   }
+}
+
+// Apply to x corresponding to DOF's in H(div) (trial), whose divergence is
+// integrated against L_2 test functions corresponding to y.
+static void PAHdivL2Apply3D(const int D1D,
+                            const int Q1D,
+                            const int L2D1D,
+                            const int NE,
+                            const Array<double> &_Bo,
+                            const Array<double> &_Gc,
+                            const Array<double> &_L2Bot,
+                            const Vector &_op,
+                            const Vector &_x,
+                            Vector &_y)
+{
+   MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
+   MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
+   constexpr static int VDIM = 3;
+
+   auto Bo = Reshape(_Bo.Read(), Q1D, D1D-1);
+   auto Gc = Reshape(_Gc.Read(), Q1D, D1D);
+   auto L2Bot = Reshape(_L2Bot.Read(), L2D1D, Q1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, Q1D, NE);
+   auto x = Reshape(_x.Read(), 3*(D1D-1)*(D1D-1)*D1D, NE);
+   auto y = Reshape(_y.ReadWrite(), L2D1D, L2D1D, L2D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      double div[HDIV_MAX_Q1D][HDIV_MAX_Q1D][HDIV_MAX_Q1D];
+
+      for (int qz = 0; qz < Q1D; ++qz)
+      {
+         for (int qy = 0; qy < Q1D; ++qy)
+         {
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               div[qz][qy][qx] = 0.0;
+            }
+         }
+      }
+
+      int osc = 0;
+
+      for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
+      {
+         const int D1Dz = (c == 2) ? D1D : D1D - 1;
+         const int D1Dy = (c == 1) ? D1D : D1D - 1;
+         const int D1Dx = (c == 0) ? D1D : D1D - 1;
+
+         for (int dz = 0; dz < D1Dz; ++dz)
+         {
+            double aXY[HDIV_MAX_Q1D][HDIV_MAX_Q1D];
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  aXY[qy][qx] = 0.0;
+               }
+            }
+
+            for (int dy = 0; dy < D1Dy; ++dy)
+            {
+               double aX[HDIV_MAX_Q1D];
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  aX[qx] = 0.0;
+               }
+
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  const double t = x(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     aX[qx] += t * ((c == 0) ? Gc(qx,dx) : Bo(qx,dx));
+                  }
+               }
+
+               for (int qy = 0; qy < Q1D; ++qy)
+               {
+                  const double wy = (c == 1) ? Gc(qy,dy) : Bo(qy,dy);
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     aXY[qy][qx] += aX[qx] * wy;
+                  }
+               }
+            }
+
+            for (int qz = 0; qz < Q1D; ++qz)
+            {
+               const double wz = (c == 2) ? Gc(qz,dz) : Bo(qz,dz);
+               for (int qy = 0; qy < Q1D; ++qy)
+               {
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     div[qz][qy][qx] += aXY[qy][qx] * wz;
+                  }
+               }
+            }
+         }
+
+         osc += D1Dx * D1Dy * D1Dz;
+      }  // loop (c) over components
+
+      // Apply D operator.
+      for (int qz = 0; qz < Q1D; ++qz)
+      {
+         for (int qy = 0; qy < Q1D; ++qy)
+         {
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               div[qz][qy][qx] *= op(qx,qy,qz,e);
+            }
+         }
+      }
+
+      for (int qz = 0; qz < Q1D; ++qz)
+      {
+         double aXY[HDIV_MAX_D1D][HDIV_MAX_D1D];
+
+         for (int dy = 0; dy < L2D1D; ++dy)
+         {
+            for (int dx = 0; dx < L2D1D; ++dx)
+            {
+               aXY[dy][dx] = 0;
+            }
+         }
+         for (int qy = 0; qy < Q1D; ++qy)
+         {
+            double aX[HDIV_MAX_D1D];
+            for (int dx = 0; dx < L2D1D; ++dx)
+            {
+               aX[dx] = 0;
+            }
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               for (int dx = 0; dx < L2D1D; ++dx)
+               {
+                  aX[dx] += div[qz][qy][qx] * L2Bot(dx,qx);
+               }
+            }
+            for (int dy = 0; dy < L2D1D; ++dy)
+            {
+               const double wy = L2Bot(dy,qy);
+               for (int dx = 0; dx < L2D1D; ++dx)
+               {
+                  aXY[dy][dx] += aX[dx] * wy;
+               }
+            }
+         }
+
+         for (int dz = 0; dz < L2D1D; ++dz)
+         {
+            const double wz = L2Bot(dz,qz);
+            for (int dy = 0; dy < L2D1D; ++dy)
+            {
+               for (int dx = 0; dx < L2D1D; ++dx)
+               {
+                  y(dx,dy,dz,e) += aXY[dy][dx] * wz;
+               }
+            }
+         }
+      }  // loop qz
+   }); // end of element loop
+}
+
+// Apply to x corresponding to DOF's in H(div) (trial), whose divergence is
+// integrated against L_2 test functions corresponding to y.
+static void PAHdivL2Apply2D(const int D1D,
+                            const int Q1D,
+                            const int L2D1D,
+                            const int NE,
+                            const Array<double> &_Bo,
+                            const Array<double> &_Gc,
+                            const Array<double> &_L2Bot,
+                            const Vector &_op,
+                            const Vector &_x,
+                            Vector &_y)
+{
+   constexpr static int VDIM = 2;
+
+   auto Bo = Reshape(_Bo.Read(), Q1D, D1D-1);
+   auto Gc = Reshape(_Gc.Read(), Q1D, D1D);
+   auto L2Bot = Reshape(_L2Bot.Read(), L2D1D, Q1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, NE);
+   auto x = Reshape(_x.Read(), 2*(D1D-1)*D1D, NE);
+   auto y = Reshape(_y.ReadWrite(), L2D1D, L2D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      double div[MAX_Q1D][MAX_Q1D];
+
+      for (int qy = 0; qy < Q1D; ++qy)
+      {
+         for (int qx = 0; qx < Q1D; ++qx)
+         {
+            div[qy][qx] = 0.0;
+         }
+      }
+
+      int osc = 0;
+
+      for (int c = 0; c < VDIM; ++c)  // loop over x, y components
+      {
+         const int D1Dy = (c == 1) ? D1D : D1D - 1;
+         const int D1Dx = (c == 0) ? D1D : D1D - 1;
+
+         for (int dy = 0; dy < D1Dy; ++dy)
+         {
+            double aX[MAX_Q1D];
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               aX[qx] = 0.0;
+            }
+
+            for (int dx = 0; dx < D1Dx; ++dx)
+            {
+               const double t = x(dx + (dy * D1Dx) + osc, e);
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  aX[qx] += t * ((c == 0) ? Gc(qx,dx) : Bo(qx,dx));
+               }
+            }
+
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               const double wy = (c == 1) ? Gc(qy,dy) : Bo(qy,dy);
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  div[qy][qx] += aX[qx] * wy;
+               }
+            }
+         }
+
+         osc += D1Dx * D1Dy;
+      }  // loop (c) over components
+
+      // Apply D operator.
+      for (int qy = 0; qy < Q1D; ++qy)
+      {
+         for (int qx = 0; qx < Q1D; ++qx)
+         {
+            div[qy][qx] *= op(qx,qy,e);
+         }
+      }
+
+      for (int qy = 0; qy < Q1D; ++qy)
+      {
+         double aX[MAX_D1D];
+         for (int dx = 0; dx < L2D1D; ++dx)
+         {
+            aX[dx] = 0;
+         }
+         for (int qx = 0; qx < Q1D; ++qx)
+         {
+            for (int dx = 0; dx < L2D1D; ++dx)
+            {
+               aX[dx] += div[qy][qx] * L2Bot(dx,qx);
+            }
+         }
+         for (int dy = 0; dy < L2D1D; ++dy)
+         {
+            const double wy = L2Bot(dy,qy);
+            for (int dx = 0; dx < L2D1D; ++dx)
+            {
+               y(dx,dy,e) += aX[dx] * wy;
+            }
+         }
+      }
+   }); // end of element loop
+}
+
+static void PAHdivL2ApplyTranspose3D(const int D1D,
+                                     const int Q1D,
+                                     const int L2D1D,
+                                     const int NE,
+                                     const Array<double> &_L2Bo,
+                                     const Array<double> &_Gct,
+                                     const Array<double> &_Bot,
+                                     const Vector &_op,
+                                     const Vector &_x,
+                                     Vector &_y)
+{
+   MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
+   MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
+   constexpr static int VDIM = 3;
+
+   auto L2Bo = Reshape(_L2Bo.Read(), Q1D, L2D1D);
+   auto Gct = Reshape(_Gct.Read(), D1D, Q1D);
+   auto Bot = Reshape(_Bot.Read(), D1D-1, Q1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, Q1D, NE);
+   auto x = Reshape(_x.Read(), L2D1D, L2D1D, L2D1D, NE);
+   auto y = Reshape(_y.ReadWrite(), 3*(D1D-1)*(D1D-1)*D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      double div[HDIV_MAX_Q1D][HDIV_MAX_Q1D][HDIV_MAX_Q1D];
+
+      for (int qz = 0; qz < Q1D; ++qz)
+      {
+         for (int qy = 0; qy < Q1D; ++qy)
+         {
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               div[qz][qy][qx] = 0.0;
+            }
+         }
+      }
+
+      for (int dz = 0; dz < L2D1D; ++dz)
+      {
+         double aXY[HDIV_MAX_Q1D][HDIV_MAX_Q1D];
+         for (int qy = 0; qy < Q1D; ++qy)
+         {
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               aXY[qy][qx] = 0.0;
+            }
+         }
+
+         for (int dy = 0; dy < L2D1D; ++dy)
+         {
+            double aX[HDIV_MAX_Q1D];
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               aX[qx] = 0.0;
+            }
+
+            for (int dx = 0; dx < L2D1D; ++dx)
+            {
+               const double t = x(dx,dy,dz,e);
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  aX[qx] += t * L2Bo(qx,dx);
+               }
+            }
+
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               const double wy = L2Bo(qy,dy);
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  aXY[qy][qx] += aX[qx] * wy;
+               }
+            }
+         }
+
+         for (int qz = 0; qz < Q1D; ++qz)
+         {
+            const double wz = L2Bo(qz,dz);
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  div[qz][qy][qx] += aXY[qy][qx] * wz;
+               }
+            }
+         }
+      }
+
+      // Apply D operator.
+      for (int qz = 0; qz < Q1D; ++qz)
+      {
+         for (int qy = 0; qy < Q1D; ++qy)
+         {
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               div[qz][qy][qx] *= op(qx,qy,qz,e);
+            }
+         }
+      }
+
+      for (int qz = 0; qz < Q1D; ++qz)
+      {
+         double aXY[HDIV_MAX_D1D][HDIV_MAX_D1D];
+
+         int osc = 0;
+         for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
+         {
+            const int D1Dz = (c == 2) ? D1D : D1D - 1;
+            const int D1Dy = (c == 1) ? D1D : D1D - 1;
+            const int D1Dx = (c == 0) ? D1D : D1D - 1;
+
+            for (int dy = 0; dy < D1Dy; ++dy)
+            {
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  aXY[dy][dx] = 0;
+               }
+            }
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               double aX[HDIV_MAX_D1D];
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  aX[dx] = 0;
+               }
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  for (int dx = 0; dx < D1Dx; ++dx)
+                  {
+                     aX[dx] += div[qz][qy][qx] * ((c == 0) ? Gct(dx,qx) :
+                                                  Bot(dx,qx));
+                  }
+               }
+               for (int dy = 0; dy < D1Dy; ++dy)
+               {
+                  const double wy = (c == 1) ? Gct(dy,qy) : Bot(dy,qy);
+                  for (int dx = 0; dx < D1Dx; ++dx)
+                  {
+                     aXY[dy][dx] += aX[dx] * wy;
+                  }
+               }
+            }
+
+            for (int dz = 0; dz < D1Dz; ++dz)
+            {
+               const double wz = (c == 2) ? Gct(dz,qz) : Bot(dz,qz);
+               for (int dy = 0; dy < D1Dy; ++dy)
+               {
+                  for (int dx = 0; dx < D1Dx; ++dx)
+                  {
+                     y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) +=
+                        aXY[dy][dx] * wz;
+                  }
+               }
+            }
+
+            osc += D1Dx * D1Dy * D1Dz;
+         }  // loop c
+      }  // loop qz
+   }); // end of element loop
+}
+
+static void PAHdivL2ApplyTranspose2D(const int D1D,
+                                     const int Q1D,
+                                     const int L2D1D,
+                                     const int NE,
+                                     const Array<double> &_L2Bo,
+                                     const Array<double> &_Gct,
+                                     const Array<double> &_Bot,
+                                     const Vector &_op,
+                                     const Vector &_x,
+                                     Vector &_y)
+{
+   constexpr static int VDIM = 2;
+
+   auto L2Bo = Reshape(_L2Bo.Read(), Q1D, L2D1D);
+   auto Gct = Reshape(_Gct.Read(), D1D, Q1D);
+   auto Bot = Reshape(_Bot.Read(), D1D-1, Q1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, NE);
+   auto x = Reshape(_x.Read(), L2D1D, L2D1D, NE);
+   auto y = Reshape(_y.ReadWrite(), 2*(D1D-1)*D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      double div[MAX_Q1D][MAX_Q1D];
+
+      for (int qy = 0; qy < Q1D; ++qy)
+      {
+         for (int qx = 0; qx < Q1D; ++qx)
+         {
+            div[qy][qx] = 0.0;
+         }
+      }
+
+      for (int dy = 0; dy < L2D1D; ++dy)
+      {
+         double aX[MAX_Q1D];
+         for (int qx = 0; qx < Q1D; ++qx)
+         {
+            aX[qx] = 0.0;
+         }
+
+         for (int dx = 0; dx < L2D1D; ++dx)
+         {
+            const double t = x(dx,dy,e);
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               aX[qx] += t * L2Bo(qx,dx);
+            }
+         }
+
+         for (int qy = 0; qy < Q1D; ++qy)
+         {
+            const double wy = L2Bo(qy,dy);
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               div[qy][qx] += aX[qx] * wy;
+            }
+         }
+      }
+
+      // Apply D operator.
+      for (int qy = 0; qy < Q1D; ++qy)
+      {
+         for (int qx = 0; qx < Q1D; ++qx)
+         {
+            div[qy][qx] *= op(qx,qy,e);
+         }
+      }
+
+      for (int qy = 0; qy < Q1D; ++qy)
+      {
+         double aX[HDIV_MAX_D1D];
+
+         int osc = 0;
+         for (int c = 0; c < VDIM; ++c)  // loop over x, y components
+         {
+            const int D1Dy = (c == 1) ? D1D : D1D - 1;
+            const int D1Dx = (c == 0) ? D1D : D1D - 1;
+
+            for (int dx = 0; dx < D1Dx; ++dx)
+            {
+               aX[dx] = 0;
+            }
+            for (int qx = 0; qx < Q1D; ++qx)
+            {
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  aX[dx] += div[qy][qx] * ((c == 0) ? Gct(dx,qx) : Bot(dx,qx));
+               }
+            }
+            for (int dy = 0; dy < D1Dy; ++dy)
+            {
+               const double wy = (c == 0) ? Bot(dy,qy) : Gct(dy,qy);
+               for (int dx = 0; dx < D1Dx; ++dx)
+               {
+                  y(dx + (dy * D1Dx) + osc, e) += aX[dx] * wy;
+               }
+            }
+
+            osc += D1Dx * D1Dy;
+         }  // loop c
+      }  // loop qy
+   }); // end of element loop
+}
+
+void VectorFEDivergenceIntegrator::AddMultPA(const Vector &x, Vector &y) const
+{
+   if (dim == 3)
+      PAHdivL2Apply3D(dofs1D, quad1D, L2dofs1D, ne, mapsO->B, mapsC->G,
+                      L2mapsO->Bt, pa_data, x, y);
+   else if (dim == 2)
+      PAHdivL2Apply2D(dofs1D, quad1D, L2dofs1D, ne, mapsO->B, mapsC->G,
+                      L2mapsO->Bt, pa_data, x, y);
+   else
+   {
+      MFEM_ABORT("Unsupported dimension!");
+   }
+}
+
+void VectorFEDivergenceIntegrator::AddMultTransposePA(const Vector &x,
+                                                      Vector &y) const
+{
+   if (dim == 3)
+      PAHdivL2ApplyTranspose3D(dofs1D, quad1D, L2dofs1D, ne, L2mapsO->B,
+                               mapsC->Gt, mapsO->Bt, pa_data, x, y);
+   else if (dim == 2)
+      PAHdivL2ApplyTranspose2D(dofs1D, quad1D, L2dofs1D, ne, L2mapsO->B,
+                               mapsC->Gt, mapsO->Bt, pa_data, x, y);
+   else
+   {
+      MFEM_ABORT("Unsupported dimension!");
+   }
+}
+
+static void PAHdivL2AssembleDiagonal_ADAt_3D(const int D1D,
+                                             const int Q1D,
+                                             const int L2D1D,
+                                             const int NE,
+                                             const Array<double> &_L2Bo,
+                                             const Array<double> &_Gct,
+                                             const Array<double> &_Bot,
+                                             const Vector &_op,
+                                             const Vector &_D,
+                                             Vector &_diag)
+{
+   MFEM_VERIFY(D1D <= HDIV_MAX_D1D, "Error: D1D > HDIV_MAX_D1D");
+   MFEM_VERIFY(Q1D <= HDIV_MAX_Q1D, "Error: Q1D > HDIV_MAX_Q1D");
+   constexpr static int VDIM = 3;
+
+   auto L2Bo = Reshape(_L2Bo.Read(), Q1D, L2D1D);
+   auto Gct = Reshape(_Gct.Read(), D1D, Q1D);
+   auto Bot = Reshape(_Bot.Read(), D1D-1, Q1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, Q1D, NE);
+   auto D = Reshape(_D.Read(), 3*(D1D-1)*(D1D-1)*D1D, NE);
+   auto diag = Reshape(_diag.ReadWrite(), L2D1D, L2D1D, L2D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      for (int rz = 0; rz < L2D1D; ++rz)
+      {
+         for (int ry = 0; ry < L2D1D; ++ry)
+         {
+            for (int rx = 0; rx < L2D1D; ++rx)
+            {
+               // Compute row (rx,ry,rz), assuming all contributions are from
+               // a single element.
+
+               double row[3*HDIV_MAX_D1D*(HDIV_MAX_D1D-1)*(HDIV_MAX_D1D-1)];
+               double div[HDIV_MAX_Q1D][HDIV_MAX_Q1D][HDIV_MAX_Q1D];
+
+               for (int i=0; i<3*D1D*(D1D - 1)*(D1D - 1); ++i)
+               {
+                  row[i] = 0;
+               }
+
+               for (int qz = 0; qz < Q1D; ++qz)
+               {
+                  for (int qy = 0; qy < Q1D; ++qy)
+                  {
+                     for (int qx = 0; qx < Q1D; ++qx)
+                     {
+                        div[qz][qy][qx] = op(qx,qy,qz,e) * L2Bo(qx,rx) *
+                                          L2Bo(qy,ry) * L2Bo(qz,rz);
+                     }
+                  }
+               }
+
+               for (int qz = 0; qz < Q1D; ++qz)
+               {
+                  double aXY[HDIV_MAX_D1D][HDIV_MAX_D1D];
+
+                  int osc = 0;
+                  for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
+                  {
+                     const int D1Dz = (c == 2) ? D1D : D1D - 1;
+                     const int D1Dy = (c == 1) ? D1D : D1D - 1;
+                     const int D1Dx = (c == 0) ? D1D : D1D - 1;
+
+                     for (int dy = 0; dy < D1Dy; ++dy)
+                     {
+                        for (int dx = 0; dx < D1Dx; ++dx)
+                        {
+                           aXY[dy][dx] = 0;
+                        }
+                     }
+                     for (int qy = 0; qy < Q1D; ++qy)
+                     {
+                        double aX[HDIV_MAX_D1D];
+                        for (int dx = 0; dx < D1Dx; ++dx)
+                        {
+                           aX[dx] = 0;
+                        }
+                        for (int qx = 0; qx < Q1D; ++qx)
+                        {
+                           for (int dx = 0; dx < D1Dx; ++dx)
+                           {
+                              aX[dx] += div[qz][qy][qx] * ((c == 0) ? Gct(dx,qx)
+                                                           : Bot(dx,qx));
+                           }
+                        }
+                        for (int dy = 0; dy < D1Dy; ++dy)
+                        {
+                           const double wy = (c == 1) ? Gct(dy,qy) : Bot(dy,qy);
+                           for (int dx = 0; dx < D1Dx; ++dx)
+                           {
+                              aXY[dy][dx] += aX[dx] * wy;
+                           }
+                        }
+                     }
+
+                     for (int dz = 0; dz < D1Dz; ++dz)
+                     {
+                        const double wz = (c == 2) ? Gct(dz,qz) : Bot(dz,qz);
+                        for (int dy = 0; dy < D1Dy; ++dy)
+                        {
+                           for (int dx = 0; dx < D1Dx; ++dx)
+                           {
+                              row[dx + ((dy + (dz * D1Dy)) * D1Dx) + osc] +=
+                                 aXY[dy][dx] * wz;
+                           }
+                        }
+                     }
+
+                     osc += D1Dx * D1Dy * D1Dz;
+                  }  // loop c
+               }  // loop qz
+
+               double val = 0.0;
+               for (int i=0; i<3*D1D*(D1D - 1)*(D1D - 1); ++i)
+               {
+                  val += row[i] * row[i] * D(i,e);
+               }
+               diag(rx,ry,rz,e) += val;
+            }  // loop rx
+         }  // loop ry
+      }  // loop rz
+   }); // end of element loop
+}
+
+static void PAHdivL2AssembleDiagonal_ADAt_2D(const int D1D,
+                                             const int Q1D,
+                                             const int L2D1D,
+                                             const int NE,
+                                             const Array<double> &_L2Bo,
+                                             const Array<double> &_Gct,
+                                             const Array<double> &_Bot,
+                                             const Vector &_op,
+                                             const Vector &_D,
+                                             Vector &_diag)
+{
+   constexpr static int VDIM = 2;
+
+   auto L2Bo = Reshape(_L2Bo.Read(), Q1D, L2D1D);
+   auto Gct = Reshape(_Gct.Read(), D1D, Q1D);
+   auto Bot = Reshape(_Bot.Read(), D1D-1, Q1D);
+   auto op = Reshape(_op.Read(), Q1D, Q1D, NE);
+   auto D = Reshape(_D.Read(), 2*(D1D-1)*D1D, NE);
+   auto diag = Reshape(_diag.ReadWrite(), L2D1D, L2D1D, NE);
+
+   MFEM_FORALL(e, NE,
+   {
+      for (int ry = 0; ry < L2D1D; ++ry)
+      {
+         for (int rx = 0; rx < L2D1D; ++rx)
+         {
+            // Compute row (rx,ry), assuming all contributions are from
+            // a single element.
+
+            double row[2*HDIV_MAX_D1D*(HDIV_MAX_D1D-1)];
+            double div[HDIV_MAX_Q1D][HDIV_MAX_Q1D];
+
+            for (int i=0; i<2*D1D*(D1D - 1); ++i)
+            {
+               row[i] = 0;
+            }
+
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               for (int qx = 0; qx < Q1D; ++qx)
+               {
+                  div[qy][qx] = op(qx,qy,e) * L2Bo(qx,rx) * L2Bo(qy,ry);
+               }
+            }
+
+            for (int qy = 0; qy < Q1D; ++qy)
+            {
+               int osc = 0;
+               for (int c = 0; c < VDIM; ++c)  // loop over x, y, z components
+               {
+                  const int D1Dy = (c == 1) ? D1D : D1D - 1;
+                  const int D1Dx = (c == 0) ? D1D : D1D - 1;
+
+                  double aX[HDIV_MAX_D1D];
+                  for (int dx = 0; dx < D1Dx; ++dx)
+                  {
+                     aX[dx] = 0;
+                  }
+                  for (int qx = 0; qx < Q1D; ++qx)
+                  {
+                     for (int dx = 0; dx < D1Dx; ++dx)
+                     {
+                        aX[dx] += div[qy][qx] * ((c == 0) ? Gct(dx,qx) :
+                                                 Bot(dx,qx));
+                     }
+                  }
+
+                  for (int dy = 0; dy < D1Dy; ++dy)
+                  {
+                     const double wy = (c == 1) ? Gct(dy,qy) : Bot(dy,qy);
+
+                     for (int dx = 0; dx < D1Dx; ++dx)
+                     {
+                        row[dx + (dy * D1Dx) + osc] += aX[dx] * wy;
+                     }
+                  }
+
+                  osc += D1Dx * D1Dy;
+               }  // loop c
+            }  // loop qy
+
+            double val = 0.0;
+            for (int i=0; i<2*D1D*(D1D - 1); ++i)
+            {
+               val += row[i] * row[i] * D(i,e);
+            }
+            diag(rx,ry,e) += val;
+         }  // loop rx
+      }  // loop ry
+   }); // end of element loop
+}
+
+void VectorFEDivergenceIntegrator::AssembleDiagonalPA_ADAt(const Vector &D,
+                                                           Vector &diag)
+{
+   if (dim == 3)
+      PAHdivL2AssembleDiagonal_ADAt_3D(dofs1D, quad1D, L2dofs1D, ne, L2mapsO->B,
+                                       mapsC->Gt, mapsO->Bt, pa_data, D, diag);
+   else if (dim == 2)
+      PAHdivL2AssembleDiagonal_ADAt_2D(dofs1D, quad1D, L2dofs1D, ne, L2mapsO->B,
+                                       mapsC->Gt, mapsO->Bt, pa_data, D, diag);
+   else
+   {
+      MFEM_ABORT("Unsupported dimension!");
+   }
+}
+
+} // namespace mfem
diff --git a/fem/bilininteg_mass_ea.cpp b/fem/bilininteg_mass_ea.cpp
new file mode 100644
index 00000000000..16aad03eefc
--- /dev/null
+++ b/fem/bilininteg_mass_ea.cpp
@@ -0,0 +1,255 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#include "../general/forall.hpp"
+#include "bilininteg.hpp"
+#include "gridfunc.hpp"
+
+namespace mfem
+{
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EAMassAssemble1D(const int NE,
+                             const Array<double> &basis,
+                             const Vector &padata,
+                             Vector &eadata,
+                             const int d1d = 0,
+                             const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto B = Reshape(basis.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, NE);
+   auto M = Reshape(eadata.Write(), D1D, D1D, NE);
+   MFEM_FORALL_3D(e, NE, D1D, D1D, 1,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
+      double r_Bi[MQ1];
+      double r_Bj[MQ1];
+      for (int q = 0; q < Q1D; q++)
+      {
+         r_Bi[q] = B(q,MFEM_THREAD_ID(x));
+         r_Bj[q] = B(q,MFEM_THREAD_ID(y));
+      }
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(j1,y,D1D)
+         {
+            double val = 0.0;
+            for (int k1 = 0; k1 < Q1D; ++k1)
+            {
+               val += r_Bi[k1] * r_Bj[k1] * D(k1, e);
+            }
+            M(i1, j1, e) = val;
+         }
+      }
+   });
+}
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EAMassAssemble2D(const int NE,
+                             const Array<double> &basis,
+                             const Vector &padata,
+                             Vector &eadata,
+                             const int d1d = 0,
+                             const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto B = Reshape(basis.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, Q1D, NE);
+   auto M = Reshape(eadata.Write(), D1D, D1D, D1D, D1D, NE);
+   MFEM_FORALL_3D(e, NE, D1D, D1D, 1,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
+      double r_B[MQ1][MD1];
+      for (int d = 0; d < D1D; d++)
+      {
+         for (int q = 0; q < Q1D; q++)
+         {
+            r_B[q][d] = B(q,d);
+         }
+      }
+      MFEM_SHARED double s_D[MQ1][MQ1];
+      MFEM_FOREACH_THREAD(k1,x,Q1D)
+      {
+         MFEM_FOREACH_THREAD(k2,y,Q1D)
+         {
+            s_D[k1][k2] = D(k1,k2,e);
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(i2,y,D1D)
+         {
+            for (int j1 = 0; j1 < D1D; ++j1)
+            {
+               for (int j2 = 0; j2 < D1D; ++j2)
+               {
+                  double val = 0.0;
+                  for (int k1 = 0; k1 < Q1D; ++k1)
+                  {
+                     for (int k2 = 0; k2 < Q1D; ++k2)
+                     {
+                        val += r_B[k1][i1] * r_B[k1][j1]
+                               * r_B[k2][i2] * r_B[k2][j2]
+                               * s_D[k1][k2];
+                     }
+                  }
+                  M(i1, i2, j1, j2, e) = val;
+               }
+            }
+         }
+      }
+   });
+}
+
+template<int T_D1D = 0, int T_Q1D = 0>
+static void EAMassAssemble3D(const int NE,
+                             const Array<double> &basis,
+                             const Vector &padata,
+                             Vector &eadata,
+                             const int d1d = 0,
+                             const int q1d = 0)
+{
+   const int D1D = T_D1D ? T_D1D : d1d;
+   const int Q1D = T_Q1D ? T_Q1D : q1d;
+   MFEM_VERIFY(D1D <= MAX_D1D, "");
+   MFEM_VERIFY(Q1D <= MAX_Q1D, "");
+   auto B = Reshape(basis.Read(), Q1D, D1D);
+   auto D = Reshape(padata.Read(), Q1D, Q1D, Q1D, NE);
+   auto M = Reshape(eadata.Write(), D1D, D1D, D1D, D1D, D1D, D1D, NE);
+   MFEM_FORALL_3D(e, NE, D1D, D1D, D1D,
+   {
+      const int D1D = T_D1D ? T_D1D : d1d;
+      const int Q1D = T_Q1D ? T_Q1D : q1d;
+      constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
+      constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
+      double r_B[MQ1][MD1];
+      for (int d = 0; d < D1D; d++)
+      {
+         for (int q = 0; q < Q1D; q++)
+         {
+            r_B[q][d] = B(q,d);
+         }
+      }
+      MFEM_SHARED double s_D[MQ1][MQ1][MQ1];
+      MFEM_FOREACH_THREAD(k1,x,Q1D)
+      {
+         MFEM_FOREACH_THREAD(k2,y,Q1D)
+         {
+            MFEM_FOREACH_THREAD(k3,z,Q1D)
+            {
+               s_D[k1][k2][k3] = D(k1,k2,k3,e);
+            }
+         }
+      }
+      MFEM_SYNC_THREAD;
+      MFEM_FOREACH_THREAD(i1,x,D1D)
+      {
+         MFEM_FOREACH_THREAD(i2,y,D1D)
+         {
+            MFEM_FOREACH_THREAD(i3,z,D1D)
+            {
+               for (int j1 = 0; j1 < D1D; ++j1)
+               {
+                  for (int j2 = 0; j2 < D1D; ++j2)
+                  {
+                     for (int j3 = 0; j3 < D1D; ++j3)
+                     {
+                        double val = 0.0;
+                        for (int k1 = 0; k1 < Q1D; ++k1)
+                        {
+                           for (int k2 = 0; k2 < Q1D; ++k2)
+                           {
+                              for (int k3 = 0; k3 < Q1D; ++k3)
+                              {
+                                 val += r_B[k1][i1] * r_B[k1][j1]
+                                        * r_B[k2][i2] * r_B[k2][j2]
+                                        * r_B[k3][i3] * r_B[k3][j3]
+                                        * s_D[k1][k2][k3];
+                              }
+                           }
+                        }
+                        M(i1, i2, i3, j1, j2, j3, e) = val;
+                     }
+                  }
+               }
+            }
+         }
+      }
+   });
+}
+
+void MassIntegrator::AssembleEA(const FiniteElementSpace &fes,
+                                Vector &ea_data)
+{
+   AssemblePA(fes);
+   const int ne = fes.GetMesh()->GetNE();
+   const Array<double> &B = maps->B;
+   if (dim == 1)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x22: return EAMassAssemble1D<2,2>(ne,B,pa_data,ea_data);
+         case 0x33: return EAMassAssemble1D<3,3>(ne,B,pa_data,ea_data);
+         case 0x44: return EAMassAssemble1D<4,4>(ne,B,pa_data,ea_data);
+         case 0x55: return EAMassAssemble1D<5,5>(ne,B,pa_data,ea_data);
+         case 0x66: return EAMassAssemble1D<6,6>(ne,B,pa_data,ea_data);
+         case 0x77: return EAMassAssemble1D<7,7>(ne,B,pa_data,ea_data);
+         case 0x88: return EAMassAssemble1D<8,8>(ne,B,pa_data,ea_data);
+         case 0x99: return EAMassAssemble1D<9,9>(ne,B,pa_data,ea_data);
+         default:   return EAMassAssemble1D(ne,B,pa_data,ea_data,dofs1D,quad1D);
+      }
+   }
+   else if (dim == 2)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x22: return EAMassAssemble2D<2,2>(ne,B,pa_data,ea_data);
+         case 0x33: return EAMassAssemble2D<3,3>(ne,B,pa_data,ea_data);
+         case 0x44: return EAMassAssemble2D<4,4>(ne,B,pa_data,ea_data);
+         case 0x55: return EAMassAssemble2D<5,5>(ne,B,pa_data,ea_data);
+         case 0x66: return EAMassAssemble2D<6,6>(ne,B,pa_data,ea_data);
+         case 0x77: return EAMassAssemble2D<7,7>(ne,B,pa_data,ea_data);
+         case 0x88: return EAMassAssemble2D<8,8>(ne,B,pa_data,ea_data);
+         case 0x99: return EAMassAssemble2D<9,9>(ne,B,pa_data,ea_data);
+         default:   return EAMassAssemble2D(ne,B,pa_data,ea_data,dofs1D,quad1D);
+      }
+   }
+   else if (dim == 3)
+   {
+      switch ((dofs1D << 4 ) | quad1D)
+      {
+         case 0x23: return EAMassAssemble3D<2,3>(ne,B,pa_data,ea_data);
+         case 0x34: return EAMassAssemble3D<3,4>(ne,B,pa_data,ea_data);
+         case 0x45: return EAMassAssemble3D<4,5>(ne,B,pa_data,ea_data);
+         case 0x56: return EAMassAssemble3D<5,6>(ne,B,pa_data,ea_data);
+         case 0x67: return EAMassAssemble3D<6,7>(ne,B,pa_data,ea_data);
+         case 0x78: return EAMassAssemble3D<7,8>(ne,B,pa_data,ea_data);
+         case 0x89: return EAMassAssemble3D<8,9>(ne,B,pa_data,ea_data);
+         default:   return EAMassAssemble3D(ne,B,pa_data,ea_data,dofs1D,quad1D);
+      }
+   }
+   MFEM_ABORT("Unknown kernel.");
+}
+
+}
diff --git a/fem/bilininteg_mass.cpp b/fem/bilininteg_mass_pa.cpp
similarity index 100%
rename from fem/bilininteg_mass.cpp
rename to fem/bilininteg_mass_pa.cpp
diff --git a/fem/bilininteg_transpose_ea.cpp b/fem/bilininteg_transpose_ea.cpp
new file mode 100644
index 00000000000..645efe1cc05
--- /dev/null
+++ b/fem/bilininteg_transpose_ea.cpp
@@ -0,0 +1,103 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#include "../general/forall.hpp"
+#include "bilininteg.hpp"
+
+namespace mfem
+{
+
+void TransposeIntegrator::AssembleEA(const FiniteElementSpace &fes,
+                                     Vector &ea_data)
+{
+   Vector ea_data_tmp(ea_data.Size());
+   ea_data_tmp = 0.0;
+   bfi->AssembleEA(fes, ea_data_tmp);
+   const int ne = fes.GetNE();
+   if (ne == 0) { return; }
+   const int dofs = fes.GetFE(0)->GetDof();
+   auto A = Reshape(ea_data_tmp.Write(), dofs, dofs, ne);
+   auto AT = Reshape(ea_data.Write(), dofs, dofs, ne);
+   MFEM_FORALL(e, ne,
+   {
+      for (int i = 0; i < dofs; i++)
+      {
+         for (int j = 0; j < dofs; j++)
+         {
+            const double a = A(i, j, e);
+            AT(j, i, e) += a;
+         }
+      }
+   });
+}
+
+void TransposeIntegrator::AssembleEAInteriorFaces(const FiniteElementSpace& fes,
+                                                  Vector &ea_data_int,
+                                                  Vector &ea_data_ext)
+{
+   const int nf = fes.GetNFbyType(FaceType::Interior);
+   if (nf == 0) { return; }
+   Vector ea_data_int_tmp(ea_data_int.Size());
+   Vector ea_data_ext_tmp(ea_data_ext.Size());
+   ea_data_int_tmp = 0.0;
+   ea_data_ext_tmp = 0.0;
+   bfi->AssembleEAInteriorFaces(fes, ea_data_int_tmp, ea_data_ext_tmp);
+   const int faceDofs = fes.GetTraceElement(0,
+                                            fes.GetMesh()->GetFaceBaseGeometry(0))->GetDof();
+   auto A_int = Reshape(ea_data_int_tmp.Read(), faceDofs, faceDofs, 2, nf);
+   auto A_ext = Reshape(ea_data_ext_tmp.Read(), faceDofs, faceDofs, 2, nf);
+   auto AT_int = Reshape(ea_data_int.ReadWrite(), faceDofs, faceDofs, 2, nf);
+   auto AT_ext = Reshape(ea_data_ext.ReadWrite(), faceDofs, faceDofs, 2, nf);
+   MFEM_FORALL(f, nf,
+   {
+      for (int i = 0; i < faceDofs; i++)
+      {
+         for (int j = 0; j < faceDofs; j++)
+         {
+            const double a_int0 = A_int(i, j, 0, f);
+            const double a_int1 = A_int(i, j, 1, f);
+            const double a_ext0 = A_ext(i, j, 0, f);
+            const double a_ext1 = A_ext(i, j, 1, f);
+            AT_int(j, i, 0, f) += a_int0;
+            AT_int(j, i, 1, f) += a_int1;
+            AT_ext(j, i, 0, f) += a_ext1;
+            AT_ext(j, i, 1, f) += a_ext0;
+         }
+      }
+   });
+}
+
+void TransposeIntegrator::AssembleEABoundaryFaces(const FiniteElementSpace& fes,
+                                                  Vector &ea_data_bdr)
+{
+   const int nf = fes.GetNFbyType(FaceType::Boundary);
+   if (nf == 0) { return; }
+   Vector ea_data_bdr_tmp(ea_data_bdr.Size());
+   ea_data_bdr_tmp = 0.0;
+   bfi->AssembleEABoundaryFaces(fes, ea_data_bdr_tmp);
+   const int faceDofs = fes.GetTraceElement(0,
+                                            fes.GetMesh()->GetFaceBaseGeometry(0))->GetDof();
+   auto A_bdr = Reshape(ea_data_bdr_tmp.Read(), faceDofs, faceDofs, nf);
+   auto AT_bdr = Reshape(ea_data_bdr.ReadWrite(), faceDofs, faceDofs, nf);
+   MFEM_FORALL(f, nf,
+   {
+      for (int i = 0; i < faceDofs; i++)
+      {
+         for (int j = 0; j < faceDofs; j++)
+         {
+            const double a_bdr = A_bdr(i, j, f);
+            AT_bdr(j, i, f) += a_bdr;
+         }
+      }
+   });
+}
+
+}
diff --git a/fem/bilininteg_vectorfe.cpp b/fem/bilininteg_vectorfe.cpp
new file mode 100644
index 00000000000..9059bdd9178
--- /dev/null
+++ b/fem/bilininteg_vectorfe.cpp
@@ -0,0 +1,379 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#include "bilininteg.hpp"
+
+namespace mfem
+{
+
+void PAHcurlSetup2D(const int Q1D,
+                    const int NE,
+                    const Array<double> &w,
+                    const Vector &j,
+                    Vector &_coeff,
+                    Vector &op);
+
+void PAHcurlSetup3D(const int Q1D,
+                    const int NE,
+                    const Array<double> &w,
+                    const Vector &j,
+                    Vector &_coeff,
+                    Vector &op);
+
+void PAHcurlMassAssembleDiagonal2D(const int D1D,
+                                   const int Q1D,
+                                   const int NE,
+                                   const Array<double> &_Bo,
+                                   const Array<double> &_Bc,
+                                   const Vector &_op,
+                                   Vector &_diag);
+
+void PAHcurlMassAssembleDiagonal3D(const int D1D,
+                                   const int Q1D,
+                                   const int NE,
+                                   const Array<double> &_Bo,
+                                   const Array<double> &_Bc,
+                                   const Vector &_op,
+                                   Vector &_diag);
+
+void PAHcurlMassApply2D(const int D1D,
+                        const int Q1D,
+                        const int NE,
+                        const Array<double> &_Bo,
+                        const Array<double> &_Bc,
+                        const Array<double> &_Bot,
+                        const Array<double> &_Bct,
+                        const Vector &_op,
+                        const Vector &_x,
+                        Vector &_y);
+
+void PAHcurlMassApply3D(const int D1D,
+                        const int Q1D,
+                        const int NE,
+                        const Array<double> &_Bo,
+                        const Array<double> &_Bc,
+                        const Array<double> &_Bot,
+                        const Array<double> &_Bct,
+                        const Vector &_op,
+                        const Vector &_x,
+                        Vector &_y);
+
+void PAHdivSetup2D(const int Q1D,
+                   const int NE,
+                   const Array<double> &w,
+                   const Vector &j,
+                   Vector &_coeff,
+                   Vector &op);
+
+void PAHdivSetup3D(const int Q1D,
+                   const int NE,
+                   const Array<double> &w,
+                   const Vector &j,
+                   Vector &_coeff,
+                   Vector &op);
+
+void PAHcurlH1Apply2D(const int D1D,
+                      const int Q1D,
+                      const int NE,
+                      const Array<double> &_Bc,
+                      const Array<double> &_Gc,
+                      const Array<double> &_Bot,
+                      const Array<double> &_Bct,
+                      const Vector &_op,
+                      const Vector &_x,
+                      Vector &_y);
+
+void PAHcurlH1Apply3D(const int D1D,
+                      const int Q1D,
+                      const int NE,
+                      const Array<double> &_Bc,
+                      const Array<double> &_Gc,
+                      const Array<double> &_Bot,
+                      const Array<double> &_Bct,
+                      const Vector &_op,
+                      const Vector &_x,
+                      Vector &_y);
+
+void PAHdivMassAssembleDiagonal2D(const int D1D,
+                                  const int Q1D,
+                                  const int NE,
+                                  const Array<double> &_Bo,
+                                  const Array<double> &_Bc,
+                                  const Vector &_op,
+                                  Vector &_diag);
+
+void PAHdivMassAssembleDiagonal3D(const int D1D,
+                                  const int Q1D,
+                                  const int NE,
+                                  const Array<double> &_Bo,
+                                  const Array<double> &_Bc,
+                                  const Vector &_op,
+                                  Vector &_diag);
+
+void PAHdivMassApply2D(const int D1D,
+                       const int Q1D,
+                       const int NE,
+                       const Array<double> &_Bo,
+                       const Array<double> &_Bc,
+                       const Array<double> &_Bot,
+                       const Array<double> &_Bct,
+                       const Vector &_op,
+                       const Vector &_x,
+                       Vector &_y);
+
+void PAHdivMassApply3D(const int D1D,
+                       const int Q1D,
+                       const int NE,
+                       const Array<double> &_Bo,
+                       const Array<double> &_Bc,
+                       const Array<double> &_Bot,
+                       const Array<double> &_Bct,
+                       const Vector &_op,
+                       const Vector &_x,
+                       Vector &_y);
+
+void VectorFEMassIntegrator::AssemblePA(const FiniteElementSpace &fes)
+{
+   // Assumes tensor-product elements
+   Mesh *mesh = fes.GetMesh();
+   const FiniteElement *fel = fes.GetFE(0);
+
+   const VectorTensorFiniteElement *el =
+      dynamic_cast<const VectorTensorFiniteElement*>(fel);
+   MFEM_VERIFY(el != NULL, "Only VectorTensorFiniteElement is supported!");
+
+   const IntegrationRule *ir
+      = IntRule ? IntRule : &MassIntegrator::GetRule(*el, *el,
+                                                     *mesh->GetElementTransformation(0));
+   const int dims = el->GetDim();
+   MFEM_VERIFY(dims == 2 || dims == 3, "");
+
+   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
+   const int nq = ir->GetNPoints();
+   dim = mesh->Dimension();
+   MFEM_VERIFY(dim == 2 || dim == 3, "");
+
+   ne = fes.GetNE();
+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
+   mapsC = &el->GetDofToQuad(*ir, DofToQuad::TENSOR);
+   mapsO = &el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
+   dofs1D = mapsC->ndof;
+   quad1D = mapsC->nqpt;
+
+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
+
+   pa_data.SetSize(symmDims * nq * ne, Device::GetMemoryType());
+
+   Vector coeff(ne * nq);
+   coeff = 1.0;
+   if (Q)
+   {
+      for (int e=0; e<ne; ++e)
+      {
+         ElementTransformation *tr = mesh->GetElementTransformation(e);
+         for (int p=0; p<nq; ++p)
+         {
+            coeff[p + (e * nq)] = Q->Eval(*tr, ir->IntPoint(p));
+         }
+      }
+   }
+
+   fetype = el->GetDerivType();
+
+   if (el->GetDerivType() == mfem::FiniteElement::CURL && dim == 3)
+   {
+      PAHcurlSetup3D(quad1D, ne, ir->GetWeights(), geom->J,
+                     coeff, pa_data);
+   }
+   else if (el->GetDerivType() == mfem::FiniteElement::CURL && dim == 2)
+   {
+      PAHcurlSetup2D(quad1D, ne, ir->GetWeights(), geom->J,
+                     coeff, pa_data);
+   }
+   else if (el->GetDerivType() == mfem::FiniteElement::DIV && dim == 3)
+   {
+      PAHdivSetup3D(quad1D, ne, ir->GetWeights(), geom->J,
+                    coeff, pa_data);
+   }
+   else if (el->GetDerivType() == mfem::FiniteElement::DIV && dim == 2)
+   {
+      PAHdivSetup2D(quad1D, ne, ir->GetWeights(), geom->J,
+                    coeff, pa_data);
+   }
+   else
+   {
+      MFEM_ABORT("Unknown kernel.");
+   }
+}
+
+void VectorFEMassIntegrator::AssembleDiagonalPA(Vector& diag)
+{
+   if (dim == 3)
+   {
+      if (fetype == mfem::FiniteElement::CURL)
+      {
+         PAHcurlMassAssembleDiagonal3D(dofs1D, quad1D, ne,
+                                       mapsO->B, mapsC->B, pa_data, diag);
+      }
+      else if (fetype == mfem::FiniteElement::DIV)
+      {
+         PAHdivMassAssembleDiagonal3D(dofs1D, quad1D, ne,
+                                      mapsO->B, mapsC->B, pa_data, diag);
+      }
+      else
+      {
+         MFEM_ABORT("Unknown kernel.");
+      }
+   }
+   else
+   {
+      if (fetype == mfem::FiniteElement::CURL)
+      {
+         PAHcurlMassAssembleDiagonal2D(dofs1D, quad1D, ne,
+                                       mapsO->B, mapsC->B, pa_data, diag);
+      }
+      else if (fetype == mfem::FiniteElement::DIV)
+      {
+         PAHdivMassAssembleDiagonal2D(dofs1D, quad1D, ne,
+                                      mapsO->B, mapsC->B, pa_data, diag);
+      }
+      else
+      {
+         MFEM_ABORT("Unknown kernel.");
+      }
+   }
+}
+
+void VectorFEMassIntegrator::AddMultPA(const Vector &x, Vector &y) const
+{
+   if (dim == 3)
+   {
+      if (fetype == mfem::FiniteElement::CURL)
+      {
+         PAHcurlMassApply3D(dofs1D, quad1D, ne, mapsO->B, mapsC->B, mapsO->Bt,
+                            mapsC->Bt, pa_data, x, y);
+      }
+      else if (fetype == mfem::FiniteElement::DIV)
+      {
+         PAHdivMassApply3D(dofs1D, quad1D, ne, mapsO->B, mapsC->B, mapsO->Bt,
+                           mapsC->Bt, pa_data, x, y);
+      }
+      else
+      {
+         MFEM_ABORT("Unknown kernel.");
+      }
+   }
+   else
+   {
+      if (fetype == mfem::FiniteElement::CURL)
+      {
+         PAHcurlMassApply2D(dofs1D, quad1D, ne, mapsO->B, mapsC->B, mapsO->Bt,
+                            mapsC->Bt, pa_data, x, y);
+      }
+      else if (fetype == mfem::FiniteElement::DIV)
+      {
+         PAHdivMassApply2D(dofs1D, quad1D, ne, mapsO->B, mapsC->B, mapsO->Bt,
+                           mapsC->Bt, pa_data, x, y);
+      }
+      else
+      {
+         MFEM_ABORT("Unknown kernel.");
+      }
+   }
+}
+
+void MixedVectorGradientIntegrator::AssemblePA(const FiniteElementSpace
+                                               &trial_fes,
+                                               const FiniteElementSpace &test_fes)
+{
+   // Assumes tensor-product elements, with a vector test space and H^1 trial space.
+   Mesh *mesh = trial_fes.GetMesh();
+   const FiniteElement *trial_fel = trial_fes.GetFE(0);
+   const FiniteElement *test_fel = test_fes.GetFE(0);
+
+   const NodalTensorFiniteElement *trial_el =
+      dynamic_cast<const NodalTensorFiniteElement*>(trial_fel);
+   MFEM_VERIFY(trial_el != NULL, "Only NodalTensorFiniteElement is supported!");
+
+   const VectorTensorFiniteElement *test_el =
+      dynamic_cast<const VectorTensorFiniteElement*>(test_fel);
+   MFEM_VERIFY(test_el != NULL, "Only VectorTensorFiniteElement is supported!");
+
+   const IntegrationRule *ir
+      = IntRule ? IntRule : &MassIntegrator::GetRule(*trial_el, *trial_el,
+                                                     *mesh->GetElementTransformation(0));
+   const int dims = trial_el->GetDim();
+   MFEM_VERIFY(dims == 2 || dims == 3, "");
+
+   const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
+   const int nq = ir->GetNPoints();
+   dim = mesh->Dimension();
+   MFEM_VERIFY(dim == 2 || dim == 3, "");
+
+   MFEM_VERIFY(trial_el->GetOrder() == test_el->GetOrder(), "");
+
+   ne = trial_fes.GetNE();
+   geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
+   mapsC = &test_el->GetDofToQuad(*ir, DofToQuad::TENSOR);
+   mapsO = &test_el->GetDofToQuadOpen(*ir, DofToQuad::TENSOR);
+   dofs1D = mapsC->ndof;
+   quad1D = mapsC->nqpt;
+
+   MFEM_VERIFY(dofs1D == mapsO->ndof + 1 && quad1D == mapsO->nqpt, "");
+
+   pa_data.SetSize(symmDims * nq * ne, Device::GetMemoryType());
+
+   Vector coeff(ne * nq);
+   coeff = 1.0;
+   if (Q)
+   {
+      for (int e=0; e<ne; ++e)
+      {
+         ElementTransformation *tr = mesh->GetElementTransformation(e);
+         for (int p=0; p<nq; ++p)
+         {
+            coeff[p + (e * nq)] = Q->Eval(*tr, ir->IntPoint(p));
+         }
+      }
+   }
+
+   // Use the same setup functions as VectorFEMassIntegrator.
+   if (test_el->GetDerivType() == mfem::FiniteElement::CURL && dim == 3)
+   {
+      PAHcurlSetup3D(quad1D, ne, ir->GetWeights(), geom->J,
+                     coeff, pa_data);
+   }
+   else if (test_el->GetDerivType() == mfem::FiniteElement::CURL && dim == 2)
+   {
+      PAHcurlSetup2D(quad1D, ne, ir->GetWeights(), geom->J,
+                     coeff, pa_data);
+   }
+   else
+   {
+      MFEM_ABORT("Unknown kernel.");
+   }
+}
+
+void MixedVectorGradientIntegrator::AddMultPA(const Vector &x, Vector &y) const
+{
+   if (dim == 3)
+      PAHcurlH1Apply3D(dofs1D, quad1D, ne, mapsC->B, mapsC->G,
+                       mapsO->Bt, mapsC->Bt, pa_data, x, y);
+   else if (dim == 2)
+      PAHcurlH1Apply2D(dofs1D, quad1D, ne, mapsC->B, mapsC->G,
+                       mapsO->Bt, mapsC->Bt, pa_data, x, y);
+   else
+   {
+      MFEM_ABORT("Unsupported dimension!");
+   }
+}
+
+} // namespace mfem
diff --git a/fem/coefficient.cpp b/fem/coefficient.cpp
index 1f1d3069196..4ecddeb634f 100644
--- a/fem/coefficient.cpp
+++ b/fem/coefficient.cpp
@@ -49,7 +49,7 @@ double FunctionCoefficient::Eval(ElementTransformation & T,
 double GridFunctionCoefficient::Eval (ElementTransformation &T,
                                       const IntegrationPoint &ip)
 {
-   return GridF -> GetValue (T.ElementNo, ip, Component);
+   return GridF -> GetValue (T, ip, Component);
 }
 
 double TransformedCoefficient::Eval(ElementTransformation &T,
@@ -160,13 +160,13 @@ void VectorArrayCoefficient::Eval(Vector &V, ElementTransformation &T,
 }
 
 VectorGridFunctionCoefficient::VectorGridFunctionCoefficient (
-   GridFunction *gf)
+   const GridFunction *gf)
    : VectorCoefficient ((gf) ? gf -> VectorDim() : 0)
 {
    GridFunc = gf;
 }
 
-void VectorGridFunctionCoefficient::SetGridFunction(GridFunction *gf)
+void VectorGridFunctionCoefficient::SetGridFunction(const GridFunction *gf)
 {
    GridFunc = gf; vdim = (gf) ? gf -> VectorDim() : 0;
 }
@@ -174,24 +174,7 @@ void VectorGridFunctionCoefficient::SetGridFunction(GridFunction *gf)
 void VectorGridFunctionCoefficient::Eval(Vector &V, ElementTransformation &T,
                                          const IntegrationPoint &ip)
 {
-   Mesh *mesh = GridFunc->FESpace()->GetMesh();
-   if (mesh->Dimension() == T.GetDimension())
-   {
-      GridFunc->GetVectorValue(T.ElementNo, ip, V);
-   }
-   else // Assuming T is a boundary element transformation
-   {
-      int el_id, el_info;
-      mesh->GetBdrElementAdjacentElement(T.ElementNo, el_id, el_info);
-      IntegrationPointTransformation loc_T;
-      mesh->GetLocalFaceTransformation(mesh->GetBdrElementType(T.ElementNo),
-                                       mesh->GetElementType(el_id),
-                                       loc_T.Transf,
-                                       el_info);
-      IntegrationPoint eip;
-      loc_T.Transform(ip, eip);
-      GridFunc->GetVectorValue(el_id, eip, V);
-   }
+   GridFunc->GetVectorValue(T, ip, V);
 }
 
 void VectorGridFunctionCoefficient::Eval(
@@ -201,14 +184,14 @@ void VectorGridFunctionCoefficient::Eval(
 }
 
 GradientGridFunctionCoefficient::GradientGridFunctionCoefficient (
-   GridFunction *gf)
+   const GridFunction *gf)
    : VectorCoefficient((gf) ?
                        gf -> FESpace() -> GetMesh() -> SpaceDimension() : 0)
 {
    GridFunc = gf;
 }
 
-void GradientGridFunctionCoefficient::SetGridFunction(GridFunction *gf)
+void GradientGridFunctionCoefficient::SetGridFunction(const GridFunction *gf)
 {
    GridFunc = gf; vdim = (gf) ?
                          gf -> FESpace() -> GetMesh() -> SpaceDimension() : 0;
@@ -227,14 +210,14 @@ void GradientGridFunctionCoefficient::Eval(
 }
 
 CurlGridFunctionCoefficient::CurlGridFunctionCoefficient (
-   GridFunction *gf)
+   const GridFunction *gf)
    : VectorCoefficient ((gf) ?
                         gf -> FESpace() -> GetMesh() -> SpaceDimension() : 0)
 {
    GridFunc = gf;
 }
 
-void CurlGridFunctionCoefficient::SetGridFunction(GridFunction *gf)
+void CurlGridFunctionCoefficient::SetGridFunction(const GridFunction *gf)
 {
    GridFunc = gf; vdim = (gf) ?
                          gf -> FESpace() -> GetMesh() -> SpaceDimension() : 0;
@@ -247,7 +230,7 @@ void CurlGridFunctionCoefficient::Eval(Vector &V, ElementTransformation &T,
 }
 
 DivergenceGridFunctionCoefficient::DivergenceGridFunctionCoefficient (
-   GridFunction *gf) : Coefficient()
+   const GridFunction *gf) : Coefficient()
 {
    GridFunc = gf;
 }
@@ -775,4 +758,61 @@ double ComputeGlobalLpNorm(double p, VectorCoefficient &coeff, ParMesh &pmesh,
 }
 #endif
 
+VectorQuadratureFunctionCoefficient::VectorQuadratureFunctionCoefficient(
+   QuadratureFunction &qf)
+   : VectorCoefficient(qf.GetVDim()), QuadF(qf), index(0) { }
+
+void VectorQuadratureFunctionCoefficient::SetComponent(int _index, int _length)
+{
+   MFEM_VERIFY(_index >= 0, "Index must be >= 0");
+   MFEM_VERIFY(_index < QuadF.GetVDim(),
+               "Index must be < QuadratureFunction length");
+   index = _index;
+
+   MFEM_VERIFY(_length > 0, "Length must be > 0");
+   MFEM_VERIFY(_length <= QuadF.GetVDim() - index,
+               "Length must be <= (QuadratureFunction length - index)");
+
+   vdim = _length;
+}
+
+void VectorQuadratureFunctionCoefficient::Eval(Vector &V,
+                                               ElementTransformation &T,
+                                               const IntegrationPoint &ip)
+{
+   QuadF.HostRead();
+
+   if (index == 0 && vdim == QuadF.GetVDim())
+   {
+      QuadF.GetElementValues(T.ElementNo, ip.index, V);
+   }
+   else
+   {
+      Vector temp;
+      QuadF.GetElementValues(T.ElementNo, ip.index, temp);
+      V.SetSize(vdim);
+      for (int i = 0; i < vdim; i++)
+      {
+         V(i) = temp(index + i);
+      }
+   }
+
+   return;
+}
+
+QuadratureFunctionCoefficient::QuadratureFunctionCoefficient(
+   QuadratureFunction &qf) : QuadF(qf)
+{
+   MFEM_VERIFY(qf.GetVDim() == 1, "QuadratureFunction's vdim must be 1");
+}
+
+double QuadratureFunctionCoefficient::Eval(ElementTransformation &T,
+                                           const IntegrationPoint &ip)
+{
+   QuadF.HostRead();
+   Vector temp(1);
+   QuadF.GetElementValues(T.ElementNo, ip.index, temp);
+   return temp[0];
+}
+
 }
diff --git a/fem/coefficient.hpp b/fem/coefficient.hpp
index a50b1235156..ccae771d370 100644
--- a/fem/coefficient.hpp
+++ b/fem/coefficient.hpp
@@ -27,7 +27,10 @@ class ParMesh;
 #endif
 
 
-/// Base class Coefficient that may optionally depend on time.
+/** @brief Base class Coefficients that optionally depend on space and time.
+    These are used by the BilinearFormIntegrator, LinearFormIntegrator, and
+    NonlinearFormIntegrator classes to represent the physical coefficients in
+    the PDEs that are being discretized. */
 class Coefficient
 {
 protected:
@@ -36,7 +39,10 @@ class Coefficient
 public:
    Coefficient() { time = 0.; }
 
+   /// Set the time for time dependent coefficients
    void SetTime(double t) { time = t; }
+
+   /// Get the time for time dependent coefficients
    double GetTime() { return time; }
 
    /** @brief Evaluate the coefficient in the element described by @a T at the
@@ -63,7 +69,7 @@ class Coefficient
 };
 
 
-/// Subclass constant coefficient.
+/// A coefficient that is constant across space and time
 class ConstantCoefficient : public Coefficient
 {
 public:
@@ -72,13 +78,14 @@ class ConstantCoefficient : public Coefficient
    /// c is value of constant function
    explicit ConstantCoefficient(double c = 1.0) { constant=c; }
 
-   /// Evaluate the coefficient
+   /// Evaluate the coefficient at @a ip.
    virtual double Eval(ElementTransformation &T,
                        const IntegrationPoint &ip)
    { return (constant); }
 };
 
-/// class for piecewise constant coefficient
+/** @brief A piecewise constant coefficient with the constants keyed
+    off the element attribute numbers. */
 class PWConstCoefficient : public Coefficient
 {
 private:
@@ -90,30 +97,31 @@ class PWConstCoefficient : public Coefficient
    explicit PWConstCoefficient(int NumOfSubD = 0) : constants(NumOfSubD)
    { constants = 0.0; }
 
-   /** c should be a vector defined by attributes, so for region with
-       attribute i  c[i-1] is the coefficient in that region */
+   /// Construct the constant coefficient using a vector of constants.
+   /** @a c should be a vector defined by attributes, so for region with
+       attribute @a i @a c[i-1] is the coefficient in that region */
    PWConstCoefficient(Vector &c)
    { constants.SetSize(c.Size()); constants=c; }
 
-   /// Update constants
+   /// Update the constants with vector @a c.
    void UpdateConstants(Vector &c) { constants.SetSize(c.Size()); constants=c; }
 
-   /// Member function to access or modify the value of the i-th constant
+   /// Return a reference to the i-th constant
    double &operator()(int i) { return constants(i-1); }
 
-   /// Set domain constants equal to the same constant c
+   /// Set the constants for all attributes to constant @a c.
    void operator=(double c) { constants = c; }
 
-   /// Returns the number of constants
+   /// Returns the number of constants representing different attributes.
    int GetNConst() { return constants.Size(); }
 
-   /// Evaluate the coefficient function
+   /// Evaluate the coefficient.
    virtual double Eval(ElementTransformation &T,
                        const IntegrationPoint &ip);
 };
 
 
-/// class for C-function coefficient
+/// A general C-function coefficient
 class FunctionCoefficient : public Coefficient
 {
 protected:
@@ -121,14 +129,14 @@ class FunctionCoefficient : public Coefficient
    double (*TDFunction)(const Vector &, double);
 
 public:
-   /// Define a time-independent coefficient from a C-function
+   /// Define a time-independent coefficient from a pointer to a C-function
    FunctionCoefficient(double (*f)(const Vector &))
    {
       Function = f;
       TDFunction = NULL;
    }
 
-   /// Define a time-dependent coefficient from a C-function
+   /// Define a time-dependent coefficient from a pointer to a C-function
    FunctionCoefficient(double (*tdf)(const Vector &, double))
    {
       Function = NULL;
@@ -153,7 +161,7 @@ class FunctionCoefficient : public Coefficient
       TDFunction = reinterpret_cast<double(*)(const Vector&,double)>(tdf);
    }
 
-   /// Evaluate coefficient
+   /// Evaluate the coefficient at @a ip.
    virtual double Eval(ElementTransformation &T,
                        const IntegrationPoint &ip);
 };
@@ -164,23 +172,34 @@ class GridFunction;
 class GridFunctionCoefficient : public Coefficient
 {
 private:
-   GridFunction *GridF;
+   const GridFunction *GridF;
    int Component;
 
 public:
    GridFunctionCoefficient() : GridF(NULL), Component(1) { }
    /** Construct GridFunctionCoefficient from a given GridFunction, and
        optionally specify a component to use if it is a vector GridFunction. */
-   GridFunctionCoefficient (GridFunction *gf, int comp = 1)
+   GridFunctionCoefficient (const GridFunction *gf, int comp = 1)
    { GridF = gf; Component = comp; }
 
-   void SetGridFunction(GridFunction *gf) { GridF = gf; }
-   GridFunction * GetGridFunction() const { return GridF; }
+   /// Set the internal GridFunction
+   void SetGridFunction(const GridFunction *gf) { GridF = gf; }
+
+   /// Get the internal GridFunction
+   const GridFunction * GetGridFunction() const { return GridF; }
 
+   /// Evaluate the coefficient at @a ip.
    virtual double Eval(ElementTransformation &T,
                        const IntegrationPoint &ip);
 };
 
+
+/** @brief A coefficient that depends on 1 or 2 parent coefficients and a
+    transformation rule represented by a C-function.
+
+    \f$ C(x,t) = T(Q1(x,t)) \f$ or \f$ C(x,t) = T(Q1(x,t), Q2(x,t)) \f$
+
+    where T is the transformation rule, and Q1/Q2 are the parent coefficients.*/
 class TransformedCoefficient : public Coefficient
 {
 private:
@@ -196,10 +215,20 @@ class TransformedCoefficient : public Coefficient
                            double (*F)(double,double))
       : Q1(q1), Q2(q2), Transform2(F) { Transform1 = 0; }
 
+   /// Evaluate the coefficient at @a ip.
    virtual double Eval(ElementTransformation &T, const IntegrationPoint &ip);
 };
 
-/// Delta function coefficient
+/** @brief Delta function coefficient optionally multiplied by a weight
+    coefficient and a scaled time dependent C-function.
+
+    \f$ F(x,t) = w(x,t) s T(t) d(x - xc) \f$
+
+    where w is the optional weight coefficient, @a s is a scale factor
+    T is an optional time-dependent function and d is a delta function.
+
+    WARNING this cannot be used as a normal coefficient.  The usual Eval
+    method is disabled. */
 class DeltaCoefficient : public Coefficient
 {
 protected:
@@ -209,34 +238,49 @@ class DeltaCoefficient : public Coefficient
    double (*tdf)(double);
 
 public:
+
+   /// Construct a unit delta function centered at (0.0,0.0,0.0)
    DeltaCoefficient()
    {
       center[0] = center[1] = center[2] = 0.; scale = 1.; tol = 1e-12;
       weight = NULL; sdim = 0; tdf = NULL;
    }
+
+   /// Construct a delta function scaled by @a s and centered at (x,0.0,0.0)
    DeltaCoefficient(double x, double s)
    {
       center[0] = x; center[1] = 0.; center[2] = 0.; scale = s; tol = 1e-12;
       weight = NULL; sdim = 1; tdf = NULL;
    }
+
+   /// Construct a delta function scaled by @a s and centered at (x,y,0.0)
    DeltaCoefficient(double x, double y, double s)
    {
       center[0] = x; center[1] = y; center[2] = 0.; scale = s; tol = 1e-12;
       weight = NULL; sdim = 2; tdf = NULL;
    }
+
+   /// Construct a delta function scaled by @a s and centered at (x,y,z)
    DeltaCoefficient(double x, double y, double z, double s)
    {
       center[0] = x; center[1] = y; center[2] = z; scale = s; tol = 1e-12;
       weight = NULL; sdim = 3; tdf = NULL;
    }
+
+   /// Set the center location of the delta function.
    void SetDeltaCenter(const Vector& center);
+
+   /// Set the scale value multiplying the delta function.
    void SetScale(double _s) { scale = _s; }
+
    /// Set a time-dependent function that multiplies the Scale().
    void SetFunction(double (*f)(double)) { tdf = f; }
+
    /** @brief Set the tolerance used during projection onto GridFunction to
-       identifying the Mesh vertex where the Center() of the delta function
-       lies. */
+       identify the Mesh vertex where the Center() of the delta function
+       lies. (default 1e-12)*/
    void SetTol(double _tol) { tol = _tol; }
+
    /// Set a weight Coefficient that multiplies the DeltaCoefficient.
    /** The weight Coefficient multiplies the value returned by EvalDelta() but
        not the value returned by Scale().
@@ -244,16 +288,26 @@ class DeltaCoefficient : public Coefficient
        projecting the DeltaCoefficient onto a GridFunction, so that the weighted
        integral of the projection is exactly equal to the Scale(). */
    void SetWeight(Coefficient *w) { weight = w; }
+
+   /// Return a pointer to a c-array representing the center of the delta
+   /// function.
    const double *Center() { return center; }
-   /** @brief Return the scale set by SetScale() multiplied by the
-       time-dependent function specified by SetFunction(), if set. */
+
+   /** @brief Return the scale factor times the optional time dependent
+       function.  Returns \f$ s T(t) \f$ with \f$ T(t) = 1 \f$ when
+       not set by the user. */
    double Scale() { return tdf ? (*tdf)(GetTime())*scale : scale; }
-   /// See SetTol() for description of the tolerance parameter.
+
+   /// Return the tolerance used to identify the mesh vertices
    double Tol() { return tol; }
+
    /// See SetWeight() for description of the weight Coefficient.
    Coefficient *Weight() { return weight; }
+
+   /// Write the center of the delta function into @a center.
    void GetDeltaCenter(Vector& center);
-   /// Return the Scale() multiplied by the weight Coefficient, if any.
+
+   /// The value of the function assuming we are evaluating at the delta center.
    virtual double EvalDelta(ElementTransformation &T, const IntegrationPoint &ip);
    /** @brief A DeltaFunction cannot be evaluated. Calling this method will
        cause an MFEM error, terminating the application. */
@@ -262,7 +316,8 @@ class DeltaCoefficient : public Coefficient
    virtual ~DeltaCoefficient() { delete weight; }
 };
 
-/// Coefficient defined on a subset of domain or boundary attributes
+/** @brief Derived coefficient that takes the value of the parent coefficient
+    for the active attributes and is zero otherwise. */
 class RestrictedCoefficient : public Coefficient
 {
 private:
@@ -270,13 +325,18 @@ class RestrictedCoefficient : public Coefficient
    Array<int> active_attr;
 
 public:
+   /** @brief Construct with a parent coefficient and an array with
+       ones marking the attributes on which this coefficient should be
+       active. */
    RestrictedCoefficient(Coefficient &_c, Array<int> &attr)
    { c = &_c; attr.Copy(active_attr); }
 
+   /// Evaluate the coefficient at @a ip.
    virtual double Eval(ElementTransformation &T, const IntegrationPoint &ip)
    { return active_attr[T.Attribute-1] ? c->Eval(T, ip, GetTime()) : 0.0; }
 };
 
+/// Base class for vector Coefficients that optionally depend on time and space.
 class VectorCoefficient
 {
 protected:
@@ -284,9 +344,13 @@ class VectorCoefficient
    double time;
 
 public:
+   /// Initialize the VectorCoefficient with vector dimension @a vd.
    VectorCoefficient(int vd) { vdim = vd; time = 0.; }
 
+   /// Set the time for time dependent coefficients
    void SetTime(double t) { time = t; }
+
+   /// Get the time for time dependent coefficients
    double GetTime() { return time; }
 
    /// Returns dimension of the vector.
@@ -318,19 +382,27 @@ class VectorCoefficient
    virtual ~VectorCoefficient() { }
 };
 
+
+/// Vector coefficient that is constant in space and time.
 class VectorConstantCoefficient : public VectorCoefficient
 {
 private:
    Vector vec;
 public:
+   /// Construct the coefficient with constant vector @a v.
    VectorConstantCoefficient(const Vector &v)
       : VectorCoefficient(v.Size()), vec(v) { }
    using VectorCoefficient::Eval;
+
+   ///  Evaluate the vector coefficient at @a ip.
    virtual void Eval(Vector &V, ElementTransformation &T,
                      const IntegrationPoint &ip) { V = vec; }
+
+   /// Return a reference to the constant vector in this class.
    const Vector& GetVec() { return vec; }
 };
 
+/// A general C-function vector coefficient
 class VectorFunctionCoefficient : public VectorCoefficient
 {
 private:
@@ -359,13 +431,17 @@ class VectorFunctionCoefficient : public VectorCoefficient
    }
 
    using VectorCoefficient::Eval;
+   /// Evaluate the vector coefficient at @a ip.
    virtual void Eval(Vector &V, ElementTransformation &T,
                      const IntegrationPoint &ip);
 
    virtual ~VectorFunctionCoefficient() { }
 };
 
-/// Vector coefficient defined by an array of scalar coefficients.
+/** @brief Vector coefficient defined by an array of scalar coefficients.
+    Coefficients that are not set will evaluate to zero in the vector. This
+    object takes ownership of the array of coefficients inside it and deletes
+    them at object destruction. */
 class VectorArrayCoefficient : public VectorCoefficient
 {
 private:
@@ -373,22 +449,27 @@ class VectorArrayCoefficient : public VectorCoefficient
    Array<bool> ownCoeff;
 
 public:
-   /// Construct vector of dim coefficients.
+   /** @brief Construct vector of dim coefficients.  The actual coefficients
+       still need to be added with Set(). */
    explicit VectorArrayCoefficient(int dim);
 
    /// Returns i'th coefficient.
    Coefficient* GetCoeff(int i) { return Coeff[i]; }
 
+   /// Returns the entire array of coefficients.
    Coefficient **GetCoeffs() { return Coeff; }
 
    /// Sets coefficient in the vector.
    void Set(int i, Coefficient *c, bool own=true);
 
-   /// Evaluates i'th component of the vector.
+   /// Evaluates i'th component of the vector of coefficients and returns the
+   /// value.
    double Eval(int i, ElementTransformation &T, const IntegrationPoint &ip)
    { return Coeff[i] ? Coeff[i]->Eval(T, ip, GetTime()) : 0.0; }
 
    using VectorCoefficient::Eval;
+   /** @brief Evaluate the coefficient. Each element of vector V comes from the
+       associated array of scalar coefficients. */
    virtual void Eval(Vector &V, ElementTransformation &T,
                      const IntegrationPoint &ip);
 
@@ -400,18 +481,31 @@ class VectorArrayCoefficient : public VectorCoefficient
 class VectorGridFunctionCoefficient : public VectorCoefficient
 {
 protected:
-   GridFunction *GridFunc;
+   const GridFunction *GridFunc;
 
 public:
+   /** @brief Construct an empty coefficient.  Calling Eval() before the grid
+       function is set will cause a segfault. */
    VectorGridFunctionCoefficient() : VectorCoefficient(0), GridFunc(NULL) { }
-   VectorGridFunctionCoefficient(GridFunction *gf);
 
-   void SetGridFunction(GridFunction *gf);
-   GridFunction * GetGridFunction() const { return GridFunc; }
+   /** @brief  Construct the coefficient with grid function @a gf.  The
+       grid function is not owned by the coefficient. */
+   VectorGridFunctionCoefficient(const GridFunction *gf);
+
+   /** @brief Set the grid function for this coefficient. Also sets the Vector
+       dimension to match that of the @a gf. */
+   void SetGridFunction(const GridFunction *gf);
+
+   ///  Returns a pointer to the grid function in this Coefficient
+   const GridFunction * GetGridFunction() const { return GridFunc; }
 
+   /// Evaluate the vector coefficient at @a ip.
    virtual void Eval(Vector &V, ElementTransformation &T,
                      const IntegrationPoint &ip);
 
+   /** @brief Evaluate the vector coefficients at all of the locations in the
+       integration rule and write the vectors into the columns of matrix @a
+       M. */
    virtual void Eval(DenseMatrix &M, ElementTransformation &T,
                      const IntegrationRule &ir);
 
@@ -422,17 +516,27 @@ class VectorGridFunctionCoefficient : public VectorCoefficient
 class GradientGridFunctionCoefficient : public VectorCoefficient
 {
 protected:
-   GridFunction *GridFunc;
+   const GridFunction *GridFunc;
 
 public:
-   GradientGridFunctionCoefficient(GridFunction *gf);
 
-   void SetGridFunction(GridFunction *gf);
-   GridFunction * GetGridFunction() const { return GridFunc; }
+   /** @brief Construct the coefficient with a scalar grid function @a gf. The
+       grid function is not owned by the coefficient. */
+   GradientGridFunctionCoefficient(const GridFunction *gf);
 
+   ///Set the scalar grid function.
+   void SetGridFunction(const GridFunction *gf);
+
+   ///Get the scalar grid function.
+   const GridFunction * GetGridFunction() const { return GridFunc; }
+
+   /// Evaluate the gradient vector coefficient at @a ip.
    virtual void Eval(Vector &V, ElementTransformation &T,
                      const IntegrationPoint &ip);
 
+   /** @brief Evaluate the gradient vector coefficient at all of the locations
+       in the integration rule and write the vectors into columns of matrix @a
+       M. */
    virtual void Eval(DenseMatrix &M, ElementTransformation &T,
                      const IntegrationRule &ir);
 
@@ -443,15 +547,21 @@ class GradientGridFunctionCoefficient : public VectorCoefficient
 class CurlGridFunctionCoefficient : public VectorCoefficient
 {
 protected:
-   GridFunction *GridFunc;
+   const GridFunction *GridFunc;
 
 public:
-   CurlGridFunctionCoefficient(GridFunction *gf);
+   /** @brief Construct the coefficient with a vector grid function @a gf. The
+       grid function is not owned by the coefficient. */
+   CurlGridFunctionCoefficient(const GridFunction *gf);
+
+   /// Set the vector grid function.
+   void SetGridFunction(const GridFunction *gf);
 
-   void SetGridFunction(GridFunction *gf);
-   GridFunction * GetGridFunction() const { return GridFunc; }
+   /// Get the vector grid function.
+   const GridFunction * GetGridFunction() const { return GridFunc; }
 
    using VectorCoefficient::Eval;
+   /// Evaluate the vector curl coefficient at @a ip.
    virtual void Eval(Vector &V, ElementTransformation &T,
                      const IntegrationPoint &ip);
 
@@ -462,21 +572,31 @@ class CurlGridFunctionCoefficient : public VectorCoefficient
 class DivergenceGridFunctionCoefficient : public Coefficient
 {
 protected:
-   GridFunction *GridFunc;
+   const GridFunction *GridFunc;
 
 public:
-   DivergenceGridFunctionCoefficient(GridFunction *gf);
+   /** @brief Construct the coefficient with a vector grid function @a gf. The
+       grid function is not owned by the coefficient. */
+   DivergenceGridFunctionCoefficient(const GridFunction *gf);
 
-   void SetGridFunction(GridFunction *gf) { GridFunc = gf; }
-   GridFunction * GetGridFunction() const { return GridFunc; }
+   /// Set the vector grid function.
+   void SetGridFunction(const GridFunction *gf) { GridFunc = gf; }
 
+   /// Get the vector grid function.
+   const GridFunction * GetGridFunction() const { return GridFunc; }
+
+   /// Evaluate the scalar divergence coefficient at @a ip.
    virtual double Eval(ElementTransformation &T,
                        const IntegrationPoint &ip);
 
    virtual ~DivergenceGridFunctionCoefficient() { }
 };
 
-/// VectorDeltaCoefficient: DeltaCoefficient with a direction
+/** @brief Vector coefficient defined by a scalar DeltaCoefficient and a
+    constant vector direction.
+
+    WARNING this cannot be used as a normal coefficient. The usual Eval method
+    is disabled. */
 class VectorDeltaCoefficient : public VectorCoefficient
 {
 protected:
@@ -484,22 +604,36 @@ class VectorDeltaCoefficient : public VectorCoefficient
    DeltaCoefficient d;
 
 public:
+   /// Construct with a vector of dimension @a _vdim.
    VectorDeltaCoefficient(int _vdim)
       : VectorCoefficient(_vdim), dir(_vdim), d() { }
+
+   /** @brief Construct with a Vector object representing the direction and a
+       unit delta function centered at (0.0,0.0,0.0) */
    VectorDeltaCoefficient(const Vector& _dir)
       : VectorCoefficient(_dir.Size()), dir(_dir), d() { }
+
+   /** @brief Construct with a Vector object representing the direction and a
+       delta function scaled by @a s and centered at (x,0.0,0.0) */
    VectorDeltaCoefficient(const Vector& _dir, double x, double s)
       : VectorCoefficient(_dir.Size()), dir(_dir), d(x,s) { }
+
+   /** @brief Construct with a Vector object representing the direction and a
+       delta function scaled by @a s and centered at (x,y,0.0) */
    VectorDeltaCoefficient(const Vector& _dir, double x, double y, double s)
       : VectorCoefficient(_dir.Size()), dir(_dir), d(x,y,s) { }
+
+   /** @brief Construct with a Vector object representing the direction and a
+       delta function scaled by @a s and centered at (x,y,z) */
    VectorDeltaCoefficient(const Vector& _dir, double x, double y, double z,
                           double s)
       : VectorCoefficient(_dir.Size()), dir(_dir), d(x,y,z,s) { }
 
-   /// Replace the associated DeltaCoeficient with a new DeltaCoeficient.
-   /** The new DeltaCoeficient cannot have a specified weight Coefficient, i.e.
-       DeltaCoeficient::Weight() should return NULL. */
+   /// Replace the associated DeltaCoefficient with a new DeltaCoefficient.
+   /** The new DeltaCoefficient cannot have a specified weight Coefficient, i.e.
+       DeltaCoefficient::Weight() should return NULL. */
    void SetDeltaCoefficient(const DeltaCoefficient& _d) { d = _d; }
+
    /// Return the associated scalar DeltaCoefficient.
    DeltaCoefficient& GetDeltaCoefficient() { return d; }
 
@@ -514,6 +648,7 @@ class VectorDeltaCoefficient : public VectorCoefficient
        DeltaCoefficient. */
    virtual void EvalDelta(Vector &V, ElementTransformation &T,
                           const IntegrationPoint &ip);
+
    using VectorCoefficient::Eval;
    /** @brief A VectorDeltaFunction cannot be evaluated. Calling this method
        will cause an MFEM error, terminating the application. */
@@ -523,7 +658,8 @@ class VectorDeltaCoefficient : public VectorCoefficient
    virtual ~VectorDeltaCoefficient() { }
 };
 
-/// VectorCoefficient defined on a subset of domain or boundary attributes
+/** @brief Derived vector coefficient that has the value of the parent vector
+    where it is active and is zero otherwise. */
 class VectorRestrictedCoefficient : public VectorCoefficient
 {
 private:
@@ -531,18 +667,26 @@ class VectorRestrictedCoefficient : public VectorCoefficient
    Array<int> active_attr;
 
 public:
+   /** @brief Construct with a parent vector coefficient and an array of zeros
+       and ones representing the attributes for which this coefficient should be
+       active. */
    VectorRestrictedCoefficient(VectorCoefficient &vc, Array<int> &attr)
       : VectorCoefficient(vc.GetVDim())
    { c = &vc; attr.Copy(active_attr); }
 
+   /// Evaluate the vector coefficient at @a ip.
    virtual void Eval(Vector &V, ElementTransformation &T,
                      const IntegrationPoint &ip);
 
+   /** @brief Evaluate the vector coefficient at all of the locations in the
+       integration rule and write the vectors into the columns of matrix @a
+       M. */
    virtual void Eval(DenseMatrix &M, ElementTransformation &T,
                      const IntegrationRule &ir);
 };
 
 
+/// Base class for Matrix Coefficients that optionally depend on time and space.
 class MatrixCoefficient
 {
 protected:
@@ -550,16 +694,25 @@ class MatrixCoefficient
    double time;
 
 public:
+   /// Construct a dim x dim matrix coefficient.
    explicit MatrixCoefficient(int dim) { height = width = dim; time = 0.; }
 
+   /// Construct a h x w matrix coefficient.
    MatrixCoefficient(int h, int w) : height(h), width(w), time(0.) { }
 
+   /// Set the time for time dependent coefficients
    void SetTime(double t) { time = t; }
+
+   /// Get the time for time dependent coefficients
    double GetTime() { return time; }
 
+   /// Get the height of the matrix.
    int GetHeight() const { return height; }
+
+   /// Get the width of the matrix.
    int GetWidth() const { return width; }
-   // For backward compatibility
+
+   /// For backward compatibility get the width of the matrix.
    int GetVDim() const { return width; }
 
    /** @brief Evaluate the matrix coefficient in the element described by @a T
@@ -573,18 +726,26 @@ class MatrixCoefficient
    virtual ~MatrixCoefficient() { }
 };
 
+
+/// A matrix coefficient that is constant in space and time.
 class MatrixConstantCoefficient : public MatrixCoefficient
 {
 private:
    DenseMatrix mat;
 public:
+   ///Construct using matrix @a m for the constant.
    MatrixConstantCoefficient(const DenseMatrix &m)
       : MatrixCoefficient(m.Height(), m.Width()), mat(m) { }
    using MatrixCoefficient::Eval;
+   /// Evaluate the matrix coefficient at @a ip.
    virtual void Eval(DenseMatrix &M, ElementTransformation &T,
                      const IntegrationPoint &ip) { M = mat; }
 };
 
+
+/** @brief A matrix coefficient with an optional scalar coefficient multiplier
+    \a q.  The matrix function can either be represented by a C-function or a
+    constant matrix provided when constructing this object.  */
 class MatrixFunctionCoefficient : public MatrixCoefficient
 {
 private:
@@ -594,7 +755,8 @@ class MatrixFunctionCoefficient : public MatrixCoefficient
    DenseMatrix mat;
 
 public:
-   /// Construct a time-independent square matrix coefficient from a C-function
+   /// Construct a square matrix coefficient from a C-function without time
+   /// dependence.
    MatrixFunctionCoefficient(int dim, void (*F)(const Vector &, DenseMatrix &),
                              Coefficient *q = NULL)
       : MatrixCoefficient(dim), Q(q)
@@ -613,7 +775,8 @@ class MatrixFunctionCoefficient : public MatrixCoefficient
       mat = m;
    }
 
-   /// Construct a time-dependent square matrix coefficient from a C-function
+   /// Construct a square matrix coefficient from a C-function with
+   /// time-dependence.
    MatrixFunctionCoefficient(int dim,
                              void (*TDF)(const Vector &, double, DenseMatrix &),
                              Coefficient *q = NULL)
@@ -624,12 +787,18 @@ class MatrixFunctionCoefficient : public MatrixCoefficient
       mat.SetSize(0);
    }
 
+   /// Evaluate the matrix coefficient at @a ip.
    virtual void Eval(DenseMatrix &K, ElementTransformation &T,
                      const IntegrationPoint &ip);
 
    virtual ~MatrixFunctionCoefficient() { }
 };
 
+
+
+/** @brief Matrix coefficient defined by a matrix of scalar coefficients.
+    Coefficients that are not set will evaluate to zero in the vector. The
+    coefficient is stored as a flat Array with indexing (i,j) -> i*width+j. */
 class MatrixArrayCoefficient : public MatrixCoefficient
 {
 private:
@@ -637,23 +806,33 @@ class MatrixArrayCoefficient : public MatrixCoefficient
    Array<bool> ownCoeff;
 
 public:
-
+   /** @brief Construct a coefficient matrix of dimensions @a dim * @a dim. The
+       actual coefficients still need to be added with Set(). */
    explicit MatrixArrayCoefficient (int dim);
 
+   /// Get the coefficient located at (i,j) in the matrix.
    Coefficient* GetCoeff (int i, int j) { return Coeff[i*width+j]; }
 
+   /** @brief Set the coefficient located at (i,j) in the matrix.  By default by
+       default this will take ownership of the Coefficient passed in, but this
+       can be overridden with the @a own parameter. */
    void Set(int i, int j, Coefficient * c, bool own=true);
 
+   /// Evaluate coefficient located at (i,j) in the matrix using integration
+   /// point @a ip.
    double Eval(int i, int j, ElementTransformation &T, const IntegrationPoint &ip)
    { return Coeff[i*width+j] ? Coeff[i*width+j] -> Eval(T, ip, GetTime()) : 0.0; }
 
+   /// Evaluate the matrix coefficient @a ip.
    virtual void Eval(DenseMatrix &K, ElementTransformation &T,
                      const IntegrationPoint &ip);
 
    virtual ~MatrixArrayCoefficient();
 };
 
-/// MatrixCoefficient defined on a subset of domain or boundary attributes
+
+/** @brief Derived matrix coefficient that has the value of the parent matrix
+    coefficient where it is active and is zero otherwise. */
 class MatrixRestrictedCoefficient : public MatrixCoefficient
 {
 private:
@@ -661,10 +840,14 @@ class MatrixRestrictedCoefficient : public MatrixCoefficient
    Array<int> active_attr;
 
 public:
+   /** @brief Construct with a parent matrix coefficient and an array of zeros
+       and ones representing the attributes for which this coefficient should be
+       active. */
    MatrixRestrictedCoefficient(MatrixCoefficient &mc, Array<int> &attr)
       : MatrixCoefficient(mc.GetHeight(), mc.GetWidth())
    { c = &mc; attr.Copy(active_attr); }
 
+   /// Evaluate the matrix coefficient at @a ip.
    virtual void Eval(DenseMatrix &K, ElementTransformation &T,
                      const IntegrationPoint &ip);
 };
@@ -682,12 +865,12 @@ class SumCoefficient : public Coefficient
    double beta;
 
 public:
-   // Result is _alpha * A + _beta * B
+   /// Construct with the two coefficients.  Result is _alpha * A + _beta * B.
    SumCoefficient(Coefficient &A, Coefficient &B,
                   double _alpha = 1.0, double _beta = 1.0)
       : a(&A), b(&B), alpha(_alpha), beta(_beta) { }
 
-   /// Evaluate the coefficient
+   /// Evaluate the coefficient at @a ip.
    virtual double Eval(ElementTransformation &T,
                        const IntegrationPoint &ip)
    { return alpha * a->Eval(T, ip) + beta * b->Eval(T, ip); }
@@ -701,10 +884,11 @@ class ProductCoefficient : public Coefficient
    Coefficient * b;
 
 public:
+   /// Construct with the two coefficients.  Result is A * B.
    ProductCoefficient(Coefficient &A, Coefficient &B)
       : a(&A), b(&B) { }
 
-   /// Evaluate the coefficient
+   /// Evaluate the coefficient at @a ip.
    virtual double Eval(ElementTransformation &T,
                        const IntegrationPoint &ip)
    { return a->Eval(T, ip) * b->Eval(T, ip); }
@@ -719,16 +903,17 @@ class PowerCoefficient : public Coefficient
    double p;
 
 public:
-   // Result is A^p
+   /// Construct with a coefficient and a constant power @a _p.  Result is A^p.
    PowerCoefficient(Coefficient &A, double _p)
       : a(&A), p(_p) { }
 
-   /// Evaluate the coefficient
+   /// Evaluate the coefficient at @a ip.
    virtual double Eval(ElementTransformation &T,
                        const IntegrationPoint &ip)
    { return pow(a->Eval(T, ip), p); }
 };
 
+
 /// Scalar coefficient defined as the inner product of two vector coefficients
 class InnerProductCoefficient : public Coefficient
 {
@@ -739,14 +924,15 @@ class InnerProductCoefficient : public Coefficient
    mutable Vector va;
    mutable Vector vb;
 public:
+   /// Construct with the two vector coefficients.  Result is \f$ A \cdot B \f$.
    InnerProductCoefficient(VectorCoefficient &A, VectorCoefficient &B);
 
-   /// Evaluate the coefficient
+   /// Evaluate the coefficient at @a ip.
    virtual double Eval(ElementTransformation &T,
                        const IntegrationPoint &ip);
 };
 
-/// Scalar coefficient defined as a cross product of two vectors in 2D
+/// Scalar coefficient defined as a cross product of two vectors in the xy-plane.
 class VectorRotProductCoefficient : public Coefficient
 {
 private:
@@ -757,8 +943,10 @@ class VectorRotProductCoefficient : public Coefficient
    mutable Vector vb;
 
 public:
+   /// Construct with the two vector coefficients.  Result is \f$ A_x B_y - A_y * B_x; \f$.
    VectorRotProductCoefficient(VectorCoefficient &A, VectorCoefficient &B);
 
+   /// Evaluate the coefficient at @a ip.
    virtual double Eval(ElementTransformation &T,
                        const IntegrationPoint &ip);
 };
@@ -772,9 +960,10 @@ class DeterminantCoefficient : public Coefficient
    mutable DenseMatrix ma;
 
 public:
+   /// Construct with the matrix.
    DeterminantCoefficient(MatrixCoefficient &A);
 
-   /// Evaluate the coefficient
+   /// Evaluate the determinant coefficient at @a ip.
    virtual double Eval(ElementTransformation &T,
                        const IntegrationPoint &ip);
 };
@@ -792,17 +981,17 @@ class VectorSumCoefficient : public VectorCoefficient
    mutable Vector va;
 
 public:
-   // Result is _alpha * A + _beta * B
+   /// Construct with the two vector coefficients.  Result is _alpha * A + _beta * B.
    VectorSumCoefficient(VectorCoefficient &A, VectorCoefficient &B,
                         double _alpha = 1.0, double _beta = 1.0);
 
-   /// Evaluate the coefficient
+   /// Evaluate the coefficient at @a ip.
    virtual void Eval(Vector &V, ElementTransformation &T,
                      const IntegrationPoint &ip);
    using VectorCoefficient::Eval;
 };
 
-/// Vector coefficient defined as a product of a scalar and a vector
+/// Vector coefficient defined as a product of scalar and vector coefficients.
 class ScalarVectorProductCoefficient : public VectorCoefficient
 {
 private:
@@ -810,8 +999,10 @@ class ScalarVectorProductCoefficient : public VectorCoefficient
    VectorCoefficient * b;
 
 public:
+   /// Construct with the two coefficients.  Result is A * B.
    ScalarVectorProductCoefficient(Coefficient &A, VectorCoefficient &B);
 
+   /// Evaluate the coefficient at @a ip.
    virtual void Eval(Vector &V, ElementTransformation &T,
                      const IntegrationPoint &ip);
    using VectorCoefficient::Eval;
@@ -828,14 +1019,17 @@ class VectorCrossProductCoefficient : public VectorCoefficient
    mutable Vector vb;
 
 public:
+   /// Construct with the two coefficients.  Result is A x B.
    VectorCrossProductCoefficient(VectorCoefficient &A, VectorCoefficient &B);
 
+   /// Evaluate the coefficient at @a ip.
    virtual void Eval(Vector &V, ElementTransformation &T,
                      const IntegrationPoint &ip);
    using VectorCoefficient::Eval;
 };
 
-/// Vector coefficient defined as a matrix vector product
+/** @brief Vector coefficient defined as a product of a matrix coefficient and
+    a vector coefficient. */
 class MatVecCoefficient : public VectorCoefficient
 {
 private:
@@ -846,28 +1040,32 @@ class MatVecCoefficient : public VectorCoefficient
    mutable Vector vb;
 
 public:
+   /// Construct with the two coefficients.  Result is A*B.
    MatVecCoefficient(MatrixCoefficient &A, VectorCoefficient &B);
 
+   /// Evaluate the vector coefficient at @a ip.
    virtual void Eval(Vector &V, ElementTransformation &T,
                      const IntegrationPoint &ip);
    using VectorCoefficient::Eval;
 };
 
-/// Matrix coefficient defined as the identity of dimension d
+/// Constant matrix coefficient defined as the identity of dimension d
 class IdentityMatrixCoefficient : public MatrixCoefficient
 {
 private:
    int dim;
 
 public:
+   /// Construct with the dimension of the square identity matrix.
    IdentityMatrixCoefficient(int d)
       : MatrixCoefficient(d, d), dim(d) { }
 
+   /// Evaluate the matrix coefficient at @a ip.
    virtual void Eval(DenseMatrix &M, ElementTransformation &T,
                      const IntegrationPoint &ip);
 };
 
-/// Matrix coefficient defined as the sum of two matrix coefficients
+/// Matrix coefficient defined as the sum of two matrix coefficients.
 class MatrixSumCoefficient : public MatrixCoefficient
 {
 private:
@@ -880,16 +1078,17 @@ class MatrixSumCoefficient : public MatrixCoefficient
    mutable DenseMatrix ma;
 
 public:
-   // Result is _alpha * A + _beta * B
+   /// Construct with the two coefficients.  Result is _alpha * A + _beta * B.
    MatrixSumCoefficient(MatrixCoefficient &A, MatrixCoefficient &B,
                         double _alpha = 1.0, double _beta = 1.0);
 
-   /// Evaluate the coefficient
+   /// Evaluate the matrix coefficient at @a ip.
    virtual void Eval(DenseMatrix &M, ElementTransformation &T,
                      const IntegrationPoint &ip);
 };
 
-/// Matrix coefficient defined as a product of a scalar and a matrix
+/** @brief Matrix coefficient defined as a product of a scalar coefficient and a
+    matrix coefficient.*/
 class ScalarMatrixProductCoefficient : public MatrixCoefficient
 {
 private:
@@ -897,39 +1096,45 @@ class ScalarMatrixProductCoefficient : public MatrixCoefficient
    MatrixCoefficient * b;
 
 public:
+   /// Construct with the two coefficients.  Result is A*B.
    ScalarMatrixProductCoefficient(Coefficient &A, MatrixCoefficient &B);
 
+   /// Evaluate the matrix coefficient at @a ip.
    virtual void Eval(DenseMatrix &M, ElementTransformation &T,
                      const IntegrationPoint &ip);
 };
 
-/// Matrix coefficient defined as the transpose a matrix
+/// Matrix coefficient defined as the transpose a matrix coefficient
 class TransposeMatrixCoefficient : public MatrixCoefficient
 {
 private:
    MatrixCoefficient * a;
 
 public:
+   /// Construct with the matrix coefficient.  Result is \f$ A^T \f$.
    TransposeMatrixCoefficient(MatrixCoefficient &A);
 
+   /// Evaluate the matrix coefficient at @a ip.
    virtual void Eval(DenseMatrix &M, ElementTransformation &T,
                      const IntegrationPoint &ip);
 };
 
-/// Matrix coefficient defined as the inverse a matrix
+/// Matrix coefficient defined as the inverse a matrix coefficient.
 class InverseMatrixCoefficient : public MatrixCoefficient
 {
 private:
    MatrixCoefficient * a;
 
 public:
+   /// Construct with the matrix coefficient.  Result is \f$ A^{-1} \f$.
    InverseMatrixCoefficient(MatrixCoefficient &A);
 
+   /// Evaluate the matrix coefficient at @a ip.
    virtual void Eval(DenseMatrix &M, ElementTransformation &T,
                      const IntegrationPoint &ip);
 };
 
-/// Matrix coefficient defined as the outer product of two vectors
+/// Matrix coefficient defined as the outer product of two vector coefficients.
 class OuterProductCoefficient : public MatrixCoefficient
 {
 private:
@@ -940,29 +1145,80 @@ class OuterProductCoefficient : public MatrixCoefficient
    mutable Vector vb;
 
 public:
+   /// Construct with two vector coefficients.  Result is \f$ A B^T \f$.
    OuterProductCoefficient(VectorCoefficient &A, VectorCoefficient &B);
 
+   /// Evaluate the matrix coefficient at @a ip.
    virtual void Eval(DenseMatrix &M, ElementTransformation &T,
                      const IntegrationPoint &ip);
 };
 
-/** Compute the Lp norm of a function f.
+
+class QuadratureFunction;
+
+/** @brief Vector quadrature function coefficient which requires that the
+    quadrature rules used for this vector coefficient be the same as those that
+    live within the supplied QuadratureFunction. */
+class VectorQuadratureFunctionCoefficient : public VectorCoefficient
+{
+private:
+   const QuadratureFunction &QuadF; //do not own
+   int index;
+
+public:
+   /// Constructor with a quadrature function as input
+   VectorQuadratureFunctionCoefficient(QuadratureFunction &qf);
+
+   /** Set the starting index within the QuadFunc that'll be used to project
+       outwards as well as the corresponding length. The projected length should
+       have the bounds of 1 <= length <= (length QuadFunc - index). */
+   void SetComponent(int _index, int _length);
+
+   const QuadratureFunction& GetQuadFunction() const { return QuadF; }
+
+   using VectorCoefficient::Eval;
+   virtual void Eval(Vector &V, ElementTransformation &T,
+                     const IntegrationPoint &ip);
+
+   virtual ~VectorQuadratureFunctionCoefficient() { }
+};
+
+/** @brief Quadrature function coefficient which requires that the quadrature
+    rules used for this coefficient be the same as those that live within the
+    supplied QuadratureFunction. */
+class QuadratureFunctionCoefficient : public Coefficient
+{
+private:
+   const QuadratureFunction &QuadF;
+
+public:
+   /// Constructor with a quadrature function as input
+   QuadratureFunctionCoefficient(QuadratureFunction &qf);
+
+   const QuadratureFunction& GetQuadFunction() const { return QuadF; }
+
+   virtual double Eval(ElementTransformation &T, const IntegrationPoint &ip);
+
+   virtual ~QuadratureFunctionCoefficient() { }
+};
+
+/** @brief Compute the Lp norm of a function f.
     \f$ \| f \|_{Lp} = ( \int_\Omega | f |^p d\Omega)^{1/p} \f$ */
 double ComputeLpNorm(double p, Coefficient &coeff, Mesh &mesh,
                      const IntegrationRule *irs[]);
 
-/** Compute the Lp norm of a vector function f = {f_i}_i=1...N.
+/** @brief Compute the Lp norm of a vector function f = {f_i}_i=1...N.
     \f$ \| f \|_{Lp} = ( \sum_i \| f_i \|_{Lp}^p )^{1/p} \f$ */
 double ComputeLpNorm(double p, VectorCoefficient &coeff, Mesh &mesh,
                      const IntegrationRule *irs[]);
 
 #ifdef MFEM_USE_MPI
-/** Compute the global Lp norm of a function f.
+/** @brief Compute the global Lp norm of a function f.
     \f$ \| f \|_{Lp} = ( \int_\Omega | f |^p d\Omega)^{1/p} \f$ */
 double ComputeGlobalLpNorm(double p, Coefficient &coeff, ParMesh &pmesh,
                            const IntegrationRule *irs[]);
 
-/** Compute the global Lp norm of a vector function f = {f_i}_i=1...N.
+/** @brief Compute the global Lp norm of a vector function f = {f_i}_i=1...N.
     \f$ \| f \|_{Lp} = ( \sum_i \| f_i \|_{Lp}^p )^{1/p} \f$ */
 double ComputeGlobalLpNorm(double p, VectorCoefficient &coeff, ParMesh &pmesh,
                            const IntegrationRule *irs[]);
diff --git a/fem/datacollection.cpp b/fem/datacollection.cpp
index 0e8d1aca8d1..20aee629d2a 100644
--- a/fem/datacollection.cpp
+++ b/fem/datacollection.cpp
@@ -739,12 +739,6 @@ ParaViewDataCollection::ParaViewDataCollection(const std::string&
 #endif
 }
 
-void ParaViewDataCollection::RegisterField(const std::string& field_name,
-                                           mfem::GridFunction *gf)
-{
-   DataCollection::RegisterField(field_name,gf);
-}
-
 void ParaViewDataCollection::SetLevelsOfDetail(int levels_of_detail_)
 {
    levels_of_detail = levels_of_detail_;
@@ -815,7 +809,7 @@ void ParaViewDataCollection::Save()
    // the directory is created
 
    // create pvd file if needed
-   if (!pvd_stream.is_open())
+   if (myid == 0 && !pvd_stream.is_open())
    {
       std::string dpath=GenerateCollectionPath();
       std::string pvdname=dpath+"/"+GeneratePVDFileName();
diff --git a/fem/datacollection.hpp b/fem/datacollection.hpp
index 5153ced9e81..0ed925b6ebc 100644
--- a/fem/datacollection.hpp
+++ b/fem/datacollection.hpp
@@ -501,10 +501,6 @@ class ParaViewDataCollection : public DataCollection
    ParaViewDataCollection(const std::string& collection_name,
                           mfem::Mesh *mesh_ = NULL);
 
-   /// Add a grid function to the collection
-   virtual void RegisterField(const std::string& field_name,
-                              mfem::GridFunction *gf) override;
-
    /// Set refinement levels - every element is uniformly split based on
    /// levels_of_detail_
    void SetLevelsOfDetail(int levels_of_detail_);
diff --git a/fem/eltrans.cpp b/fem/eltrans.cpp
index 26419b25322..a0b1c5dda9d 100644
--- a/fem/eltrans.cpp
+++ b/fem/eltrans.cpp
@@ -19,6 +19,7 @@ namespace mfem
 ElementTransformation::ElementTransformation()
    : IntPoint(static_cast<IntegrationPoint *>(NULL)),
      EvalState(0),
+     geom(Geometry::INVALID),
      Attribute(-1),
      ElementNo(-1)
 { }
@@ -551,4 +552,76 @@ void IntegrationPointTransformation::Transform (const IntegrationRule &ir1,
    }
 }
 
+void FaceElementTransformations::SetIntPoint(const IntegrationPoint *ip)
+{
+   IsoparametricTransformation::SetIntPoint(ip);
+
+   if (Elem1)
+   {
+      Loc1.Transform(*ip, eip1);
+      Elem1->SetIntPoint(&eip1);
+   }
+   if (Elem2)
+   {
+      Loc2.Transform(*ip, eip2);
+      Elem2->SetIntPoint(&eip2);
+   }
+}
+
+ElementTransformation &
+FaceElementTransformations::GetElement1Transformation()
+{
+   MFEM_VERIFY(mask & 1 && Elem1 != NULL, "The ElementTransformation "
+               "for the element has not been configured for side 1.");
+   return *Elem1;
+}
+
+ElementTransformation &
+FaceElementTransformations::GetElement2Transformation()
+{
+   MFEM_VERIFY(mask & 2 && Elem2 != NULL, "The ElementTransformation "
+               "for the element has not been configured for side 2.");
+   return *Elem2;
+}
+
+IntegrationPointTransformation &
+FaceElementTransformations::GetIntPoint1Transformation()
+{
+   MFEM_VERIFY(mask & 4, "The IntegrationPointTransformation "
+               "for the element has not been configured for side 1.");
+   return Loc1;
+}
+
+IntegrationPointTransformation &
+FaceElementTransformations::GetIntPoint2Transformation()
+{
+   MFEM_VERIFY(mask & 8, "The IntegrationPointTransformation "
+               "for the element has not been configured for side 2.");
+   return Loc2;
+}
+
+void FaceElementTransformations::Transform(const IntegrationPoint &ip,
+                                           Vector &trans)
+{
+   MFEM_VERIFY(mask & 16, "The ElementTransformation "
+               "for the face has not been configured.");
+   IsoparametricTransformation::Transform(ip, trans);
+}
+
+void FaceElementTransformations::Transform(const IntegrationRule &ir,
+                                           DenseMatrix &tr)
+{
+   MFEM_VERIFY(mask & 16, "The ElementTransformation "
+               "for the face has not been configured.");
+   IsoparametricTransformation::Transform(ir, tr);
+}
+
+void FaceElementTransformations::Transform(const DenseMatrix &matrix,
+                                           DenseMatrix &result)
+{
+   MFEM_VERIFY(mask & 16, "The ElementTransformation "
+               "for the face has not been configured.");
+   IsoparametricTransformation::Transform(matrix, result);
+}
+
 }
diff --git a/fem/eltrans.hpp b/fem/eltrans.hpp
index 78daf9a733c..6fe35ee25f5 100644
--- a/fem/eltrans.hpp
+++ b/fem/eltrans.hpp
@@ -38,9 +38,12 @@ class ElementTransformation
    };
    Geometry::Type geom;
 
-   // Evaluate the Jacobian of the transformation at the IntPoint and store it
-   // in dFdx.
+   /** @brief Evaluate the Jacobian of the transformation at the IntPoint and
+       store it in dFdx. */
    virtual const DenseMatrix &EvalJacobian() = 0;
+
+   /** @brief Evaluate the Hessian of the transformation at the IntPoint and
+       store it in d2Fdx2. */
    virtual const DenseMatrix &EvalHessian() = 0;
 
    double EvalWeight();
@@ -48,18 +51,53 @@ class ElementTransformation
    const DenseMatrix &EvalInverseJ();
 
 public:
-   int Attribute, ElementNo;
+
+   /** This enumeration declares the values stored in
+       ElementTransformation::ElementType and indicates which group of objects
+       the index stored in ElementTransformation::ElementNo refers:
+
+       | ElementType | Range of ElementNo
+       +-------------+-------------------------
+       | ELEMENT     | [0, Mesh::GetNE()     )
+       | BDR_ELEMENT | [0, Mesh::GetNBE()    )
+       | EDGE        | [0, Mesh::GetNEdges() )
+       | FACE        | [0, Mesh::GetNFaces() )
+       | BDR_FACE    | [0, Mesh::GetNBE()    )
+   */
+   enum
+   {
+      ELEMENT     = 1,
+      BDR_ELEMENT = 2,
+      EDGE        = 3,
+      FACE        = 4,
+      BDR_FACE    = 5
+   };
+
+   int Attribute, ElementNo, ElementType;
 
    ElementTransformation();
 
+   /** @brief Set the integration point @a ip that weights and Jacobians will
+       be evaluated at. */
    void SetIntPoint(const IntegrationPoint *ip)
    { IntPoint = ip; EvalState = 0; }
+
+   /** @brief Get a const reference to the currently set integration point.  This
+       will return NULL if no integration point is set. */
    const IntegrationPoint &GetIntPoint() { return *IntPoint; }
 
+   /** @brief Transform integration point from reference coordinates to
+       physical coordinates and store them in the vector. */
    virtual void Transform(const IntegrationPoint &, Vector &) = 0;
+
+   /** @brief Transform all the integration points from the integration rule
+       from reference coordinates to physical
+       coordinates and store them as column vectors in the matrix. */
    virtual void Transform(const IntegrationRule &, DenseMatrix &) = 0;
 
-   /// Transform columns of 'matrix', store result in 'result'.
+   /** @brief Transform all the integration points from the column vectors
+       of @a matrix from reference coordinates to physical
+       coordinates and store them as column vectors in @a result. */
    virtual void Transform(const DenseMatrix &matrix, DenseMatrix &result) = 0;
 
    /** @brief Return the Jacobian matrix of the transformation at the currently
@@ -70,27 +108,44 @@ class ElementTransformation
    const DenseMatrix &Jacobian()
    { return (EvalState & JACOBIAN_MASK) ? dFdx : EvalJacobian(); }
 
+
+   /** @brief Return the Hessian matrix of the transformation at the currently
+       set IntegrationPoint, using the method SetIntPoint(). */
    const DenseMatrix &Hessian()
    { return (EvalState & HESSIAN_MASK) ? d2Fdx2 : EvalHessian(); }
 
+   /** @brief Return the weight of the Jacobian matrix of the transformation
+       at the currently set IntegrationPoint.
+       The Weight evaluates to \f$ \sqrt{\lvert J^T J \rvert} \f$. */
    double Weight() { return (EvalState & WEIGHT_MASK) ? Wght : EvalWeight(); }
 
+   /** @brief Return the adjugate of the Jacobian matrix of the transformation
+        at the currently set IntegrationPoint. */
    const DenseMatrix &AdjugateJacobian()
    { return (EvalState & ADJUGATE_MASK) ? adjJ : EvalAdjugateJ(); }
 
+   /** @brief Return the inverse of the Jacobian matrix of the transformation
+        at the currently set IntegrationPoint. */
    const DenseMatrix &InverseJacobian()
    { return (EvalState & INVERSE_MASK) ? invJ : EvalInverseJ(); }
 
+   /// Return the order of the current element we are using for the transformation.
    virtual int Order() const = 0;
+
+   /// Return the order of the elements of the Jacobian of the transformation.
    virtual int OrderJ() const = 0;
+
+   /** @brief Return the order of the determinant of the Jacobian (weight)
+       of the transformation. */
    virtual int OrderW() const = 0;
-   /// Order of adj(J)^t.grad(fi)
+
+   /// Return the order of \f$ adj(J)^T \nabla fi \f$
    virtual int OrderGrad(const FiniteElement *fe) const = 0;
 
    /// Return the Geometry::Type of the reference element.
    Geometry::Type GetGeometryType() const { return geom; }
 
-   /// Return the dimension of the reference element.
+   /// Return the topological dimension of the reference element.
    int GetDimension() const { return Geometry::Dimension[geom]; }
 
    /// Get the dimension of the target (physical) space.
@@ -286,7 +341,7 @@ class InverseElementTransformation
    virtual int Transform(const Vector &pt, IntegrationPoint &ip);
 };
 
-
+/// A standard isoparametric element transformation
 class IsoparametricTransformation : public ElementTransformation
 {
 private:
@@ -296,26 +351,29 @@ class IsoparametricTransformation : public ElementTransformation
    const FiniteElement *FElem;
    DenseMatrix PointMat; // dim x dof
 
-   // Evaluate the Jacobian of the transformation at the IntPoint and store it
-   // in dFdx.
+   /** @brief Evaluate the Jacobian of the transformation at the IntPoint and
+       store it in dFdx. */
    virtual const DenseMatrix &EvalJacobian();
    // Evaluate the Hessian of the transformation at the IntPoint and store it
    // in d2Fdx2.
    virtual const DenseMatrix &EvalHessian();
 public:
+   /// Set the element that will be used to compute the transformations
    void SetFE(const FiniteElement *FE) { FElem = FE; geom = FE->GetGeomType(); }
+
+   /// Get the current element used to compute the transformations
    const FiniteElement* GetFE() const { return FElem; }
 
    /// @brief Set the underlying point matrix describing the transformation.
    /** The dimensions of the matrix are space-dim x dof. The transformation is
        defined as
+           \f$ x = F( \hat x ) = P \phi( \hat x ) \f$
 
-           x = F(xh) = P . phi(xh),
-
-       where xh (x hat) is the reference point, x is the corresponding physical
-       point, P is the point matrix, and phi(xh) is the column-vector of all
-       basis functions evaluated at xh. The columns of P represent the control
-       points in physical space defining the transformation. */
+       where \f$ \hat x \f$  is the reference point, @a x is the corresponding
+       physical point, @a P is the point matrix, and \f$ \phi( \hat x ) \f$ is
+       the column-vector of all basis functions evaluated at \f$ \hat x \f$ .
+       The columns of @a P represent the control points in physical space
+       defining the transformation. */
    void SetPointMat(const DenseMatrix &pm) { PointMat = pm; }
 
    /// Return the stored point matrix.
@@ -324,19 +382,44 @@ class IsoparametricTransformation : public ElementTransformation
    /// Write access to the stored point matrix. Use with caution.
    DenseMatrix &GetPointMat() { return PointMat; }
 
+   /// Set the FiniteElement Geometry for the reference elements being used.
    void SetIdentityTransformation(Geometry::Type GeomType);
 
+   /** @brief Transform integration point from reference coordinates to
+       physical coordinates and store them in the vector. */
    virtual void Transform(const IntegrationPoint &, Vector &);
+
+   /** @brief Transform all the integration points from the integration rule
+       from reference coordinates to physical
+      coordinates and store them as column vectors in the matrix. */
    virtual void Transform(const IntegrationRule &, DenseMatrix &);
+
+   /** @brief Transform all the integration points from the column vectors
+       of @a matrix from reference coordinates to physical
+       coordinates and store them as column vectors in @a result. */
    virtual void Transform(const DenseMatrix &matrix, DenseMatrix &result);
 
+   /// Return the order of the current element we are using for the transformation.
    virtual int Order() const { return FElem->GetOrder(); }
+
+   /// Return the order of the elements of the Jacobian of the transformation.
    virtual int OrderJ() const;
+
+   /** @brief Return the order of the determinant of the Jacobian (weight)
+       of the transformation. */
    virtual int OrderW() const;
+
+   /// Return the order of \f$ adj(J)^T \nabla fi \f$
    virtual int OrderGrad(const FiniteElement *fe) const;
 
    virtual int GetSpaceDim() const { return PointMat.Height(); }
 
+   /** @brief Transform a point @a pt from physical space to a point @a ip in
+       reference space. */
+   /** Attempt to find the IntegrationPoint that is transformed into the given
+       point in physical space. If the inversion fails a non-zero value is
+       returned. This method is not 100 percent reliable for non-linear
+       transformations. */
    virtual int TransformBack(const Vector & v, IntegrationPoint & ip)
    {
       InverseElementTransformation inv_tr(this);
@@ -356,15 +439,62 @@ class IntegrationPointTransformation
    void Transform (const IntegrationRule  &, IntegrationRule  &);
 };
 
-class FaceElementTransformations
+
+class FaceElementTransformations : public IsoparametricTransformation
 {
+private:
+   int mask;
+
+   IntegrationPoint eip1, eip2;
+
 public:
-   int Elem1No, Elem2No, FaceGeom;
-   ElementTransformation *Elem1, *Elem2, *Face;
+   int Elem1No, Elem2No;
+   Geometry::Type &FaceGeom; ///< @deprecated Use GetGeometryType instead
+   ElementTransformation *Elem1, *Elem2;
+   ElementTransformation *Face; ///< @deprecated No longer necessary
    IntegrationPointTransformation Loc1, Loc2;
+
+   FaceElementTransformations() : FaceGeom(geom), Face(this) {}
+
+   /** @brief Method to set the geometry type of the face.
+
+       @note This method is designed to be used when
+       [Par]Mesh::GetFaceTransformation will not be called i.e. when the face
+       transformation will not be needed but the neighboring element
+       transformations will be.  Using this method to override the GeometryType
+       should only be done with great care.
+   */
+   void SetGeometryType(Geometry::Type g) { geom = g; }
+
+   /// Set the mask indicating which portions of the object have been setup
+   /** The argument @a m is a bitmask used in
+       Mesh::GetFaceElementTransformations to indicate which portions of the
+       FaceElement Transformations object have been configured.
+
+       mask &  1: Elem1 is configured
+       mask &  2: Elem2 is configured
+       mask &  4: Loc1 is configured
+       mask &  8: Loc2 is configured
+       mask & 16: The Face transformation itself is configured
+   */
+   void SetConfigurationMask(int m) { mask = m; }
+   int  GetConfigurationMask() const { return mask; }
+
+   /** @brief Set the integration point in the Face and the two neighboring
+       elements, if present. */
+   void SetIntPoint(const IntegrationPoint *ip);
+
+   virtual void Transform(const IntegrationPoint &, Vector &);
+   virtual void Transform(const IntegrationRule &, DenseMatrix &);
+   virtual void Transform(const DenseMatrix &matrix, DenseMatrix &result);
+
+   ElementTransformation & GetElement1Transformation();
+   ElementTransformation & GetElement2Transformation();
+   IntegrationPointTransformation & GetIntPoint1Transformation();
+   IntegrationPointTransformation & GetIntPoint2Transformation();
 };
 
-/*                 Elem1(Loc1(x)) = Face(x) = Elem2(Loc2(x))
+/**                Elem1(Loc1(x)) = Face(x) = Elem2(Loc2(x))
 
 
                                 Physical Space
diff --git a/fem/estimators.cpp b/fem/estimators.cpp
index 47851c4e82f..a0289b5ccab 100644
--- a/fem/estimators.cpp
+++ b/fem/estimators.cpp
@@ -50,4 +50,21 @@ void L2ZienkiewiczZhuEstimator::ComputeEstimates()
 
 #endif // MFEM_USE_MPI
 
+void LpErrorEstimator::ComputeEstimates()
+{
+   MFEM_VERIFY(coef != NULL || vcoef != NULL,
+               "LpErrorEstimator has no coefficient!  Call SetCoef first.");
+
+   error_estimates.SetSize(sol->FESpace()->GetMesh()->GetNE());
+   if (coef)
+   {
+      sol->ComputeElementLpErrors(local_norm_p, *coef, error_estimates);
+   }
+   else
+   {
+      sol->ComputeElementLpErrors(local_norm_p, *vcoef, error_estimates);
+   }
+   current_sequence = sol->FESpace()->GetMesh()->GetSequence();
+}
+
 } // namespace mfem
diff --git a/fem/estimators.hpp b/fem/estimators.hpp
index 75c4eba8256..16f66024dee 100644
--- a/fem/estimators.hpp
+++ b/fem/estimators.hpp
@@ -45,6 +45,7 @@ class ErrorEstimator : public AbstractErrorEstimator
    /// Force recomputation of the estimates on the next call to GetLocalErrors.
    virtual void Reset() = 0;
 
+   /// Destruct the error estimator
    virtual ~ErrorEstimator() { }
 };
 
@@ -66,6 +67,14 @@ class AnisotropicErrorEstimator : public ErrorEstimator
 /** @brief The ZienkiewiczZhuEstimator class implements the Zienkiewicz-Zhu
     error estimation procedure.
 
+    Zienkiewicz, O.C. and Zhu, J.Z., The superconvergent patch recovery
+    and a posteriori error estimates. Part 1: The recovery technique.
+    Int. J. Num. Meth. Engng. 33, 1331-1364 (1992).
+
+    Zienkiewicz, O.C. and Zhu, J.Z., The superconvergent patch recovery
+    and a posteriori error estimates. Part 2: Error estimates and adaptivity.
+    Int. J. Num. Meth. Engng. 33, 1365-1382 (1992).
+
     The required BilinearFormIntegrator must implement the methods
     ComputeElementFlux() and ComputeFluxEnergy().
  */
@@ -217,6 +226,7 @@ class L2ZienkiewiczZhuEstimator : public ErrorEstimator
       class when needed.*/
    bool own_flux_fes; ///< Ownership flag for flux_space and smooth_flux_space.
 
+   /// Initialize with the integrator, solution, and flux finite element spaces.
    void Init(BilinearFormIntegrator &integ,
              ParGridFunction &sol,
              ParFiniteElementSpace *flux_fes,
@@ -304,6 +314,88 @@ class L2ZienkiewiczZhuEstimator : public ErrorEstimator
 
 #endif // MFEM_USE_MPI
 
+/** @brief The LpErrorEstimator class compares the solution to a known
+    coefficient.
+
+    This class can be used, for example, to adapt a mesh to a non-trivial
+    initial condition in a time-dependent simulation. It can also be used to
+    force refinement in the neighborhood of small features before switching to a
+    more traditional error estimator.
+
+    The LpErrorEstimator supports either scalar or vector coefficients and works
+    both in serial and in parallel.
+*/
+class LpErrorEstimator : public ErrorEstimator
+{
+protected:
+   long current_sequence;
+   int local_norm_p;
+   Vector error_estimates;
+
+   Coefficient * coef;
+   VectorCoefficient * vcoef;
+   GridFunction * sol;
+
+   /// Check if the mesh of the solution was modified.
+   bool MeshIsModified()
+   {
+      long mesh_sequence = sol->FESpace()->GetMesh()->GetSequence();
+      MFEM_ASSERT(mesh_sequence >= current_sequence, "");
+      return (mesh_sequence > current_sequence);
+   }
+
+   /// Compute the element error estimates.
+   void ComputeEstimates();
+
+public:
+   /** @brief Construct a new LpErrorEstimator object for a scalar field.
+       @param p    Integer which selects which Lp norm to use.
+       @param sol  The GridFunction representation of the scalar field.
+       Note: the coefficient must be set before use with the SetCoef method.
+   */
+   LpErrorEstimator(int p, GridFunction &sol)
+      : current_sequence(-1), local_norm_p(p),
+        error_estimates(0), coef(NULL), vcoef(NULL), sol(&sol) { }
+
+   /** @brief Construct a new LpErrorEstimator object for a scalar field.
+       @param p    Integer which selects which Lp norm to use.
+       @param coef The scalar Coefficient to compare to the solution.
+       @param sol  The GridFunction representation of the scalar field.
+   */
+   LpErrorEstimator(int p, Coefficient &coef, GridFunction &sol)
+      : current_sequence(-1), local_norm_p(p),
+        error_estimates(0), coef(&coef), vcoef(NULL), sol(&sol) { }
+
+   /** @brief Construct a new LpErrorEstimator object for a vector field.
+       @param p    Integer which selects which Lp norm to use.
+       @param coef The vector VectorCoefficient to compare to the solution.
+       @param sol  The GridFunction representation of the vector field.
+   */
+   LpErrorEstimator(int p, VectorCoefficient &coef, GridFunction &sol)
+      : current_sequence(-1), local_norm_p(p),
+        error_estimates(0), coef(NULL), vcoef(&coef), sol(&sol) { }
+
+   /** @brief Set the exponent, p, of the Lp norm used for computing the local
+       element errors. */
+   void SetLocalErrorNormP(int p) { local_norm_p = p; }
+
+   void SetCoef(Coefficient &A) { coef = &A; }
+   void SetCoef(VectorCoefficient &A) { vcoef = &A; }
+
+   /// Reset the error estimator.
+   virtual void Reset() { current_sequence = -1; }
+
+   /// Get a Vector with all element errors.
+   virtual const Vector &GetLocalErrors()
+   {
+      if (MeshIsModified()) { ComputeEstimates(); }
+      return error_estimates;
+   }
+
+   /// Destructor
+   virtual ~LpErrorEstimator() {}
+};
+
 } // namespace mfem
 
 #endif // MFEM_ERROR_ESTIMATORS
diff --git a/fem/fe.cpp b/fem/fe.cpp
index 5a3f16f78f4..46e657a2794 100644
--- a/fem/fe.cpp
+++ b/fem/fe.cpp
@@ -25,15 +25,15 @@ using namespace std;
 FiniteElement::FiniteElement(int D, Geometry::Type G, int Do, int O, int F)
    : Nodes(Do)
 {
-   Dim = D ; GeomType = G ; Dof = Do ; Order = O ; FuncSpace = F;
-   RangeType = SCALAR;
-   MapType = VALUE;
-   DerivType = NONE;
-   DerivRangeType = SCALAR;
-   DerivMapType = VALUE;
-   for (int i = 0; i < Geometry::MaxDim; i++) { Orders[i] = -1; }
+   dim = D ; geom_type = G ; dof = Do ; order = O ; func_space = F;
+   range_type = SCALAR;
+   map_type = VALUE;
+   deriv_type = NONE;
+   deriv_range_type = SCALAR;
+   deriv_map_type = VALUE;
+   for (int i = 0; i < Geometry::MaxDim; i++) { orders[i] = -1; }
 #ifndef MFEM_THREAD_SAFE
-   vshape.SetSize(Dof, Dim);
+   vshape.SetSize(dof, dim);
 #endif
 }
 
@@ -75,12 +75,12 @@ void FiniteElement::CalcCurlShape(const IntegrationPoint &ip,
 void FiniteElement::CalcPhysCurlShape(ElementTransformation &Trans,
                                       DenseMatrix &curl_shape) const
 {
-   switch (Dim)
+   switch (dim)
    {
       case 3:
       {
 #ifdef MFEM_THREAD_SAFE
-         DenseMatrix vshape(Dof, Dim);
+         DenseMatrix vshape(dof, dim);
 #endif
          CalcCurlShape(Trans.GetIntPoint(), vshape);
          MultABt(vshape, Trans.Jacobian(), curl_shape);
@@ -93,7 +93,7 @@ void FiniteElement::CalcPhysCurlShape(ElementTransformation &Trans,
          curl_shape *= (1.0 / Trans.Weight());
          break;
       default:
-         MFEM_ABORT("Invalid dimension, Dim = " << Dim);
+         MFEM_ABORT("Invalid dimension, Dim = " << dim);
    }
 }
 
@@ -186,7 +186,7 @@ void FiniteElement::CalcPhysShape(ElementTransformation &Trans,
                                   Vector &shape) const
 {
    CalcShape(Trans.GetIntPoint(), shape);
-   if (MapType == INTEGRAL)
+   if (map_type == INTEGRAL)
    {
       shape /= Trans.Weight();
    }
@@ -195,9 +195,9 @@ void FiniteElement::CalcPhysShape(ElementTransformation &Trans,
 void FiniteElement::CalcPhysDShape(ElementTransformation &Trans,
                                    DenseMatrix &dshape) const
 {
-   MFEM_ASSERT(MapType == VALUE, "");
+   MFEM_ASSERT(map_type == VALUE, "");
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
+   DenseMatrix vshape(dof, dim);
 #endif
    CalcDShape(Trans.GetIntPoint(), vshape);
    Mult(vshape, Trans.InverseJacobian(), dshape);
@@ -206,7 +206,7 @@ void FiniteElement::CalcPhysDShape(ElementTransformation &Trans,
 void FiniteElement::CalcPhysLaplacian(ElementTransformation &Trans,
                                       Vector &Laplacian) const
 {
-   MFEM_ASSERT(MapType == VALUE, "");
+   MFEM_ASSERT(map_type == VALUE, "");
 
    // Simpler routine if mapping is affine
    if (Trans.Hessian().FNorm2() < 1e-20)
@@ -216,27 +216,27 @@ void FiniteElement::CalcPhysLaplacian(ElementTransformation &Trans,
    }
 
    // Compute full Hessian first if non-affine
-   int size = (Dim*(Dim+1))/2;
-   DenseMatrix hess(Dof, size);
+   int size = (dim*(dim+1))/2;
+   DenseMatrix hess(dof, size);
    CalcPhysHessian(Trans,hess);
 
-   if (Dim == 3)
+   if (dim == 3)
    {
-      for (int nd = 0; nd < Dof; nd++)
+      for (int nd = 0; nd < dof; nd++)
       {
          Laplacian[nd] = hess(nd,0) + hess(nd,4) + hess(nd,5);
       }
    }
-   else if (Dim == 2)
+   else if (dim == 2)
    {
-      for (int nd = 0; nd < Dof; nd++)
+      for (int nd = 0; nd < dof; nd++)
       {
          Laplacian[nd] = hess(nd,0) + hess(nd,2);
       }
    }
    else
    {
-      for (int nd = 0; nd < Dof; nd++)
+      for (int nd = 0; nd < dof; nd++)
       {
          Laplacian[nd] = hess(nd,0);
       }
@@ -248,16 +248,16 @@ void FiniteElement::CalcPhysLaplacian(ElementTransformation &Trans,
 void FiniteElement::CalcPhysLinLaplacian(ElementTransformation &Trans,
                                          Vector &Laplacian) const
 {
-   MFEM_ASSERT(MapType == VALUE, "");
-   int size = (Dim*(Dim+1))/2;
-   DenseMatrix hess(Dof, size);
-   DenseMatrix Gij(Dim,Dim);
+   MFEM_ASSERT(map_type == VALUE, "");
+   int size = (dim*(dim+1))/2;
+   DenseMatrix hess(dof, size);
+   DenseMatrix Gij(dim,dim);
    Vector scale(size);
 
    CalcHessian (Trans.GetIntPoint(), hess);
    MultAAt(Trans.InverseJacobian(), Gij);
 
-   if (Dim == 3)
+   if (dim == 3)
    {
       scale[0] =   Gij(0,0);
       scale[1] = 2*Gij(0,1);
@@ -268,7 +268,7 @@ void FiniteElement::CalcPhysLinLaplacian(ElementTransformation &Trans,
 
       scale[5] =   Gij(1,1);
    }
-   else if (Dim == 2)
+   else if (dim == 2)
    {
       scale[0] =   Gij(0,0);
       scale[1] = 2*Gij(0,1);
@@ -279,7 +279,7 @@ void FiniteElement::CalcPhysLinLaplacian(ElementTransformation &Trans,
       scale[0] =   Gij(0,0);
    }
 
-   for (int nd = 0; nd < Dof; nd++)
+   for (int nd = 0; nd < dof; nd++)
    {
       Laplacian[nd] = 0.0;
       for (int ii = 0; ii < size; ii++)
@@ -293,11 +293,11 @@ void FiniteElement::CalcPhysLinLaplacian(ElementTransformation &Trans,
 void  FiniteElement::CalcPhysHessian(ElementTransformation &Trans,
                                      DenseMatrix& Hessian) const
 {
-   MFEM_ASSERT(MapType == VALUE, "");
+   MFEM_ASSERT(map_type == VALUE, "");
 
    // Roll 2-Tensors in vectors and 4-Tensor in Matrix, exploiting symmetry
-   Array<int> map(Dim*Dim);
-   if (Dim == 3)
+   Array<int> map(dim*dim);
+   if (dim == 3)
    {
       map[0] = 0;
       map[1] = 1;
@@ -311,7 +311,7 @@ void  FiniteElement::CalcPhysHessian(ElementTransformation &Trans,
       map[7] = 3;
       map[8] = 4;
    }
-   else if (Dim == 2)
+   else if (dim == 2)
    {
       map[0] = 0;
       map[1] = 1;
@@ -325,16 +325,16 @@ void  FiniteElement::CalcPhysHessian(ElementTransformation &Trans,
    }
 
    // Hessian in ref coords
-   int size = (Dim*(Dim+1))/2;
-   DenseMatrix hess(Dof, size);
+   int size = (dim*(dim+1))/2;
+   DenseMatrix hess(dof, size);
    CalcHessian(Trans.GetIntPoint(), hess);
 
    // Gradient in physical coords
    if (Trans.Hessian().FNorm2() > 1e-10)
    {
-      DenseMatrix grad(Dof, Dim);
+      DenseMatrix grad(dof, dim);
       CalcPhysDShape(Trans, grad);
-      DenseMatrix gmap(Dof, size);
+      DenseMatrix gmap(dof, size);
       Mult(grad,Trans.Hessian(),gmap);
       hess -= gmap;
    }
@@ -343,15 +343,15 @@ void  FiniteElement::CalcPhysHessian(ElementTransformation &Trans,
    DenseMatrix lhm(size,size);
    DenseMatrix invJ = Trans.Jacobian();
    lhm = 0.0;
-   for (int i = 0; i < Dim; i++)
+   for (int i = 0; i < dim; i++)
    {
-      for (int j = 0; j < Dim; j++)
+      for (int j = 0; j < dim; j++)
       {
-         for (int k = 0; k < Dim; k++)
+         for (int k = 0; k < dim; k++)
          {
-            for (int l = 0; l < Dim; l++)
+            for (int l = 0; l < dim; l++)
             {
-               lhm(map[i*Dim+j],map[k*Dim+l]) += invJ(i,k)*invJ(j,l);
+               lhm(map[i*dim+j],map[k*dim+l]) += invJ(i,k)*invJ(j,l);
             }
          }
       }
@@ -359,7 +359,7 @@ void  FiniteElement::CalcPhysHessian(ElementTransformation &Trans,
    // Correct multiplicity
    Vector mult(size);
    mult = 0.0;
-   for (int i = 0; i < Dim*Dim; i++) { mult[map[i]]++; }
+   for (int i = 0; i < dim*dim; i++) { mult[map[i]]++; }
    lhm.InvRightScaling(mult);
 
    // Hessian in physical coords
@@ -389,31 +389,31 @@ void ScalarFiniteElement::NodalLocalInterpolation (
    const ScalarFiniteElement &fine_fe) const
 {
    double v[Geometry::MaxDim];
-   Vector vv (v, Dim);
+   Vector vv (v, dim);
    IntegrationPoint f_ip;
 
 #ifdef MFEM_THREAD_SAFE
-   Vector c_shape(Dof);
+   Vector c_shape(dof);
 #endif
 
-   MFEM_ASSERT(MapType == fine_fe.GetMapType(), "");
+   MFEM_ASSERT(map_type == fine_fe.GetMapType(), "");
 
-   I.SetSize(fine_fe.Dof, Dof);
-   for (int i = 0; i < fine_fe.Dof; i++)
+   I.SetSize(fine_fe.dof, dof);
+   for (int i = 0; i < fine_fe.dof; i++)
    {
       Trans.Transform(fine_fe.Nodes.IntPoint(i), vv);
-      f_ip.Set(v, Dim);
+      f_ip.Set(v, dim);
       CalcShape(f_ip, c_shape);
-      for (int j = 0; j < Dof; j++)
+      for (int j = 0; j < dof; j++)
          if (fabs(I(i,j) = c_shape(j)) < 1.0e-12)
          {
             I(i,j) = 0.0;
          }
    }
-   if (MapType == INTEGRAL)
+   if (map_type == INTEGRAL)
    {
       // assuming Trans is linear; this should be ok for all refinement types
-      Trans.SetIntPoint(&Geometries.GetCenter(GeomType));
+      Trans.SetIntPoint(&Geometries.GetCenter(geom_type));
       I *= Trans.Weight();
    }
 }
@@ -425,11 +425,11 @@ void ScalarFiniteElement::ScalarLocalInterpolation(
    // General "interpolation", defined by L2 projection
 
    double v[Geometry::MaxDim];
-   Vector vv (v, Dim);
+   Vector vv (v, dim);
    IntegrationPoint f_ip;
 
    const int fs = fine_fe.GetDof(), cs = this->GetDof();
-   I.SetSize(fs, cs);
+   I.SetSize(fs, cs );
    Vector fine_shape(fs), coarse_shape(cs);
    DenseMatrix fine_mass(fs), fine_coarse_mass(fs, cs); // initialized with 0
    const int ir_order = GetOrder() + fine_fe.GetOrder();
@@ -440,7 +440,7 @@ void ScalarFiniteElement::ScalarLocalInterpolation(
       const IntegrationPoint &ip = ir.IntPoint(i);
       fine_fe.CalcShape(ip, fine_shape);
       Trans.Transform(ip, vv);
-      f_ip.Set(v, Dim);
+      f_ip.Set(v, dim);
       this->CalcShape(f_ip, coarse_shape);
 
       AddMult_a_VVt(ip.weight, fine_shape, fine_mass);
@@ -450,10 +450,10 @@ void ScalarFiniteElement::ScalarLocalInterpolation(
    DenseMatrixInverse fine_mass_inv(fine_mass);
    fine_mass_inv.Mult(fine_coarse_mass, I);
 
-   if (MapType == INTEGRAL)
+   if (map_type == INTEGRAL)
    {
       // assuming Trans is linear; this should be ok for all refinement types
-      Trans.SetIntPoint(&Geometries.GetCenter(GeomType));
+      Trans.SetIntPoint(&Geometries.GetCenter(geom_type));
       I *= Trans.Weight();
    }
 }
@@ -474,30 +474,30 @@ const DofToQuad &ScalarFiniteElement::GetDofToQuad(const IntegrationRule &ir,
    d2q->FE = this;
    d2q->IntRule = &ir;
    d2q->mode = mode;
-   d2q->ndof = Dof;
+   d2q->ndof = dof;
    d2q->nqpt = nqpt;
-   d2q->B.SetSize(nqpt*Dof);
-   d2q->Bt.SetSize(Dof*nqpt);
-   d2q->G.SetSize(nqpt*Dim*Dof);
-   d2q->Gt.SetSize(Dof*nqpt*Dim);
+   d2q->B.SetSize(nqpt*dof);
+   d2q->Bt.SetSize(dof*nqpt);
+   d2q->G.SetSize(nqpt*dim*dof);
+   d2q->Gt.SetSize(dof*nqpt*dim);
 #ifdef MFEM_THREAD_SAFE
-   Vector c_shape(Dof);
-   DenseMatrix vshape(Dof, Dim);
+   Vector c_shape(dof);
+   DenseMatrix vshape(dof, dim);
 #endif
    for (int i = 0; i < nqpt; i++)
    {
       const IntegrationPoint &ip = ir.IntPoint(i);
       CalcShape(ip, c_shape);
-      for (int j = 0; j < Dof; j++)
+      for (int j = 0; j < dof; j++)
       {
-         d2q->B[i+nqpt*j] = d2q->Bt[j+Dof*i] = c_shape(j);
+         d2q->B[i+nqpt*j] = d2q->Bt[j+dof*i] = c_shape(j);
       }
       CalcDShape(ip, vshape);
-      for (int d = 0; d < Dim; d++)
+      for (int d = 0; d < dim; d++)
       {
-         for (int j = 0; j < Dof; j++)
+         for (int j = 0; j < dof; j++)
          {
-            d2q->G[i+nqpt*(d+Dim*j)] = d2q->Gt[j+Dof*(i+nqpt*d)] = vshape(j,d);
+            d2q->G[i+nqpt*(d+dim*j)] = d2q->Gt[j+dof*(i+nqpt*d)] = vshape(j,d);
          }
       }
    }
@@ -520,8 +520,8 @@ const DofToQuad &ScalarFiniteElement::GetTensorDofToQuad(
 
    DofToQuad *d2q = new DofToQuad;
    const Poly_1D::Basis &basis_1d = tb.GetBasis1D();
-   const int ndof = Order + 1;
-   const int nqpt = (int)floor(pow(ir.GetNPoints(), 1.0/Dim) + 0.5);
+   const int ndof = order + 1;
+   const int nqpt = (int)floor(pow(ir.GetNPoints(), 1.0/dim) + 0.5);
    d2q->FE = this;
    d2q->IntRule = &ir;
    d2q->mode = mode;
@@ -556,8 +556,8 @@ void NodalFiniteElement::ProjectCurl_2D(
 
    DenseMatrix curl_shape(fe.GetDof(), 1);
 
-   curl.SetSize(Dof, fe.GetDof());
-   for (int i = 0; i < Dof; i++)
+   curl.SetSize(dof, fe.GetDof());
+   for (int i = 0; i < dof; i++)
    {
       fe.CalcCurlShape(Nodes.IntPoint(i), curl_shape);
       for (int j = 0; j < fe.GetDof(); j++)
@@ -587,18 +587,18 @@ void NodalFiniteElement::GetLocalRestriction(ElementTransformation &Trans,
                                              DenseMatrix &R) const
 {
    IntegrationPoint ipt;
-   Vector pt(&ipt.x, Dim);
+   Vector pt(&ipt.x, dim);
 
 #ifdef MFEM_THREAD_SAFE
-   Vector c_shape(Dof);
+   Vector c_shape(dof);
 #endif
 
    Trans.SetIntPoint(&Nodes[0]);
 
-   for (int j = 0; j < Dof; j++)
+   for (int j = 0; j < dof; j++)
    {
       InvertLinearTrans(Trans, Nodes[j], pt);
-      if (Geometries.CheckPoint(GeomType, ipt)) // do we need an epsilon here?
+      if (Geometries.CheckPoint(geom_type, ipt)) // do we need an epsilon here?
       {
          CalcShape(ipt, c_shape);
          R.SetRow(j, c_shape);
@@ -615,14 +615,14 @@ void NodalFiniteElement::GetLocalRestriction(ElementTransformation &Trans,
 void NodalFiniteElement::Project (
    Coefficient &coeff, ElementTransformation &Trans, Vector &dofs) const
 {
-   for (int i = 0; i < Dof; i++)
+   for (int i = 0; i < dof; i++)
    {
       const IntegrationPoint &ip = Nodes.IntPoint(i);
       // some coefficients expect that Trans.IntPoint is the same
       // as the second argument of Eval
       Trans.SetIntPoint(&ip);
       dofs(i) = coeff.Eval (Trans, ip);
-      if (MapType == INTEGRAL)
+      if (map_type == INTEGRAL)
       {
          dofs(i) *= Trans.Weight();
       }
@@ -632,21 +632,21 @@ void NodalFiniteElement::Project (
 void NodalFiniteElement::Project (
    VectorCoefficient &vc, ElementTransformation &Trans, Vector &dofs) const
 {
-   MFEM_ASSERT(dofs.Size() == vc.GetVDim()*Dof, "");
+   MFEM_ASSERT(dofs.Size() == vc.GetVDim()*dof, "");
    Vector x(vc.GetVDim());
 
-   for (int i = 0; i < Dof; i++)
+   for (int i = 0; i < dof; i++)
    {
       const IntegrationPoint &ip = Nodes.IntPoint(i);
       Trans.SetIntPoint(&ip);
       vc.Eval (x, Trans, ip);
-      if (MapType == INTEGRAL)
+      if (map_type == INTEGRAL)
       {
          x *= Trans.Weight();
       }
       for (int j = 0; j < x.Size(); j++)
       {
-         dofs(Dof*j+i) = x(j);
+         dofs(dof*j+i) = x(j);
       }
    }
 }
@@ -654,20 +654,20 @@ void NodalFiniteElement::Project (
 void NodalFiniteElement::ProjectMatrixCoefficient(
    MatrixCoefficient &mc, ElementTransformation &T, Vector &dofs) const
 {
-   // (mc.height x mc.width) @ DOFs -> (Dof x mc.width x mc.height) in dofs
-   MFEM_ASSERT(dofs.Size() == mc.GetHeight()*mc.GetWidth()*Dof, "");
+   // (mc.height x mc.width) @ DOFs -> (dof x mc.width x mc.height) in dofs
+   MFEM_ASSERT(dofs.Size() == mc.GetHeight()*mc.GetWidth()*dof, "");
    DenseMatrix MQ(mc.GetHeight(), mc.GetWidth());
 
-   for (int k = 0; k < Dof; k++)
+   for (int k = 0; k < dof; k++)
    {
       T.SetIntPoint(&Nodes.IntPoint(k));
       mc.Eval(MQ, T, Nodes.IntPoint(k));
-      if (MapType == INTEGRAL) { MQ *= T.Weight(); }
+      if (map_type == INTEGRAL) { MQ *= T.Weight(); }
       for (int r = 0; r < MQ.Height(); r++)
       {
          for (int d = 0; d < MQ.Width(); d++)
          {
-            dofs(k+Dof*(d+MQ.Width()*r)) = MQ(r,d);
+            dofs(k+dof*(d+MQ.Width()*r)) = MQ(r,d);
          }
       }
    }
@@ -678,12 +678,12 @@ void NodalFiniteElement::Project(
 {
    if (fe.GetRangeType() == SCALAR)
    {
-      MFEM_ASSERT(MapType == fe.GetMapType(), "");
+      MFEM_ASSERT(map_type == fe.GetMapType(), "");
 
       Vector shape(fe.GetDof());
 
-      I.SetSize(Dof, fe.GetDof());
-      for (int k = 0; k < Dof; k++)
+      I.SetSize(dof, fe.GetDof());
+      for (int k = 0; k < dof; k++)
       {
          fe.CalcShape(Nodes.IntPoint(k), shape);
          for (int j = 0; j < shape.Size(); j++)
@@ -696,19 +696,19 @@ void NodalFiniteElement::Project(
    {
       DenseMatrix vshape(fe.GetDof(), Trans.GetSpaceDim());
 
-      I.SetSize(vshape.Width()*Dof, fe.GetDof());
-      for (int k = 0; k < Dof; k++)
+      I.SetSize(vshape.Width()*dof, fe.GetDof());
+      for (int k = 0; k < dof; k++)
       {
          Trans.SetIntPoint(&Nodes.IntPoint(k));
          fe.CalcVShape(Trans, vshape);
-         if (MapType == INTEGRAL)
+         if (map_type == INTEGRAL)
          {
             vshape *= Trans.Weight();
          }
          for (int j = 0; j < vshape.Height(); j++)
             for (int d = 0; d < vshape.Width(); d++)
             {
-               I(k+d*Dof,j) = vshape(j,d);
+               I(k+d*dof,j) = vshape(j,d);
             }
       }
    }
@@ -719,26 +719,26 @@ void NodalFiniteElement::ProjectGrad(
    DenseMatrix &grad) const
 {
    MFEM_ASSERT(fe.GetMapType() == VALUE, "");
-   MFEM_ASSERT(Trans.GetSpaceDim() == Dim, "")
+   MFEM_ASSERT(Trans.GetSpaceDim() == dim, "")
 
-   DenseMatrix dshape(fe.GetDof(), Dim), grad_k(fe.GetDof(), Dim), Jinv(Dim);
+   DenseMatrix dshape(fe.GetDof(), dim), grad_k(fe.GetDof(), dim), Jinv(dim);
 
-   grad.SetSize(Dim*Dof, fe.GetDof());
-   for (int k = 0; k < Dof; k++)
+   grad.SetSize(dim*dof, fe.GetDof());
+   for (int k = 0; k < dof; k++)
    {
       const IntegrationPoint &ip = Nodes.IntPoint(k);
       fe.CalcDShape(ip, dshape);
       Trans.SetIntPoint(&ip);
       CalcInverse(Trans.Jacobian(), Jinv);
       Mult(dshape, Jinv, grad_k);
-      if (MapType == INTEGRAL)
+      if (map_type == INTEGRAL)
       {
          grad_k *= Trans.Weight();
       }
       for (int j = 0; j < grad_k.Height(); j++)
-         for (int d = 0; d < Dim; d++)
+         for (int d = 0; d < dim; d++)
          {
-            grad(k+d*Dof,j) = grad_k(j,d);
+            grad(k+d*dof,j) = grad_k(j,d);
          }
    }
 }
@@ -750,12 +750,12 @@ void NodalFiniteElement::ProjectDiv(
    double detJ;
    Vector div_shape(fe.GetDof());
 
-   div.SetSize(Dof, fe.GetDof());
-   for (int k = 0; k < Dof; k++)
+   div.SetSize(dof, fe.GetDof());
+   for (int k = 0; k < dof; k++)
    {
       const IntegrationPoint &ip = Nodes.IntPoint(k);
       fe.CalcDivShape(ip, div_shape);
-      if (MapType == VALUE)
+      if (map_type == VALUE)
       {
          Trans.SetIntPoint(&ip);
          detJ = Trans.Weight();
@@ -778,7 +778,7 @@ void NodalFiniteElement::ProjectDiv(
 void PositiveFiniteElement::Project(
    Coefficient &coeff, ElementTransformation &Trans, Vector &dofs) const
 {
-   for (int i = 0; i < Dof; i++)
+   for (int i = 0; i < dof; i++)
    {
       const IntegrationPoint &ip = Nodes.IntPoint(i);
       Trans.SetIntPoint(&ip);
@@ -789,17 +789,17 @@ void PositiveFiniteElement::Project(
 void PositiveFiniteElement::Project(
    VectorCoefficient &vc, ElementTransformation &Trans, Vector &dofs) const
 {
-   MFEM_ASSERT(dofs.Size() == vc.GetVDim()*Dof, "");
+   MFEM_ASSERT(dofs.Size() == vc.GetVDim()*dof, "");
    Vector x(vc.GetVDim());
 
-   for (int i = 0; i < Dof; i++)
+   for (int i = 0; i < dof; i++)
    {
       const IntegrationPoint &ip = Nodes.IntPoint(i);
       Trans.SetIntPoint(&ip);
       vc.Eval (x, Trans, ip);
       for (int j = 0; j < x.Size(); j++)
       {
-         dofs(Dof*j+i) = x(j);
+         dofs(dof*j+i) = x(j);
       }
    }
 }
@@ -810,7 +810,7 @@ void PositiveFiniteElement::Project(
    const NodalFiniteElement *nfe =
       dynamic_cast<const NodalFiniteElement *>(&fe);
 
-   if (nfe && Dof == nfe->GetDof())
+   if (nfe && dof == nfe->GetDof())
    {
       nfe->Project(*this, Trans, I);
       I.Invert();
@@ -825,7 +825,7 @@ void PositiveFiniteElement::Project(
       mass_integ.AssembleElementMatrix2(fe, *this, Trans, mixed_mass);
 
       DenseMatrixInverse pos_mass_inv(pos_mass);
-      I.SetSize(Dof, fe.GetDof());
+      I.SetSize(dof, fe.GetDof());
       pos_mass_inv.Mult(mixed_mass, I);
    }
 }
@@ -847,47 +847,47 @@ void VectorFiniteElement::CalcDShape (
 
 void VectorFiniteElement::SetDerivMembers()
 {
-   switch (MapType)
+   switch (map_type)
    {
       case H_DIV:
-         DerivType = DIV;
-         DerivRangeType = SCALAR;
-         DerivMapType = INTEGRAL;
+         deriv_type = DIV;
+         deriv_range_type = SCALAR;
+         deriv_map_type = INTEGRAL;
          break;
       case H_CURL:
-         switch (Dim)
+         switch (dim)
          {
             case 3: // curl: 3D H_CURL -> 3D H_DIV
-               DerivType = CURL;
-               DerivRangeType = VECTOR;
-               DerivMapType = H_DIV;
+               deriv_type = CURL;
+               deriv_range_type = VECTOR;
+               deriv_map_type = H_DIV;
                break;
             case 2:
                // curl: 2D H_CURL -> INTEGRAL
-               DerivType = CURL;
-               DerivRangeType = SCALAR;
-               DerivMapType = INTEGRAL;
+               deriv_type = CURL;
+               deriv_range_type = SCALAR;
+               deriv_map_type = INTEGRAL;
                break;
             case 1:
-               DerivType = NONE;
-               DerivRangeType = SCALAR;
-               DerivMapType = INTEGRAL;
+               deriv_type = NONE;
+               deriv_range_type = SCALAR;
+               deriv_map_type = INTEGRAL;
                break;
             default:
-               MFEM_ABORT("Invalid dimension, Dim = " << Dim);
+               MFEM_ABORT("Invalid dimension, Dim = " << dim);
          }
          break;
       default:
-         MFEM_ABORT("Invalid MapType = " << MapType);
+         MFEM_ABORT("Invalid MapType = " << map_type);
    }
 }
 
 void VectorFiniteElement::CalcVShape_RT (
    ElementTransformation &Trans, DenseMatrix &shape) const
 {
-   MFEM_ASSERT(MapType == H_DIV, "");
+   MFEM_ASSERT(map_type == H_DIV, "");
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
+   DenseMatrix vshape(dof, dim);
 #endif
    CalcVShape(Trans.GetIntPoint(), vshape);
    MultABt(vshape, Trans.Jacobian(), shape);
@@ -897,9 +897,9 @@ void VectorFiniteElement::CalcVShape_RT (
 void VectorFiniteElement::CalcVShape_ND (
    ElementTransformation &Trans, DenseMatrix &shape) const
 {
-   MFEM_ASSERT(MapType == H_CURL, "");
+   MFEM_ASSERT(map_type == H_CURL, "");
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
+   DenseMatrix vshape(dof, dim);
 #endif
    CalcVShape(Trans.GetIntPoint(), vshape);
    Mult(vshape, Trans.InverseJacobian(), shape);
@@ -913,14 +913,14 @@ void VectorFiniteElement::Project_RT(
    const int sdim = Trans.GetSpaceDim();
    MFEM_ASSERT(vc.GetVDim() == sdim, "");
    Vector xk(vk, sdim);
-   const bool square_J = (Dim == sdim);
+   const bool square_J = (dim == sdim);
 
-   for (int k = 0; k < Dof; k++)
+   for (int k = 0; k < dof; k++)
    {
       Trans.SetIntPoint(&Nodes.IntPoint(k));
       vc.Eval(xk, Trans, Nodes.IntPoint(k));
       // dof_k = nk^t adj(J) xk
-      dofs(k) = Trans.AdjugateJacobian().InnerProduct(vk, nk + d2n[k]*Dim);
+      dofs(k) = Trans.AdjugateJacobian().InnerProduct(vk, nk + d2n[k]*dim);
       if (!square_J) { dofs(k) /= Trans.Weight(); }
    }
 }
@@ -933,22 +933,22 @@ void VectorFiniteElement::ProjectMatrixCoefficient_RT(
 
    const int sdim = T.GetSpaceDim();
    MFEM_ASSERT(mc.GetWidth() == sdim, "");
-   const bool square_J = (Dim == sdim);
+   const bool square_J = (dim == sdim);
    DenseMatrix MQ(mc.GetHeight(), mc.GetWidth());
    Vector nk_phys(sdim), dofs_k(MQ.Height());
-   MFEM_ASSERT(dofs.Size() == Dof*MQ.Height(), "");
+   MFEM_ASSERT(dofs.Size() == dof*MQ.Height(), "");
 
-   for (int k = 0; k < Dof; k++)
+   for (int k = 0; k < dof; k++)
    {
       T.SetIntPoint(&Nodes.IntPoint(k));
       mc.Eval(MQ, T, Nodes.IntPoint(k));
       // nk_phys = adj(J)^t nk
-      T.AdjugateJacobian().MultTranspose(nk + d2n[k]*Dim, nk_phys);
+      T.AdjugateJacobian().MultTranspose(nk + d2n[k]*dim, nk_phys);
       if (!square_J) { nk_phys /= T.Weight(); }
       MQ.Mult(nk_phys, dofs_k);
       for (int r = 0; r < MQ.Height(); r++)
       {
-         dofs(k+Dof*r) = dofs_k(r);
+         dofs(k+dof*r) = dofs_k(r);
       }
    }
 }
@@ -963,18 +963,18 @@ void VectorFiniteElement::Project_RT(
       Vector shape(fe.GetDof());
       int sdim = Trans.GetSpaceDim();
 
-      I.SetSize(Dof, sdim*fe.GetDof());
-      for (int k = 0; k < Dof; k++)
+      I.SetSize(dof, sdim*fe.GetDof());
+      for (int k = 0; k < dof; k++)
       {
          const IntegrationPoint &ip = Nodes.IntPoint(k);
 
          fe.CalcShape(ip, shape);
          Trans.SetIntPoint(&ip);
-         Trans.AdjugateJacobian().MultTranspose(nk + d2n[k]*Dim, vk);
+         Trans.AdjugateJacobian().MultTranspose(nk + d2n[k]*dim, vk);
          if (fe.GetMapType() == INTEGRAL)
          {
             double w = 1.0/Trans.Weight();
-            for (int d = 0; d < Dim; d++)
+            for (int d = 0; d < dim; d++)
             {
                vk[d] *= w;
             }
@@ -1004,7 +1004,7 @@ void VectorFiniteElement::ProjectGrad_RT(
    const double *nk, const Array<int> &d2n, const FiniteElement &fe,
    ElementTransformation &Trans, DenseMatrix &grad) const
 {
-   if (Dim != 2)
+   if (dim != 2)
    {
       mfem_error("VectorFiniteElement::ProjectGrad_RT works only in 2D!");
    }
@@ -1013,12 +1013,12 @@ void VectorFiniteElement::ProjectGrad_RT(
    Vector grad_k(fe.GetDof());
    double tk[2];
 
-   grad.SetSize(Dof, fe.GetDof());
-   for (int k = 0; k < Dof; k++)
+   grad.SetSize(dof, fe.GetDof());
+   for (int k = 0; k < dof; k++)
    {
       fe.CalcDShape(Nodes.IntPoint(k), dshape);
-      tk[0] = nk[d2n[k]*Dim+1];
-      tk[1] = -nk[d2n[k]*Dim];
+      tk[0] = nk[d2n[k]*dim+1];
+      tk[1] = -nk[d2n[k]*dim];
       dshape.Mult(tk, grad_k);
       for (int j = 0; j < grad_k.Size(); j++)
       {
@@ -1032,19 +1032,19 @@ void VectorFiniteElement::ProjectCurl_ND(
    ElementTransformation &Trans, DenseMatrix &curl) const
 {
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix curlshape(fe.GetDof(), Dim);
-   DenseMatrix curlshape_J(fe.GetDof(), Dim);
-   DenseMatrix J(Dim, Dim);
+   DenseMatrix curlshape(fe.GetDof(), dim);
+   DenseMatrix curlshape_J(fe.GetDof(), dim);
+   DenseMatrix J(dim, dim);
 #else
-   curlshape.SetSize(fe.GetDof(), Dim);
-   curlshape_J.SetSize(fe.GetDof(), Dim);
-   J.SetSize(Dim, Dim);
+   curlshape.SetSize(fe.GetDof(), dim);
+   curlshape_J.SetSize(fe.GetDof(), dim);
+   J.SetSize(dim, dim);
 #endif
 
    Vector curl_k(fe.GetDof());
 
-   curl.SetSize(Dof, fe.GetDof());
-   for (int k = 0; k < Dof; k++)
+   curl.SetSize(dof, fe.GetDof());
+   for (int k = 0; k < dof; k++)
    {
       const IntegrationPoint &ip = Nodes.IntPoint(k);
 
@@ -1057,7 +1057,7 @@ void VectorFiniteElement::ProjectCurl_ND(
       fe.CalcCurlShape(ip, curlshape);
       Mult(curlshape, J, curlshape_J);
 
-      curlshape_J.Mult(tk + d2t[k]*Dim, curl_k);
+      curlshape_J.Mult(tk + d2t[k]*dim, curl_k);
       for (int j = 0; j < curl_k.Size(); j++)
       {
          curl(k,j) = (fabs(curl_k(j)) < 1e-12) ? 0.0 : curl_k(j);
@@ -1069,14 +1069,14 @@ void VectorFiniteElement::ProjectCurl_RT(
    const double *nk, const Array<int> &d2n, const FiniteElement &fe,
    ElementTransformation &Trans, DenseMatrix &curl) const
 {
-   DenseMatrix curl_shape(fe.GetDof(), Dim);
+   DenseMatrix curl_shape(fe.GetDof(), dim);
    Vector curl_k(fe.GetDof());
 
-   curl.SetSize(Dof, fe.GetDof());
-   for (int k = 0; k < Dof; k++)
+   curl.SetSize(dof, fe.GetDof());
+   for (int k = 0; k < dof; k++)
    {
       fe.CalcCurlShape(Nodes.IntPoint(k), curl_shape);
-      curl_shape.Mult(nk + d2n[k]*Dim, curl_k);
+      curl_shape.Mult(nk + d2n[k]*dim, curl_k);
       for (int j = 0; j < curl_k.Size(); j++)
       {
          curl(k,j) = (fabs(curl_k(j)) < 1e-12) ? 0.0 : curl_k(j);
@@ -1091,13 +1091,13 @@ void VectorFiniteElement::Project_ND(
    double vk[Geometry::MaxDim];
    Vector xk(vk, vc.GetVDim());
 
-   for (int k = 0; k < Dof; k++)
+   for (int k = 0; k < dof; k++)
    {
       Trans.SetIntPoint(&Nodes.IntPoint(k));
 
       vc.Eval(xk, Trans, Nodes.IntPoint(k));
       // dof_k = xk^t J tk
-      dofs(k) = Trans.Jacobian().InnerProduct(tk + d2t[k]*Dim, vk);
+      dofs(k) = Trans.Jacobian().InnerProduct(tk + d2t[k]*dim, vk);
    }
 }
 
@@ -1111,18 +1111,18 @@ void VectorFiniteElement::ProjectMatrixCoefficient_ND(
    MFEM_ASSERT(mc.GetWidth() == sdim, "");
    DenseMatrix MQ(mc.GetHeight(), mc.GetWidth());
    Vector tk_phys(sdim), dofs_k(MQ.Height());
-   MFEM_ASSERT(dofs.Size() == Dof*MQ.Height(), "");
+   MFEM_ASSERT(dofs.Size() == dof*MQ.Height(), "");
 
-   for (int k = 0; k < Dof; k++)
+   for (int k = 0; k < dof; k++)
    {
       T.SetIntPoint(&Nodes.IntPoint(k));
       mc.Eval(MQ, T, Nodes.IntPoint(k));
       // tk_phys = J tk
-      T.Jacobian().Mult(tk + d2t[k]*Dim, tk_phys);
+      T.Jacobian().Mult(tk + d2t[k]*dim, tk_phys);
       MQ.Mult(tk_phys, dofs_k);
       for (int r = 0; r < MQ.Height(); r++)
       {
-         dofs(k+Dof*r) = dofs_k(r);
+         dofs(k+dof*r) = dofs_k(r);
       }
    }
 }
@@ -1137,14 +1137,14 @@ void VectorFiniteElement::Project_ND(
       double vk[Geometry::MaxDim];
       Vector shape(fe.GetDof());
 
-      I.SetSize(Dof, sdim*fe.GetDof());
-      for (int k = 0; k < Dof; k++)
+      I.SetSize(dof, sdim*fe.GetDof());
+      for (int k = 0; k < dof; k++)
       {
          const IntegrationPoint &ip = Nodes.IntPoint(k);
 
          fe.CalcShape(ip, shape);
          Trans.SetIntPoint(&ip);
-         Trans.Jacobian().Mult(tk + d2t[k]*Dim, vk);
+         Trans.Jacobian().Mult(tk + d2t[k]*dim, vk);
          if (fe.GetMapType() == INTEGRAL)
          {
             double w = 1.0/Trans.Weight();
@@ -1183,11 +1183,11 @@ void VectorFiniteElement::ProjectGrad_ND(
    DenseMatrix dshape(fe.GetDof(), fe.GetDim());
    Vector grad_k(fe.GetDof());
 
-   grad.SetSize(Dof, fe.GetDof());
-   for (int k = 0; k < Dof; k++)
+   grad.SetSize(dof, fe.GetDof());
+   for (int k = 0; k < dof; k++)
    {
       fe.CalcDShape(Nodes.IntPoint(k), dshape);
-      dshape.Mult(tk + d2t[k]*Dim, grad_k);
+      dshape.Mult(tk + d2t[k]*dim, grad_k);
       for (int j = 0; j < grad_k.Size(); j++)
       {
          grad(k,j) = (fabs(grad_k(j)) < 1e-12) ? 0.0 : grad_k(j);
@@ -1199,33 +1199,33 @@ void VectorFiniteElement::LocalInterpolation_RT(
    const VectorFiniteElement &cfe, const double *nk, const Array<int> &d2n,
    ElementTransformation &Trans, DenseMatrix &I) const
 {
-   MFEM_ASSERT(MapType == cfe.GetMapType(), "");
+   MFEM_ASSERT(map_type == cfe.GetMapType(), "");
 
    double vk[Geometry::MaxDim];
-   Vector xk(vk, Dim);
+   Vector xk(vk, dim);
    IntegrationPoint ip;
 #ifdef MFEM_THREAD_SAFE
    DenseMatrix vshape(cfe.GetDof(), cfe.GetDim());
 #else
    DenseMatrix vshape(cfe.vshape.Data(), cfe.GetDof(), cfe.GetDim());
 #endif
-   I.SetSize(Dof, vshape.Height());
+   I.SetSize(dof, vshape.Height());
 
    // assuming Trans is linear; this should be ok for all refinement types
-   Trans.SetIntPoint(&Geometries.GetCenter(GeomType));
+   Trans.SetIntPoint(&Geometries.GetCenter(geom_type));
    const DenseMatrix &adjJ = Trans.AdjugateJacobian();
-   for (int k = 0; k < Dof; k++)
+   for (int k = 0; k < dof; k++)
    {
       Trans.Transform(Nodes.IntPoint(k), xk);
       ip.Set3(vk);
       cfe.CalcVShape(ip, vshape);
       // xk = |J| J^{-t} n_k
-      adjJ.MultTranspose(nk + d2n[k]*Dim, vk);
-      // I_k = vshape_k.adj(J)^t.n_k, k=1,...,Dof
+      adjJ.MultTranspose(nk + d2n[k]*dim, vk);
+      // I_k = vshape_k.adj(J)^t.n_k, k=1,...,dof
       for (int j = 0; j < vshape.Height(); j++)
       {
          double Ikj = 0.;
-         for (int i = 0; i < Dim; i++)
+         for (int i = 0; i < dim; i++)
          {
             Ikj += vshape(j, i) * vk[i];
          }
@@ -1239,30 +1239,30 @@ void VectorFiniteElement::LocalInterpolation_ND(
    ElementTransformation &Trans, DenseMatrix &I) const
 {
    double vk[Geometry::MaxDim];
-   Vector xk(vk, Dim);
+   Vector xk(vk, dim);
    IntegrationPoint ip;
 #ifdef MFEM_THREAD_SAFE
    DenseMatrix vshape(cfe.GetDof(), cfe.GetDim());
 #else
    DenseMatrix vshape(cfe.vshape.Data(), cfe.GetDof(), cfe.GetDim());
 #endif
-   I.SetSize(Dof, vshape.Height());
+   I.SetSize(dof, vshape.Height());
 
    // assuming Trans is linear; this should be ok for all refinement types
-   Trans.SetIntPoint(&Geometries.GetCenter(GeomType));
+   Trans.SetIntPoint(&Geometries.GetCenter(geom_type));
    const DenseMatrix &J = Trans.Jacobian();
-   for (int k = 0; k < Dof; k++)
+   for (int k = 0; k < dof; k++)
    {
       Trans.Transform(Nodes.IntPoint(k), xk);
       ip.Set3(vk);
       cfe.CalcVShape(ip, vshape);
       // xk = J t_k
-      J.Mult(tk + d2t[k]*Dim, vk);
+      J.Mult(tk + d2t[k]*dim, vk);
       // I_k = vshape_k.J.t_k, k=1,...,Dof
       for (int j = 0; j < vshape.Height(); j++)
       {
          double Ikj = 0.;
-         for (int i = 0; i < Dim; i++)
+         for (int i = 0; i < dim; i++)
          {
             Ikj += vshape(j, i) * vk[i];
          }
@@ -1277,28 +1277,28 @@ void VectorFiniteElement::LocalRestriction_RT(
 {
    double pt_data[Geometry::MaxDim];
    IntegrationPoint ip;
-   Vector pt(pt_data, Dim);
+   Vector pt(pt_data, dim);
 
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
+   DenseMatrix vshape(dof, dim);
 #endif
 
-   Trans.SetIntPoint(&Geometries.GetCenter(GeomType));
+   Trans.SetIntPoint(&Geometries.GetCenter(geom_type));
    const DenseMatrix &J = Trans.Jacobian();
    const double weight = Trans.Weight();
-   for (int j = 0; j < Dof; j++)
+   for (int j = 0; j < dof; j++)
    {
       InvertLinearTrans(Trans, Nodes.IntPoint(j), pt);
-      ip.Set(pt_data, Dim);
-      if (Geometries.CheckPoint(GeomType, ip)) // do we need an epsilon here?
+      ip.Set(pt_data, dim);
+      if (Geometries.CheckPoint(geom_type, ip)) // do we need an epsilon here?
       {
          CalcVShape(ip, vshape);
-         J.MultTranspose(nk+Dim*d2n[j], pt_data);
+         J.MultTranspose(nk+dim*d2n[j], pt_data);
          pt /= weight;
-         for (int k = 0; k < Dof; k++)
+         for (int k = 0; k < dof; k++)
          {
             double R_jk = 0.0;
-            for (int d = 0; d < Dim; d++)
+            for (int d = 0; d < dim; d++)
             {
                R_jk += vshape(k,d)*pt_data[d];
             }
@@ -1320,26 +1320,26 @@ void VectorFiniteElement::LocalRestriction_ND(
 {
    double pt_data[Geometry::MaxDim];
    IntegrationPoint ip;
-   Vector pt(pt_data, Dim);
+   Vector pt(pt_data, dim);
 
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
+   DenseMatrix vshape(dof, dim);
 #endif
 
-   Trans.SetIntPoint(&Geometries.GetCenter(GeomType));
+   Trans.SetIntPoint(&Geometries.GetCenter(geom_type));
    const DenseMatrix &Jinv = Trans.InverseJacobian();
-   for (int j = 0; j < Dof; j++)
+   for (int j = 0; j < dof; j++)
    {
       InvertLinearTrans(Trans, Nodes.IntPoint(j), pt);
-      ip.Set(pt_data, Dim);
-      if (Geometries.CheckPoint(GeomType, ip)) // do we need an epsilon here?
+      ip.Set(pt_data, dim);
+      if (Geometries.CheckPoint(geom_type, ip)) // do we need an epsilon here?
       {
          CalcVShape(ip, vshape);
-         Jinv.Mult(tk+Dim*d2t[j], pt_data);
-         for (int k = 0; k < Dof; k++)
+         Jinv.Mult(tk+dim*d2t[j], pt_data);
+         for (int k = 0; k < dof; k++)
          {
             double R_jk = 0.0;
-            for (int d = 0; d < Dim; d++)
+            for (int d = 0; d < dim; d++)
             {
                R_jk += vshape(k,d)*pt_data[d];
             }
@@ -2130,7 +2130,7 @@ void H1Ser_QuadrilateralElement::GetLocalInterpolation(ElementTransformation
 {
    // For p<=4, the basis is nodal; for p>4, the quad-interior functions are
    // non-nodal.
-   if (Order <= 4)
+   if (order <= 4)
    {
       NodalLocalInterpolation(Trans, I, *this);
    }
@@ -3201,7 +3201,7 @@ void TriLinear3DFiniteElement::CalcDShape(const IntegrationPoint &ip,
 
 
 P0SegmentFiniteElement::P0SegmentFiniteElement(int Ord)
-   : NodalFiniteElement(1, Geometry::SEGMENT, 1, Ord)   // defaul Ord = 0
+   : NodalFiniteElement(1, Geometry::SEGMENT, 1, Ord)   // default Ord = 0
 {
    Nodes.IntPoint(0).x = 0.5;
 }
@@ -3323,8 +3323,8 @@ void RT0TriangleFiniteElement::GetLocalInterpolation (
 {
    int k, j;
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
-   DenseMatrix Jinv(Dim);
+   DenseMatrix vshape(dof, dim);
+   DenseMatrix Jinv(dim);
 #endif
 
 #ifdef MFEM_DEBUG
@@ -3377,7 +3377,7 @@ void RT0TriangleFiniteElement::Project (
    double vk[2];
    Vector xk (vk, 2);
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix Jinv(Dim);
+   DenseMatrix Jinv(dim);
 #endif
 
    for (int k = 0; k < 3; k++)
@@ -3438,8 +3438,8 @@ void RT0QuadFiniteElement::GetLocalInterpolation (
 {
    int k, j;
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
-   DenseMatrix Jinv(Dim);
+   DenseMatrix vshape(dof, dim);
+   DenseMatrix Jinv(dim);
 #endif
 
 #ifdef MFEM_DEBUG
@@ -3492,7 +3492,7 @@ void RT0QuadFiniteElement::Project (
    double vk[2];
    Vector xk (vk, 2);
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix Jinv(Dim);
+   DenseMatrix Jinv(dim);
 #endif
 
    for (int k = 0; k < 4; k++)
@@ -3580,8 +3580,8 @@ void RT1TriangleFiniteElement::GetLocalInterpolation (
 {
    int k, j;
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
-   DenseMatrix Jinv(Dim);
+   DenseMatrix vshape(dof, dim);
+   DenseMatrix Jinv(dim);
 #endif
 
 #ifdef MFEM_DEBUG
@@ -3633,7 +3633,7 @@ void RT1TriangleFiniteElement::Project (
    double vk[2];
    Vector xk (vk, 2);
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix Jinv(Dim);
+   DenseMatrix Jinv(dim);
 #endif
 
    for (int k = 0; k < 8; k++)
@@ -3762,8 +3762,8 @@ void RT1QuadFiniteElement::GetLocalInterpolation (
 {
    int k, j;
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
-   DenseMatrix Jinv(Dim);
+   DenseMatrix vshape(dof, dim);
+   DenseMatrix Jinv(dim);
 #endif
 
 #ifdef MFEM_DEBUG
@@ -3815,7 +3815,7 @@ void RT1QuadFiniteElement::Project (
    double vk[2];
    Vector xk (vk, 2);
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix Jinv(Dim);
+   DenseMatrix Jinv(dim);
 #endif
 
    for (int k = 0; k < 12; k++)
@@ -4214,8 +4214,8 @@ void RT2QuadFiniteElement::GetLocalInterpolation (
 {
    int k, j;
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
-   DenseMatrix Jinv(Dim);
+   DenseMatrix vshape(dof, dim);
+   DenseMatrix Jinv(dim);
 #endif
 
 #ifdef MFEM_DEBUG
@@ -4267,7 +4267,7 @@ void RT2QuadFiniteElement::Project (
    double vk[2];
    Vector xk (vk, 2);
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix Jinv(Dim);
+   DenseMatrix Jinv(dim);
 #endif
 
    for (int k = 0; k < 24; k++)
@@ -4574,9 +4574,9 @@ LagrangeHexFiniteElement::LagrangeHexFiniteElement (int degree)
 {
    if (degree == 2)
    {
-      I = new int[Dof];
-      J = new int[Dof];
-      K = new int[Dof];
+      I = new int[dof];
+      J = new int[dof];
+      K = new int[dof];
       // nodes
       I[ 0] = 0; J[ 0] = 0; K[ 0] = 0;
       I[ 1] = 1; J[ 1] = 0; K[ 1] = 0;
@@ -4611,9 +4611,9 @@ LagrangeHexFiniteElement::LagrangeHexFiniteElement (int degree)
    }
    else if (degree == 3)
    {
-      I = new int[Dof];
-      J = new int[Dof];
-      K = new int[Dof];
+      I = new int[dof];
+      J = new int[dof];
+      K = new int[dof];
       // nodes
       I[ 0] = 0; J[ 0] = 0; K[ 0] = 0;
       I[ 1] = 1; J[ 1] = 0; K[ 1] = 0;
@@ -4701,7 +4701,7 @@ LagrangeHexFiniteElement::LagrangeHexFiniteElement (int degree)
    dshape1dz.SetSize(dof1d,1);
 #endif
 
-   for (int n = 0; n < Dof; n++)
+   for (int n = 0; n < dof; n++)
    {
       Nodes.IntPoint(n).x = fe1d -> GetNodes().IntPoint(I[n]).x;
       Nodes.IntPoint(n).y = fe1d -> GetNodes().IntPoint(J[n]).x;
@@ -4724,7 +4724,7 @@ void LagrangeHexFiniteElement::CalcShape(const IntegrationPoint &ip,
    fe1d -> CalcShape(ipy, shape1dy);
    fe1d -> CalcShape(ipz, shape1dz);
 
-   for (int n = 0; n < Dof; n++)
+   for (int n = 0; n < dof; n++)
    {
       shape(n) = shape1dx(I[n]) *  shape1dy(J[n]) * shape1dz(K[n]);
    }
@@ -4750,7 +4750,7 @@ void LagrangeHexFiniteElement::CalcDShape(const IntegrationPoint &ip,
    fe1d -> CalcDShape(ipy, dshape1dy);
    fe1d -> CalcDShape(ipz, dshape1dz);
 
-   for (int n = 0; n < Dof; n++)
+   for (int n = 0; n < dof; n++)
    {
       dshape(n,0) = dshape1dx(I[n],0) * shape1dy(J[n])    * shape1dz(K[n]);
       dshape(n,1) = shape1dx(I[n])    * dshape1dy(J[n],0) * shape1dz(K[n]);
@@ -5849,7 +5849,7 @@ void Nedelec1HexFiniteElement::GetLocalInterpolation (
 {
    int k, j;
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
+   DenseMatrix vshape(dof, dim);
 #endif
 
 #ifdef MFEM_DEBUG
@@ -6015,7 +6015,7 @@ void Nedelec1TetFiniteElement::GetLocalInterpolation (
 {
    int k, j;
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
+   DenseMatrix vshape(dof, dim);
 #endif
 
 #ifdef MFEM_DEBUG
@@ -6163,8 +6163,8 @@ void RT0HexFiniteElement::GetLocalInterpolation (
 {
    int k, j;
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
-   DenseMatrix Jinv(Dim);
+   DenseMatrix vshape(dof, dim);
+   DenseMatrix Jinv(dim);
 #endif
 
 #ifdef MFEM_DEBUG
@@ -6220,7 +6220,7 @@ void RT0HexFiniteElement::Project (
    double vk[3];
    Vector xk (vk, 3);
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix Jinv(Dim);
+   DenseMatrix Jinv(dim);
 #endif
 
    for (int k = 0; k < 6; k++)
@@ -6552,8 +6552,8 @@ void RT1HexFiniteElement::GetLocalInterpolation (
 {
    int k, j;
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
-   DenseMatrix Jinv(Dim);
+   DenseMatrix vshape(dof, dim);
+   DenseMatrix Jinv(dim);
 #endif
 
 #ifdef MFEM_DEBUG
@@ -6609,7 +6609,7 @@ void RT1HexFiniteElement::Project (
    double vk[3];
    Vector xk (vk, 3);
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix Jinv(Dim);
+   DenseMatrix Jinv(dim);
 #endif
 
    for (int k = 0; k < 36; k++)
@@ -6687,8 +6687,8 @@ void RT0TetFiniteElement::GetLocalInterpolation (
 {
    int k, j;
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix vshape(Dof, Dim);
-   DenseMatrix Jinv(Dim);
+   DenseMatrix vshape(dof, dim);
+   DenseMatrix Jinv(dim);
 #endif
 
 #ifdef MFEM_DEBUG
@@ -6744,7 +6744,7 @@ void RT0TetFiniteElement::Project (
    double vk[3];
    Vector xk (vk, 3);
 #ifdef MFEM_THREAD_SAFE
-   DenseMatrix Jinv(Dim);
+   DenseMatrix Jinv(dim);
 #endif
 
    for (int k = 0; k < 4; k++)
@@ -7602,7 +7602,7 @@ H1_SegmentElement::H1_SegmentElement(const int p, const int btype)
 void H1_SegmentElement::CalcShape(const IntegrationPoint &ip,
                                   Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1);
@@ -7621,7 +7621,7 @@ void H1_SegmentElement::CalcShape(const IntegrationPoint &ip,
 void H1_SegmentElement::CalcDShape(const IntegrationPoint &ip,
                                    DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), dshape_x(p+1);
@@ -7639,7 +7639,7 @@ void H1_SegmentElement::CalcDShape(const IntegrationPoint &ip,
 
 void H1_SegmentElement::ProjectDelta(int vertex, Vector &dofs) const
 {
-   const int p = Order;
+   const int p = order;
    const double *cp = poly1d.ClosedPoints(p, b_type);
 
    switch (vertex)
@@ -7692,7 +7692,7 @@ H1_QuadrilateralElement::H1_QuadrilateralElement(const int p, const int btype)
 void H1_QuadrilateralElement::CalcShape(const IntegrationPoint &ip,
                                         Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), shape_y(p+1);
@@ -7711,7 +7711,7 @@ void H1_QuadrilateralElement::CalcShape(const IntegrationPoint &ip,
 void H1_QuadrilateralElement::CalcDShape(const IntegrationPoint &ip,
                                          DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), shape_y(p+1), dshape_x(p+1), dshape_y(p+1);
@@ -7732,7 +7732,7 @@ void H1_QuadrilateralElement::CalcDShape(const IntegrationPoint &ip,
 
 void H1_QuadrilateralElement::ProjectDelta(int vertex, Vector &dofs) const
 {
-   const int p = Order;
+   const int p = order;
    const double *cp = poly1d.ClosedPoints(p, b_type);
 
 #ifdef MFEM_THREAD_SAFE
@@ -7807,7 +7807,7 @@ H1_HexahedronElement::H1_HexahedronElement(const int p, const int btype)
 void H1_HexahedronElement::CalcShape(const IntegrationPoint &ip,
                                      Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), shape_y(p+1), shape_z(p+1);
@@ -7828,7 +7828,7 @@ void H1_HexahedronElement::CalcShape(const IntegrationPoint &ip,
 void H1_HexahedronElement::CalcDShape(const IntegrationPoint &ip,
                                       DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1),  shape_y(p+1),  shape_z(p+1);
@@ -7851,7 +7851,7 @@ void H1_HexahedronElement::CalcDShape(const IntegrationPoint &ip,
 
 void H1_HexahedronElement::ProjectDelta(int vertex, Vector &dofs) const
 {
-   const int p = Order;
+   const int p = order;
    const double *cp = poly1d.ClosedPoints(p,b_type);
 
 #ifdef MFEM_THREAD_SAFE
@@ -7955,7 +7955,7 @@ H1Pos_SegmentElement::H1Pos_SegmentElement(const int p)
 void H1Pos_SegmentElement::CalcShape(const IntegrationPoint &ip,
                                      Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1);
@@ -7975,7 +7975,7 @@ void H1Pos_SegmentElement::CalcShape(const IntegrationPoint &ip,
 void H1Pos_SegmentElement::CalcDShape(const IntegrationPoint &ip,
                                       DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), dshape_x(p+1);
@@ -8022,7 +8022,7 @@ H1Pos_QuadrilateralElement::H1Pos_QuadrilateralElement(const int p)
 void H1Pos_QuadrilateralElement::CalcShape(const IntegrationPoint &ip,
                                            Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), shape_y(p+1);
@@ -8042,7 +8042,7 @@ void H1Pos_QuadrilateralElement::CalcShape(const IntegrationPoint &ip,
 void H1Pos_QuadrilateralElement::CalcDShape(const IntegrationPoint &ip,
                                             DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), shape_y(p+1), dshape_x(p+1), dshape_y(p+1);
@@ -8092,7 +8092,7 @@ H1Pos_HexahedronElement::H1Pos_HexahedronElement(const int p)
 void H1Pos_HexahedronElement::CalcShape(const IntegrationPoint &ip,
                                         Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), shape_y(p+1), shape_z(p+1);
@@ -8113,7 +8113,7 @@ void H1Pos_HexahedronElement::CalcShape(const IntegrationPoint &ip,
 void H1Pos_HexahedronElement::CalcDShape(const IntegrationPoint &ip,
                                          DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1),  shape_y(p+1),  shape_z(p+1);
@@ -8157,9 +8157,9 @@ H1_TriangleElement::H1_TriangleElement(const int p, const int btype)
    ddshape_x.SetSize(p + 1);
    ddshape_y.SetSize(p + 1);
    ddshape_l.SetSize(p + 1);
-   u.SetSize(Dof);
-   du.SetSize(Dof, Dim);
-   ddu.SetSize(Dof, (Dim * (Dim + 1)) / 2 );
+   u.SetSize(dof);
+   du.SetSize(dof, dim);
+   ddu.SetSize(dof, (dim * (dim + 1)) / 2 );
 #else
    Vector shape_x(p + 1), shape_y(p + 1), shape_l(p + 1);
 #endif
@@ -8192,8 +8192,8 @@ H1_TriangleElement::H1_TriangleElement(const int p, const int btype)
          Nodes.IntPoint(o++).Set2(cp[i]/w, cp[j]/w);
       }
 
-   DenseMatrix T(Dof);
-   for (int k = 0; k < Dof; k++)
+   DenseMatrix T(dof);
+   for (int k = 0; k < dof; k++)
    {
       IntegrationPoint &ip = Nodes.IntPoint(k);
       poly1d.CalcBasis(p, ip.x, shape_x);
@@ -8215,10 +8215,10 @@ H1_TriangleElement::H1_TriangleElement(const int p, const int btype)
 void H1_TriangleElement::CalcShape(const IntegrationPoint &ip,
                                    Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
-   Vector shape_x(p + 1), shape_y(p + 1), shape_l(p + 1), u(Dof);
+   Vector shape_x(p + 1), shape_y(p + 1), shape_l(p + 1), u(dof);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x);
@@ -8237,12 +8237,12 @@ void H1_TriangleElement::CalcShape(const IntegrationPoint &ip,
 void H1_TriangleElement::CalcDShape(const IntegrationPoint &ip,
                                     DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector  shape_x(p + 1),  shape_y(p + 1),  shape_l(p + 1);
    Vector dshape_x(p + 1), dshape_y(p + 1), dshape_l(p + 1);
-   DenseMatrix du(Dof, Dim);
+   DenseMatrix du(dof, dim);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x, dshape_x);
@@ -8266,12 +8266,12 @@ void H1_TriangleElement::CalcDShape(const IntegrationPoint &ip,
 void H1_TriangleElement::CalcHessian(const IntegrationPoint &ip,
                                      DenseMatrix &ddshape) const
 {
-   const int p = Order;
+   const int p = order;
 #ifdef MFEM_THREAD_SAFE
    Vector   shape_x(p + 1),   shape_y(p + 1),   shape_l(p + 1);
    Vector  dshape_x(p + 1),  dshape_y(p + 1),  dshape_l(p + 1);
    Vector ddshape_x(p + 1), ddshape_y(p + 1), ddshape_l(p + 1);
-   DenseMatrix ddu(Dof, Dim);
+   DenseMatrix ddu(dof, dim);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x, dshape_x, ddshape_x);
@@ -8315,9 +8315,9 @@ H1_TetrahedronElement::H1_TetrahedronElement(const int p, const int btype)
    ddshape_y.SetSize(p + 1);
    ddshape_z.SetSize(p + 1);
    ddshape_l.SetSize(p + 1);
-   u.SetSize(Dof);
-   du.SetSize(Dof, Dim);
-   ddu.SetSize(Dof, (Dim * (Dim + 1)) / 2);
+   u.SetSize(dof);
+   du.SetSize(dof, dim);
+   ddu.SetSize(dof, (dim * (dim + 1)) / 2);
 #else
    Vector shape_x(p + 1), shape_y(p + 1), shape_z(p + 1), shape_l(p + 1);
 #endif
@@ -8390,8 +8390,8 @@ H1_TetrahedronElement::H1_TetrahedronElement(const int p, const int btype)
             Nodes.IntPoint(o++).Set3(cp[i]/w, cp[j]/w, cp[k]/w);
          }
 
-   DenseMatrix T(Dof);
-   for (int m = 0; m < Dof; m++)
+   DenseMatrix T(dof);
+   for (int m = 0; m < dof; m++)
    {
       IntegrationPoint &ip = Nodes.IntPoint(m);
       poly1d.CalcBasis(p, ip.x, shape_x);
@@ -8415,11 +8415,11 @@ H1_TetrahedronElement::H1_TetrahedronElement(const int p, const int btype)
 void H1_TetrahedronElement::CalcShape(const IntegrationPoint &ip,
                                       Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p + 1), shape_y(p + 1), shape_z(p + 1), shape_l(p + 1);
-   Vector u(Dof);
+   Vector u(dof);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x);
@@ -8440,12 +8440,12 @@ void H1_TetrahedronElement::CalcShape(const IntegrationPoint &ip,
 void H1_TetrahedronElement::CalcDShape(const IntegrationPoint &ip,
                                        DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector  shape_x(p + 1),  shape_y(p + 1),  shape_z(p + 1),  shape_l(p + 1);
    Vector dshape_x(p + 1), dshape_y(p + 1), dshape_z(p + 1), dshape_l(p + 1);
-   DenseMatrix du(Dof, Dim);
+   DenseMatrix du(dof, dim);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x, dshape_x);
@@ -8473,13 +8473,13 @@ void H1_TetrahedronElement::CalcDShape(const IntegrationPoint &ip,
 void H1_TetrahedronElement::CalcHessian(const IntegrationPoint &ip,
                                         DenseMatrix &ddshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector   shape_x(p + 1),   shape_y(p + 1),   shape_z(p + 1),   shape_l(p + 1);
    Vector  dshape_x(p + 1),  dshape_y(p + 1),  dshape_z(p + 1),  dshape_l(p + 1);
    Vector ddshape_x(p + 1), ddshape_y(p + 1), ddshape_z(p + 1), ddshape_l(p + 1);
-   DenseMatrix ddu(Dof, ((Dim + 1) * Dim) / 2);
+   DenseMatrix ddu(dof, ((dim + 1) * dim) / 2);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x, dshape_x, ddshape_x);
@@ -8521,11 +8521,11 @@ H1Pos_TriangleElement::H1Pos_TriangleElement(const int p)
                            FunctionSpace::Pk)
 {
 #ifndef MFEM_THREAD_SAFE
-   m_shape.SetSize(Dof);
+   m_shape.SetSize(dof);
    dshape_1d.SetSize(p + 1);
-   m_dshape.SetSize(Dof, Dim);
+   m_dshape.SetSize(dof, dim);
 #endif
-   dof_map.SetSize(Dof);
+   dof_map.SetSize(dof);
 
    struct Index
    {
@@ -8634,10 +8634,10 @@ void H1Pos_TriangleElement::CalcShape(const IntegrationPoint &ip,
                                       Vector &shape) const
 {
 #ifdef MFEM_THREAD_SAFE
-   Vector m_shape(Dof);
+   Vector m_shape(dof);
 #endif
-   CalcShape(Order, ip.x, ip.y, m_shape.GetData());
-   for (int i = 0; i < Dof; i++)
+   CalcShape(order, ip.x, ip.y, m_shape.GetData());
+   for (int i = 0; i < dof; i++)
    {
       shape(dof_map[i]) = m_shape(i);
    }
@@ -8647,13 +8647,13 @@ void H1Pos_TriangleElement::CalcDShape(const IntegrationPoint &ip,
                                        DenseMatrix &dshape) const
 {
 #ifdef MFEM_THREAD_SAFE
-   Vector dshape_1d(Order + 1);
-   DenseMatrix m_dshape(Dof, Dim);
+   Vector dshape_1d(order + 1);
+   DenseMatrix m_dshape(dof, dim);
 #endif
-   CalcDShape(Order, ip.x, ip.y, dshape_1d.GetData(), m_dshape.Data());
+   CalcDShape(order, ip.x, ip.y, dshape_1d.GetData(), m_dshape.Data());
    for (int d = 0; d < 2; d++)
    {
-      for (int i = 0; i < Dof; i++)
+      for (int i = 0; i < dof; i++)
       {
          dshape(dof_map[i],d) = m_dshape(i,d);
       }
@@ -8666,11 +8666,11 @@ H1Pos_TetrahedronElement::H1Pos_TetrahedronElement(const int p)
                            ((p + 1)*(p + 2)*(p + 3))/6, p, FunctionSpace::Pk)
 {
 #ifndef MFEM_THREAD_SAFE
-   m_shape.SetSize(Dof);
+   m_shape.SetSize(dof);
    dshape_1d.SetSize(p + 1);
-   m_dshape.SetSize(Dof, Dim);
+   m_dshape.SetSize(dof, dim);
 #endif
-   dof_map.SetSize(Dof);
+   dof_map.SetSize(dof);
 
    struct Index
    {
@@ -8886,10 +8886,10 @@ void H1Pos_TetrahedronElement::CalcShape(const IntegrationPoint &ip,
                                          Vector &shape) const
 {
 #ifdef MFEM_THREAD_SAFE
-   Vector m_shape(Dof);
+   Vector m_shape(dof);
 #endif
-   CalcShape(Order, ip.x, ip.y, ip.z, m_shape.GetData());
-   for (int i = 0; i < Dof; i++)
+   CalcShape(order, ip.x, ip.y, ip.z, m_shape.GetData());
+   for (int i = 0; i < dof; i++)
    {
       shape(dof_map[i]) = m_shape(i);
    }
@@ -8899,13 +8899,13 @@ void H1Pos_TetrahedronElement::CalcDShape(const IntegrationPoint &ip,
                                           DenseMatrix &dshape) const
 {
 #ifdef MFEM_THREAD_SAFE
-   Vector dshape_1d(Order + 1);
-   DenseMatrix m_dshape(Dof, Dim);
+   Vector dshape_1d(order + 1);
+   DenseMatrix m_dshape(dof, dim);
 #endif
-   CalcDShape(Order, ip.x, ip.y, ip.z, dshape_1d.GetData(), m_dshape.Data());
+   CalcDShape(order, ip.x, ip.y, ip.z, dshape_1d.GetData(), m_dshape.Data());
    for (int d = 0; d < 3; d++)
    {
-      for (int i = 0; i < Dof; i++)
+      for (int i = 0; i < dof; i++)
       {
          dshape(dof_map[i],d) = m_dshape(i,d);
       }
@@ -8927,8 +8927,8 @@ H1_WedgeElement::H1_WedgeElement(const int p,
    s_dshape.SetSize(SegmentFE.GetDof(), 1);
 #endif
 
-   t_dof.SetSize(Dof);
-   s_dof.SetSize(Dof);
+   t_dof.SetSize(dof);
+   s_dof.SetSize(dof);
 
    // Nodal DoFs
    t_dof[0] = 0; s_dof[0] = 0;
@@ -9005,7 +9005,7 @@ H1_WedgeElement::H1_WedgeElement(const int p,
    // Define Nodes
    const IntegrationRule & t_Nodes = TriangleFE.GetNodes();
    const IntegrationRule & s_Nodes = SegmentFE.GetNodes();
-   for (int i=0; i<Dof; i++)
+   for (int i=0; i<dof; i++)
    {
       Nodes.IntPoint(i).x = t_Nodes.IntPoint(t_dof[i]).x;
       Nodes.IntPoint(i).y = t_Nodes.IntPoint(t_dof[i]).y;
@@ -9026,7 +9026,7 @@ void H1_WedgeElement::CalcShape(const IntegrationPoint &ip,
    TriangleFE.CalcShape(ip, t_shape);
    SegmentFE.CalcShape(ipz, s_shape);
 
-   for (int i=0; i<Dof; i++)
+   for (int i=0; i<dof; i++)
    {
       shape[i] = t_shape[t_dof[i]] * s_shape[s_dof[i]];
    }
@@ -9049,7 +9049,7 @@ void H1_WedgeElement::CalcDShape(const IntegrationPoint &ip,
    SegmentFE.CalcShape(ipz, s_shape);
    SegmentFE.CalcDShape(ipz, s_dshape);
 
-   for (int i=0; i<Dof; i++)
+   for (int i=0; i<dof; i++)
    {
       dshape(i, 0) = t_dshape(t_dof[i],0) * s_shape[s_dof[i]];
       dshape(i, 1) = t_dshape(t_dof[i],1) * s_shape[s_dof[i]];
@@ -9071,8 +9071,8 @@ H1Pos_WedgeElement::H1Pos_WedgeElement(const int p)
    s_dshape.SetSize(SegmentFE.GetDof(), 1);
 #endif
 
-   t_dof.SetSize(Dof);
-   s_dof.SetSize(Dof);
+   t_dof.SetSize(dof);
+   s_dof.SetSize(dof);
 
    // Nodal DoFs
    t_dof[0] = 0; s_dof[0] = 0;
@@ -9148,7 +9148,7 @@ H1Pos_WedgeElement::H1Pos_WedgeElement(const int p)
    // Define Nodes
    const IntegrationRule & t_Nodes = TriangleFE.GetNodes();
    const IntegrationRule & s_Nodes = SegmentFE.GetNodes();
-   for (int i=0; i<Dof; i++)
+   for (int i=0; i<dof; i++)
    {
       Nodes.IntPoint(i).x = t_Nodes.IntPoint(t_dof[i]).x;
       Nodes.IntPoint(i).y = t_Nodes.IntPoint(t_dof[i]).y;
@@ -9169,7 +9169,7 @@ void H1Pos_WedgeElement::CalcShape(const IntegrationPoint &ip,
    TriangleFE.CalcShape(ip, t_shape);
    SegmentFE.CalcShape(ipz, s_shape);
 
-   for (int i=0; i<Dof; i++)
+   for (int i=0; i<dof; i++)
    {
       shape[i] = t_shape[t_dof[i]] * s_shape[s_dof[i]];
    }
@@ -9192,7 +9192,7 @@ void H1Pos_WedgeElement::CalcDShape(const IntegrationPoint &ip,
    SegmentFE.CalcShape(ipz, s_shape);
    SegmentFE.CalcDShape(ipz, s_dshape);
 
-   for (int i=0; i<Dof; i++)
+   for (int i=0; i<dof; i++)
    {
       dshape(i, 0) = t_dshape(t_dof[i],0) * s_shape[s_dof[i]];
       dshape(i, 1) = t_dshape(t_dof[i],1) * s_shape[s_dof[i]];
@@ -9227,7 +9227,7 @@ void L2_SegmentElement::CalcDShape(const IntegrationPoint &ip,
                                    DenseMatrix &dshape) const
 {
 #ifdef MFEM_THREAD_SAFE
-   Vector shape_x(Dof), dshape_x(dshape.Data(), Dof);
+   Vector shape_x(dof), dshape_x(dshape.Data(), dof);
 #else
    dshape_x.SetData(dshape.Data());
 #endif
@@ -9236,7 +9236,7 @@ void L2_SegmentElement::CalcDShape(const IntegrationPoint &ip,
 
 void L2_SegmentElement::ProjectDelta(int vertex, Vector &dofs) const
 {
-   const int p = Order;
+   const int p = order;
    const double *op = poly1d.OpenPoints(p, b_type);
 
    switch (vertex)
@@ -9282,24 +9282,24 @@ L2Pos_SegmentElement::L2Pos_SegmentElement(const int p)
 void L2Pos_SegmentElement::CalcShape(const IntegrationPoint &ip,
                                      Vector &shape) const
 {
-   Poly_1D::CalcBernstein(Order, ip.x, shape);
+   Poly_1D::CalcBernstein(order, ip.x, shape);
 }
 
 void L2Pos_SegmentElement::CalcDShape(const IntegrationPoint &ip,
                                       DenseMatrix &dshape) const
 {
 #ifdef MFEM_THREAD_SAFE
-   Vector shape_x(Dof), dshape_x(dshape.Data(), Dof);
+   Vector shape_x(dof), dshape_x(dshape.Data(), dof);
 #else
    dshape_x.SetData(dshape.Data());
 #endif
-   Poly_1D::CalcBernstein(Order, ip.x, shape_x, dshape_x);
+   Poly_1D::CalcBernstein(order, ip.x, shape_x, dshape_x);
 }
 
 void L2Pos_SegmentElement::ProjectDelta(int vertex, Vector &dofs) const
 {
    dofs = 0.0;
-   dofs[vertex*Order] = 1.0;
+   dofs[vertex*order] = 1.0;
 }
 
 
@@ -9325,7 +9325,7 @@ L2_QuadrilateralElement::L2_QuadrilateralElement(const int p, const int btype)
 void L2_QuadrilateralElement::CalcShape(const IntegrationPoint &ip,
                                         Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), shape_y(p+1);
@@ -9344,7 +9344,7 @@ void L2_QuadrilateralElement::CalcShape(const IntegrationPoint &ip,
 void L2_QuadrilateralElement::CalcDShape(const IntegrationPoint &ip,
                                          DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), shape_y(p+1), dshape_x(p+1), dshape_y(p+1);
@@ -9363,7 +9363,7 @@ void L2_QuadrilateralElement::CalcDShape(const IntegrationPoint &ip,
 
 void L2_QuadrilateralElement::ProjectDelta(int vertex, Vector &dofs) const
 {
-   const int p = Order;
+   const int p = order;
    const double *op = poly1d.OpenPoints(p, b_type);
 
 #ifdef MFEM_THREAD_SAFE
@@ -9437,7 +9437,7 @@ L2Pos_QuadrilateralElement::L2Pos_QuadrilateralElement(const int p)
 void L2Pos_QuadrilateralElement::CalcShape(const IntegrationPoint &ip,
                                            Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), shape_y(p+1);
@@ -9456,7 +9456,7 @@ void L2Pos_QuadrilateralElement::CalcShape(const IntegrationPoint &ip,
 void L2Pos_QuadrilateralElement::CalcDShape(const IntegrationPoint &ip,
                                             DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), shape_y(p+1), dshape_x(p+1), dshape_y(p+1);
@@ -9475,7 +9475,7 @@ void L2Pos_QuadrilateralElement::CalcDShape(const IntegrationPoint &ip,
 
 void L2Pos_QuadrilateralElement::ProjectDelta(int vertex, Vector &dofs) const
 {
-   const int p = Order;
+   const int p = order;
 
    dofs = 0.0;
    switch (vertex)
@@ -9513,7 +9513,7 @@ L2_HexahedronElement::L2_HexahedronElement(const int p, const int btype)
 void L2_HexahedronElement::CalcShape(const IntegrationPoint &ip,
                                      Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), shape_y(p+1), shape_z(p+1);
@@ -9534,7 +9534,7 @@ void L2_HexahedronElement::CalcShape(const IntegrationPoint &ip,
 void L2_HexahedronElement::CalcDShape(const IntegrationPoint &ip,
                                       DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1),  shape_y(p+1),  shape_z(p+1);
@@ -9557,7 +9557,7 @@ void L2_HexahedronElement::CalcDShape(const IntegrationPoint &ip,
 
 void L2_HexahedronElement::ProjectDelta(int vertex, Vector &dofs) const
 {
-   const int p = Order;
+   const int p = order;
    const double *op = poly1d.OpenPoints(p, b_type);
 
 #ifdef MFEM_THREAD_SAFE
@@ -9670,7 +9670,7 @@ L2Pos_HexahedronElement::L2Pos_HexahedronElement(const int p)
 void L2Pos_HexahedronElement::CalcShape(const IntegrationPoint &ip,
                                         Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1), shape_y(p+1), shape_z(p+1);
@@ -9691,7 +9691,7 @@ void L2Pos_HexahedronElement::CalcShape(const IntegrationPoint &ip,
 void L2Pos_HexahedronElement::CalcDShape(const IntegrationPoint &ip,
                                          DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p+1),  shape_y(p+1),  shape_z(p+1);
@@ -9714,7 +9714,7 @@ void L2Pos_HexahedronElement::CalcDShape(const IntegrationPoint &ip,
 
 void L2Pos_HexahedronElement::ProjectDelta(int vertex, Vector &dofs) const
 {
-   const int p = Order;
+   const int p = order;
 
    dofs = 0.0;
    switch (vertex)
@@ -9725,8 +9725,8 @@ void L2Pos_HexahedronElement::ProjectDelta(int vertex, Vector &dofs) const
       case 3: dofs[p*(p + 1)] = 1.0; break;
       case 4: dofs[p*(p + 1)*(p + 1)] = 1.0; break;
       case 5: dofs[p + p*(p + 1)*(p + 1)] = 1.0; break;
-      case 6: dofs[Dof - 1] = 1.0; break;
-      case 7: dofs[Dof - p - 1] = 1.0; break;
+      case 6: dofs[dof - 1] = 1.0; break;
+      case 7: dofs[dof - p - 1] = 1.0; break;
    }
 }
 
@@ -9744,8 +9744,8 @@ L2_TriangleElement::L2_TriangleElement(const int p, const int btype)
    dshape_x.SetSize(p + 1);
    dshape_y.SetSize(p + 1);
    dshape_l.SetSize(p + 1);
-   u.SetSize(Dof);
-   du.SetSize(Dof, Dim);
+   u.SetSize(dof);
+   du.SetSize(dof, dim);
 #else
    Vector shape_x(p + 1), shape_y(p + 1), shape_l(p + 1);
 #endif
@@ -9757,8 +9757,8 @@ L2_TriangleElement::L2_TriangleElement(const int p, const int btype)
          Nodes.IntPoint(o++).Set2(op[i]/w, op[j]/w);
       }
 
-   DenseMatrix T(Dof);
-   for (int k = 0; k < Dof; k++)
+   DenseMatrix T(dof);
+   for (int k = 0; k < dof; k++)
    {
       IntegrationPoint &ip = Nodes.IntPoint(k);
       poly1d.CalcBasis(p, ip.x, shape_x);
@@ -9779,10 +9779,10 @@ L2_TriangleElement::L2_TriangleElement(const int p, const int btype)
 void L2_TriangleElement::CalcShape(const IntegrationPoint &ip,
                                    Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
-   Vector shape_x(p + 1), shape_y(p + 1), shape_l(p + 1), u(Dof);
+   Vector shape_x(p + 1), shape_y(p + 1), shape_l(p + 1), u(dof);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x);
@@ -9801,12 +9801,12 @@ void L2_TriangleElement::CalcShape(const IntegrationPoint &ip,
 void L2_TriangleElement::CalcDShape(const IntegrationPoint &ip,
                                     DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector  shape_x(p + 1),  shape_y(p + 1),  shape_l(p + 1);
    Vector dshape_x(p + 1), dshape_y(p + 1), dshape_l(p + 1);
-   DenseMatrix du(Dof, Dim);
+   DenseMatrix du(dof, dim);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x, dshape_x);
@@ -9832,24 +9832,24 @@ void L2_TriangleElement::ProjectDelta(int vertex, Vector &dofs) const
    switch (vertex)
    {
       case 0:
-         for (int i = 0; i < Dof; i++)
+         for (int i = 0; i < dof; i++)
          {
             const IntegrationPoint &ip = Nodes.IntPoint(i);
-            dofs[i] = pow(1.0 - ip.x - ip.y, Order);
+            dofs[i] = pow(1.0 - ip.x - ip.y, order);
          }
          break;
       case 1:
-         for (int i = 0; i < Dof; i++)
+         for (int i = 0; i < dof; i++)
          {
             const IntegrationPoint &ip = Nodes.IntPoint(i);
-            dofs[i] = pow(ip.x, Order);
+            dofs[i] = pow(ip.x, order);
          }
          break;
       case 2:
-         for (int i = 0; i < Dof; i++)
+         for (int i = 0; i < dof; i++)
          {
             const IntegrationPoint &ip = Nodes.IntPoint(i);
-            dofs[i] = pow(ip.y, Order);
+            dofs[i] = pow(ip.y, order);
          }
          break;
    }
@@ -9881,17 +9881,17 @@ L2Pos_TriangleElement::L2Pos_TriangleElement(const int p)
 void L2Pos_TriangleElement::CalcShape(const IntegrationPoint &ip,
                                       Vector &shape) const
 {
-   H1Pos_TriangleElement::CalcShape(Order, ip.x, ip.y, shape.GetData());
+   H1Pos_TriangleElement::CalcShape(order, ip.x, ip.y, shape.GetData());
 }
 
 void L2Pos_TriangleElement::CalcDShape(const IntegrationPoint &ip,
                                        DenseMatrix &dshape) const
 {
 #ifdef MFEM_THREAD_SAFE
-   Vector dshape_1d(Order + 1);
+   Vector dshape_1d(order + 1);
 #endif
 
-   H1Pos_TriangleElement::CalcDShape(Order, ip.x, ip.y, dshape_1d.GetData(),
+   H1Pos_TriangleElement::CalcDShape(order, ip.x, ip.y, dshape_1d.GetData(),
                                      dshape.Data());
 }
 
@@ -9901,8 +9901,8 @@ void L2Pos_TriangleElement::ProjectDelta(int vertex, Vector &dofs) const
    switch (vertex)
    {
       case 0: dofs[0] = 1.0; break;
-      case 1: dofs[Order] = 1.0; break;
-      case 2: dofs[Dof-1] = 1.0; break;
+      case 1: dofs[order] = 1.0; break;
+      case 2: dofs[dof-1] = 1.0; break;
    }
 }
 
@@ -9922,8 +9922,8 @@ L2_TetrahedronElement::L2_TetrahedronElement(const int p, const int btype)
    dshape_y.SetSize(p + 1);
    dshape_z.SetSize(p + 1);
    dshape_l.SetSize(p + 1);
-   u.SetSize(Dof);
-   du.SetSize(Dof, Dim);
+   u.SetSize(dof);
+   du.SetSize(dof, dim);
 #else
    Vector shape_x(p + 1), shape_y(p + 1), shape_z(p + 1), shape_l(p + 1);
 #endif
@@ -9936,8 +9936,8 @@ L2_TetrahedronElement::L2_TetrahedronElement(const int p, const int btype)
             Nodes.IntPoint(o++).Set3(op[i]/w, op[j]/w, op[k]/w);
          }
 
-   DenseMatrix T(Dof);
-   for (int m = 0; m < Dof; m++)
+   DenseMatrix T(dof);
+   for (int m = 0; m < dof; m++)
    {
       IntegrationPoint &ip = Nodes.IntPoint(m);
       poly1d.CalcBasis(p, ip.x, shape_x);
@@ -9960,11 +9960,11 @@ L2_TetrahedronElement::L2_TetrahedronElement(const int p, const int btype)
 void L2_TetrahedronElement::CalcShape(const IntegrationPoint &ip,
                                       Vector &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p + 1), shape_y(p + 1), shape_z(p + 1), shape_l(p + 1);
-   Vector u(Dof);
+   Vector u(dof);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x);
@@ -9985,12 +9985,12 @@ void L2_TetrahedronElement::CalcShape(const IntegrationPoint &ip,
 void L2_TetrahedronElement::CalcDShape(const IntegrationPoint &ip,
                                        DenseMatrix &dshape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector  shape_x(p + 1),  shape_y(p + 1),  shape_z(p + 1),  shape_l(p + 1);
    Vector dshape_x(p + 1), dshape_y(p + 1), dshape_z(p + 1), dshape_l(p + 1);
-   DenseMatrix du(Dof, Dim);
+   DenseMatrix du(dof, dim);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x, dshape_x);
@@ -10020,31 +10020,31 @@ void L2_TetrahedronElement::ProjectDelta(int vertex, Vector &dofs) const
    switch (vertex)
    {
       case 0:
-         for (int i = 0; i < Dof; i++)
+         for (int i = 0; i < dof; i++)
          {
             const IntegrationPoint &ip = Nodes.IntPoint(i);
-            dofs[i] = pow(1.0 - ip.x - ip.y - ip.z, Order);
+            dofs[i] = pow(1.0 - ip.x - ip.y - ip.z, order);
          }
          break;
       case 1:
-         for (int i = 0; i < Dof; i++)
+         for (int i = 0; i < dof; i++)
          {
             const IntegrationPoint &ip = Nodes.IntPoint(i);
-            dofs[i] = pow(ip.x, Order);
+            dofs[i] = pow(ip.x, order);
          }
          break;
       case 2:
-         for (int i = 0; i < Dof; i++)
+         for (int i = 0; i < dof; i++)
          {
             const IntegrationPoint &ip = Nodes.IntPoint(i);
-            dofs[i] = pow(ip.y, Order);
+            dofs[i] = pow(ip.y, order);
          }
          break;
       case 3:
-         for (int i = 0; i < Dof; i++)
+         for (int i = 0; i < dof; i++)
          {
             const IntegrationPoint &ip = Nodes.IntPoint(i);
-            dofs[i] = pow(ip.z, Order);
+            dofs[i] = pow(ip.z, order);
          }
          break;
    }
@@ -10077,7 +10077,7 @@ L2Pos_TetrahedronElement::L2Pos_TetrahedronElement(const int p)
 void L2Pos_TetrahedronElement::CalcShape(const IntegrationPoint &ip,
                                          Vector &shape) const
 {
-   H1Pos_TetrahedronElement::CalcShape(Order, ip.x, ip.y, ip.z,
+   H1Pos_TetrahedronElement::CalcShape(order, ip.x, ip.y, ip.z,
                                        shape.GetData());
 }
 
@@ -10085,10 +10085,10 @@ void L2Pos_TetrahedronElement::CalcDShape(const IntegrationPoint &ip,
                                           DenseMatrix &dshape) const
 {
 #ifdef MFEM_THREAD_SAFE
-   Vector dshape_1d(Order + 1);
+   Vector dshape_1d(order + 1);
 #endif
 
-   H1Pos_TetrahedronElement::CalcDShape(Order, ip.x, ip.y, ip.z,
+   H1Pos_TetrahedronElement::CalcDShape(order, ip.x, ip.y, ip.z,
                                         dshape_1d.GetData(), dshape.Data());
 }
 
@@ -10098,9 +10098,9 @@ void L2Pos_TetrahedronElement::ProjectDelta(int vertex, Vector &dofs) const
    switch (vertex)
    {
       case 0: dofs[0] = 1.0; break;
-      case 1: dofs[Order] = 1.0; break;
-      case 2: dofs[(Order*(Order+3))/2] = 1.0; break;
-      case 3: dofs[Dof-1] = 1.0; break;
+      case 1: dofs[order] = 1.0; break;
+      case 2: dofs[(order*(order+3))/2] = 1.0; break;
+      case 3: dofs[dof-1] = 1.0; break;
    }
 }
 
@@ -10118,8 +10118,8 @@ L2_WedgeElement::L2_WedgeElement(const int p, const int btype)
    s_dshape.SetSize(SegmentFE.GetDof(), 1);
 #endif
 
-   t_dof.SetSize(Dof);
-   s_dof.SetSize(Dof);
+   t_dof.SetSize(dof);
+   s_dof.SetSize(dof);
 
    // Interior DoFs
    int m=0;
@@ -10140,7 +10140,7 @@ L2_WedgeElement::L2_WedgeElement(const int p, const int btype)
    // Define Nodes
    const IntegrationRule & t_Nodes = TriangleFE.GetNodes();
    const IntegrationRule & s_Nodes = SegmentFE.GetNodes();
-   for (int i=0; i<Dof; i++)
+   for (int i=0; i<dof; i++)
    {
       Nodes.IntPoint(i).x = t_Nodes.IntPoint(t_dof[i]).x;
       Nodes.IntPoint(i).y = t_Nodes.IntPoint(t_dof[i]).y;
@@ -10161,7 +10161,7 @@ void L2_WedgeElement::CalcShape(const IntegrationPoint &ip,
    TriangleFE.CalcShape(ip, t_shape);
    SegmentFE.CalcShape(ipz, s_shape);
 
-   for (int i=0; i<Dof; i++)
+   for (int i=0; i<dof; i++)
    {
       shape[i] = t_shape[t_dof[i]] * s_shape[s_dof[i]];
    }
@@ -10184,7 +10184,7 @@ void L2_WedgeElement::CalcDShape(const IntegrationPoint &ip,
    SegmentFE.CalcShape(ipz, s_shape);
    SegmentFE.CalcDShape(ipz, s_dshape);
 
-   for (int i=0; i<Dof; i++)
+   for (int i=0; i<dof; i++)
    {
       dshape(i, 0) = t_dshape(t_dof[i],0) * s_shape[s_dof[i]];
       dshape(i, 1) = t_dshape(t_dof[i],1) * s_shape[s_dof[i]];
@@ -10206,8 +10206,8 @@ L2Pos_WedgeElement::L2Pos_WedgeElement(const int p)
    s_dshape.SetSize(SegmentFE.GetDof(), 1);
 #endif
 
-   t_dof.SetSize(Dof);
-   s_dof.SetSize(Dof);
+   t_dof.SetSize(dof);
+   s_dof.SetSize(dof);
 
    // Interior DoFs
    int m=0;
@@ -10228,7 +10228,7 @@ L2Pos_WedgeElement::L2Pos_WedgeElement(const int p)
    // Define Nodes
    const IntegrationRule & t_Nodes = TriangleFE.GetNodes();
    const IntegrationRule & s_Nodes = SegmentFE.GetNodes();
-   for (int i=0; i<Dof; i++)
+   for (int i=0; i<dof; i++)
    {
       Nodes.IntPoint(i).x = t_Nodes.IntPoint(t_dof[i]).x;
       Nodes.IntPoint(i).y = t_Nodes.IntPoint(t_dof[i]).y;
@@ -10249,7 +10249,7 @@ void L2Pos_WedgeElement::CalcShape(const IntegrationPoint &ip,
    TriangleFE.CalcShape(ip, t_shape);
    SegmentFE.CalcShape(ipz, s_shape);
 
-   for (int i=0; i<Dof; i++)
+   for (int i=0; i<dof; i++)
    {
       shape[i] = t_shape[t_dof[i]] * s_shape[s_dof[i]];
    }
@@ -10272,7 +10272,7 @@ void L2Pos_WedgeElement::CalcDShape(const IntegrationPoint &ip,
    SegmentFE.CalcShape(ipz, s_shape);
    SegmentFE.CalcDShape(ipz, s_dshape);
 
-   for (int i=0; i<Dof; i++)
+   for (int i=0; i<dof; i++)
    {
       dshape(i, 0) = t_dshape(t_dof[i],0) * s_shape[s_dof[i]];
       dshape(i, 1) = t_dshape(t_dof[i],1) * s_shape[s_dof[i]];
@@ -10287,15 +10287,15 @@ const double RT_QuadrilateralElement::nk[8] =
 RT_QuadrilateralElement::RT_QuadrilateralElement(const int p,
                                                  const int cb_type,
                                                  const int ob_type)
-   : VectorFiniteElement(2, Geometry::SQUARE, 2*(p + 1)*(p + 2), p + 1,
-                         H_DIV, FunctionSpace::Qk),
-     cbasis1d(poly1d.GetBasis(p + 1, VerifyClosed(cb_type))),
-     obasis1d(poly1d.GetBasis(p, VerifyOpen(ob_type))),
-     dof_map(Dof), dof2nk(Dof)
+   : VectorTensorFiniteElement(2, 2*(p + 1)*(p + 2), p + 1, cb_type, ob_type,
+                               H_DIV, DofMapType::L2_DOF_MAP),
+     dof2nk(dof)
 {
+   dof_map.SetSize(dof);
+
    const double *cp = poly1d.ClosedPoints(p + 1, cb_type);
    const double *op = poly1d.OpenPoints(p, ob_type);
-   const int dof2 = Dof/2;
+   const int dof2 = dof/2;
 
 #ifndef MFEM_THREAD_SAFE
    shape_cx.SetSize(p + 2);
@@ -10401,7 +10401,7 @@ RT_QuadrilateralElement::RT_QuadrilateralElement(const int p,
 void RT_QuadrilateralElement::CalcVShape(const IntegrationPoint &ip,
                                          DenseMatrix &shape) const
 {
-   const int pp1 = Order;
+   const int pp1 = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_cx(pp1 + 1), shape_ox(pp1), shape_cy(pp1 + 1), shape_oy(pp1);
@@ -10448,7 +10448,7 @@ void RT_QuadrilateralElement::CalcVShape(const IntegrationPoint &ip,
 void RT_QuadrilateralElement::CalcDivShape(const IntegrationPoint &ip,
                                            Vector &divshape) const
 {
-   const int pp1 = Order;
+   const int pp1 = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_cx(pp1 + 1), shape_ox(pp1), shape_cy(pp1 + 1), shape_oy(pp1);
@@ -10498,15 +10498,15 @@ const double RT_HexahedronElement::nk[18] =
 RT_HexahedronElement::RT_HexahedronElement(const int p,
                                            const int cb_type,
                                            const int ob_type)
-   : VectorFiniteElement(3, Geometry::CUBE, 3*(p + 1)*(p + 1)*(p + 2), p + 1,
-                         H_DIV, FunctionSpace::Qk),
-     cbasis1d(poly1d.GetBasis(p + 1, VerifyClosed(cb_type))),
-     obasis1d(poly1d.GetBasis(p, VerifyOpen(ob_type))),
-     dof_map(Dof), dof2nk(Dof)
+   : VectorTensorFiniteElement(3, 3*(p + 1)*(p + 1)*(p + 2), p + 1, cb_type,
+                               ob_type, H_DIV, DofMapType::L2_DOF_MAP),
+     dof2nk(dof)
 {
+   dof_map.SetSize(dof);
+
    const double *cp = poly1d.ClosedPoints(p + 1, cb_type);
    const double *op = poly1d.OpenPoints(p, ob_type);
-   const int dof3 = Dof/3;
+   const int dof3 = dof/3;
 
 #ifndef MFEM_THREAD_SAFE
    shape_cx.SetSize(p + 2);
@@ -10662,7 +10662,7 @@ RT_HexahedronElement::RT_HexahedronElement(const int p,
 void RT_HexahedronElement::CalcVShape(const IntegrationPoint &ip,
                                       DenseMatrix &shape) const
 {
-   const int pp1 = Order;
+   const int pp1 = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_cx(pp1 + 1), shape_ox(pp1), shape_cy(pp1 + 1), shape_oy(pp1);
@@ -10736,7 +10736,7 @@ void RT_HexahedronElement::CalcVShape(const IntegrationPoint &ip,
 void RT_HexahedronElement::CalcDivShape(const IntegrationPoint &ip,
                                         Vector &divshape) const
 {
-   const int pp1 = Order;
+   const int pp1 = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_cx(pp1 + 1), shape_ox(pp1), shape_cy(pp1 + 1), shape_oy(pp1);
@@ -10811,7 +10811,7 @@ const double RT_TriangleElement::c = 1./3.;
 RT_TriangleElement::RT_TriangleElement(const int p)
    : VectorFiniteElement(2, Geometry::TRIANGLE, (p + 1)*(p + 3), p + 1,
                          H_DIV, FunctionSpace::Pk),
-     dof2nk(Dof)
+     dof2nk(dof)
 {
    const double *iop = (p > 0) ? poly1d.OpenPoints(p - 1) : NULL;
    const double *bop = poly1d.OpenPoints(p);
@@ -10823,8 +10823,8 @@ RT_TriangleElement::RT_TriangleElement(const int p)
    dshape_x.SetSize(p + 1);
    dshape_y.SetSize(p + 1);
    dshape_l.SetSize(p + 1);
-   u.SetSize(Dof, Dim);
-   divu.SetSize(Dof);
+   u.SetSize(dof, dim);
+   divu.SetSize(dof);
 #else
    Vector shape_x(p + 1), shape_y(p + 1), shape_l(p + 1);
 #endif
@@ -10858,8 +10858,8 @@ RT_TriangleElement::RT_TriangleElement(const int p)
          dof2nk[o++] = 2;
       }
 
-   DenseMatrix T(Dof);
-   for (int k = 0; k < Dof; k++)
+   DenseMatrix T(dof);
+   for (int k = 0; k < dof; k++)
    {
       const IntegrationPoint &ip = Nodes.IntPoint(k);
       poly1d.CalcBasis(p, ip.x, shape_x);
@@ -10889,11 +10889,11 @@ RT_TriangleElement::RT_TriangleElement(const int p)
 void RT_TriangleElement::CalcVShape(const IntegrationPoint &ip,
                                     DenseMatrix &shape) const
 {
-   const int p = Order - 1;
+   const int p = order - 1;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p + 1), shape_y(p + 1), shape_l(p + 1);
-   DenseMatrix u(Dof, Dim);
+   DenseMatrix u(dof, dim);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x);
@@ -10922,12 +10922,12 @@ void RT_TriangleElement::CalcVShape(const IntegrationPoint &ip,
 void RT_TriangleElement::CalcDivShape(const IntegrationPoint &ip,
                                       Vector &divshape) const
 {
-   const int p = Order - 1;
+   const int p = order - 1;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p + 1),  shape_y(p + 1),  shape_l(p + 1);
    Vector dshape_x(p + 1), dshape_y(p + 1), dshape_l(p + 1);
-   Vector divu(Dof);
+   Vector divu(dof);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x, dshape_x);
@@ -10964,7 +10964,7 @@ const double RT_TetrahedronElement::c = 1./4.;
 RT_TetrahedronElement::RT_TetrahedronElement(const int p)
    : VectorFiniteElement(3, Geometry::TETRAHEDRON, (p + 1)*(p + 2)*(p + 4)/2,
                          p + 1, H_DIV, FunctionSpace::Pk),
-     dof2nk(Dof)
+     dof2nk(dof)
 {
    const double *iop = (p > 0) ? poly1d.OpenPoints(p - 1) : NULL;
    const double *bop = poly1d.OpenPoints(p);
@@ -10978,8 +10978,8 @@ RT_TetrahedronElement::RT_TetrahedronElement(const int p)
    dshape_y.SetSize(p + 1);
    dshape_z.SetSize(p + 1);
    dshape_l.SetSize(p + 1);
-   u.SetSize(Dof, Dim);
-   divu.SetSize(Dof);
+   u.SetSize(dof, dim);
+   divu.SetSize(dof);
 #else
    Vector shape_x(p + 1), shape_y(p + 1), shape_z(p + 1), shape_l(p + 1);
 #endif
@@ -11030,8 +11030,8 @@ RT_TetrahedronElement::RT_TetrahedronElement(const int p)
             dof2nk[o++] = 3;
          }
 
-   DenseMatrix T(Dof);
-   for (int m = 0; m < Dof; m++)
+   DenseMatrix T(dof);
+   for (int m = 0; m < dof; m++)
    {
       const IntegrationPoint &ip = Nodes.IntPoint(m);
       poly1d.CalcBasis(p, ip.x, shape_x);
@@ -11066,11 +11066,11 @@ RT_TetrahedronElement::RT_TetrahedronElement(const int p)
 void RT_TetrahedronElement::CalcVShape(const IntegrationPoint &ip,
                                        DenseMatrix &shape) const
 {
-   const int p = Order - 1;
+   const int p = order - 1;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p + 1), shape_y(p + 1), shape_z(p + 1), shape_l(p + 1);
-   DenseMatrix u(Dof, Dim);
+   DenseMatrix u(dof, dim);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x);
@@ -11102,12 +11102,12 @@ void RT_TetrahedronElement::CalcVShape(const IntegrationPoint &ip,
 void RT_TetrahedronElement::CalcDivShape(const IntegrationPoint &ip,
                                          Vector &divshape) const
 {
-   const int p = Order - 1;
+   const int p = order - 1;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_x(p + 1),  shape_y(p + 1),  shape_z(p + 1),  shape_l(p + 1);
    Vector dshape_x(p + 1), dshape_y(p + 1), dshape_z(p + 1), dshape_l(p + 1);
-   Vector divu(Dof);
+   Vector divu(dof);
 #endif
 
    poly1d.CalcBasis(p, ip.x, shape_x, dshape_x);
@@ -11149,13 +11149,13 @@ ND_HexahedronElement::ND_HexahedronElement(const int p,
                                            const int cb_type, const int ob_type)
    : VectorTensorFiniteElement(3, 3*p*(p + 1)*(p + 1), p, cb_type, ob_type,
                                H_CURL, DofMapType::L2_DOF_MAP),
-     dof2tk(Dof)
+     dof2tk(dof)
 {
-   dof_map.SetSize(Dof);
+   dof_map.SetSize(dof);
 
    const double *cp = poly1d.ClosedPoints(p, cb_type);
    const double *op = poly1d.OpenPoints(p - 1, ob_type);
-   const int dof3 = Dof/3;
+   const int dof3 = dof/3;
 
 #ifndef MFEM_THREAD_SAFE
    shape_cx.SetSize(p + 1);
@@ -11366,7 +11366,7 @@ ND_HexahedronElement::ND_HexahedronElement(const int p,
 void ND_HexahedronElement::CalcVShape(const IntegrationPoint &ip,
                                       DenseMatrix &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_cx(p + 1), shape_ox(p), shape_cy(p + 1), shape_oy(p);
@@ -11440,7 +11440,7 @@ void ND_HexahedronElement::CalcVShape(const IntegrationPoint &ip,
 void ND_HexahedronElement::CalcCurlShape(const IntegrationPoint &ip,
                                          DenseMatrix &curl_shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_cx(p + 1), shape_ox(p), shape_cy(p + 1), shape_oy(p);
@@ -11537,7 +11537,8 @@ const DofToQuad &VectorTensorFiniteElement::GetTensorDofToQuad(
 {
    MFEM_VERIFY(mode == DofToQuad::TENSOR, "invalid mode requested");
 
-   for (int i = 0; i < closed ? dof2quad_array.Size() : dof2quad_array_open.Size();
+   for (int i = 0;
+        i < (closed ? dof2quad_array.Size() : dof2quad_array_open.Size());
         i++)
    {
       const DofToQuad &d2q = closed ? *dof2quad_array[i] : *dof2quad_array_open[i];
@@ -11545,8 +11546,8 @@ const DofToQuad &VectorTensorFiniteElement::GetTensorDofToQuad(
    }
 
    DofToQuad *d2q = new DofToQuad;
-   const int ndof = closed ? Order + 1 : Order;
-   const int nqpt = (int)floor(pow(ir.GetNPoints(), 1.0/Dim) + 0.5);
+   const int ndof = closed ? order + 1 : order;
+   const int nqpt = (int)floor(pow(ir.GetNPoints(), 1.0/dim) + 0.5);
    d2q->FE = this;
    d2q->IntRule = &ir;
    d2q->mode = mode;
@@ -11590,6 +11591,14 @@ const DofToQuad &VectorTensorFiniteElement::GetTensorDofToQuad(
    return *d2q;
 }
 
+VectorTensorFiniteElement::~VectorTensorFiniteElement()
+{
+   for (int i = 0; i < dof2quad_array_open.Size(); i++)
+   {
+      delete dof2quad_array_open[i];
+   }
+}
+
 const double ND_QuadrilateralElement::tk[8] =
 { 1.,0.,  0.,1., -1.,0., 0.,-1. };
 
@@ -11598,13 +11607,13 @@ ND_QuadrilateralElement::ND_QuadrilateralElement(const int p,
                                                  const int ob_type)
    : VectorTensorFiniteElement(2, 2*p*(p + 1), p, cb_type, ob_type,
                                H_CURL, DofMapType::L2_DOF_MAP),
-     dof2tk(Dof)
+     dof2tk(dof)
 {
-   dof_map.SetSize(Dof);
+   dof_map.SetSize(dof);
 
    const double *cp = poly1d.ClosedPoints(p, cb_type);
    const double *op = poly1d.OpenPoints(p - 1, ob_type);
-   const int dof2 = Dof/2;
+   const int dof2 = dof/2;
 
 #ifndef MFEM_THREAD_SAFE
    shape_cx.SetSize(p + 1);
@@ -11685,7 +11694,7 @@ ND_QuadrilateralElement::ND_QuadrilateralElement(const int p,
 void ND_QuadrilateralElement::CalcVShape(const IntegrationPoint &ip,
                                          DenseMatrix &shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_cx(p + 1), shape_ox(p), shape_cy(p + 1), shape_oy(p);
@@ -11734,7 +11743,7 @@ void ND_QuadrilateralElement::CalcVShape(const IntegrationPoint &ip,
 void ND_QuadrilateralElement::CalcCurlShape(const IntegrationPoint &ip,
                                             DenseMatrix &curl_shape) const
 {
-   const int p = Order;
+   const int p = order;
 
 #ifdef MFEM_THREAD_SAFE
    Vector shape_cx(p + 1), shape_ox(p), shape_cy(p + 1), shape_oy(p);
@@ -11787,7 +11796,7 @@ const double ND_TetrahedronElement::c = 1./4.;
 
 ND_TetrahedronElement::ND_TetrahedronElement(const int p)
    : VectorFiniteElement(3, Geometry::TETRAHEDRON, p*(p + 2)*(p + 3)/2, p,
-                         H_CURL, FunctionSpace::Pk), dof2tk(Dof)
+                         H_CURL, FunctionSpace::Pk), dof2tk(dof)
 {
    const double *eop = poly1d.OpenPoints(p - 1);
    const double *fop = (p > 1) ? poly1d.OpenPoints(p - 2) : NULL;
@@ -11804,7 +11813,7 @@ ND_TetrahedronElement::ND_TetrahedronElement(const int p)
    dshape_y.SetSize(p);
    dshape_z.SetSize(p);
    dshape_l.SetSize(p);
-   u.SetSize(Dof, Dim);
+   u.SetSize(dof, dim);
 #else
    Vector shape_x(p), shape_y(p), shape_z(p), shape_l(p);
 #endif
@@ -11894,8 +11903,8 @@ ND_TetrahedronElement::ND_TetrahedronElement(const int p)
             dof2tk[o++] = 2;
          }
 
-   DenseMatrix T(Dof);
-   for (int m = 0; m < Dof; m++)
+   DenseMatrix T(dof);
+   for (int m = 0; m < dof; m++)
    {
       const IntegrationPoint &ip = Nodes.IntPoint(m);
       const double *tm = tk + 3*dof2tk[m];
@@ -11936,12 +11945,12 @@ ND_TetrahedronElement::ND_TetrahedronElement(const int p)
 void ND_TetrahedronElement::CalcVShape(const IntegrationPoint &ip,
                                        DenseMatrix &shape) const
 {
-   const int pm1 = Order - 1;
+   const int pm1 = order - 1;
 
 #ifdef MFEM_THREAD_SAFE
-   const int p = Order;
+   const int p = order;
    Vector shape_x(p), shape_y(p), shape_z(p), shape_l(p);
-   DenseMatrix u(Dof, Dim);
+   DenseMatrix u(dof, dim);
 #endif
 
    poly1d.CalcBasis(pm1, ip.x, shape_x);
@@ -11978,13 +11987,13 @@ void ND_TetrahedronElement::CalcVShape(const IntegrationPoint &ip,
 void ND_TetrahedronElement::CalcCurlShape(const IntegrationPoint &ip,
                                           DenseMatrix &curl_shape) const
 {
-   const int pm1 = Order - 1;
+   const int pm1 = order - 1;
 
 #ifdef MFEM_THREAD_SAFE
-   const int p = Order;
+   const int p = order;
    Vector shape_x(p), shape_y(p), shape_z(p), shape_l(p);
    Vector dshape_x(p), dshape_y(p), dshape_z(p), dshape_l(p);
-   DenseMatrix u(Dof, Dim);
+   DenseMatrix u(dof, dim);
 #endif
 
    poly1d.CalcBasis(pm1, ip.x, shape_x, dshape_x);
@@ -12050,7 +12059,7 @@ const double ND_TriangleElement::c = 1./3.;
 ND_TriangleElement::ND_TriangleElement(const int p)
    : VectorFiniteElement(2, Geometry::TRIANGLE, p*(p + 2), p,
                          H_CURL, FunctionSpace::Pk),
-     dof2tk(Dof)
+     dof2tk(dof)
 {
    const double *eop = poly1d.OpenPoints(p - 1);
    const double *iop = (p > 1) ? poly1d.OpenPoints(p - 2) : NULL;
@@ -12064,8 +12073,8 @@ ND_TriangleElement::ND_TriangleElement(const int p)
    dshape_x.SetSize(p);
    dshape_y.SetSize(p);
    dshape_l.SetSize(p);
-   u.SetSize(Dof, Dim);
-   curlu.SetSize(Dof);
+   u.SetSize(dof, dim);
+   curlu.SetSize(dof);
 #else
    Vector shape_x(p), shape_y(p), shape_l(p);
 #endif
@@ -12099,8 +12108,8 @@ ND_TriangleElement::ND_TriangleElement(const int p)
          dof2tk[n++] = 3;
       }
 
-   DenseMatrix T(Dof);
-   for (int m = 0; m < Dof; m++)
+   DenseMatrix T(dof);
+   for (int m = 0; m < dof; m++)
    {
       const IntegrationPoint &ip = Nodes.IntPoint(m);
       const double *tm = tk + 2*dof2tk[m];
@@ -12131,12 +12140,12 @@ ND_TriangleElement::ND_TriangleElement(const int p)
 void ND_TriangleElement::CalcVShape(const IntegrationPoint &ip,
                                     DenseMatrix &shape) const
 {
-   const int pm1 = Order - 1;
+   const int pm1 = order - 1;
 
 #ifdef MFEM_THREAD_SAFE
-   const int p = Order;
+   const int p = order;
    Vector shape_x(p), shape_y(p), shape_l(p);
-   DenseMatrix u(Dof, Dim);
+   DenseMatrix u(dof, dim);
 #endif
 
    poly1d.CalcBasis(pm1, ip.x, shape_x);
@@ -12165,13 +12174,13 @@ void ND_TriangleElement::CalcVShape(const IntegrationPoint &ip,
 void ND_TriangleElement::CalcCurlShape(const IntegrationPoint &ip,
                                        DenseMatrix &curl_shape) const
 {
-   const int pm1 = Order - 1;
+   const int pm1 = order - 1;
 
 #ifdef MFEM_THREAD_SAFE
-   const int p = Order;
+   const int p = order;
    Vector shape_x(p), shape_y(p), shape_l(p);
    Vector dshape_x(p), dshape_y(p), dshape_l(p);
-   Vector curlu(Dof);
+   Vector curlu(dof);
 #endif
 
    poly1d.CalcBasis(pm1, ip.x, shape_x, dshape_x);
@@ -12200,7 +12209,7 @@ void ND_TriangleElement::CalcCurlShape(const IntegrationPoint &ip,
                      (dshape_y(j)*(ip.y - c) + shape_y(j)) * shape_x(i));
    }
 
-   Vector curl2d(curl_shape.Data(),Dof);
+   Vector curl2d(curl_shape.Data(),dof);
    Ti.Mult(curlu, curl2d);
 }
 
@@ -12211,7 +12220,7 @@ ND_SegmentElement::ND_SegmentElement(const int p, const int ob_type)
    : VectorFiniteElement(1, Geometry::SEGMENT, p, p - 1,
                          H_CURL, FunctionSpace::Pk),
      obasis1d(poly1d.GetBasis(p - 1, VerifyOpen(ob_type))),
-     dof2tk(Dof)
+     dof2tk(dof)
 {
    const double *op = poly1d.OpenPoints(p - 1, ob_type);
 
@@ -12226,18 +12235,18 @@ ND_SegmentElement::ND_SegmentElement(const int p, const int ob_type)
 void ND_SegmentElement::CalcVShape(const IntegrationPoint &ip,
                                    DenseMatrix &shape) const
 {
-   Vector vshape(shape.Data(), Dof);
+   Vector vshape(shape.Data(), dof);
 
    obasis1d.Eval(ip.x, vshape);
 }
 
 void NURBS1DFiniteElement::SetOrder() const
 {
-   Order = kv[0]->GetOrder();
-   Dof = Order + 1;
+   order = kv[0]->GetOrder();
+   dof = order + 1;
 
-   weights.SetSize(Dof);
-   shape_x.SetSize(Dof);
+   weights.SetSize(dof);
+   shape_x.SetSize(dof);
 }
 
 void NURBS1DFiniteElement::CalcShape(const IntegrationPoint &ip,
@@ -12246,7 +12255,7 @@ void NURBS1DFiniteElement::CalcShape(const IntegrationPoint &ip,
    kv[0]->CalcShape(shape, ijk[0], ip.x);
 
    double sum = 0.0;
-   for (int i = 0; i <= Order; i++)
+   for (int i = 0; i <= order; i++)
    {
       sum += (shape(i) *= weights(i));
    }
@@ -12257,13 +12266,13 @@ void NURBS1DFiniteElement::CalcShape(const IntegrationPoint &ip,
 void NURBS1DFiniteElement::CalcDShape(const IntegrationPoint &ip,
                                       DenseMatrix &dshape) const
 {
-   Vector grad(dshape.Data(), Dof);
+   Vector grad(dshape.Data(), dof);
 
    kv[0]->CalcShape (shape_x, ijk[0], ip.x);
    kv[0]->CalcDShape(grad,    ijk[0], ip.x);
 
    double sum = 0.0, dsum = 0.0;
-   for (int i = 0; i <= Order; i++)
+   for (int i = 0; i <= order; i++)
    {
       sum  += (shape_x(i) *= weights(i));
       dsum += (   grad(i) *= weights(i));
@@ -12276,15 +12285,15 @@ void NURBS1DFiniteElement::CalcDShape(const IntegrationPoint &ip,
 void NURBS1DFiniteElement::CalcHessian (const IntegrationPoint &ip,
                                         DenseMatrix &hessian) const
 {
-   Vector grad(Dof);
-   Vector hess(hessian.Data(), Dof);
+   Vector grad(dof);
+   Vector hess(hessian.Data(), dof);
 
    kv[0]->CalcShape (shape_x,  ijk[0], ip.x);
    kv[0]->CalcDShape(grad,     ijk[0], ip.x);
    kv[0]->CalcD2Shape(hess,    ijk[0], ip.x);
 
    double sum = 0.0, dsum = 0.0, d2sum = 0.0;
-   for (int i = 0; i <= Order; i++)
+   for (int i = 0; i <= order; i++)
    {
       sum   += (shape_x(i) *= weights(i));
       dsum  += (   grad(i) *= weights(i));
@@ -12299,20 +12308,20 @@ void NURBS1DFiniteElement::CalcHessian (const IntegrationPoint &ip,
 
 void NURBS2DFiniteElement::SetOrder() const
 {
-   Orders[0] = kv[0]->GetOrder();
-   Orders[1] = kv[1]->GetOrder();
-   shape_x.SetSize(Orders[0]+1);
-   shape_y.SetSize(Orders[1]+1);
-   dshape_x.SetSize(Orders[0]+1);
-   dshape_y.SetSize(Orders[1]+1);
-   d2shape_x.SetSize(Orders[0]+1);
-   d2shape_y.SetSize(Orders[1]+1);
+   orders[0] = kv[0]->GetOrder();
+   orders[1] = kv[1]->GetOrder();
+   shape_x.SetSize(orders[0]+1);
+   shape_y.SetSize(orders[1]+1);
+   dshape_x.SetSize(orders[0]+1);
+   dshape_y.SetSize(orders[1]+1);
+   d2shape_x.SetSize(orders[0]+1);
+   d2shape_y.SetSize(orders[1]+1);
 
-   Order = max(Orders[0], Orders[1]);
-   Dof = (Orders[0] + 1)*(Orders[1] + 1);
-   u.SetSize(Dof);
-   du.SetSize(Dof);
-   weights.SetSize(Dof);
+   order = max(orders[0], orders[1]);
+   dof = (orders[0] + 1)*(orders[1] + 1);
+   u.SetSize(dof);
+   du.SetSize(dof);
+   weights.SetSize(dof);
 }
 
 void NURBS2DFiniteElement::CalcShape(const IntegrationPoint &ip,
@@ -12322,10 +12331,10 @@ void NURBS2DFiniteElement::CalcShape(const IntegrationPoint &ip,
    kv[1]->CalcShape(shape_y, ijk[1], ip.y);
 
    double sum = 0.0;
-   for (int o = 0, j = 0; j <= Orders[1]; j++)
+   for (int o = 0, j = 0; j <= orders[1]; j++)
    {
       const double sy = shape_y(j);
-      for (int i = 0; i <= Orders[0]; i++, o++)
+      for (int i = 0; i <= orders[0]; i++, o++)
       {
          sum += ( shape(o) = shape_x(i)*sy*weights(o) );
       }
@@ -12346,10 +12355,10 @@ void NURBS2DFiniteElement::CalcDShape(const IntegrationPoint &ip,
    kv[1]->CalcDShape(dshape_y, ijk[1], ip.y);
 
    sum = dsum[0] = dsum[1] = 0.0;
-   for (int o = 0, j = 0; j <= Orders[1]; j++)
+   for (int o = 0, j = 0; j <= orders[1]; j++)
    {
       const double sy = shape_y(j), dsy = dshape_y(j);
-      for (int i = 0; i <= Orders[0]; i++, o++)
+      for (int i = 0; i <= orders[0]; i++, o++)
       {
          sum += ( u(o) = shape_x(i)*sy*weights(o) );
 
@@ -12362,7 +12371,7 @@ void NURBS2DFiniteElement::CalcDShape(const IntegrationPoint &ip,
    dsum[0] *= sum*sum;
    dsum[1] *= sum*sum;
 
-   for (int o = 0; o < Dof; o++)
+   for (int o = 0; o < dof; o++)
    {
       dshape(o,0) = dshape(o,0)*sum - u(o)*dsum[0];
       dshape(o,1) = dshape(o,1)*sum - u(o)*dsum[1];
@@ -12385,10 +12394,10 @@ void NURBS2DFiniteElement::CalcHessian (const IntegrationPoint &ip,
 
    sum = dsum[0] = dsum[1] = 0.0;
    d2sum[0] = d2sum[1] = d2sum[2] = 0.0;
-   for (int o = 0, j = 0; j <= Orders[1]; j++)
+   for (int o = 0, j = 0; j <= orders[1]; j++)
    {
       const double sy = shape_y(j), dsy = dshape_y(j), d2sy = d2shape_y(j);
-      for (int i = 0; i <= Orders[0]; i++, o++)
+      for (int i = 0; i <= orders[0]; i++, o++)
       {
          const double sx = shape_x(i), dsx = dshape_x(i), d2sx = d2shape_x(i);
          sum += ( u(o) = sx*sy*weights(o) );
@@ -12410,7 +12419,7 @@ void NURBS2DFiniteElement::CalcHessian (const IntegrationPoint &ip,
    d2sum[1] *= sum;
    d2sum[2] *= sum;
 
-   for (int o = 0; o < Dof; o++)
+   for (int o = 0; o < dof; o++)
    {
       hessian(o,0) = hessian(o,0)*sum
                      - 2*du(o,0)*sum*dsum[0]
@@ -12430,26 +12439,26 @@ void NURBS2DFiniteElement::CalcHessian (const IntegrationPoint &ip,
 
 void NURBS3DFiniteElement::SetOrder() const
 {
-   Orders[0] = kv[0]->GetOrder();
-   Orders[1] = kv[1]->GetOrder();
-   Orders[2] = kv[2]->GetOrder();
-   shape_x.SetSize(Orders[0]+1);
-   shape_y.SetSize(Orders[1]+1);
-   shape_z.SetSize(Orders[2]+1);
+   orders[0] = kv[0]->GetOrder();
+   orders[1] = kv[1]->GetOrder();
+   orders[2] = kv[2]->GetOrder();
+   shape_x.SetSize(orders[0]+1);
+   shape_y.SetSize(orders[1]+1);
+   shape_z.SetSize(orders[2]+1);
 
-   dshape_x.SetSize(Orders[0]+1);
-   dshape_y.SetSize(Orders[1]+1);
-   dshape_z.SetSize(Orders[2]+1);
+   dshape_x.SetSize(orders[0]+1);
+   dshape_y.SetSize(orders[1]+1);
+   dshape_z.SetSize(orders[2]+1);
 
-   d2shape_x.SetSize(Orders[0]+1);
-   d2shape_y.SetSize(Orders[1]+1);
-   d2shape_z.SetSize(Orders[2]+1);
+   d2shape_x.SetSize(orders[0]+1);
+   d2shape_y.SetSize(orders[1]+1);
+   d2shape_z.SetSize(orders[2]+1);
 
-   Order = max(max(Orders[0], Orders[1]), Orders[2]);
-   Dof = (Orders[0] + 1)*(Orders[1] + 1)*(Orders[2] + 1);
-   u.SetSize(Dof);
-   du.SetSize(Dof);
-   weights.SetSize(Dof);
+   order = max(max(orders[0], orders[1]), orders[2]);
+   dof = (orders[0] + 1)*(orders[1] + 1)*(orders[2] + 1);
+   u.SetSize(dof);
+   du.SetSize(dof);
+   weights.SetSize(dof);
 }
 
 void NURBS3DFiniteElement::CalcShape(const IntegrationPoint &ip,
@@ -12460,13 +12469,13 @@ void NURBS3DFiniteElement::CalcShape(const IntegrationPoint &ip,
    kv[2]->CalcShape(shape_z, ijk[2], ip.z);
 
    double sum = 0.0;
-   for (int o = 0, k = 0; k <= Orders[2]; k++)
+   for (int o = 0, k = 0; k <= orders[2]; k++)
    {
       const double sz = shape_z(k);
-      for (int j = 0; j <= Orders[1]; j++)
+      for (int j = 0; j <= orders[1]; j++)
       {
          const double sy_sz = shape_y(j)*sz;
-         for (int i = 0; i <= Orders[0]; i++, o++)
+         for (int i = 0; i <= orders[0]; i++, o++)
          {
             sum += ( shape(o) = shape_x(i)*sy_sz*weights(o) );
          }
@@ -12490,15 +12499,15 @@ void NURBS3DFiniteElement::CalcDShape(const IntegrationPoint &ip,
    kv[2]->CalcDShape(dshape_z, ijk[2], ip.z);
 
    sum = dsum[0] = dsum[1] = dsum[2] = 0.0;
-   for (int o = 0, k = 0; k <= Orders[2]; k++)
+   for (int o = 0, k = 0; k <= orders[2]; k++)
    {
       const double sz = shape_z(k), dsz = dshape_z(k);
-      for (int j = 0; j <= Orders[1]; j++)
+      for (int j = 0; j <= orders[1]; j++)
       {
          const double  sy_sz  =  shape_y(j)* sz;
          const double dsy_sz  = dshape_y(j)* sz;
          const double  sy_dsz =  shape_y(j)*dsz;
-         for (int i = 0; i <= Orders[0]; i++, o++)
+         for (int i = 0; i <= orders[0]; i++, o++)
          {
             sum += ( u(o) = shape_x(i)*sy_sz*weights(o) );
 
@@ -12514,7 +12523,7 @@ void NURBS3DFiniteElement::CalcDShape(const IntegrationPoint &ip,
    dsum[1] *= sum*sum;
    dsum[2] *= sum*sum;
 
-   for (int o = 0; o < Dof; o++)
+   for (int o = 0; o < dof; o++)
    {
       dshape(o,0) = dshape(o,0)*sum - u(o)*dsum[0];
       dshape(o,1) = dshape(o,1)*sum - u(o)*dsum[1];
@@ -12542,13 +12551,13 @@ void NURBS3DFiniteElement::CalcHessian (const IntegrationPoint &ip,
    sum = dsum[0] = dsum[1] = dsum[2] = 0.0;
    d2sum[0] = d2sum[1] = d2sum[2] = d2sum[3] = d2sum[4] = d2sum[5] = 0.0;
 
-   for (int o = 0, k = 0; k <= Orders[2]; k++)
+   for (int o = 0, k = 0; k <= orders[2]; k++)
    {
       const double sz = shape_z(k), dsz = dshape_z(k), d2sz = d2shape_z(k);
-      for (int j = 0; j <= Orders[1]; j++)
+      for (int j = 0; j <= orders[1]; j++)
       {
          const double sy = shape_y(j), dsy = dshape_y(j), d2sy = d2shape_y(j);
-         for (int i = 0; i <= Orders[0]; i++, o++)
+         for (int i = 0; i <= orders[0]; i++, o++)
          {
             const double sx = shape_x(i), dsx = dshape_x(i), d2sx = d2shape_x(i);
             sum += ( u(o) = sx*sy*sz*weights(o) );
@@ -12582,7 +12591,7 @@ void NURBS3DFiniteElement::CalcHessian (const IntegrationPoint &ip,
    d2sum[4] *= sum;
    d2sum[5] *= sum;
 
-   for (int o = 0; o < Dof; o++)
+   for (int o = 0; o < dof; o++)
    {
       hessian(o,0) = hessian(o,0)*sum
                      - 2*du(o,0)*sum*dsum[0]
diff --git a/fem/fe.hpp b/fem/fe.hpp
index b8d1982574a..e5bd910d8ef 100644
--- a/fem/fe.hpp
+++ b/fem/fe.hpp
@@ -206,7 +206,7 @@ class DofToQuad
 };
 
 
-/// Describes the space on each element
+/// Describes the function space on each element
 class FunctionSpace
 {
 public:
@@ -228,18 +228,18 @@ class KnotVector;
 // Base and derived classes for finite elements
 
 
-/// Abstract class for Finite Elements
+/// Abstract class for all finite elements.
 class FiniteElement
 {
 protected:
-   int Dim;      ///< Dimension of reference space
-   Geometry::Type GeomType; ///< Geometry::Type of the reference element
-   int FuncSpace, RangeType, MapType,
-       DerivType, DerivRangeType, DerivMapType;
+   int dim;      ///< Dimension of reference space
+   Geometry::Type geom_type; ///< Geometry::Type of the reference element
+   int func_space, range_type, map_type,
+       deriv_type, deriv_range_type, deriv_map_type;
    mutable
-   int Dof,      ///< Number of degrees of freedom
-       Order;    ///< Order/degree of the shape functions
-   mutable int Orders[Geometry::MaxDim]; ///< Anisotropic orders
+   int dof,      ///< Number of degrees of freedom
+       order;    ///< Order/degree of the shape functions
+   mutable int orders[Geometry::MaxDim]; ///< Anisotropic orders
    IntegrationRule Nodes;
 #ifndef MFEM_THREAD_SAFE
    mutable DenseMatrix vshape; // Dof x Dim
@@ -250,51 +250,49 @@ class FiniteElement
    mutable Array<DofToQuad*> dof2quad_array;
 
 public:
-   /// Enumeration for RangeType and DerivRangeType
-   enum { SCALAR, VECTOR };
+   /// Enumeration for range_type and deriv_range_type
+   enum RangeType { SCALAR, VECTOR };
 
    /** @brief Enumeration for MapType: defines how reference functions are
        mapped to physical space.
 
-       A reference function, `uh(xh)`, can be mapped to a function, `u(x)`, on a
-       general physical element in following ways:
-
-           VALUE       u(x) = uh(xh)
-           INTEGRAL    u(x) = (1/w) * uh(xh)
-           H_DIV       u(x) = (J/w) * uh(xh)
-           H_CURL      u(x) = J^{-t} * uh(xh)           (square J)
-           H_CURL      u(x) = J*(J^t*J)^{-1} * uh(xh)   (general J)
-
-       where
-
-           x = T(xh) is the image of the reference point xh ("x hat"),
-           J = J(xh) is the Jacobian matrix of the transformation T, and
-           w = w(xh) = / det(J),           for square J,
-                       \ det(J^t*J)^{1/2}, for general J,
-                     is the transformation weight factor.
+       A reference function \f$ \hat u(\hat x) \f$ can be mapped to a function
+      \f$ u(x) \f$ on a general physical element in following ways:
+       - \f$ x = T(\hat x) \f$ is the image of the reference point \f$ \hat x \f$
+       - \f$ J = J(\hat x) \f$ is the Jacobian matrix of the transformation T
+       - \f$ w = w(\hat x) = det(J) \f$ is the transformation weight factor for square J
+       - \f$ w = w(\hat x) = det(J^t J)^{1/2} \f$ is the transformation weight factor in general
    */
-   enum { VALUE,     ///< For scalar fields; preserves point values
-          INTEGRAL,  ///< For scalar fields; preserves volume integrals
-          H_DIV,     /**< For vector fields; preserves surface integrals of the
-                          normal component */
-          H_CURL     /**< For vector fields; preserves line integrals of the
-                          tangential component */
-        };
+   enum MapType
+   {
+      VALUE,     /**< For scalar fields; preserves point values
+                          \f$ u(x) = \hat u(\hat x) \f$ */
+      INTEGRAL,  /**< For scalar fields; preserves volume integrals
+                          \f$ u(x) = (1/w) \hat u(\hat x) \f$ */
+      H_DIV,     /**< For vector fields; preserves surface integrals of the
+                          normal component \f$ u(x) = (J/w) \hat u(\hat x) \f$ */
+      H_CURL     /**< For vector fields; preserves line integrals of the
+                          tangential component
+                          \f$ u(x) = J^{-t} \hat u(\hat x) \f$ (square J),
+                          \f$ u(x) = J(J^t J)^{-1} \hat u(\hat x) \f$ (general J) */
+   };
 
    /** @brief Enumeration for DerivType: defines which derivative method
        is implemented.
 
-       Each FiniteElement class implements only one type of derivative.  The
+       Each FiniteElement class implements up to one type of derivative.  The
        value returned by GetDerivType() indicates which derivative method is
        implemented.
    */
-   enum { NONE, ///< No derivatives implemented
-          GRAD, ///< Implements CalcDShape methods
-          DIV,  ///< Implements CalcDivShape methods
-          CURL  ///< Implements CalcCurlShape methods
-        };
+   enum DerivType
+   {
+      NONE, ///< No derivatives implemented
+      GRAD, ///< Implements CalcDShape methods
+      DIV,  ///< Implements CalcDivShape methods
+      CURL  ///< Implements CalcCurlShape methods
+   };
 
-   /** Construct FiniteElement with given
+   /** @brief Construct FiniteElement with given
        @param D    Reference space dimension
        @param G    Geometry type (of type Geometry::Type)
        @param Do   Number of degrees of freedom in the FiniteElement
@@ -305,53 +303,66 @@ class FiniteElement
                  int F = FunctionSpace::Pk);
 
    /// Returns the reference space dimension for the finite element
-   int GetDim() const { return Dim; }
+   int GetDim() const { return dim; }
 
    /// Returns the Geometry::Type of the reference element
-   Geometry::Type GetGeomType() const { return GeomType; }
+   Geometry::Type GetGeomType() const { return geom_type; }
 
    /// Returns the number of degrees of freedom in the finite element
-   int GetDof() const { return Dof; }
+   int GetDof() const { return dof; }
 
    /** @brief Returns the order of the finite element. In the case of
        anisotropic orders, returns the maximum order. */
-   int GetOrder() const { return Order; }
+   int GetOrder() const { return order; }
 
    /** @brief Returns true if the FiniteElement basis *may be using* different
        orders/degrees in different spatial directions. */
-   bool HasAnisotropicOrders() const { return Orders[0] != -1; }
+   bool HasAnisotropicOrders() const { return orders[0] != -1; }
 
    /// Returns an array containing the anisotropic orders/degrees.
-   const int *GetAnisotropicOrders() const { return Orders; }
+   const int *GetAnisotropicOrders() const { return orders; }
+
+   /// Returns the type of FunctionSpace on the element.
+   int Space() const { return func_space; }
 
-   /// Returns the type of space on each element
-   int Space() const { return FuncSpace; }
+   /// Returns the FiniteElement::RangeType of the element, one of {SCALAR, VECTOR}.
+   int GetRangeType() const { return range_type; }
 
-   int GetRangeType() const { return RangeType; }
+   /** @brief Returns the FiniteElement::RangeType of the element derivative, either
+       SCALAR or VECTOR. */
+   int GetDerivRangeType() const { return deriv_range_type; }
 
-   int GetDerivRangeType() const { return DerivRangeType; }
+   /** @brief Returns the FiniteElement::MapType of the element describing how reference
+       functions are mapped to physical space, one of {VALUE, INTEGRAL
+       H_DIV, H_CURL}. */
+   int GetMapType() const { return map_type; }
 
-   int GetMapType() const { return MapType; }
 
-   int GetDerivType() const { return DerivType; }
+   /** @brief Returns the FiniteElement::DerivType of the element describing the
+       spatial derivative method implemented, one of {NONE, GRAD,
+       DIV, CURL}. */
+   int GetDerivType() const { return deriv_type; }
 
-   int GetDerivMapType() const { return DerivMapType; }
+   /** @brief Returns the FiniteElement::DerivType of the element describing how
+       reference function derivatives are mapped to physical space, one of {VALUE,
+       INTEGRAL, H_DIV, H_CURL}. */
+   int GetDerivMapType() const { return deriv_map_type; }
 
    /** @brief Evaluate the values of all shape functions of a scalar finite
        element in reference space at the given point @a ip. */
-   /** The size (#Dof) of the result Vector @a shape must be set in advance. */
+   /** The size (#dof) of the result Vector @a shape must be set in advance. */
    virtual void CalcShape(const IntegrationPoint &ip,
                           Vector &shape) const = 0;
 
    /** @brief Evaluate the values of all shape functions of a scalar finite
        element in physical space at the point described by @a Trans. */
-   /** The size (#Dof) of the result Vector @a shape must be set in advance. */
+   /** The size (#dof) of the result Vector @a shape must be set in advance. */
    void CalcPhysShape(ElementTransformation &Trans, Vector &shape) const;
 
    /** @brief Evaluate the gradients of all shape functions of a scalar finite
        element in reference space at the given point @a ip. */
    /** Each row of the result DenseMatrix @a dshape contains the derivatives of
-       one shape function. The size (#Dof x #Dim) of @a dshape must be set in
+       one shape function. The size (#dof x #dim) of @a dshape must be set in
        advance.  */
    virtual void CalcDShape(const IntegrationPoint &ip,
                            DenseMatrix &dshape) const = 0;
@@ -359,11 +370,12 @@ class FiniteElement
    /** @brief Evaluate the gradients of all shape functions of a scalar finite
        element in physical space at the point described by @a Trans. */
    /** Each row of the result DenseMatrix @a dshape contains the derivatives of
-       one shape function. The size (#Dof x SDim) of @a dshape must be set in
-       advance, where SDim >= #Dim is the physical space dimension as described
+       one shape function. The size (#dof x SDim) of @a dshape must be set in
+       advance, where SDim >= #dim is the physical space dimension as described
        by @a Trans. */
    void CalcPhysDShape(ElementTransformation &Trans, DenseMatrix &dshape) const;
 
+   /// Get a const reference to the nodes of the element
    const IntegrationRule & GetNodes() const { return Nodes; }
 
    // virtual functions for finite elements on vector spaces
@@ -371,7 +383,7 @@ class FiniteElement
    /** @brief Evaluate the values of all shape functions of a *vector* finite
        element in reference space at the given point @a ip. */
    /** Each row of the result DenseMatrix @a shape contains the components of
-       one vector shape function. The size (#Dof x #Dim) of @a shape must be set
+       one vector shape function. The size (#dof x #dim) of @a shape must be set
        in advance. */
    virtual void CalcVShape(const IntegrationPoint &ip,
                            DenseMatrix &shape) const;
@@ -379,8 +391,8 @@ class FiniteElement
    /** @brief Evaluate the values of all shape functions of a *vector* finite
        element in physical space at the point described by @a Trans. */
    /** Each row of the result DenseMatrix @a shape contains the components of
-       one vector shape function. The size (#Dof x SDim) of @a shape must be set
-       in advance, where SDim >= #Dim is the physical space dimension as
+       one vector shape function. The size (#dof x SDim) of @a shape must be set
+       in advance, where SDim >= #dim is the physical space dimension as
        described by @a Trans. */
    virtual void CalcVShape(ElementTransformation &Trans,
                            DenseMatrix &shape) const;
@@ -391,35 +403,39 @@ class FiniteElement
 
    /** @brief Evaluate the divergence of all shape functions of a *vector*
        finite element in reference space at the given point @a ip. */
-   /** The size (#Dof) of the result Vector @a divshape must be set in advance.
+   /** The size (#dof) of the result Vector @a divshape must be set in advance.
     */
    virtual void CalcDivShape(const IntegrationPoint &ip,
                              Vector &divshape) const;
 
    /** @brief Evaluate the divergence of all shape functions of a *vector*
        finite element in physical space at the point described by @a Trans. */
-   /** The size (#Dof) of the result Vector @a divshape must be set in advance.
+   /** The size (#dof) of the result Vector @a divshape must be set in advance.
     */
    void CalcPhysDivShape(ElementTransformation &Trans, Vector &divshape) const;
 
    /** @brief Evaluate the curl of all shape functions of a *vector* finite
        element in reference space at the given point @a ip. */
    /** Each row of the result DenseMatrix @a curl_shape contains the components
-       of the curl of one vector shape function. The size (#Dof x CDim) of
-       @a curl_shape must be set in advance, where CDim = 3 for #Dim = 3 and
-       CDim = 1 for #Dim = 2. */
+       of the curl of one vector shape function. The size (#dof x CDim) of
+       @a curl_shape must be set in advance, where CDim = 3 for #dim = 3 and
+       CDim = 1 for #dim = 2. */
    virtual void CalcCurlShape(const IntegrationPoint &ip,
                               DenseMatrix &curl_shape) const;
 
    /** @brief Evaluate the curl of all shape functions of a *vector* finite
        element in physical space at the point described by @a Trans. */
    /** Each row of the result DenseMatrix @a curl_shape contains the components
-       of the curl of one vector shape function. The size (#Dof x CDim) of
-       @a curl_shape must be set in advance, where CDim = 3 for #Dim = 3 and
-       CDim = 1 for #Dim = 2. */
+       of the curl of one vector shape function. The size (#dof x CDim) of
+       @a curl_shape must be set in advance, where CDim = 3 for #dim = 3 and
+       CDim = 1 for #dim = 2. */
    void CalcPhysCurlShape(ElementTransformation &Trans,
                           DenseMatrix &curl_shape) const;
 
+   /** @brief Get the dofs associated with the given @a face.
+       @a *dofs is set to an internal array of the local dofc on the
+       face, while *ndofs is set to the number of dofs on that face.
+   */
    virtual void GetFaceDofs(int face, int **dofs, int *ndofs) const;
 
    /** @brief Evaluate the Hessians of all shape functions of a scalar finite
@@ -427,19 +443,19 @@ class FiniteElement
    /** Each row of the result DenseMatrix @a Hessian contains upper triangular
        part of the Hessian of one shape function.
        The order in 2D is {u_xx, u_xy, u_yy}.
-       The size (#Dof x (#Dim (#Dim-1)/2) of @a Hessian must be set in advance.*/
+       The size (#dof x (#dim (#dim-1)/2) of @a Hessian must be set in advance.*/
    virtual void CalcHessian (const IntegrationPoint &ip,
                              DenseMatrix &Hessian) const;
 
    /** @brief Evaluate the Hessian of all shape functions of a scalar finite
        element in reference space at the given point @a ip. */
-   /** The size (#Dof, #Dim*(#Dim+1)/2) of @a Hessian must be set in advance. */
+   /** The size (#dof, #dim*(#dim+1)/2) of @a Hessian must be set in advance. */
    virtual void CalcPhysHessian(ElementTransformation &Trans,
                                 DenseMatrix& Hessian) const;
 
    /** @brief Evaluate the Laplacian of all shape functions of a scalar finite
        element in reference space at the given point @a ip. */
-   /** The size (#Dof) of @a Laplacian must be set in advance. */
+   /** The size (#dof) of @a Laplacian must be set in advance. */
    virtual void CalcPhysLaplacian(ElementTransformation &Trans,
                                   Vector& Laplacian) const;
 
@@ -477,69 +493,71 @@ class FiniteElement
        allowing the "coarse" FiniteElement to be different from the "fine"
        FiniteElement as when h-refinement is combined with p-refinement or
        p-derefinement. It is assumed that both finite elements use the same
-       MapType. */
+       FiniteElement::MapType. */
    virtual void GetTransferMatrix(const FiniteElement &fe,
                                   ElementTransformation &Trans,
                                   DenseMatrix &I) const;
 
-   /** Given a coefficient and a transformation, compute its projection
+   /** @brief Given a coefficient and a transformation, compute its projection
        (approximation) in the local finite dimensional space in terms
        of the degrees of freedom. */
    virtual void Project (Coefficient &coeff,
                          ElementTransformation &Trans, Vector &dofs) const;
 
-   /** Given a vector coefficient and a transformation, compute its
+   /** @brief Given a vector coefficient and a transformation, compute its
        projection (approximation) in the local finite dimensional space
        in terms of the degrees of freedom. (VectorFiniteElements) */
    virtual void Project (VectorCoefficient &vc,
                          ElementTransformation &Trans, Vector &dofs) const;
 
-   /** Given a matrix coefficient and a transformation, compute an approximation
-       ("projection") in the local finite dimensional space in terms of the
-       degrees of freedom. For VectorFiniteElements, the rows of the coefficient
-       are projected in the vector space. */
+   /** @brief Given a matrix coefficient and a transformation, compute an
+       approximation ("projection") in the local finite dimensional space in
+       terms of the degrees of freedom. For VectorFiniteElements, the rows of
+       the coefficient are projected in the vector space. */
    virtual void ProjectMatrixCoefficient(
       MatrixCoefficient &mc, ElementTransformation &T, Vector &dofs) const;
 
-   /** Compute a representation (up to multiplicative constant) for
-       the delta function at the vertex with the given index. */
+   /** @brief Project a delta function centered on the given @a vertex in
+       the local finite dimensional space represented by the @a dofs. */
    virtual void ProjectDelta(int vertex, Vector &dofs) const;
 
-   /** Compute the embedding/projection matrix from the given FiniteElement
-       onto 'this' FiniteElement. The ElementTransformation is included to
-       support cases when the projection depends on it. */
+   /** @brief Compute the embedding/projection matrix from the given
+       FiniteElement onto 'this' FiniteElement. The ElementTransformation is
+       included to support cases when the projection depends on it. */
    virtual void Project(const FiniteElement &fe, ElementTransformation &Trans,
                         DenseMatrix &I) const;
 
-   /** Compute the discrete gradient matrix from the given FiniteElement onto
-       'this' FiniteElement. The ElementTransformation is included to support
-       cases when the matrix depends on it. */
+   /** @brief Compute the discrete gradient matrix from the given FiniteElement
+       onto 'this' FiniteElement. The ElementTransformation is included to
+       support cases when the matrix depends on it. */
    virtual void ProjectGrad(const FiniteElement &fe,
                             ElementTransformation &Trans,
                             DenseMatrix &grad) const;
 
-   /** Compute the discrete curl matrix from the given FiniteElement onto
+   /** @brief Compute the discrete curl matrix from the given FiniteElement onto
        'this' FiniteElement. The ElementTransformation is included to support
        cases when the matrix depends on it. */
    virtual void ProjectCurl(const FiniteElement &fe,
                             ElementTransformation &Trans,
                             DenseMatrix &curl) const;
 
-   /** Compute the discrete divergence matrix from the given FiniteElement onto
-       'this' FiniteElement. The ElementTransformation is included to support
-       cases when the matrix depends on it. */
+   /** @brief Compute the discrete divergence matrix from the given
+       FiniteElement onto 'this' FiniteElement. The ElementTransformation is
+       included to support cases when the matrix depends on it. */
    virtual void ProjectDiv(const FiniteElement &fe,
                            ElementTransformation &Trans,
                            DenseMatrix &div) const;
 
-   /** Return a DofToQuad structure corresponding to the given IntegrationRule
-       using the given DofToQuad::Mode. */
+   /** @brief Return a DofToQuad structure corresponding to the given
+       IntegrationRule using the given DofToQuad::Mode. */
    /** See the documentation for DofToQuad for more details. */
    virtual const DofToQuad &GetDofToQuad(const IntegrationRule &ir,
                                          DofToQuad::Mode mode) const;
-
+   /// Deconstruct the FiniteElement
    virtual ~FiniteElement();
 
+   /** @brief Return true if the BasisType of @a b_type is closed
+       (has Quadrature1D points on the boundary). */
    static bool IsClosedType(int b_type)
    {
       const int q_type = BasisType::GetQuadrature1D(b_type);
@@ -547,6 +565,8 @@ class FiniteElement
               (Quadrature1D::CheckClosed(q_type) != Quadrature1D::Invalid));
    }
 
+   /** @brief Return true if the BasisType of @a b_type is open
+       (doesn't have Quadrature1D points on the boundary). */
    static bool IsOpenType(int b_type)
    {
       const int q_type = BasisType::GetQuadrature1D(b_type);
@@ -554,23 +574,34 @@ class FiniteElement
               (Quadrature1D::CheckOpen(q_type) != Quadrature1D::Invalid));
    }
 
+   /** @brief Ensure that the BasisType of @a b_type is closed
+       (has Quadrature1D points on the boundary). */
    static int VerifyClosed(int b_type)
    {
       MFEM_VERIFY(IsClosedType(b_type),
                   "invalid closed basis type: " << b_type);
       return b_type;
    }
+
+   /** @brief Ensure that the BasisType of @a b_type is open
+       (doesn't have Quadrature1D points on the boundary). */
    static int VerifyOpen(int b_type)
    {
       MFEM_VERIFY(IsOpenType(b_type), "invalid open basis type: " << b_type);
       return b_type;
    }
+
+   /** @brief Ensure that the BasisType of @a b_type nodal
+       (satisfies the interpolation property). */
    static int VerifyNodal(int b_type)
    {
       return BasisType::CheckNodal(b_type);
    }
 };
 
+
+/** @brief Class for finite elements with basis functions
+    that return scalar values. */
 class ScalarFiniteElement : public FiniteElement
 {
 protected:
@@ -590,29 +621,42 @@ class ScalarFiniteElement : public FiniteElement
                                        DofToQuad::Mode mode) const;
 
 public:
+   /** @brief Construct ScalarFiniteElement with given
+       @param D    Reference space dimension
+       @param G    Geometry type (of type Geometry::Type)
+       @param Do   Number of degrees of freedom in the FiniteElement
+       @param O    Order/degree of the FiniteElement
+       @param F    FunctionSpace type of the FiniteElement
+    */
    ScalarFiniteElement(int D, Geometry::Type G, int Do, int O,
                        int F = FunctionSpace::Pk)
 #ifdef MFEM_THREAD_SAFE
       : FiniteElement(D, G, Do, O, F)
-   { DerivType = GRAD; DerivRangeType = VECTOR; DerivMapType = H_CURL; }
+   { deriv_type = GRAD; deriv_range_type = VECTOR; deriv_map_type = H_CURL; }
 #else
-      : FiniteElement(D, G, Do, O, F), c_shape(Dof)
-   { DerivType = GRAD; DerivRangeType = VECTOR; DerivMapType = H_CURL; }
+      : FiniteElement(D, G, Do, O, F), c_shape(dof)
+   { deriv_type = GRAD; deriv_range_type = VECTOR; deriv_map_type = H_CURL; }
 #endif
 
+   /** @brief Set the FiniteElement::MapType of the element to either VALUE or
+       INTEGRAL. Also sets the FiniteElement::DerivType to GRAD if the
+       FiniteElement::MapType is VALUE. */
    void SetMapType(int M)
    {
       MFEM_VERIFY(M == VALUE || M == INTEGRAL, "unknown MapType");
-      MapType = M;
-      DerivType = (M == VALUE) ? GRAD : NONE;
+      map_type = M;
+      deriv_type = (M == VALUE) ? GRAD : NONE;
    }
 
-   /// Nodal interpolation.
+
+   /** @brief Get the matrix @a I that defines nodal interpolation
+       @a between this element and the refined element @a fine_fe. */
    void NodalLocalInterpolation(ElementTransformation &Trans,
                                 DenseMatrix &I,
                                 const ScalarFiniteElement &fine_fe) const;
 
-   /// "Interpolation" defined through local L2-projection.
+   /** @brief Get matrix @a I "Interpolation" defined through local
+       L2-projection in the space defined by the @a fine_fe.  */
    /** If the "fine" elements cannot represent all basis functions of the
        "coarse" element, then boundary values from different sub-elements are
        generally different. */
@@ -624,6 +668,8 @@ class ScalarFiniteElement : public FiniteElement
                                          DofToQuad::Mode mode) const;
 };
 
+
+/// Class for standard nodal finite elements.
 class NodalFiniteElement : public ScalarFiniteElement
 {
 protected:
@@ -632,6 +678,13 @@ class NodalFiniteElement : public ScalarFiniteElement
                        DenseMatrix &curl) const;
 
 public:
+   /** @brief Construct NodalFiniteElement with given
+       @param D    Reference space dimension
+       @param G    Geometry type (of type Geometry::Type)
+       @param Do   Number of degrees of freedom in the FiniteElement
+       @param O    Order/degree of the FiniteElement
+       @param F    FunctionSpace type of the FiniteElement
+   */
    NodalFiniteElement(int D, Geometry::Type G, int Do, int O,
                       int F = FunctionSpace::Pk)
       : ScalarFiniteElement(D, G, Do, O, F) { }
@@ -670,10 +723,18 @@ class NodalFiniteElement : public ScalarFiniteElement
                            DenseMatrix &div) const;
 };
 
-
+/** @brief Class for finite elements utilizing the
+    always positive Bernstein basis. */
 class PositiveFiniteElement : public ScalarFiniteElement
 {
 public:
+   /** @brief Construct PositiveFiniteElement with given
+       @param D    Reference space dimension
+       @param G    Geometry type (of type Geometry::Type)
+       @param Do   Number of degrees of freedom in the FiniteElement
+       @param O    Order/degree of the FiniteElement
+       @param F    FunctionSpace type of the FiniteElement
+   */
    PositiveFiniteElement(int D, Geometry::Type G, int Do, int O,
                          int F = FunctionSpace::Pk) :
       ScalarFiniteElement(D, G, Do, O, F)
@@ -702,6 +763,8 @@ class PositiveFiniteElement : public ScalarFiniteElement
                         DenseMatrix &I) const;
 };
 
+/** @brief Intermediate class for finite elements whose basis functions return
+    vector values. */
 class VectorFiniteElement : public FiniteElement
 {
    // Hide the scalar functions CalcShape and CalcDShape.
@@ -759,7 +822,7 @@ class VectorFiniteElement : public FiniteElement
                    VectorCoefficient &vc, ElementTransformation &Trans,
                    Vector &dofs) const;
 
-   // project the rows of the matrix coefficient in an ND space
+   /// project the rows of the matrix coefficient in an ND space
    void ProjectMatrixCoefficient_ND(
       const double *tk, const Array<int> &d2t,
       MatrixCoefficient &mc, ElementTransformation &T, Vector &dofs) const;
@@ -802,16 +865,18 @@ class VectorFiniteElement : public FiniteElement
                         int F = FunctionSpace::Pk) :
 #ifdef MFEM_THREAD_SAFE
       FiniteElement(D, G, Do, O, F)
-   { RangeType = VECTOR; MapType = M; SetDerivMembers(); }
+   { range_type = VECTOR; map_type = M; SetDerivMembers(); }
 #else
       FiniteElement(D, G, Do, O, F), Jinv(D)
-   { RangeType = VECTOR; MapType = M; SetDerivMembers(); }
+   { range_type = VECTOR; map_type = M; SetDerivMembers(); }
 #endif
 };
 
+/// A 0D point finite element
 class PointFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the PointFiniteElement
    PointFiniteElement();
 
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -820,11 +885,11 @@ class PointFiniteElement : public NodalFiniteElement
                            DenseMatrix &dshape) const;
 };
 
-/// Class for linear FE on interval
+/// A 1D linear element with nodes on the endpoints
 class Linear1DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a linear FE on interval
+   /// Construct the Linear1DFiniteElement
    Linear1DFiniteElement();
 
    /** virtual function which evaluates the values of all
@@ -840,11 +905,11 @@ class Linear1DFiniteElement : public NodalFiniteElement
                            DenseMatrix &dshape) const;
 };
 
-/// Class for linear FE on triangle
+/// A 2D linear element on triangle with nodes at the vertices of the triangle
 class Linear2DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a linear FE on triangle
+   /// Construct the Linear2DFiniteElement
    Linear2DFiniteElement();
 
    /** virtual function which evaluates the values of all
@@ -862,11 +927,11 @@ class Linear2DFiniteElement : public NodalFiniteElement
    { dofs = 0.0; dofs(vertex) = 1.0; }
 };
 
-/// Class for bilinear FE on quadrilateral
+/// A 2D bi-linear element on a square with nodes at the vertices of the square
 class BiLinear2DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a bilinear FE on quadrilateral
+   /// Construct the BiLinear2DFiniteElement
    BiLinear2DFiniteElement();
 
    /** virtual function which evaluates the values of all
@@ -886,10 +951,11 @@ class BiLinear2DFiniteElement : public NodalFiniteElement
    { dofs = 0.0; dofs(vertex) = 1.0; } // { dofs = 1.0; }
 };
 
-/// Class for linear FE on triangle with nodes at the 3 "Gaussian" points
+/// A linear element on a triangle with nodes at the 3 "Gaussian" points
 class GaussLinear2DFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the GaussLinear2DFiniteElement
    GaussLinear2DFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -897,13 +963,14 @@ class GaussLinear2DFiniteElement : public NodalFiniteElement
    virtual void ProjectDelta(int vertex, Vector &dofs) const;
 };
 
-/// Class for bilinear FE on quad with nodes at the 4 Gaussian points
+/// A 2D bi-linear element on a square with nodes at the "Gaussian" points
 class GaussBiLinear2DFiniteElement : public NodalFiniteElement
 {
 private:
    static const double p[2];
 
 public:
+   /// Construct the FiniteElement
    GaussBiLinear2DFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -911,9 +978,12 @@ class GaussBiLinear2DFiniteElement : public NodalFiniteElement
    virtual void ProjectDelta(int vertex, Vector &dofs) const;
 };
 
+/** @brief A 2D linear element on a square with 3 nodes at the
+    vertices of the lower left triangle */
 class P1OnQuadFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the P1OnQuadFiniteElement
    P1OnQuadFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -922,11 +992,11 @@ class P1OnQuadFiniteElement : public NodalFiniteElement
    { dofs = 1.0; }
 };
 
-/// Class for quadratic FE on interval
+/// A 1D quadractic finite element with uniformly spaced nodes
 class Quad1DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a quadratic FE on interval
+   /// Construct the Quad1DFiniteElement
    Quad1DFiniteElement();
 
    /** virtual function which evaluates the values of all
@@ -942,20 +1012,23 @@ class Quad1DFiniteElement : public NodalFiniteElement
                            DenseMatrix &dshape) const;
 };
 
+/// A 1D quadratic positive element utilizing the 2nd order Bernstein basis
 class QuadPos1DFiniteElement : public PositiveFiniteElement
 {
 public:
+   /// Construct the QuadPos1DFiniteElement
    QuadPos1DFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
                            DenseMatrix &dshape) const;
 };
 
-/// Class for quadratic FE on triangle
+/** @brief A 2D quadratic element on triangle with nodes at the
+    vertices and midpoints of the triangle. */
 class Quad2DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a quadratic FE on triangle
+   /// Construct the Quad2DFiniteElement
    Quad2DFiniteElement();
 
    /** virtual function which evaluates the values of all
@@ -975,7 +1048,7 @@ class Quad2DFiniteElement : public NodalFiniteElement
    virtual void ProjectDelta(int vertex, Vector &dofs) const;
 };
 
-/// Class for quadratic FE on triangle with nodes at the "Gaussian" points
+/// A quadratic element on triangle with nodes at the "Gaussian" points
 class GaussQuad2DFiniteElement : public NodalFiniteElement
 {
 private:
@@ -984,6 +1057,7 @@ class GaussQuad2DFiniteElement : public NodalFiniteElement
    mutable DenseMatrix D;
    mutable Vector pol;
 public:
+   /// Construct the GaussQuad2DFiniteElement
    GaussQuad2DFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -991,11 +1065,11 @@ class GaussQuad2DFiniteElement : public NodalFiniteElement
    // virtual void ProjectDelta(int vertex, Vector &dofs) const;
 };
 
-/// Class for bi-quadratic FE on quadrilateral
+/// A 2D bi-quadratic element on a square with uniformly spaced nodes
 class BiQuad2DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a biquadratic FE on quadrilateral
+   /// Construct the BiQuad2DFiniteElement
    BiQuad2DFiniteElement();
 
    /** virtual function which evaluates the values of all
@@ -1012,9 +1086,13 @@ class BiQuad2DFiniteElement : public NodalFiniteElement
    virtual void ProjectDelta(int vertex, Vector &dofs) const;
 };
 
+
+/// A 2D positive bi-quadratic element on a square utilizing the 2nd order
+/// Bernstein basis
 class BiQuadPos2DFiniteElement : public PositiveFiniteElement
 {
 public:
+   /// Construct the BiQuadPos2DFiniteElement
    BiQuadPos2DFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -1030,10 +1108,11 @@ class BiQuadPos2DFiniteElement : public PositiveFiniteElement
    { dofs = 0.; dofs(vertex) = 1.; }
 };
 
-/// Bi-quadratic element on quad with nodes at the 9 Gaussian points
+/// A 2D bi-quadratic element on a square with nodes at the 9 "Gaussian" points
 class GaussBiQuad2DFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the GaussBiQuad2DFiniteElement
    GaussBiQuad2DFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -1041,20 +1120,27 @@ class GaussBiQuad2DFiniteElement : public NodalFiniteElement
    // virtual void ProjectDelta(int vertex, Vector &dofs) const { dofs = 1.; }
 };
 
+
+/// A 2D bi-cubic element on a square with uniformly spaces nodes
 class BiCubic2DFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the BiCubic2DFiniteElement
    BiCubic2DFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
                            DenseMatrix &dshape) const;
+
+   /// Compute the Hessian of second order partial derivatives at @a ip.
    virtual void CalcHessian (const IntegrationPoint &ip,
                              DenseMatrix &h) const;
 };
 
+/// A 1D cubic element with uniformly spaced nodes
 class Cubic1DFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the Cubic1DFiniteElement
    Cubic1DFiniteElement();
 
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -1063,9 +1149,11 @@ class Cubic1DFiniteElement : public NodalFiniteElement
                            DenseMatrix &dshape) const;
 };
 
+/// A 2D cubic element on a triangle with uniformly spaced nodes
 class Cubic2DFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the Cubic2DFiniteElement
    Cubic2DFiniteElement();
 
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -1077,11 +1165,12 @@ class Cubic2DFiniteElement : public NodalFiniteElement
                              DenseMatrix &h) const;
 };
 
-/// Class for cubic FE on tetrahedron
+/// A 3D cubic element on a tetrahedron with 20 nodes at the thirds of the
+/// tetrahedron
 class Cubic3DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a cubic FE on tetrahedron
+   /// Construct the Cubic3DFiniteElement
    Cubic3DFiniteElement();
 
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -1090,11 +1179,11 @@ class Cubic3DFiniteElement : public NodalFiniteElement
                            DenseMatrix &dshape) const;
 };
 
-/// Class for constant FE on triangle
+/// A 2D constant element on a triangle
 class P0TriangleFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct P0 triangle finite element
+   /// Construct the P0TriangleFiniteElement
    P0TriangleFiniteElement();
 
    /// evaluate shape function - constant 1
@@ -1108,9 +1197,11 @@ class P0TriangleFiniteElement : public NodalFiniteElement
 };
 
 
+/// A 2D constant element on a square
 class P0QuadFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the P0QuadFiniteElement
    P0QuadFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -1120,19 +1211,20 @@ class P0QuadFiniteElement : public NodalFiniteElement
 };
 
 
-/// Class for linear FE on tetrahedron
+/** @brief A 3D linear element on a tetrahedron with nodes at the
+    vertices of the tetrahedron */
 class Linear3DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a linear FE on tetrahedron
+   /// Construct the Linear3DFiniteElement
    Linear3DFiniteElement();
 
-   /** virtual function which evaluates the values of all
+   /** @brief virtual function which evaluates the values of all
        shape functions at a given point ip and stores
        them in the vector shape of dimension Dof (4) */
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
 
-   /** virtual function which evaluates the values of all
+   /** @brief virtual function which evaluates the values of all
        partial derivatives of all shape functions at a given
        point ip and stores them in the matrix dshape (Dof x Dim) (4 x 3)
        so that each row contains the derivatives of one shape function */
@@ -1142,14 +1234,18 @@ class Linear3DFiniteElement : public NodalFiniteElement
    virtual void ProjectDelta(int vertex, Vector &dofs) const
    { dofs = 0.0; dofs(vertex) = 1.0; }
 
+   /** @brief Get the dofs associated with the given @a face.
+       @a *dofs is set to an internal array of the local dofc on the
+       face, while *ndofs is set to the number of dofs on that face.
+   */
    virtual void GetFaceDofs(int face, int **dofs, int *ndofs) const;
 };
 
-/// Class for quadratic FE on tetrahedron
+/// A 3D quadratic element on a tetrahedron with uniformly spaced nodes
 class Quadratic3DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a quadratic FE on tetrahedron
+   /// Construct the Quadratic3DFiniteElement
    Quadratic3DFiniteElement();
 
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -1158,11 +1254,11 @@ class Quadratic3DFiniteElement : public NodalFiniteElement
                            DenseMatrix &dshape) const;
 };
 
-/// Class for tri-linear FE on cube
+/// A 3D tri-linear element on a cube with nodes at the vertices of the cube
 class TriLinear3DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a tri-linear FE on cube
+   /// Construct the TriLinear3DFiniteElement
    TriLinear3DFiniteElement();
 
    /** virtual function which evaluates the values of all
@@ -1182,10 +1278,11 @@ class TriLinear3DFiniteElement : public NodalFiniteElement
 };
 
 
-/// Crouzeix-Raviart finite element on triangle
+/// A 2D Crouzeix-Raviart element on triangle
 class CrouzeixRaviartFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the CrouzeixRaviartFiniteElement
    CrouzeixRaviartFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -1194,31 +1291,37 @@ class CrouzeixRaviartFiniteElement : public NodalFiniteElement
    { dofs = 1.0; }
 };
 
-/// Crouzeix-Raviart finite element on quadrilateral
+/// A 2D Crouzeix-Raviart finite element on square
 class CrouzeixRaviartQuadFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the CrouzeixRaviartQuadFiniteElement
    CrouzeixRaviartQuadFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
                            DenseMatrix &dshape) const;
 };
 
+
+/// A 1D constant element on a segment
 class P0SegmentFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the P0SegmentFiniteElement with dummy order @a Ord
    P0SegmentFiniteElement(int Ord = 0);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
                            DenseMatrix &dshape) const;
 };
 
+/** @brief A 2D 1st order Raviart-Thomas vector element on a triangle */
 class RT0TriangleFiniteElement : public VectorFiniteElement
 {
 private:
    static const double nk[3][2];
 
 public:
+   /// Construct the RT0TriangleFiniteElement
    RT0TriangleFiniteElement();
 
    virtual void CalcVShape(const IntegrationPoint &ip,
@@ -1240,12 +1343,14 @@ class RT0TriangleFiniteElement : public VectorFiniteElement
                          ElementTransformation &Trans, Vector &dofs) const;
 };
 
+/** @brief A 2D 1st order Raviart-Thomas vector element on a square*/
 class RT0QuadFiniteElement : public VectorFiniteElement
 {
 private:
    static const double nk[4][2];
 
 public:
+   /// Construct the RT0QuadFiniteElement
    RT0QuadFiniteElement();
 
    virtual void CalcVShape(const IntegrationPoint &ip,
@@ -1267,12 +1372,14 @@ class RT0QuadFiniteElement : public VectorFiniteElement
                          ElementTransformation &Trans, Vector &dofs) const;
 };
 
+/** @brief A 2D 2nd order Raviart-Thomas vector element on a triangle */
 class RT1TriangleFiniteElement : public VectorFiniteElement
 {
 private:
    static const double nk[8][2];
 
 public:
+   /// Construct the RT1TriangleFiniteElement
    RT1TriangleFiniteElement();
 
    virtual void CalcVShape(const IntegrationPoint &ip,
@@ -1294,12 +1401,14 @@ class RT1TriangleFiniteElement : public VectorFiniteElement
                          ElementTransformation &Trans, Vector &dofs) const;
 };
 
+/** @brief A 2D 2nd order Raviart-Thomas vector element on a square */
 class RT1QuadFiniteElement : public VectorFiniteElement
 {
 private:
    static const double nk[12][2];
 
 public:
+   /// Construct the RT1QuadFiniteElement
    RT1QuadFiniteElement();
 
    virtual void CalcVShape(const IntegrationPoint &ip,
@@ -1321,11 +1430,13 @@ class RT1QuadFiniteElement : public VectorFiniteElement
                          ElementTransformation &Trans, Vector &dofs) const;
 };
 
+/** @brief A 2D 3rd order Raviart-Thomas vector element on a triangle */
 class RT2TriangleFiniteElement : public VectorFiniteElement
 {
 private:
    static const double M[15][15];
 public:
+   /// Construct the RT2TriangleFiniteElement
    RT2TriangleFiniteElement();
 
    virtual void CalcVShape(const IntegrationPoint &ip,
@@ -1339,6 +1450,7 @@ class RT2TriangleFiniteElement : public VectorFiniteElement
                              Vector &divshape) const;
 };
 
+/** @brief A 2D 3rd order Raviart-Thomas vector element on a square */
 class RT2QuadFiniteElement : public VectorFiniteElement
 {
 private:
@@ -1347,6 +1459,7 @@ class RT2QuadFiniteElement : public VectorFiniteElement
    static const double dpt[3];
 
 public:
+   /// Construct the RT2QuadFiniteElement
    RT2QuadFiniteElement();
 
    virtual void CalcVShape(const IntegrationPoint &ip,
@@ -1368,26 +1481,29 @@ class RT2QuadFiniteElement : public VectorFiniteElement
                          ElementTransformation &Trans, Vector &dofs) const;
 };
 
-/// Linear 1D element with nodes 1/3 and 2/3 (trace of RT1)
+/// A 1D linear element with nodes at 1/3 and 2/3 (trace of RT1)
 class P1SegmentFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the P1SegmentFiniteElement
    P1SegmentFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
                            DenseMatrix &dshape) const;
 };
 
-/// Quadratic 1D element with nodes the Gaussian points in [0,1] (trace of RT2)
+/// A 1D quadratic element with nodes at the Gaussian points (trace of RT2)
 class P2SegmentFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the P2SegmentFiniteElement
    P2SegmentFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
                            DenseMatrix &dshape) const;
 };
 
+/// A 1D element with uniform nodes
 class Lagrange1DFiniteElement : public NodalFiniteElement
 {
 private:
@@ -1396,24 +1512,29 @@ class Lagrange1DFiniteElement : public NodalFiniteElement
    mutable Vector rxxk;
 #endif
 public:
+   /// Construct the Lagrange1DFiniteElement with the provided @a degree
    Lagrange1DFiniteElement (int degree);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
                            DenseMatrix &dshape) const;
 };
 
+/// A 3D Crouzeix-Raviart element on the tetrahedron.
 class P1TetNonConfFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the P1TetNonConfFiniteElement
    P1TetNonConfFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
                            DenseMatrix &dshape) const;
 };
 
+/// A 3D constant element on a tetrahedron
 class P0TetFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the P0TetFiniteElement
    P0TetFiniteElement ();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -1422,9 +1543,11 @@ class P0TetFiniteElement : public NodalFiniteElement
    { dofs(0) = 1.0; }
 };
 
+/// A 3D constant element on a cube
 class P0HexFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the P0HexFiniteElement
    P0HexFiniteElement ();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -1433,7 +1556,8 @@ class P0HexFiniteElement : public NodalFiniteElement
    { dofs(0) = 1.0; }
 };
 
-/// Tensor products of 1D FEs (only degree 2 is functional)
+/** @brief Tensor products of 1D Lagrange1DFiniteElement
+    (only degree 2 is functional) */
 class LagrangeHexFiniteElement : public NodalFiniteElement
 {
 private:
@@ -1446,6 +1570,7 @@ class LagrangeHexFiniteElement : public NodalFiniteElement
 #endif
 
 public:
+   /// Construct the LagrangeHexFiniteElement with the provided @a degree
    LagrangeHexFiniteElement (int degree);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -1454,11 +1579,11 @@ class LagrangeHexFiniteElement : public NodalFiniteElement
 };
 
 
-/// Class for refined linear FE on interval
+/// A 1D refined linear element
 class RefinedLinear1DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a quadratic FE on interval
+   /// Construct the RefinedLinear1DFiniteElement
    RefinedLinear1DFiniteElement();
 
    /** virtual function which evaluates the values of all
@@ -1474,11 +1599,11 @@ class RefinedLinear1DFiniteElement : public NodalFiniteElement
                            DenseMatrix &dshape) const;
 };
 
-/// Class for refined linear FE on triangle
+/// A 2D refined linear element on a triangle
 class RefinedLinear2DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a quadratic FE on triangle
+   /// Construct the RefinedLinear2DFiniteElement
    RefinedLinear2DFiniteElement();
 
    /** virtual function which evaluates the values of all
@@ -1494,11 +1619,11 @@ class RefinedLinear2DFiniteElement : public NodalFiniteElement
                            DenseMatrix &dshape) const;
 };
 
-/// Class for refined linear FE on tetrahedron
+/// A 2D refined linear element on a tetrahedron
 class RefinedLinear3DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a quadratic FE on tetrahedron
+   /// Construct the RefinedLinear3DFiniteElement
    RefinedLinear3DFiniteElement();
 
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -1507,11 +1632,11 @@ class RefinedLinear3DFiniteElement : public NodalFiniteElement
                            DenseMatrix &dshape) const;
 };
 
-/// Class for refined bi-linear FE on quadrilateral
+/// A 2D refined bi-linear FE on a square
 class RefinedBiLinear2DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a biquadratic FE on quadrilateral
+   /// Construct the RefinedBiLinear2DFiniteElement
    RefinedBiLinear2DFiniteElement();
 
    /** virtual function which evaluates the values of all
@@ -1527,11 +1652,11 @@ class RefinedBiLinear2DFiniteElement : public NodalFiniteElement
                            DenseMatrix &dshape) const;
 };
 
-/// Class for refined trilinear FE on a hexahedron
+/// A 3D refined tri-linear element on a cube
 class RefinedTriLinear3DFiniteElement : public NodalFiniteElement
 {
 public:
-   /// Construct a biquadratic FE on quadrilateral
+   /// Construct the RefinedTriLinear3DFiniteElement
    RefinedTriLinear3DFiniteElement();
 
    /** virtual function which evaluates the values of all
@@ -1548,12 +1673,14 @@ class RefinedTriLinear3DFiniteElement : public NodalFiniteElement
 };
 
 
+/// A 3D 1st order Nedelec element on a cube
 class Nedelec1HexFiniteElement : public VectorFiniteElement
 {
 private:
    static const double tk[12][3];
 
 public:
+   /// Construct the Nedelec1HexFiniteElement
    Nedelec1HexFiniteElement();
    virtual void CalcVShape(const IntegrationPoint &ip,
                            DenseMatrix &shape) const;
@@ -1570,12 +1697,14 @@ class Nedelec1HexFiniteElement : public VectorFiniteElement
 };
 
 
+/// A 3D 1st order Nedelec element on a tetrahedron
 class Nedelec1TetFiniteElement : public VectorFiniteElement
 {
 private:
    static const double tk[6][3];
 
 public:
+   /// Construct the Nedelec1TetFiniteElement
    Nedelec1TetFiniteElement();
    virtual void CalcVShape(const IntegrationPoint &ip,
                            DenseMatrix &shape) const;
@@ -1592,12 +1721,14 @@ class Nedelec1TetFiniteElement : public VectorFiniteElement
 };
 
 
+/// A 3D 0th order Raviert-Thomas element on a cube
 class RT0HexFiniteElement : public VectorFiniteElement
 {
 private:
    static const double nk[6][3];
 
 public:
+   /// Construct the RT0HexFiniteElement
    RT0HexFiniteElement();
 
    virtual void CalcVShape(const IntegrationPoint &ip,
@@ -1620,12 +1751,14 @@ class RT0HexFiniteElement : public VectorFiniteElement
 };
 
 
+/// A 3D 1st order Raviert-Thomas element on a cube
 class RT1HexFiniteElement : public VectorFiniteElement
 {
 private:
    static const double nk[36][3];
 
 public:
+   /// Construct the RT1HexFiniteElement
    RT1HexFiniteElement();
 
    virtual void CalcVShape(const IntegrationPoint &ip,
@@ -1648,12 +1781,14 @@ class RT1HexFiniteElement : public VectorFiniteElement
 };
 
 
+/// A 3D 0th order Raviert-Thomas element on a tetrahedron
 class RT0TetFiniteElement : public VectorFiniteElement
 {
 private:
    static const double nk[4][3];
 
 public:
+   /// Construct the RT0TetFiniteElement
    RT0TetFiniteElement();
 
    virtual void CalcVShape(const IntegrationPoint &ip,
@@ -1679,6 +1814,7 @@ class RT0TetFiniteElement : public VectorFiniteElement
 class RotTriLinearHexFiniteElement : public NodalFiniteElement
 {
 public:
+   /// Construct the RotTriLinearHexFiniteElement
    RotTriLinearHexFiniteElement();
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -1686,6 +1822,8 @@ class RotTriLinearHexFiniteElement : public NodalFiniteElement
 };
 
 
+/// Class for computing 1D special polynomials and their associated basis
+/// functions
 class Poly_1D
 {
 public:
@@ -1748,9 +1886,13 @@ class Poly_1D
                points. Returns NULL if the BasisType has no associated set of
                points. */
    const double *GetPoints(const int p, const int btype);
+
+   /// Get coordinates of an open (GaussLegendre) set of points if degree @a p
    const double *OpenPoints(const int p,
                             const int btype = BasisType::GaussLegendre)
    { return GetPoints(p, btype); }
+
+   /// Get coordinates of a closed (GaussLegendre) set of points if degree @a p
    const double *ClosedPoints(const int p,
                               const int btype = BasisType::GaussLobatto)
    { return GetPoints(p, btype); }
@@ -1765,8 +1907,8 @@ class Poly_1D
                the requested basis type. */
    Basis &GetBasis(const int p, const int btype);
 
-   // Evaluate the values of a hierarchical 1D basis at point x
-   // hierarchical = k-th basis function is degree k polynomial
+   /** @brief Evaluate the values of a hierarchical 1D basis at point x
+       hierarchical = k-th basis function is degree k polynomial */
    static void CalcBasis(const int p, const double x, double *u)
    // { CalcMono(p, x, u); }
    // Bernstein basis is not hierarchical --> does not work for triangles
@@ -1775,14 +1917,14 @@ class Poly_1D
    // { CalcLegendre(p, x, u); }
    { CalcChebyshev(p, x, u); }
 
-   // Evaluate the values and derivatives of a hierarchical 1D basis at point x
+   /// Evaluate the values and derivatives of a hierarchical 1D basis at point @a x
    static void CalcBasis(const int p, const double x, double *u, double *d)
    // { CalcMono(p, x, u, d); }
    // { CalcBernstein(p, x, u, d); }
    // { CalcLegendre(p, x, u, d); }
    { CalcChebyshev(p, x, u, d); }
 
-   // Evaluate the values, derivatives and second derivatives of a hierarchical 1D basis at point x
+   /// Evaluate the values, derivatives and second derivatives of a hierarchical 1D basis at point x
    static void CalcBasis(const int p, const double x, double *u, double *d,
                          double *dd)
    // { CalcMono(p, x, u, d); }
@@ -1790,25 +1932,38 @@ class Poly_1D
    // { CalcLegendre(p, x, u, d); }
    { CalcChebyshev(p, x, u, d, dd); }
 
-   // Evaluate a representation of a Delta function at point x
+   /// Evaluate a representation of a Delta function at point x
    static double CalcDelta(const int p, const double x)
    { return pow(x, (double) p); }
 
+   /** @brief Compute the points for the Chebyshev polynomials of order @a p
+       and place them in the already allocated @a x array. */
    static void ChebyshevPoints(const int p, double *x);
 
-   /// Compute the terms in the expansion of the binomial (x + y)^p
+   /** @brief Compute the @a p terms in the expansion of the binomial (x + y)^p
+       and store them in the already allocated @a u array. */
    static void CalcBinomTerms(const int p, const double x, const double y,
                               double *u);
-   /** Compute the terms in the expansion of the binomial (x + y)^p and their
-       derivatives with respect to x assuming that dy/dx = -1. */
+   /** @brief Compute the terms in the expansion of the binomial (x + y)^p and
+       their derivatives with respect to x assuming that dy/dx = -1.  Store the
+       results in the already allocated @a u and @a d arrays.*/
    static void CalcBinomTerms(const int p, const double x, const double y,
                               double *u, double *d);
-   /** Compute the derivatives (w.r.t. x) of the terms in the expansion of the
-       binomial (x + y)^p assuming that dy/dx = -1. */
+   /** @brief Compute the derivatives (w.r.t. x) of the terms in the expansion
+       of the binomial (x + y)^p assuming that dy/dx = -1.  Store the results
+       in the already allocated @a d array.*/
    static void CalcDBinomTerms(const int p, const double x, const double y,
                                double *d);
+
+   /** @brief Compute the values of the Bernstein basis functions of order
+       @a p at coordinate @a x and store the results in the already allocated
+       @a u array. */
    static void CalcBernstein(const int p, const double x, double *u)
    { CalcBinomTerms(p, x, 1. - x, u); }
+
+   /** @brief Compute the values and derivatives of the Bernstein basis functions
+       of order @a p at coordinate @a x and store the results in the already allocated
+       @a u and @a d arrays. */
    static void CalcBernstein(const int p, const double x, double *u, double *d)
    { CalcBinomTerms(p, x, 1. - x, u, d); }
 
@@ -1820,6 +1975,9 @@ class Poly_1D
 
 extern Poly_1D poly1d;
 
+
+/// An element defined as an ND tensor product of 1D elements on a segment,
+/// square, or cube
 class TensorBasisElement
 {
 protected:
@@ -1930,8 +2088,11 @@ class VectorTensorFiniteElement : public VectorFiniteElement,
    const DofToQuad &GetTensorDofToQuad(const IntegrationRule &ir,
                                        DofToQuad::Mode mode,
                                        const bool closed) const;
+
+   ~VectorTensorFiniteElement();
 };
 
+/// Arbitrary H1 elements in 1D
 class H1_SegmentElement : public NodalTensorFiniteElement
 {
 private:
@@ -1940,6 +2101,7 @@ class H1_SegmentElement : public NodalTensorFiniteElement
 #endif
 
 public:
+   /// Construct the H1_SegmentElement of order @a p and BasisType @a btype
    H1_SegmentElement(const int p, const int btype = BasisType::GaussLobatto);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -1948,6 +2110,7 @@ class H1_SegmentElement : public NodalTensorFiniteElement
 };
 
 
+/// Arbitrary H1 elements in 2D on a square
 class H1_QuadrilateralElement : public NodalTensorFiniteElement
 {
 private:
@@ -1956,6 +2119,7 @@ class H1_QuadrilateralElement : public NodalTensorFiniteElement
 #endif
 
 public:
+   /// Construct the H1_QuadrilateralElement of order @a p and BasisType @a btype
    H1_QuadrilateralElement(const int p,
                            const int btype = BasisType::GaussLobatto);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -1965,6 +2129,7 @@ class H1_QuadrilateralElement : public NodalTensorFiniteElement
 };
 
 
+/// Arbitrary H1 elements in 3D on a cube
 class H1_HexahedronElement : public NodalTensorFiniteElement
 {
 private:
@@ -1973,6 +2138,7 @@ class H1_HexahedronElement : public NodalTensorFiniteElement
 #endif
 
 public:
+   /// Construct the H1_HexahedronElement of order @a p and BasisType @a btype
    H1_HexahedronElement(const int p, const int btype = BasisType::GaussLobatto);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -1980,19 +2146,21 @@ class H1_HexahedronElement : public NodalTensorFiniteElement
    virtual void ProjectDelta(int vertex, Vector &dofs) const;
 };
 
+/// Arbitrary order H1 elements in 1D utilizing the Bernstein basis
 class H1Pos_SegmentElement : public PositiveTensorFiniteElement
 {
 private:
 #ifndef MFEM_THREAD_SAFE
-   // This is to share scratch space between invocations, which helps
-   // speed things up, but with OpenMP, we need one copy per thread.
-   // Right now, we solve this by allocating this space within each function
-   // call every time we call it.  Alternatively, we should do some sort
-   // thread private thing.  Brunner, Jan 2014
+   // This is to share scratch space between invocations, which helps speed
+   // things up, but with OpenMP, we need one copy per thread. Right now, we
+   // solve this by allocating this space within each function call every time
+   // we call it. Alternatively, we should do some sort thread private thing.
+   // Brunner, Jan 2014
    mutable Vector shape_x, dshape_x;
 #endif
 
 public:
+   /// Construct the H1Pos_SegmentElement of order @a p
    H1Pos_SegmentElement(const int p);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -2001,6 +2169,7 @@ class H1Pos_SegmentElement : public PositiveTensorFiniteElement
 };
 
 
+/// Arbitrary order H1 elements in 2D utilizing the Bernstein basis on a square
 class H1Pos_QuadrilateralElement : public PositiveTensorFiniteElement
 {
 private:
@@ -2010,6 +2179,7 @@ class H1Pos_QuadrilateralElement : public PositiveTensorFiniteElement
 #endif
 
 public:
+   /// Construct the H1Pos_QuadrilateralElement of order @a p
    H1Pos_QuadrilateralElement(const int p);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -2018,9 +2188,11 @@ class H1Pos_QuadrilateralElement : public PositiveTensorFiniteElement
 };
 
 
+/// Arbitrary order H1 serendipity elements in 2D on a quad
 class H1Ser_QuadrilateralElement : public ScalarFiniteElement
 {
 public:
+   /// Construct the H1Ser_QuadrilateralElement of order @a p
    H1Ser_QuadrilateralElement(const int p);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -2030,6 +2202,7 @@ class H1Ser_QuadrilateralElement : public ScalarFiniteElement
    using FiniteElement::Project;
 };
 
+/// Arbitrary order H1 elements in 3D utilizing the Bernstein basis on a cube
 class H1Pos_HexahedronElement : public PositiveTensorFiniteElement
 {
 private:
@@ -2039,6 +2212,7 @@ class H1Pos_HexahedronElement : public PositiveTensorFiniteElement
 #endif
 
 public:
+   /// Construct the H1Pos_HexahedronElement of order @a p
    H1Pos_HexahedronElement(const int p);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -2047,6 +2221,7 @@ class H1Pos_HexahedronElement : public PositiveTensorFiniteElement
 };
 
 
+/// Arbitrary order H1 elements in 2D on a tiangle
 class H1_TriangleElement : public NodalFiniteElement
 {
 private:
@@ -2058,6 +2233,7 @@ class H1_TriangleElement : public NodalFiniteElement
    DenseMatrixInverse Ti;
 
 public:
+   /// Construct the H1_TriangleElement of order @a p and BasisType @a btype
    H1_TriangleElement(const int p, const int btype = BasisType::GaussLobatto);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -2067,6 +2243,7 @@ class H1_TriangleElement : public NodalFiniteElement
 };
 
 
+/// Arbitrary order H1 elements in 3D  on a tetrahedron
 class H1_TetrahedronElement : public NodalFiniteElement
 {
 private:
@@ -2079,6 +2256,7 @@ class H1_TetrahedronElement : public NodalFiniteElement
    DenseMatrixInverse Ti;
 
 public:
+   /// Construct the H1_TetrahedronElement of order @a p and BasisType @a btype
    H1_TetrahedronElement(const int p,
                          const int btype = BasisType::GaussLobatto);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -2089,6 +2267,7 @@ class H1_TetrahedronElement : public NodalFiniteElement
 };
 
 
+/// Arbitrary order H1 elements in 2D utilizing the Bernstein basis on a triangle
 class H1Pos_TriangleElement : public PositiveFiniteElement
 {
 protected:
@@ -2099,6 +2278,7 @@ class H1Pos_TriangleElement : public PositiveFiniteElement
    Array<int> dof_map;
 
 public:
+   /// Construct the H1Pos_TriangleElement of order @a p
    H1Pos_TriangleElement(const int p);
 
    // The size of shape is (p+1)(p+2)/2 (dof).
@@ -2115,6 +2295,8 @@ class H1Pos_TriangleElement : public PositiveFiniteElement
 };
 
 
+/// Arbitrary order H1 elements in 3D utilizing the Bernstein basis on a
+/// tetrahedron
 class H1Pos_TetrahedronElement : public PositiveFiniteElement
 {
 protected:
@@ -2125,6 +2307,7 @@ class H1Pos_TetrahedronElement : public PositiveFiniteElement
    Array<int> dof_map;
 
 public:
+   /// Construct the H1Pos_TetrahedronElement of order @a p
    H1Pos_TetrahedronElement(const int p);
 
    // The size of shape is (p+1)(p+2)(p+3)/6 (dof).
@@ -2141,6 +2324,7 @@ class H1Pos_TetrahedronElement : public PositiveFiniteElement
 };
 
 
+/// Arbitrary order H1 elements in 3D on a wedge
 class H1_WedgeElement : public NodalFiniteElement
 {
 private:
@@ -2154,6 +2338,7 @@ class H1_WedgeElement : public NodalFiniteElement
    H1_SegmentElement  SegmentFE;
 
 public:
+   /// Construct the H1_WedgeElement of order @a p and BasisType @a btype
    H1_WedgeElement(const int p,
                    const int btype = BasisType::GaussLobatto);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -2185,6 +2370,7 @@ class BiCubic3DFiniteElement : public H1_WedgeElement
    BiCubic3DFiniteElement() : H1_WedgeElement(3) {}
 };
 
+/// Arbitrary order H1 elements in 3D utilizing the Bernstein basis on a wedge
 class H1Pos_WedgeElement : public PositiveFiniteElement
 {
 protected:
@@ -2198,6 +2384,7 @@ class H1Pos_WedgeElement : public PositiveFiniteElement
    H1Pos_SegmentElement  SegmentFE;
 
 public:
+   /// Construct the H1Pos_WedgeElement of order @a p
    H1Pos_WedgeElement(const int p);
 
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -2206,6 +2393,7 @@ class H1Pos_WedgeElement : public PositiveFiniteElement
 };
 
 
+/// Arbitrary L2 elements in 1D on a segment
 class L2_SegmentElement : public NodalTensorFiniteElement
 {
 private:
@@ -2214,6 +2402,7 @@ class L2_SegmentElement : public NodalTensorFiniteElement
 #endif
 
 public:
+   /// Construct the L2_SegmentElement of order @a p and BasisType @a btype
    L2_SegmentElement(const int p, const int btype = BasisType::GaussLegendre);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -2221,7 +2410,7 @@ class L2_SegmentElement : public NodalTensorFiniteElement
    virtual void ProjectDelta(int vertex, Vector &dofs) const;
 };
 
-
+/// Arbitrary order L2 elements in 1D utilizing the Bernstein basis on a segment
 class L2Pos_SegmentElement : public PositiveTensorFiniteElement
 {
 private:
@@ -2230,6 +2419,7 @@ class L2Pos_SegmentElement : public PositiveTensorFiniteElement
 #endif
 
 public:
+   /// Construct the L2Pos_SegmentElement of order @a p
    L2Pos_SegmentElement(const int p);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -2238,6 +2428,7 @@ class L2Pos_SegmentElement : public PositiveTensorFiniteElement
 };
 
 
+/// Arbitrary order L2 elements in 2D on a square
 class L2_QuadrilateralElement : public NodalTensorFiniteElement
 {
 private:
@@ -2246,6 +2437,7 @@ class L2_QuadrilateralElement : public NodalTensorFiniteElement
 #endif
 
 public:
+   /// Construct the L2_QuadrilateralElement of order @a p and BasisType @a btype
    L2_QuadrilateralElement(const int p,
                            const int btype = BasisType::GaussLegendre);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -2258,7 +2450,7 @@ class L2_QuadrilateralElement : public NodalTensorFiniteElement
    { ProjectCurl_2D(fe, Trans, curl); }
 };
 
-
+/// Arbitrary order L2 elements in 2D utilizing the Bernstein basis on a square
 class L2Pos_QuadrilateralElement : public PositiveTensorFiniteElement
 {
 private:
@@ -2267,6 +2459,7 @@ class L2Pos_QuadrilateralElement : public PositiveTensorFiniteElement
 #endif
 
 public:
+   /// Construct the L2Pos_QuadrilateralElement of order @a p
    L2Pos_QuadrilateralElement(const int p);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -2274,7 +2467,7 @@ class L2Pos_QuadrilateralElement : public PositiveTensorFiniteElement
    virtual void ProjectDelta(int vertex, Vector &dofs) const;
 };
 
-
+/// Arbitrary order L2 elements in 3D on a cube
 class L2_HexahedronElement : public NodalTensorFiniteElement
 {
 private:
@@ -2283,6 +2476,7 @@ class L2_HexahedronElement : public NodalTensorFiniteElement
 #endif
 
 public:
+   /// Construct the L2_HexahedronElement of order @a p and BasisType @a btype
    L2_HexahedronElement(const int p,
                         const int btype = BasisType::GaussLegendre);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -2292,6 +2486,7 @@ class L2_HexahedronElement : public NodalTensorFiniteElement
 };
 
 
+/// Arbitrary order L2 elements in 3D utilizing the Bernstein basis on a cube
 class L2Pos_HexahedronElement : public PositiveTensorFiniteElement
 {
 private:
@@ -2300,6 +2495,7 @@ class L2Pos_HexahedronElement : public PositiveTensorFiniteElement
 #endif
 
 public:
+   /// Construct the L2Pos_HexahedronElement of order @a p
    L2Pos_HexahedronElement(const int p);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -2308,6 +2504,7 @@ class L2Pos_HexahedronElement : public PositiveTensorFiniteElement
 };
 
 
+/// Arbitrary order L2 elements in 2D on a triangle
 class L2_TriangleElement : public NodalFiniteElement
 {
 private:
@@ -2318,6 +2515,7 @@ class L2_TriangleElement : public NodalFiniteElement
    DenseMatrixInverse Ti;
 
 public:
+   /// Construct the L2_TriangleElement of order @a p and BasisType @a btype
    L2_TriangleElement(const int p,
                       const int btype = BasisType::GaussLegendre);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -2330,7 +2528,7 @@ class L2_TriangleElement : public NodalFiniteElement
    { ProjectCurl_2D(fe, Trans, curl); }
 };
 
-
+/// Arbitrary order L2 elements in 2D utilizing the Bernstein basis on a triangle
 class L2Pos_TriangleElement : public PositiveFiniteElement
 {
 private:
@@ -2339,6 +2537,7 @@ class L2Pos_TriangleElement : public PositiveFiniteElement
 #endif
 
 public:
+   /// Construct the L2Pos_TriangleElement of order @a p
    L2Pos_TriangleElement(const int p);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -2347,6 +2546,7 @@ class L2Pos_TriangleElement : public PositiveFiniteElement
 };
 
 
+/// Arbitrary order L2 elements in 3D on a tetrahedron
 class L2_TetrahedronElement : public NodalFiniteElement
 {
 private:
@@ -2358,6 +2558,7 @@ class L2_TetrahedronElement : public NodalFiniteElement
    DenseMatrixInverse Ti;
 
 public:
+   /// Construct the L2_TetrahedronElement of order @a p and BasisType @a btype
    L2_TetrahedronElement(const int p,
                          const int btype = BasisType::GaussLegendre);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -2367,6 +2568,8 @@ class L2_TetrahedronElement : public NodalFiniteElement
 };
 
 
+/// Arbitrary order L2 elements in 3D utilizing the Bernstein basis on a
+/// tetrahedron
 class L2Pos_TetrahedronElement : public PositiveFiniteElement
 {
 private:
@@ -2375,6 +2578,7 @@ class L2Pos_TetrahedronElement : public PositiveFiniteElement
 #endif
 
 public:
+   /// Construct the L2Pos_TetrahedronElement of order @a p
    L2Pos_TetrahedronElement(const int p);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
    virtual void CalcDShape(const IntegrationPoint &ip,
@@ -2383,6 +2587,7 @@ class L2Pos_TetrahedronElement : public PositiveFiniteElement
 };
 
 
+/// Arbitrary order L2 elements in 3D on a wedge
 class L2_WedgeElement : public NodalFiniteElement
 {
 private:
@@ -2396,6 +2601,7 @@ class L2_WedgeElement : public NodalFiniteElement
    L2_SegmentElement  SegmentFE;
 
 public:
+   /// Construct the L2_WedgeElement of order @a p and BasisType @a btype
    L2_WedgeElement(const int p,
                    const int btype = BasisType::GaussLegendre);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -2403,12 +2609,15 @@ class L2_WedgeElement : public NodalFiniteElement
                            DenseMatrix &dshape) const;
 };
 
+/// A 0th order L2 element on a Wedge
 class P0WedgeFiniteElement : public L2_WedgeElement
 {
 public:
+   /// Construct the P0WedgeFiniteElement
    P0WedgeFiniteElement () : L2_WedgeElement(0) {}
 };
 
+/// Arbitrary order L2 elements in 3D utilizing the Bernstein basis on a wedge
 class L2Pos_WedgeElement : public PositiveFiniteElement
 {
 protected:
@@ -2422,6 +2631,7 @@ class L2Pos_WedgeElement : public PositiveFiniteElement
    L2Pos_SegmentElement  SegmentFE;
 
 public:
+   /// Construct the L2Pos_WedgeElement of order @a p
    L2Pos_WedgeElement(const int p);
 
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -2429,20 +2639,21 @@ class L2Pos_WedgeElement : public PositiveFiniteElement
                            DenseMatrix &dshape) const;
 };
 
-
-class RT_QuadrilateralElement : public VectorFiniteElement
+/// Arbitrary order Raviart-Thomas elements in 2D on a square
+class RT_QuadrilateralElement : public VectorTensorFiniteElement
 {
 private:
    static const double nk[8];
 
-   Poly_1D::Basis &cbasis1d, &obasis1d;
 #ifndef MFEM_THREAD_SAFE
    mutable Vector shape_cx, shape_ox, shape_cy, shape_oy;
    mutable Vector dshape_cx, dshape_cy;
 #endif
-   Array<int> dof_map, dof2nk;
+   Array<int> dof2nk;
 
 public:
+   /** @brief Construct the RT_QuadrilateralElement of order @a p and closed and
+       open BasisType @a cb_type and @a ob_type */
    RT_QuadrilateralElement(const int p,
                            const int cb_type = BasisType::GaussLobatto,
                            const int ob_type = BasisType::GaussLegendre);
@@ -2486,18 +2697,20 @@ class RT_QuadrilateralElement : public VectorFiniteElement
 };
 
 
-class RT_HexahedronElement : public VectorFiniteElement
+/// Arbitrary order Raviart-Thomas elements in 3D on a cube
+class RT_HexahedronElement : public VectorTensorFiniteElement
 {
    static const double nk[18];
 
-   Poly_1D::Basis &cbasis1d, &obasis1d;
 #ifndef MFEM_THREAD_SAFE
    mutable Vector shape_cx, shape_ox, shape_cy, shape_oy, shape_cz, shape_oz;
    mutable Vector dshape_cx, dshape_cy, dshape_cz;
 #endif
-   Array<int> dof_map, dof2nk;
+   Array<int> dof2nk;
 
 public:
+   /** @brief Construct the RT_HexahedronElement of order @a p and closed and
+       open BasisType @a cb_type and @a ob_type */
    RT_HexahedronElement(const int p,
                         const int cb_type = BasisType::GaussLobatto,
                         const int ob_type = BasisType::GaussLegendre);
@@ -2536,6 +2749,7 @@ class RT_HexahedronElement : public VectorFiniteElement
 };
 
 
+/// Arbitrary order Raviart-Thomas elements in 2D on a triangle
 class RT_TriangleElement : public VectorFiniteElement
 {
    static const double nk[6], c;
@@ -2550,6 +2764,7 @@ class RT_TriangleElement : public VectorFiniteElement
    DenseMatrixInverse Ti;
 
 public:
+   /// Construct the RT_TriangleElement of order @a p
    RT_TriangleElement(const int p);
    virtual void CalcVShape(const IntegrationPoint &ip,
                            DenseMatrix &shape) const;
@@ -2591,6 +2806,7 @@ class RT_TriangleElement : public VectorFiniteElement
 };
 
 
+/// Arbitrary order Raviart-Thomas elements in 3D on a tetrahedron
 class RT_TetrahedronElement : public VectorFiniteElement
 {
    static const double nk[12], c;
@@ -2605,6 +2821,7 @@ class RT_TetrahedronElement : public VectorFiniteElement
    DenseMatrixInverse Ti;
 
 public:
+   /// Construct the RT_TetrahedronElement of order @a p
    RT_TetrahedronElement(const int p);
    virtual void CalcVShape(const IntegrationPoint &ip,
                            DenseMatrix &shape) const;
@@ -2640,6 +2857,7 @@ class RT_TetrahedronElement : public VectorFiniteElement
 };
 
 
+/// Arbitrary order Nedelec elements in 3D on a cube
 class ND_HexahedronElement : public VectorTensorFiniteElement
 {
    static const double tk[18];
@@ -2650,6 +2868,8 @@ class ND_HexahedronElement : public VectorTensorFiniteElement
    Array<int> dof2tk;
 
 public:
+   /** @brief Construct the ND_HexahedronElement of order @a p and closed and
+       open BasisType @a cb_type and @a ob_type */
    ND_HexahedronElement(const int p,
                         const int cb_type = BasisType::GaussLobatto,
                         const int ob_type = BasisType::GaussLegendre);
@@ -2704,6 +2924,7 @@ class ND_HexahedronElement : public VectorTensorFiniteElement
 };
 
 
+/// Arbitrary order Nedelec elements in 2D on a square
 class ND_QuadrilateralElement : public VectorTensorFiniteElement
 {
    static const double tk[8];
@@ -2715,6 +2936,8 @@ class ND_QuadrilateralElement : public VectorTensorFiniteElement
    Array<int> dof2tk;
 
 public:
+   /** @brief Construct the ND_QuadrilateralElement of order @a p and closed and
+       open BasisType @a cb_type and @a ob_type */
    ND_QuadrilateralElement(const int p,
                            const int cb_type = BasisType::GaussLobatto,
                            const int ob_type = BasisType::GaussLegendre);
@@ -2753,6 +2976,7 @@ class ND_QuadrilateralElement : public VectorTensorFiniteElement
 };
 
 
+/// Arbitrary order Nedelec elements in 3D on a tetrahedron
 class ND_TetrahedronElement : public VectorFiniteElement
 {
    static const double tk[18], c;
@@ -2766,6 +2990,7 @@ class ND_TetrahedronElement : public VectorFiniteElement
    DenseMatrixInverse Ti;
 
 public:
+   /// Construct the ND_TetrahedronElement of order @a p
    ND_TetrahedronElement(const int p);
    virtual void CalcVShape(const IntegrationPoint &ip,
                            DenseMatrix &shape) const;
@@ -2806,6 +3031,7 @@ class ND_TetrahedronElement : public VectorFiniteElement
    { ProjectCurl_ND(tk, dof2tk, fe, Trans, curl); }
 };
 
+/// Arbitrary order Nedelec elements in 2D on a triangle
 class ND_TriangleElement : public VectorFiniteElement
 {
    static const double tk[8], c;
@@ -2820,6 +3046,7 @@ class ND_TriangleElement : public VectorFiniteElement
    DenseMatrixInverse Ti;
 
 public:
+   /// Construct the ND_TriangleElement of order @a p
    ND_TriangleElement(const int p);
    virtual void CalcVShape(const IntegrationPoint &ip,
                            DenseMatrix &shape) const;
@@ -2856,6 +3083,7 @@ class ND_TriangleElement : public VectorFiniteElement
 };
 
 
+/// Arbitrary order Nedelec elements in 1D on a segment
 class ND_SegmentElement : public VectorFiniteElement
 {
    static const double tk[1];
@@ -2864,6 +3092,8 @@ class ND_SegmentElement : public VectorFiniteElement
    Array<int> dof2tk;
 
 public:
+   /** @brief Construct the ND_SegmentElement of order @a p and open
+       BasisType @a ob_type */
    ND_SegmentElement(const int p, const int ob_type = BasisType::GaussLegendre);
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const
    { obasis1d.Eval(ip.x, shape); }
@@ -2902,6 +3132,7 @@ class ND_SegmentElement : public VectorFiniteElement
 };
 
 
+/// An arbitrary order and dimension NURBS element
 class NURBSFiniteElement : public ScalarFiniteElement
 {
 protected:
@@ -2911,13 +3142,20 @@ class NURBSFiniteElement : public ScalarFiniteElement
    mutable Vector weights;
 
 public:
+   /** @brief Construct NURBSFiniteElement with given
+       @param D    Reference space dimension
+       @param G    Geometry type (of type Geometry::Type)
+       @param Do   Number of degrees of freedom in the FiniteElement
+       @param O    Order/degree of the FiniteElement
+       @param F    FunctionSpace type of the FiniteElement
+    */
    NURBSFiniteElement(int D, Geometry::Type G, int Do, int O, int F)
       : ScalarFiniteElement(D, G, Do, O, F)
    {
       ijk = NULL;
       patch = elem = -1;
-      kv.SetSize(Dim);
-      weights.SetSize(Dof);
+      kv.SetSize(dim);
+      weights.SetSize(dof);
       weights = 1.0;
    }
 
@@ -2933,12 +3171,15 @@ class NURBSFiniteElement : public ScalarFiniteElement
    virtual void         SetOrder   ()         const { }
 };
 
+
+/// An arbitrary order 1D NURBS element on a segment
 class NURBS1DFiniteElement : public NURBSFiniteElement
 {
 protected:
    mutable Vector shape_x;
 
 public:
+   /// Construct the NURBS1DFiniteElement of order @a p
    NURBS1DFiniteElement(int p)
       : NURBSFiniteElement(1, Geometry::SEGMENT, p + 1, p, FunctionSpace::Qk),
         shape_x(p + 1) { }
@@ -2951,6 +3192,7 @@ class NURBS1DFiniteElement : public NURBSFiniteElement
                              DenseMatrix &hessian) const;
 };
 
+/// An arbitrary order 2D NURBS element on a square
 class NURBS2DFiniteElement : public NURBSFiniteElement
 {
 protected:
@@ -2958,19 +3200,21 @@ class NURBS2DFiniteElement : public NURBSFiniteElement
    mutable DenseMatrix du;
 
 public:
+   /// Construct the NURBS2DFiniteElement of order @a p
    NURBS2DFiniteElement(int p)
       : NURBSFiniteElement(2, Geometry::SQUARE, (p + 1)*(p + 1), p,
                            FunctionSpace::Qk),
-        u(Dof), shape_x(p + 1), shape_y(p + 1), dshape_x(p + 1),
-        dshape_y(p + 1), d2shape_x(p + 1), d2shape_y(p + 1), du(Dof,2)
-   { Orders[0] = Orders[1] = p; }
+        u(dof), shape_x(p + 1), shape_y(p + 1), dshape_x(p + 1),
+        dshape_y(p + 1), d2shape_x(p + 1), d2shape_y(p + 1), du(dof,2)
+   { orders[0] = orders[1] = p; }
 
+   /// Construct the NURBS2DFiniteElement with x-order @a px and y-order @a py
    NURBS2DFiniteElement(int px, int py)
       : NURBSFiniteElement(2, Geometry::SQUARE, (px + 1)*(py + 1),
                            std::max(px, py), FunctionSpace::Qk),
-        u(Dof), shape_x(px + 1), shape_y(py + 1), dshape_x(px + 1),
-        dshape_y(py + 1), d2shape_x(px + 1), d2shape_y(py + 1), du(Dof,2)
-   { Orders[0] = px; Orders[1] = py; }
+        u(dof), shape_x(px + 1), shape_y(py + 1), dshape_x(px + 1),
+        dshape_y(py + 1), d2shape_x(px + 1), d2shape_y(py + 1), du(dof,2)
+   { orders[0] = px; orders[1] = py; }
 
    virtual void SetOrder() const;
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
@@ -2980,6 +3224,7 @@ class NURBS2DFiniteElement : public NURBSFiniteElement
                              DenseMatrix &hessian) const;
 };
 
+/// An arbitrary order 3D NURBS element on a cube
 class NURBS3DFiniteElement : public NURBSFiniteElement
 {
 protected:
@@ -2989,21 +3234,24 @@ class NURBS3DFiniteElement : public NURBSFiniteElement
    mutable DenseMatrix du;
 
 public:
+   /// Construct the NURBS3DFiniteElement of order @a p
    NURBS3DFiniteElement(int p)
       : NURBSFiniteElement(3, Geometry::CUBE, (p + 1)*(p + 1)*(p + 1), p,
                            FunctionSpace::Qk),
-        u(Dof), shape_x(p + 1), shape_y(p + 1), shape_z(p + 1),
+        u(dof), shape_x(p + 1), shape_y(p + 1), shape_z(p + 1),
         dshape_x(p + 1), dshape_y(p + 1), dshape_z(p + 1),
-        d2shape_x(p + 1), d2shape_y(p + 1), d2shape_z(p + 1), du(Dof,3)
-   { Orders[0] = Orders[1] = Orders[2] = p; }
+        d2shape_x(p + 1), d2shape_y(p + 1), d2shape_z(p + 1), du(dof,3)
+   { orders[0] = orders[1] = orders[2] = p; }
 
+   /// Construct the NURBS3DFiniteElement with x-order @a px and y-order @a py
+   /// and z-order @a pz
    NURBS3DFiniteElement(int px, int py, int pz)
       : NURBSFiniteElement(3, Geometry::CUBE, (px + 1)*(py + 1)*(pz + 1),
                            std::max(std::max(px,py),pz), FunctionSpace::Qk),
-        u(Dof), shape_x(px + 1), shape_y(py + 1), shape_z(pz + 1),
+        u(dof), shape_x(px + 1), shape_y(py + 1), shape_z(pz + 1),
         dshape_x(px + 1), dshape_y(py + 1), dshape_z(pz + 1),
-        d2shape_x(px + 1), d2shape_y(py + 1), d2shape_z(pz + 1), du(Dof,3)
-   { Orders[0] = px; Orders[1] = py; Orders[2] = pz; }
+        d2shape_x(px + 1), d2shape_y(py + 1), d2shape_z(pz + 1), du(dof,3)
+   { orders[0] = px; orders[1] = py; orders[2] = pz; }
 
    virtual void SetOrder() const;
    virtual void CalcShape(const IntegrationPoint &ip, Vector &shape) const;
diff --git a/fem/fe_coll.cpp b/fem/fe_coll.cpp
index 50c541941b9..017775138d7 100644
--- a/fem/fe_coll.cpp
+++ b/fem/fe_coll.cpp
@@ -311,10 +311,10 @@ GetEdge(int &nv, v_t &v, int &ne, int &e, int &eo, const int edge_info)
    eo = edge_info%64;
    MFEM_ASSERT(0 <= e && e < g_consts::NumEdges, "");
    MFEM_ASSERT(0 <= eo && eo < e_consts::NumOrient, "");
-   v[0] = g_consts::Edges[e][0];
-   v[1] = g_consts::Edges[e][1];
-   v[0] = e_consts::Orient[eo][v[0]];
-   v[1] = e_consts::Orient[eo][v[1]];
+   v[0] = e_consts::Orient[eo][0];
+   v[1] = e_consts::Orient[eo][1];
+   v[0] = g_consts::Edges[e][v[0]];
+   v[1] = g_consts::Edges[e][v[1]];
 }
 
 template <Geometry::Type geom, Geometry::Type f_geom,
diff --git a/fem/fe_coll.hpp b/fem/fe_coll.hpp
index 7640b8e1db5..13de7cd5621 100644
--- a/fem/fe_coll.hpp
+++ b/fem/fe_coll.hpp
@@ -19,10 +19,10 @@
 namespace mfem
 {
 
-/** Collection of finite elements from the same family in multiple dimensions.
-    This class is used to match the degrees of freedom of a FiniteElementSpace
-    between elements, and to provide the finite element restriction from an
-    element to its boundary. */
+/** @brief Collection of finite elements from the same family in multiple
+    dimensions. This class is used to match the degrees of freedom of a
+    FiniteElementSpace between elements, and to provide the finite element
+    restriction from an element to its boundary. */
 class FiniteElementCollection
 {
 protected:
@@ -40,6 +40,14 @@ class FiniteElementCollection
                               const int face_info);
 
 public:
+   /** @brief Enumeration for ContType: defines the continuity of the field
+       across element interfaces. */
+   enum { CONTINUOUS,   ///< Field is continuous across element interfaces
+          TANGENTIAL,   ///< Tangential components of vector field
+          NORMAL,       ///< Normal component of vector field
+          DISCONTINUOUS ///< Field is discontinuous across element interfaces
+        };
+
    virtual const FiniteElement *
    FiniteElementForGeometry(Geometry::Type GeomType) const = 0;
 
@@ -52,6 +60,8 @@ class FiniteElementCollection
 
    virtual const char * Name() const { return "Undefined"; }
 
+   virtual int GetContType() const = 0;
+
    int HasFaceDofs(Geometry::Type GeomType) const;
 
    virtual const FiniteElement *TraceFiniteElementForGeometry(
@@ -66,15 +76,81 @@ class FiniteElementCollection
 
    /** @brief Factory method: return a newly allocated FiniteElementCollection
        according to the given name. */
+   /**
+   | FEC Name | Space | Order | BasisType | FiniteElement::MapT | Notes |
+   | :------: | :---: | :---: | :-------: | :-----: | :---: |
+   | H1_[DIM]_[ORDER] | H1 | * | 1 | VALUE | H1 nodal elements |
+   | H1@[BTYPE]_[DIM]_[ORDER] | H1 | * | * | VALUE | H1 nodal elements |
+   | H1Pos_[DIM]_[ORDER] | H1 | * | 1 | VALUE | H1 nodal elements |
+   | H1Pos_Trace_[DIM]_[ORDER] | H^{1/2} | * | 2 | VALUE | H^{1/2}-conforming trace elements for H1 defined on the interface between mesh elements (faces,edges,vertices) |
+   | H1_Trace_[DIM]_[ORDER] | H^{1/2} | * | 1 | VALUE | H^{1/2}-conforming trace elements for H1 defined on the interface between mesh elements (faces,edges,vertices) |
+   | H1_Trace@[BTYPE]_[DIM]_[ORDER] | H^{1/2} | * | 1 | VALUE | H^{1/2}-conforming trace elements for H1 defined on the interface between mesh elements (faces,edges,vertices) |
+   | ND_[DIM]_[ORDER] | H(curl) | * | 1 / 0 | H_CURL | Nedelec vector elements |
+   | ND@[CBTYPE][OBTYPE]_[DIM]_[ORDER] | H(curl) | * | * / * | H_CURL | Nedelec vector elements |
+   | ND_Trace_[DIM]_[ORDER] | H^{1/2} | * | 1 / 0  | H_CURL | H^{1/2}-conforming trace elements for H(curl) defined on the interface between mesh elements (faces) |
+   | ND_Trace@[CBTYPE][OBTYPE]_[DIM]_[ORDER] | H^{1/2} | * | 1 / 0 | H_CURL | H^{1/2}-conforming trace elements for H(curl) defined on the interface between mesh elements (faces) |
+   | RT_[DIM]_[ORDER] | H(div) | * | 1 / 0 | H_DIV | Raviart-Thomas vector elements |
+   | RT@[CBTYPE][OBTYPE]_[DIM]_[ORDER] | H(div) | * | * / * | H_DIV | Raviart-Thomas vector elements |
+   | RT_Trace_[DIM]_[ORDER] | H^{1/2} | * | 1 / 0 | INTEGRAL | H^{1/2}-conforming trace elements for H(div) defined on the interface between mesh elements (faces) |
+   | RT_ValTrace_[DIM]_[ORDER] | H^{1/2} | * | 1 / 0 | VALUE | H^{1/2}-conforming trace elements for H(div) defined on the interface between mesh elements (faces) |
+   | RT_Trace@[BTYPE]_[DIM]_[ORDER] | H^{1/2} | * | 1 / 0 | INTEGRAL | H^{1/2}-conforming trace elements for H(div) defined on the interface between mesh elements (faces) |
+   | RT_ValTrace@[BTYPE]_[DIM]_[ORDER] |  H^{1/2} | * | 1 / 0 | VALUE | H^{1/2}-conforming trace elements for H(div) defined on the interface between mesh elements (faces) |
+   | L2_[DIM]_[ORDER] | L2 | * | 0 | VALUE | Discontinous L2 elements |
+   | L2_T[BTYPE]_[DIM]_[ORDER] | L2 | * | 0 | VALUE | Discontinous L2 elements |
+   | L2Int_[DIM]_[ORDER] | L2 | * | 0 | INTEGRAL | Discontinous L2 elements |
+   | L2Int_T[BTYPE]_[DIM]_[ORDER] | L2 | * | 0 | INTEGRAL | Discontinous L2 elements |
+   | DG_Iface_[DIM]_[ORDER] | - | * | 0 | VALUE | Discontinuous elements on the interface between mesh elements (faces) |
+   | DG_Iface@[BTYPE]_[DIM]_[ORDER] | - | * | 0 | VALUE | Discontinuous elements on the interface between mesh elements (faces) |
+   | DG_IntIface_[DIM]_[ORDER] | - | * | 0 | INTEGRAL | Discontinuous elements on the interface between mesh elements (faces) |
+   | DG_IntIface@[BTYPE]_[DIM]_[ORDER] | - | * | 0 | INTEGRAL | Discontinuous elements on the interface between mesh elements (faces) |
+   | NURBS[ORDER] | - | * | - | VALUE | Non-Uniform Rational B-Splines (NURBS) elements |
+   | LinearNonConf3D | - | 1 | 1 | VALUE | Piecewise-linear nonconforming finite elements in 3D |
+   | CrouzeixRaviart | - | - | - | - | Crouzeix-Raviart nonconforming elements in 2D |
+   | Local_[FENAME] | - | - | - | - | Special collection that builds a local version out of the FENAME collection |
+   |-|-|-|-|-|-|
+   | Linear | H1 | 1 | 1 | VALUE | Left in for backward compatibility, consider using H1_ |
+   | Quadratic | H1 | 2 | 1 | VALUE | Left in for backward compatibility, consider using H1_ |
+   | QuadraticPos | H1 | 2 | 2 | VALUE | Left in for backward compatibility, consider using H1_ |
+   | Cubic | H1 | 2 | 1 | VALUE | Left in for backward compatibility, consider using H1_ |
+   | Const2D | L2 | 0 | 1 | VALUE | Left in for backward compatibility, consider using L2_ |
+   | Const3D | L2 | 0 | 1 | VALUE | Left in for backward compatibility, consider using L2_ |
+   | LinearDiscont2D | L2 | 1 | 1 | VALUE | Left in for backward compatibility, consider using L2_ |
+   | GaussLinearDiscont2D | L2 | 1 | 0 | VALUE | Left in for backward compatibility, consider using L2_ |
+   | P1OnQuad | H1 | 1 | 1 | VALUE | Linear P1 element with 3 nodes on a square |
+   | QuadraticDiscont2D | L2 | 2 | 1 | VALUE | Left in for backward compatibility, consider using L2_ |
+   | QuadraticPosDiscont2D | L2 | 2 | 2 | VALUE | Left in for backward compatibility, consider using L2_ |
+   | GaussQuadraticDiscont2D | L2 | 2 | 0 | VALUE | Left in for backward compatibility, consider using L2_ |
+   | CubicDiscont2D | L2 | 3 | 1 | VALUE | Left in for backward compatibility, consider using L2_ |
+   | LinearDiscont3D | L2 | 1 | 1 | VALUE | Left in for backward compatibility, consider using L2_ |
+   | QuadraticDiscont3D | L2 | 2 | 1 | VALUE | Left in for backward compatibility, consider using L2_ |
+   | ND1_3D | H(Curl) | 1 | 1 / 0 | H_CURL | Left in for backward compatibility, consider using ND_ |
+   | RT0_2D | H(Div) | 1 | 1 / 0 | H_DIV | Left in for backward compatibility, consider using RT_ |
+   | RT1_2D | H(Div) | 2 | 1 / 0 | H_DIV | Left in for backward compatibility, consider using RT_ |
+   | RT2_2D | H(Div) | 3 | 1 / 0 | H_DIV | Left in for backward compatibility, consider using RT_ |
+   | RT0_3D | H(Div) | 1 | 1 / 0 | H_DIV | Left in for backward compatibility, consider using RT_ |
+   | RT1_3D | H(Div) | 2 | 1 / 0 | H_DIV | Left in for backward compatibility, consider using RT_ |
+
+   | Tag | Description |
+   | :------: | :--------: |
+   | [DIM]    | Dimension of the elements (1D, 2D, 3D) |
+   | [ORDER]  | Approximation order of the elements (P0, P1, P2, ...) |
+   | [BTYPE]  | BasisType of the element (0-GaussLegendre, 1 - GaussLobatto, 2-Bernstein, 3-OpenUniform, 4-CloseUniform, 5-OpenHalfUniform) |
+   | [OBTYPE] | Open BasisType of the element for elements which have both types |
+   | [CBTYPE] | Closed BasisType of the element for elements which have both types |
+
+   [FENAME]  Is a special case for the Local FEC which generates a local version of a given
+   FEC.  It is selected from one of (BiCubic2DFiniteElement, Quad_Q3, Nedelec1HexFiniteElement,
+      Hex_ND1, H1_[DIM]_[ORDER],H1Pos_[DIM]_[ORDER], L2_[DIM]_[ORDER] )
+   */
    static FiniteElementCollection *New(const char *name);
 
    /** @brief Get the local dofs for a given sub-manifold.
 
-      Return the local dofs for a SDim-dimensional sub-manifold (0D - vertex,
-      1D - edge, 2D - face) including those on its boundary. The local index of
-      the sub-manifold (inside Geom) and its orientation are given by the
-      parameter Info = 64 * SubIndex + SubOrientation. Naturally, it is assumed
-      that 0 <= SDim <= Dim(Geom). */
+      Return the local dofs for a SDim-dimensional sub-manifold (0D - vertex, 1D
+      - edge, 2D - face) including those on its boundary. The local index of the
+      sub-manifold (inside Geom) and its orientation are given by the parameter
+      Info = 64 * SubIndex + SubOrientation. Naturally, it is assumed that 0 <=
+      SDim <= Dim(Geom). */
    void SubDofOrder(Geometry::Type Geom, int SDim, int Info,
                     Array<int> &dofs) const;
 };
@@ -102,6 +178,7 @@ class H1_FECollection : public FiniteElementCollection
    virtual const int *DofOrderForOrientation(Geometry::Type GeomType,
                                              int Or) const;
    virtual const char *Name() const { return h1_name; }
+   virtual int GetContType() const { return CONTINUOUS; }
    FiniteElementCollection *GetTraceCollection() const;
 
    int GetBasisType() const { return b_type; }
@@ -111,8 +188,8 @@ class H1_FECollection : public FiniteElementCollection
    virtual ~H1_FECollection();
 };
 
-/** Arbitrary order H1-conforming (continuous) finite elements with positive
-    basis functions. */
+/** @brief Arbitrary order H1-conforming (continuous) finite elements with
+    positive basis functions. */
 class H1Pos_FECollection : public H1_FECollection
 {
 public:
@@ -120,6 +197,7 @@ class H1Pos_FECollection : public H1_FECollection
       : H1_FECollection(p, dim, BasisType::Positive) { }
 };
 
+
 /** Arbitrary order H1-conforming (continuous) serendipity finite elements;
     Current implementation works in 2D only; 3D version is in development. */
 class H1Ser_FECollection : public H1_FECollection
@@ -129,9 +207,9 @@ class H1Ser_FECollection : public H1_FECollection
       : H1_FECollection(p, dim, BasisType::Serendipity) { };
 };
 
-/** Arbitrary order "H^{1/2}-conforming" trace finite elements defined on the
-    interface between mesh elements (faces,edges,vertices); these are the trace
-    FEs of the H1-conforming FEs. */
+/** @brief Arbitrary order "H^{1/2}-conforming" trace finite elements defined on
+    the interface between mesh elements (faces,edges,vertices); these are the
+    trace FEs of the H1-conforming FEs. */
 class H1_Trace_FECollection : public H1_FECollection
 {
 public:
@@ -174,6 +252,8 @@ class L2_FECollection : public FiniteElementCollection
                                              int Or) const;
    virtual const char *Name() const { return d_name; }
 
+   virtual int GetContType() const { return DISCONTINUOUS; }
+
    virtual const FiniteElement *TraceFiniteElementForGeometry(
       Geometry::Type GeomType) const
    {
@@ -221,14 +301,15 @@ class RT_FECollection : public FiniteElementCollection
    virtual const int *DofOrderForOrientation(Geometry::Type GeomType,
                                              int Or) const;
    virtual const char *Name() const { return rt_name; }
+   virtual int GetContType() const { return NORMAL; }
    FiniteElementCollection *GetTraceCollection() const;
 
    virtual ~RT_FECollection();
 };
 
-/** Arbitrary order "H^{-1/2}-conforming" face finite elements defined on the
-    interface between mesh elements (faces); these are the normal trace FEs of
-    the H(div)-conforming FEs. */
+/** @brief Arbitrary order "H^{-1/2}-conforming" face finite elements defined on
+    the interface between mesh elements (faces); these are the normal trace FEs
+    of the H(div)-conforming FEs. */
 class RT_Trace_FECollection : public RT_FECollection
 {
 public:
@@ -270,14 +351,15 @@ class ND_FECollection : public FiniteElementCollection
    virtual const int *DofOrderForOrientation(Geometry::Type GeomType,
                                              int Or) const;
    virtual const char *Name() const { return nd_name; }
+   virtual int GetContType() const { return TANGENTIAL; }
    FiniteElementCollection *GetTraceCollection() const;
 
    virtual ~ND_FECollection();
 };
 
-/** Arbitrary order H(curl)-trace finite elements defined on the interface
-    between mesh elements (faces,edges); these are the tangential trace FEs of
-    the H(curl)-conforming FEs. */
+/** @brief Arbitrary order H(curl)-trace finite elements defined on the
+    interface between mesh elements (faces,edges); these are the tangential
+    trace FEs of the H(curl)-conforming FEs. */
 class ND_Trace_FECollection : public ND_FECollection
 {
 public:
@@ -334,13 +416,15 @@ class NURBSFECollection : public FiniteElementCollection
 
    virtual const char *Name() const { return name; }
 
+   virtual int GetContType() const { return CONTINUOUS; }
+
    FiniteElementCollection *GetTraceCollection() const;
 
    virtual ~NURBSFECollection();
 };
 
 
-/// Piecewise-(bi)linear continuous finite elements.
+/// Piecewise-(bi/tri)linear continuous finite elements.
 class LinearFECollection : public FiniteElementCollection
 {
 private:
@@ -363,6 +447,8 @@ class LinearFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "Linear"; }
+
+   virtual int GetContType() const { return CONTINUOUS; }
 };
 
 /// Piecewise-(bi)quadratic continuous finite elements.
@@ -389,6 +475,8 @@ class QuadraticFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "Quadratic"; }
+
+   virtual int GetContType() const { return CONTINUOUS; }
 };
 
 /// Version of QuadraticFECollection with positive basis functions.
@@ -410,6 +498,8 @@ class QuadraticPosFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "QuadraticPos"; }
+
+   virtual int GetContType() const { return CONTINUOUS; }
 };
 
 /// Piecewise-(bi)cubic continuous finite elements.
@@ -437,6 +527,8 @@ class CubicFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "Cubic"; }
+
+   virtual int GetContType() const { return CONTINUOUS; }
 };
 
 /// Crouzeix-Raviart nonconforming elements in 2D.
@@ -458,6 +550,8 @@ class CrouzeixRaviartFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "CrouzeixRaviart"; }
+
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
 /// Piecewise-linear nonconforming finite elements in 3D.
@@ -481,11 +575,13 @@ class LinearNonConf3DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "LinearNonConf3D"; }
+
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
 
-/** First order Raviart-Thomas finite elements in 2D. This class is kept only
-    for backward compatibility, consider using RT_FECollection instead. */
+/** @brief First order Raviart-Thomas finite elements in 2D. This class is kept
+    only for backward compatibility, consider using RT_FECollection instead. */
 class RT0_2DFECollection : public FiniteElementCollection
 {
 private:
@@ -504,10 +600,12 @@ class RT0_2DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "RT0_2D"; }
+
+   virtual int GetContType() const { return NORMAL; }
 };
 
-/** Second order Raviart-Thomas finite elements in 2D. This class is kept only
-    for backward compatibility, consider using RT_FECollection instead. */
+/** @brief Second order Raviart-Thomas finite elements in 2D. This class is kept
+    only for backward compatibility, consider using RT_FECollection instead. */
 class RT1_2DFECollection : public FiniteElementCollection
 {
 private:
@@ -526,10 +624,12 @@ class RT1_2DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "RT1_2D"; }
+
+   virtual int GetContType() const { return NORMAL; }
 };
 
-/** Third order Raviart-Thomas finite elements in 2D. This class is kept only
-    for backward compatibility, consider using RT_FECollection instead. */
+/** @brief Third order Raviart-Thomas finite elements in 2D. This class is kept
+    only for backward compatibility, consider using RT_FECollection instead. */
 class RT2_2DFECollection : public FiniteElementCollection
 {
 private:
@@ -548,10 +648,13 @@ class RT2_2DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "RT2_2D"; }
+
+   virtual int GetContType() const { return NORMAL; }
 };
 
-/** Piecewise-constant discontinuous finite elements in 2D. This class is kept
-    only for backward compatibility, consider using L2_FECollection instead. */
+/** @brief Piecewise-constant discontinuous finite elements in 2D. This class is
+    kept only for backward compatibility, consider using L2_FECollection
+    instead. */
 class Const2DFECollection : public FiniteElementCollection
 {
 private:
@@ -569,10 +672,13 @@ class Const2DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "Const2D"; }
+
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
-/** Piecewise-linear discontinuous finite elements in 2D. This class is kept
-    only for backward compatibility, consider using L2_FECollection instead. */
+/** @brief Piecewise-linear discontinuous finite elements in 2D. This class is
+    kept only for backward compatibility, consider using L2_FECollection
+    instead. */
 class LinearDiscont2DFECollection : public FiniteElementCollection
 {
 private:
@@ -591,6 +697,8 @@ class LinearDiscont2DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "LinearDiscont2D"; }
+
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
 /// Version of LinearDiscont2DFECollection with dofs in the Gaussian points.
@@ -613,6 +721,8 @@ class GaussLinearDiscont2DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "GaussLinearDiscont2D"; }
+
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
 /// Linear (P1) finite elements on quadrilaterals.
@@ -628,10 +738,12 @@ class P1OnQuadFECollection : public FiniteElementCollection
    virtual const int *DofOrderForOrientation(Geometry::Type GeomType,
                                              int Or) const;
    virtual const char * Name() const { return "P1OnQuad"; }
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
-/** Piecewise-quadratic discontinuous finite elements in 2D. This class is kept
-    only for backward compatibility, consider using L2_FECollection instead. */
+/** @brief Piecewise-quadratic discontinuous finite elements in 2D. This class
+    is kept only for backward compatibility, consider using L2_FECollection
+    instead. */
 class QuadraticDiscont2DFECollection : public FiniteElementCollection
 {
 private:
@@ -650,6 +762,7 @@ class QuadraticDiscont2DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "QuadraticDiscont2D"; }
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
 /// Version of QuadraticDiscont2DFECollection with positive basis functions.
@@ -667,6 +780,7 @@ class QuadraticPosDiscont2DFECollection : public FiniteElementCollection
                                              int Or) const
    { return NULL; }
    virtual const char * Name() const { return "QuadraticPosDiscont2D"; }
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
 /// Version of QuadraticDiscont2DFECollection with dofs in the Gaussian points.
@@ -689,10 +803,12 @@ class GaussQuadraticDiscont2DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "GaussQuadraticDiscont2D"; }
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
-/** Piecewise-cubic discontinuous finite elements in 2D. This class is kept
-    only for backward compatibility, consider using L2_FECollection instead. */
+/** @brief Piecewise-cubic discontinuous finite elements in 2D. This class is
+    kept only for backward compatibility, consider using L2_FECollection
+    instead. */
 class CubicDiscont2DFECollection : public FiniteElementCollection
 {
 private:
@@ -711,10 +827,12 @@ class CubicDiscont2DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "CubicDiscont2D"; }
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
-/** Piecewise-constant discontinuous finite elements in 3D. This class is kept
-    only for backward compatibility, consider using L2_FECollection instead. */
+/** @brief Piecewise-constant discontinuous finite elements in 3D. This class is
+    kept only for backward compatibility, consider using L2_FECollection
+    instead. */
 class Const3DFECollection : public FiniteElementCollection
 {
 private:
@@ -734,10 +852,12 @@ class Const3DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "Const3D"; }
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
-/** Piecewise-linear discontinuous finite elements in 3D. This class is kept
-    only for backward compatibility, consider using L2_FECollection instead. */
+/** @brief Piecewise-linear discontinuous finite elements in 3D. This class is
+    kept only for backward compatibility, consider using L2_FECollection
+    instead. */
 class LinearDiscont3DFECollection : public FiniteElementCollection
 {
 private:
@@ -756,10 +876,12 @@ class LinearDiscont3DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "LinearDiscont3D"; }
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
-/** Piecewise-quadratic discontinuous finite elements in 3D. This class is kept
-    only for backward compatibility, consider using L2_FECollection instead. */
+/** @brief Piecewise-quadratic discontinuous finite elements in 3D. This class
+    is kept only for backward compatibility, consider using L2_FECollection
+    instead. */
 class QuadraticDiscont3DFECollection : public FiniteElementCollection
 {
 private:
@@ -778,6 +900,7 @@ class QuadraticDiscont3DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "QuadraticDiscont3D"; }
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
 /// Finite element collection on a macro-element.
@@ -803,10 +926,12 @@ class RefinedLinearFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "RefinedLinear"; }
+   virtual int GetContType() const { return CONTINUOUS; }
 };
 
-/** Lowest order Nedelec finite elements in 3D. This class is kept only for
-    backward compatibility, consider using the new ND_FECollection instead. */
+/** @brief Lowest order Nedelec finite elements in 3D. This class is kept only
+    for backward compatibility, consider using the new ND_FECollection
+    instead. */
 class ND1_3DFECollection : public FiniteElementCollection
 {
 private:
@@ -825,10 +950,11 @@ class ND1_3DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "ND1_3D"; }
+   virtual int GetContType() const { return TANGENTIAL; }
 };
 
-/** First order Raviart-Thomas finite elements in 3D. This class is kept only
-    for backward compatibility, consider using RT_FECollection instead. */
+/** @brief First order Raviart-Thomas finite elements in 3D. This class is kept
+    only for backward compatibility, consider using RT_FECollection instead. */
 class RT0_3DFECollection : public FiniteElementCollection
 {
 private:
@@ -848,10 +974,11 @@ class RT0_3DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "RT0_3D"; }
+   virtual int GetContType() const { return NORMAL; }
 };
 
-/** Second order Raviart-Thomas finite elements in 3D. This class is kept only
-    for backward compatibility, consider using RT_FECollection instead. */
+/** @brief Second order Raviart-Thomas finite elements in 3D. This class is kept
+    only for backward compatibility, consider using RT_FECollection instead. */
 class RT1_3DFECollection : public FiniteElementCollection
 {
 private:
@@ -870,6 +997,7 @@ class RT1_3DFECollection : public FiniteElementCollection
                                              int Or) const;
 
    virtual const char * Name() const { return "RT1_3D"; }
+   virtual int GetContType() const { return NORMAL; }
 };
 
 /// Discontinuous collection defined locally by a given finite element.
@@ -894,6 +1022,7 @@ class Local_FECollection : public FiniteElementCollection
    virtual const char *Name() const { return d_name; }
 
    virtual ~Local_FECollection() { delete Local_Element; }
+   virtual int GetContType() const { return DISCONTINUOUS; }
 };
 
 }
diff --git a/fem/fespace.cpp b/fem/fespace.cpp
index 0291f779c3d..30e00f43429 100644
--- a/fem/fespace.cpp
+++ b/fem/fespace.cpp
@@ -1977,6 +1977,7 @@ const FiniteElement *FiniteElementSpace::GetFaceElement(int i) const
 
 const FiniteElement *FiniteElementSpace::GetEdgeElement(int i) const
 {
+   MFEM_ASSERT(mesh->Dimension() > 1, "No edges with a mesh dimension < 2");
    return fec->FiniteElementForGeometry(Geometry::SEGMENT);
 }
 
@@ -2615,7 +2616,9 @@ const Operator &InterpolationGridTransfer::BackwardOperator()
 
 L2ProjectionGridTransfer::L2Projection::L2Projection(
    const FiniteElementSpace &fes_ho_, const FiniteElementSpace &fes_lor_)
-   : fes_ho(fes_ho_), fes_lor(fes_lor_)
+   : Operator(fes_lor_.GetVSize(), fes_ho_.GetVSize()),
+     fes_ho(fes_ho_),
+     fes_lor(fes_lor_)
 {
    Mesh *mesh_ho = fes_ho.GetMesh();
    MFEM_VERIFY(mesh_ho->GetNumGeometries(mesh_ho->Dimension()) <= 1,
diff --git a/fem/fespace.hpp b/fem/fespace.hpp
index 12a8b108016..c2417f393ac 100644
--- a/fem/fespace.hpp
+++ b/fem/fespace.hpp
@@ -206,7 +206,7 @@ class FiniteElementSpace
       virtual ~RefinementOperator();
    };
 
-   // Derefinement operator, used by the friend class InterpolationGridTransfer.
+   /// Derefinement operator, used by the friend class InterpolationGridTransfer.
    class DerefinementOperator : public Operator
    {
       const FiniteElementSpace *fine_fes; // Not owned.
@@ -225,12 +225,12 @@ class FiniteElementSpace
       virtual ~DerefinementOperator();
    };
 
-   // This method makes the same assumptions as the method:
-   //    void GetLocalRefinementMatrices(
-   //       const FiniteElementSpace &coarse_fes, Geometry::Type geom,
-   //       DenseTensor &localP) const
-   // which is defined below. It also assumes that the coarse fes and this have
-   // the same vector dimension, vdim.
+   /** This method makes the same assumptions as the method:
+       void GetLocalRefinementMatrices(
+           const FiniteElementSpace &coarse_fes, Geometry::Type geom,
+           DenseTensor &localP) const
+       which is defined below. It also assumes that the coarse fes and this have
+       the same vector dimension, vdim. */
    SparseMatrix *RefinementMatrix_main(const int coarse_ndofs,
                                        const Table &coarse_elem_dof,
                                        const DenseTensor localP[]) const;
@@ -248,11 +248,13 @@ class FiniteElementSpace
    /// Calculate GridFunction restriction matrix after mesh derefinement.
    SparseMatrix* DerefinementMatrix(int old_ndofs, const Table* old_elem_dof);
 
-   // This method assumes that this->mesh is a refinement of coarse_fes->mesh
-   // and that the CoarseFineTransformations of this->mesh are set accordingly.
-   // Another assumption is that the FEs of this use the same MapType as the FEs
-   // of coarse_fes. Finally, it assumes that the spaces this and coarse_fes are
-   // NOT variable-order spaces.
+   /** @brief Return in @a localP the local refinement matrices that map
+       between fespaces after mesh refinement. */
+   /** This method assumes that this->mesh is a refinement of coarse_fes->mesh
+       and that the CoarseFineTransformations of this->mesh are set accordingly.
+       Another assumption is that the FEs of this use the same MapType as the FEs
+       of coarse_fes. Finally, it assumes that the spaces this and coarse_fes are
+       NOT variable-order spaces. */
    void GetLocalRefinementMatrices(const FiniteElementSpace &coarse_fes,
                                    Geometry::Type geom,
                                    DenseTensor &localP) const;
@@ -467,11 +469,11 @@ class FiniteElementSpace
    /// Returns indexes of degrees of freedom for i'th boundary element.
    virtual void GetBdrElementDofs(int i, Array<int> &dofs) const;
 
-   /** Returns the indexes of the degrees of freedom for i'th face
+   /** @brief eturns the indexes of the degrees of freedom for i'th face
        including the dofs for the edges and the vertices of the face. */
    virtual void GetFaceDofs(int i, Array<int> &dofs) const;
 
-   /** Returns the indexes of the degrees of freedom for i'th edge
+   /** @brief Returns the indexes of the degrees of freedom for i'th edge
        including the dofs for the vertices of the edge. */
    void GetEdgeDofs(int i, Array<int> &dofs) const;
 
@@ -534,20 +536,28 @@ class FiniteElementSpace
    int GetElementForDof(int i) const { return dof_elem_array[i]; }
    int GetLocalDofForDof(int i) const { return dof_ldof_array[i]; }
 
-   /// Returns pointer to the FiniteElement associated with i'th element.
+   /** @brief Returns pointer to the FiniteElement in the FiniteElementCollection
+        associated with i'th element in the mesh object. */
    const FiniteElement *GetFE(int i) const;
 
-   /// Returns pointer to the FiniteElement for the i'th boundary element.
+   /** @brief Returns pointer to the FiniteElement in the FiniteElementCollection
+        associated with i'th boundary face in the mesh object. */
    const FiniteElement *GetBE(int i) const;
 
+   /** @brief Returns pointer to the FiniteElement in the FiniteElementCollection
+        associated with i'th face in the mesh object.  Faces in this case refer
+        to the MESHDIM-1 primitive so in 2D they are segments and in 1D they are
+        points.*/
    const FiniteElement *GetFaceElement(int i) const;
 
+   /** @brief Returns pointer to the FiniteElement in the FiniteElementCollection
+        associated with i'th edge in the mesh object. */
    const FiniteElement *GetEdgeElement(int i) const;
 
    /// Return the trace element from element 'i' to the given 'geom_type'
    const FiniteElement *GetTraceElement(int i, Geometry::Type geom_type) const;
 
-   /** Mark degrees of freedom associated with boundary elements with
+   /** @brief Mark degrees of freedom associated with boundary elements with
        the specified boundary attributes (marked in 'bdr_attr_is_ess').
        For spaces with 'vdim' > 1, the 'component' parameter can be used
        to restricts the marked vDOFs to the specified component. */
@@ -555,7 +565,7 @@ class FiniteElementSpace
                                   Array<int> &ess_vdofs,
                                   int component = -1) const;
 
-   /** Get a list of essential true dofs, ess_tdof_list, corresponding to the
+   /** @brief Get a list of essential true dofs, ess_tdof_list, corresponding to the
        boundary attributes marked in the array bdr_attr_is_ess.
        For spaces with 'vdim' > 1, the 'component' parameter can be used
        to restricts the marked tDOFs to the specified component. */
@@ -566,19 +576,19 @@ class FiniteElementSpace
    /// Convert a Boolean marker array to a list containing all marked indices.
    static void MarkerToList(const Array<int> &marker, Array<int> &list);
 
-   /** Convert an array of indices (list) to a Boolean marker array where all
+   /** @brief Convert an array of indices (list) to a Boolean marker array where all
        indices in the list are marked with the given value and the rest are set
        to zero. */
    static void ListToMarker(const Array<int> &list, int marker_size,
                             Array<int> &marker, int mark_val = -1);
 
-   /** For a partially conforming FE space, convert a marker array (nonzero
+   /** @brief For a partially conforming FE space, convert a marker array (nonzero
        entries are true) on the partially conforming dofs to a marker array on
        the conforming dofs. A conforming dofs is marked iff at least one of its
        dependent dofs is marked. */
    void ConvertToConformingVDofs(const Array<int> &dofs, Array<int> &cdofs);
 
-   /** For a partially conforming FE space, convert a marker array (nonzero
+   /** @brief For a partially conforming FE space, convert a marker array (nonzero
        entries are true) on the conforming dofs to a marker array on the
        (partially conforming) dofs. A dof is marked iff it depends on a marked
        conforming dofs, where dependency is defined by the ConformingRestriction
@@ -586,15 +596,15 @@ class FiniteElementSpace
        conforming dof. */
    void ConvertFromConformingVDofs(const Array<int> &cdofs, Array<int> &dofs);
 
-   /** Generate the global restriction matrix from a discontinuous
+   /** @brief Generate the global restriction matrix from a discontinuous
        FE space to the continuous FE space of the same polynomial degree. */
    SparseMatrix *D2C_GlobalRestrictionMatrix(FiniteElementSpace *cfes);
 
-   /** Generate the global restriction matrix from a discontinuous
+   /** @brief Generate the global restriction matrix from a discontinuous
        FE space to the piecewise constant FE space. */
    SparseMatrix *D2Const_GlobalRestrictionMatrix(FiniteElementSpace *cfes);
 
-   /** Construct the restriction matrix from the FE space given by
+   /** @brief Construct the restriction matrix from the FE space given by
        (*this) to the lower degree FE space given by (*lfes) which
        is defined on the same mesh. */
    SparseMatrix *H2L_GlobalRestrictionMatrix(FiniteElementSpace *lfes);
@@ -631,7 +641,7 @@ class FiniteElementSpace
    virtual void GetTrueTransferOperator(const FiniteElementSpace &coarse_fes,
                                         OperatorHandle &T) const;
 
-   /** Reflect changes in the mesh: update number of DOFs, etc. Also, calculate
+   /** @brief Reflect changes in the mesh: update number of DOFs, etc. Also, calculate
        GridFunction transformation operator (unless want_transform is false).
        Safe to call multiple times, does nothing if space already up to date. */
    virtual void Update(bool want_transform = true);
@@ -669,6 +679,7 @@ class FiniteElementSpace
       return dynamic_cast<const L2_FECollection*>(fec) != NULL;
    }
 
+   /// Save finite element space to output stream @a out.
    void Save(std::ostream &out) const;
 
    /** @brief Read a FiniteElementSpace from a stream. The returned
@@ -906,7 +917,8 @@ class L2ProjectionGridTransfer : public GridTransfer
       const L2Projection &l2proj;
 
    public:
-      L2Prolongation(const L2Projection &l2proj_) : l2proj(l2proj_) { }
+      L2Prolongation(const L2Projection &l2proj_)
+         : Operator(l2proj_.Width(), l2proj_.Height()), l2proj(l2proj_) { }
       void Mult(const Vector &x, Vector &y) const
       {
          l2proj.Prolongate(x, y);
diff --git a/fem/gridfunc.cpp b/fem/gridfunc.cpp
index aca967d8396..1264b3a217b 100644
--- a/fem/gridfunc.cpp
+++ b/fem/gridfunc.cpp
@@ -236,7 +236,6 @@ void GridFunction::MakeTRef(FiniteElementSpace *f, Vector &tv, int tv_offset)
    }
 }
 
-
 void GridFunction::SumFluxAndCount(BilinearFormIntegrator &blfi,
                                    GridFunction &flux,
                                    Array<int>& count,
@@ -617,17 +616,354 @@ int GridFunction::GetFaceValues(int i, int side, const IntegrationRule &ir,
    return dir;
 }
 
+void GridFunction::GetVectorValues(int i, const IntegrationRule &ir,
+                                   DenseMatrix &vals, DenseMatrix &tr) const
+{
+   ElementTransformation *Tr = fes->GetElementTransformation(i);
+   Tr->Transform(ir, tr);
+
+   GetVectorValues(*Tr, ir, vals);
+}
+
+void be_to_bfe(Geometry::Type geom, int o, const IntegrationPoint &ip,
+               IntegrationPoint &fip)
+{
+   if (geom == Geometry::TRIANGLE)
+   {
+      if (o == 2)
+      {
+         fip.x = 1.0 - ip.x - ip.y;
+         fip.y = ip.x;
+      }
+      else if (o == 4)
+      {
+         fip.x = ip.y;
+         fip.y = 1.0 - ip.x - ip.y;
+      }
+      else
+      {
+         fip.x = ip.x;
+         fip.y = ip.y;
+      }
+      fip.z = ip.z;
+   }
+   else
+   {
+      if (o == 2)
+      {
+         fip.x = ip.y;
+         fip.y = 1.0 - ip.x;
+      }
+      else if (o == 4)
+      {
+         fip.x = 1.0 - ip.x;
+         fip.y = 1.0 - ip.y;
+      }
+      else if (o == 6)
+      {
+         fip.x = 1.0 - ip.y;
+         fip.y = ip.x;
+      }
+      else
+      {
+         fip.x = ip.x;
+         fip.y = ip.y;
+      }
+      fip.z = ip.z;
+   }
+   fip.weight = ip.weight;
+   fip.index  = ip.index;
+}
+
+double GridFunction::GetValue(ElementTransformation &T,
+                              const IntegrationPoint &ip,
+                              int comp, Vector *tr) const
+{
+   if (tr)
+   {
+      T.SetIntPoint(&ip);
+      T.Transform(ip, *tr);
+   }
+
+   const FiniteElement * fe = NULL;
+   Array<int> dofs;
+
+   switch (T.ElementType)
+   {
+      case ElementTransformation::ELEMENT:
+         fe = fes->GetFE(T.ElementNo);
+         fes->GetElementDofs(T.ElementNo, dofs);
+         break;
+      case ElementTransformation::EDGE:
+         if (fes->FEColl()->GetContType() ==
+             FiniteElementCollection::CONTINUOUS)
+         {
+            fe = fes->GetEdgeElement(T.ElementNo);
+            fes->GetEdgeDofs(T.ElementNo, dofs);
+         }
+         else
+         {
+            MFEM_ABORT("GridFunction::GetValue: Field continuity type \""
+                       << fes->FEColl()->GetContType() << "\" not supported "
+                       << "on mesh edges.");
+            return NAN;
+         }
+         break;
+      case ElementTransformation::FACE:
+         if (fes->FEColl()->GetContType() ==
+             FiniteElementCollection::CONTINUOUS)
+         {
+            fe = fes->GetFaceElement(T.ElementNo);
+            fes->GetFaceDofs(T.ElementNo, dofs);
+         }
+         else
+         {
+            MFEM_ABORT("GridFunction::GetValue: Field continuity type \""
+                       << fes->FEColl()->GetContType() << "\" not supported "
+                       << "on mesh faces.");
+            return NAN;
+         }
+         break;
+      case ElementTransformation::BDR_ELEMENT:
+      {
+         if (fes->FEColl()->GetContType() ==
+             FiniteElementCollection::CONTINUOUS)
+         {
+            // This is a continuous field so we can evaluate it on the boundary.
+            fe = fes->GetBE(T.ElementNo);
+            fes->GetBdrElementDofs(T.ElementNo, dofs);
+         }
+         else
+         {
+            // This is a discontinuous field which cannot be evaluated on the
+            // boundary so we'll evaluate it in the neighboring element.
+            FaceElementTransformations * FET =
+               fes->GetMesh()->GetBdrFaceTransformations(T.ElementNo);
+
+            // Boundary elements and Boundary Faces may have different
+            // orientations so adjust the integration point if necessary.
+            int o = 0;
+            if (fes->GetMesh()->Dimension() == 3)
+            {
+               int f;
+               fes->GetMesh()->GetBdrElementFace(T.ElementNo, &f, &o);
+            }
+
+            IntegrationPoint fip;
+            be_to_bfe(FET->GetGeometryType(), o, ip, fip);
+
+            FET->SetIntPoint(&fip);
+            ElementTransformation & T1 = FET->GetElement1Transformation();
+            return GetValue(T1, T1.GetIntPoint(), comp);
+         }
+         break;
+      }
+      case ElementTransformation::BDR_FACE:
+      {
+         FaceElementTransformations * FET =
+            dynamic_cast<FaceElementTransformations *>(&T);
+
+         // Evaluate in neighboring element for both continuous and
+         // discontinuous fields.
+         ElementTransformation & T1 = FET->GetElement1Transformation();
+         return GetValue(T1, T1.GetIntPoint(), comp);
+      }
+      default:
+      {
+         MFEM_ABORT("GridFunction::GetValue: Unsupported element type \""
+                    << T.ElementType << "\"");
+         return NAN;
+      }
+   }
+
+   fes->DofsToVDofs(comp-1, dofs);
+   Vector DofVal(dofs.Size()), LocVec;
+   if (fe->GetMapType() == FiniteElement::VALUE)
+   {
+      fe->CalcShape(ip, DofVal);
+   }
+   else
+   {
+      fe->CalcPhysShape(T, DofVal);
+   }
+   GetSubVector(dofs, LocVec);
+
+   return (DofVal * LocVec);
+}
+
+void GridFunction::GetValues(ElementTransformation &T,
+                             const IntegrationRule &ir,
+                             Vector &vals, int comp,
+                             DenseMatrix *tr) const
+{
+   if (tr)
+   {
+      T.Transform(ir, *tr);
+   }
+
+   int nip = ir.GetNPoints();
+   vals.SetSize(nip);
+   for (int j = 0; j < nip; j++)
+   {
+      const IntegrationPoint &ip = ir.IntPoint(j);
+      T.SetIntPoint(&ip);
+      vals[j] = GetValue(T, ip, comp);
+   }
+}
+
+void GridFunction::GetVectorValue(ElementTransformation &T,
+                                  const IntegrationPoint &ip,
+                                  Vector &val, Vector *tr) const
+{
+   if (tr)
+   {
+      T.SetIntPoint(&ip);
+      T.Transform(ip, *tr);
+   }
+
+   Array<int> vdofs;
+   const FiniteElement *fe = NULL;
+
+   switch (T.ElementType)
+   {
+      case ElementTransformation::ELEMENT:
+         fes->GetElementVDofs(T.ElementNo, vdofs);
+         fe = fes->GetFE(T.ElementNo);
+         break;
+      case ElementTransformation::EDGE:
+         if (fes->FEColl()->GetContType() ==
+             FiniteElementCollection::CONTINUOUS)
+         {
+            fe = fes->GetEdgeElement(T.ElementNo);
+            fes->GetEdgeVDofs(T.ElementNo, vdofs);
+         }
+         else
+         {
+            MFEM_ABORT("GridFunction::GetVectorValue: Field continuity type \""
+                       << fes->FEColl()->GetContType() << "\" not supported "
+                       << "on mesh edges.");
+            return;
+         }
+         break;
+      case ElementTransformation::FACE:
+         if (fes->FEColl()->GetContType() ==
+             FiniteElementCollection::CONTINUOUS)
+         {
+            fe = fes->GetFaceElement(T.ElementNo);
+            fes->GetFaceVDofs(T.ElementNo, vdofs);
+         }
+         else
+         {
+            MFEM_ABORT("GridFunction::GetVectorValue: Field continuity type \""
+                       << fes->FEColl()->GetContType() << "\" not supported "
+                       << "on mesh faces.");
+            return;
+         }
+         break;
+      case ElementTransformation::BDR_ELEMENT:
+      {
+         if (fes->FEColl()->GetContType() ==
+             FiniteElementCollection::CONTINUOUS)
+         {
+            // This is a continuous field so we can evaluate it on the boundary.
+            fes->GetBdrElementVDofs(T.ElementNo, vdofs);
+            fe = fes->GetBE(T.ElementNo);
+         }
+         else
+         {
+            // This is a discontinuous vector field which cannot be evaluated on
+            // the boundary so we'll evaluate it in the neighboring element.
+            FaceElementTransformations * FET =
+               fes->GetMesh()->GetBdrFaceTransformations(T.ElementNo);
+
+            // Boundary elements and Boundary Faces may have different
+            // orientations so adjust the integration point if necessary.
+            int o = 0;
+            if (fes->GetMesh()->Dimension() == 3)
+            {
+               int f;
+               fes->GetMesh()->GetBdrElementFace(T.ElementNo, &f, &o);
+            }
+
+            IntegrationPoint fip;
+            be_to_bfe(FET->GetGeometryType(), o, ip, fip);
+
+            FET->SetIntPoint(&fip);
+            ElementTransformation & T1 = FET->GetElement1Transformation();
+            return GetVectorValue(T1, T1.GetIntPoint(), val);
+         }
+         break;
+      }
+      case ElementTransformation::BDR_FACE:
+      {
+         FaceElementTransformations * FET =
+            dynamic_cast<FaceElementTransformations *>(&T);
+
+         // Evaluate in neighboring element for both continuous and
+         // discontinuous fields.
+         ElementTransformation & T1 = FET->GetElement1Transformation();
+         return GetVectorValue(T1, T1.GetIntPoint(), val);
+      }
+      default:
+      {
+         MFEM_ABORT("GridFunction::GetVectorValue: Unsupported element type \""
+                    << T.ElementType << "\"");
+         if (val.Size() > 0) { val = NAN; }
+         return;
+      }
+   }
+
+   int dof = fe->GetDof();
+   Vector loc_data;
+   GetSubVector(vdofs, loc_data);
+   if (fe->GetRangeType() == FiniteElement::SCALAR)
+   {
+      Vector shape(dof);
+      if (fe->GetMapType() == FiniteElement::VALUE)
+      {
+         fe->CalcShape(ip, shape);
+      }
+      else
+      {
+         fe->CalcPhysShape(T, shape);
+      }
+      int vdim = fes->GetVDim();
+      val.SetSize(vdim);
+      for (int k = 0; k < vdim; k++)
+      {
+         val(k) = shape * ((const double *)loc_data + dof * k);
+      }
+   }
+   else
+   {
+      int spaceDim = fes->GetMesh()->SpaceDimension();
+      DenseMatrix vshape(dof, spaceDim);
+      fe->CalcVShape(T, vshape);
+      val.SetSize(spaceDim);
+      vshape.MultTranspose(loc_data, val);
+   }
+}
+
 void GridFunction::GetVectorValues(ElementTransformation &T,
                                    const IntegrationRule &ir,
-                                   DenseMatrix &vals) const
+                                   DenseMatrix &vals,
+                                   DenseMatrix *tr) const
 {
+   if (tr)
+   {
+      T.Transform(ir, *tr);
+   }
+
    const FiniteElement *FElem = fes->GetFE(T.ElementNo);
    int dof = FElem->GetDof();
+
    Array<int> vdofs;
    fes->GetElementVDofs(T.ElementNo, vdofs);
+
    Vector loc_data;
    GetSubVector(vdofs, loc_data);
    int nip = ir.GetNPoints();
+
    if (FElem->GetRangeType() == FiniteElement::SCALAR)
    {
       MFEM_ASSERT(FElem->GetMapType() == FiniteElement::VALUE,
@@ -639,6 +975,7 @@ void GridFunction::GetVectorValues(ElementTransformation &T,
       {
          const IntegrationPoint &ip = ir.IntPoint(j);
          FElem->CalcShape(ip, shape);
+
          for (int k = 0; k < vdim; k++)
          {
             vals(k,j) = shape * ((const double *)loc_data + dof * k);
@@ -649,28 +986,22 @@ void GridFunction::GetVectorValues(ElementTransformation &T,
    {
       int spaceDim = fes->GetMesh()->SpaceDimension();
       DenseMatrix vshape(dof, spaceDim);
+
       vals.SetSize(spaceDim, nip);
       Vector val_j;
+
       for (int j = 0; j < nip; j++)
       {
          const IntegrationPoint &ip = ir.IntPoint(j);
          T.SetIntPoint(&ip);
          FElem->CalcVShape(T, vshape);
+
          vals.GetColumnReference(j, val_j);
          vshape.MultTranspose(loc_data, val_j);
       }
    }
 }
 
-void GridFunction::GetVectorValues(int i, const IntegrationRule &ir,
-                                   DenseMatrix &vals, DenseMatrix &tr) const
-{
-   ElementTransformation *Tr = fes->GetElementTransformation(i);
-   Tr->Transform(ir, tr);
-
-   GetVectorValues(*Tr, ir, vals);
-}
-
 int GridFunction::GetFaceVectorValues(
    int i, int side, const IntegrationRule &ir,
    DenseMatrix &vals, DenseMatrix &tr) const
@@ -702,13 +1033,13 @@ int GridFunction::GetFaceVectorValues(
    {
       Transf = fes->GetMesh()->GetFaceElementTransformations(i, 4);
       Transf->Loc1.Transform(ir, eir);
-      GetVectorValues(Transf->Elem1No, eir, vals, tr);
+      GetVectorValues(*Transf->Elem1, eir, vals, &tr);
    }
    else
    {
       Transf = fes->GetMesh()->GetFaceElementTransformations(i, 8);
       Transf->Loc2.Transform(ir, eir);
-      GetVectorValues(Transf->Elem2No, eir, vals, tr);
+      GetVectorValues(*Transf->Elem2, eir, vals, &tr);
    }
 
    return di;
@@ -1716,6 +2047,8 @@ void GridFunction::ProjectCoefficient(
    ElementTransformation *T = NULL;
    const FiniteElement *fe = NULL;
 
+   fes->BuildDofToArrays(); // ensures GetElementForDof(), GetLocalDofForDof() initialized.
+
    for (int i = 0; i < dofs.Size(); i++)
    {
       int dof = dofs[i], j = fes->GetElementForDof(dof);
@@ -1757,6 +2090,8 @@ void GridFunction::ProjectCoefficient(
 
    Vector val;
 
+   fes->BuildDofToArrays(); // ensures GetElementForDof(), GetLocalDofForDof() initialized.
+
    for (int i = 0; i < dofs.Size(); i++)
    {
       int dof = dofs[i], j = fes->GetElementForDof(dof);
@@ -2009,7 +2344,7 @@ double GridFunction::ComputeL2Error(
       fdof = fe->GetDof();
       transf = fes->GetElementTransformation(i);
       shape.SetSize(fdof);
-      intorder = 2*fe->GetOrder() + 1; // <----------
+      intorder = 2*fe->GetOrder() + 3; // <----------
       const IntegrationRule *ir;
       if (irs)
       {
@@ -2064,7 +2399,7 @@ double GridFunction::ComputeL2Error(
    {
       if (elems != NULL && (*elems)[i] == 0) { continue; }
       fe = fes->GetFE(i);
-      int intorder = 2*fe->GetOrder() + 1; // <----------
+      int intorder = 2*fe->GetOrder() + 3; // <----------
       const IntegrationRule *ir;
       if (irs)
       {
@@ -2168,7 +2503,7 @@ double GridFunction::ComputeH1Error(
             }
          intorder = 2 * intorder;  // <-------------
          const IntegrationRule &ir =
-            IntRules.Get(face_elem_transf->FaceGeom, intorder);
+            IntRules.Get(face_elem_transf->GetGeometryType(), intorder);
          err_val.SetSize(ir.GetNPoints());
          ell_coeff_val.SetSize(ir.GetNPoints());
          // side 1
@@ -2225,7 +2560,7 @@ double GridFunction::ComputeH1Error(
             }
          }
          face_elem_transf = mesh->GetFaceElementTransformations(i, 16);
-         transf = face_elem_transf->Face;
+         transf = face_elem_transf;
          for (j = 0; j < ir.GetNPoints(); j++)
          {
             const IntegrationPoint &ip = ir.IntPoint(j);
@@ -2259,7 +2594,7 @@ double GridFunction::ComputeMaxError(
       fdof = fe->GetDof();
       transf = fes->GetElementTransformation(i);
       shape.SetSize(fdof);
-      intorder = 2*fe->GetOrder() + 1; // <----------
+      intorder = 2*fe->GetOrder() + 3; // <----------
       const IntegrationRule *ir;
       if (irs)
       {
@@ -2425,7 +2760,7 @@ double GridFunction::ComputeLpError(const double p, Coefficient &exsol,
       }
       else
       {
-         int intorder = 2*fe->GetOrder() + 1; // <----------
+         int intorder = 2*fe->GetOrder() + 3; // <----------
          ir = &(IntRules.Get(fe->GetGeomType(), intorder));
       }
       GetValues(i, *ir, vals);
@@ -2472,10 +2807,13 @@ double GridFunction::ComputeLpError(const double p, Coefficient &exsol,
 }
 
 void GridFunction::ComputeElementLpErrors(const double p, Coefficient &exsol,
-                                          GridFunction &error,
+                                          Vector &error,
                                           Coefficient *weight,
                                           const IntegrationRule *irs[]) const
 {
+   MFEM_ASSERT(error.Size() == fes->GetNE(),
+               "Incorrect size for result vector");
+
    error = 0.0;
    const FiniteElement *fe;
    ElementTransformation *T;
@@ -2491,7 +2829,7 @@ void GridFunction::ComputeElementLpErrors(const double p, Coefficient &exsol,
       }
       else
       {
-         int intorder = 2*fe->GetOrder() + 1; // <----------
+         int intorder = 2*fe->GetOrder() + 3; // <----------
          ir = &(IntRules.Get(fe->GetGeomType(), intorder));
       }
       GetValues(i, *ir, vals);
@@ -2555,7 +2893,7 @@ double GridFunction::ComputeLpError(const double p, VectorCoefficient &exsol,
       }
       else
       {
-         int intorder = 2*fe->GetOrder() + 1; // <----------
+         int intorder = 2*fe->GetOrder() + 3; // <----------
          ir = &(IntRules.Get(fe->GetGeomType(), intorder));
       }
       T = fes->GetElementTransformation(i);
@@ -2627,11 +2965,14 @@ double GridFunction::ComputeLpError(const double p, VectorCoefficient &exsol,
 
 void GridFunction::ComputeElementLpErrors(const double p,
                                           VectorCoefficient &exsol,
-                                          GridFunction &error,
+                                          Vector &error,
                                           Coefficient *weight,
                                           VectorCoefficient *v_weight,
                                           const IntegrationRule *irs[]) const
 {
+   MFEM_ASSERT(error.Size() == fes->GetNE(),
+               "Incorrect size for result vector");
+
    error = 0.0;
    const FiniteElement *fe;
    ElementTransformation *T;
@@ -2648,7 +2989,7 @@ void GridFunction::ComputeElementLpErrors(const double p,
       }
       else
       {
-         int intorder = 2*fe->GetOrder() + 1; // <----------
+         int intorder = 2*fe->GetOrder() + 3; // <----------
          ir = &(IntRules.Get(fe->GetGeomType(), intorder));
       }
       T = fes->GetElementTransformation(i);
@@ -2658,15 +2999,15 @@ void GridFunction::ComputeElementLpErrors(const double p,
       loc_errs.SetSize(vals.Width());
       if (!v_weight)
       {
-         // compute the lengths of the errors at the integration points
-         // thus the vector norm is rotationally invariant
+         // compute the lengths of the errors at the integration points thus the
+         // vector norm is rotationally invariant
          vals.Norm2(loc_errs);
       }
       else
       {
          v_weight->Eval(exact_vals, *T, *ir);
-         // column-wise dot product of the vector error (in vals) and the
-         // vector weight (in exact_vals)
+         // column-wise dot product of the vector error (in vals) and the vector
+         // weight (in exact_vals)
          for (int j = 0; j < vals.Width(); j++)
          {
             double err = 0.0;
@@ -2798,7 +3139,9 @@ void GridFunction::SaveVTK(std::ostream &out, const std::string &field_name,
          RefG = GlobGeometryRefiner.Refine(
                    mesh->GetElementBaseGeometry(i), ref, 1);
 
-         GetVectorValues(i, RefG->RefPts, vval, pmat);
+         // GetVectorValues(i, RefG->RefPts, vval, pmat);
+         ElementTransformation * T = mesh->GetElementTransformation(i);
+         GetVectorValues(*T, RefG->RefPts, vval, &pmat);
 
          for (int j = 0; j < vval.Width(); j++)
          {
diff --git a/fem/gridfunc.hpp b/fem/gridfunc.hpp
index 41599da5318..9687514b598 100644
--- a/fem/gridfunc.hpp
+++ b/fem/gridfunc.hpp
@@ -144,17 +144,133 @@ class GridFunction : public Vector
    /// Returns the values in the vertices of i'th element for dimension vdim.
    void GetNodalValues(int i, Array<double> &nval, int vdim = 1) const;
 
+   /** @name Element index Get Value Methods
+
+       These methods take an element index and return the interpolated value of
+       the field at a given reference point within the element.
+
+       @warning These methods retrieve and use the ElementTransformation object
+       from the mfem::Mesh. This can alter the state of the element
+       transformation object and can also lead to unexpected results when the
+       ElementTransformation object is already in use such as when these methods
+       are called from within an integration loop. Consider using
+       GetValue(ElementTransformation &T, ...) instead.
+   */
+   ///@{
+   /** Return a scalar value from within the given element. */
    virtual double GetValue(int i, const IntegrationPoint &ip,
                            int vdim = 1) const;
 
+   /** Return a vector value from within the given element. */
    void GetVectorValue(int i, const IntegrationPoint &ip, Vector &val) const;
+   ///@}
 
+   /** @name Element Index Get Values Methods
+
+       These are convenience methods for repeatedly calling GetValue for
+       multiple points within a given element. The GetValues methods are
+       optimized and should perform better than repeatedly calling GetValue. The
+       GetVectorValues method simply calls GetVectorValue repeatedly.
+
+       @warning These methods retrieve and use the ElementTransformation object
+       from the mfem::Mesh. This can alter the state of the element
+       transformation object and can also lead to unexpected results when the
+       ElementTransformation object is already in use such as when these methods
+       are called from within an integration loop. Consider using
+       GetValues(ElementTransformation &T, ...) instead.
+   */
+   ///@{
+   /** Compute a collection of scalar values from within the element indicated
+       by the index i. */
    void GetValues(int i, const IntegrationRule &ir, Vector &vals,
                   int vdim = 1) const;
 
+   /** Compute a collection of vector values from within the element indicated
+       by the index i. */
    void GetValues(int i, const IntegrationRule &ir, Vector &vals,
                   DenseMatrix &tr, int vdim = 1) const;
 
+   void GetVectorValues(int i, const IntegrationRule &ir,
+                        DenseMatrix &vals, DenseMatrix &tr) const;
+   ///@}
+
+   /** @name ElementTransformation Get Value Methods
+
+       These member functions are designed for use within
+       GridFunctionCoefficient objects. These can be used with
+       ElementTransformation objects coming from either
+       Mesh::GetElementTransformation() or Mesh::GetBdrElementTransformation().
+
+       @note These methods do not reset the ElementTransformation object so they
+       should be safe to use within integration loops or other contexts where
+       the ElementTransformation is already in use.
+   */
+   ///@{
+   /** Return a scalar value from within the element indicated by the
+       ElementTransformation Object. */
+   double GetValue(ElementTransformation &T, const IntegrationPoint &ip,
+                   int comp = 0, Vector *tr = NULL) const;
+
+   /** Return a vector value from within the element indicated by the
+       ElementTransformation Object. */
+   void GetVectorValue(ElementTransformation &T, const IntegrationPoint &ip,
+                       Vector &val, Vector *tr = NULL) const;
+   ///@}
+
+   /** @name ElementTransformation Get Values Methods
+
+       These are convenience methods for repeatedly calling GetValue for
+       multiple points within a given element. They work by calling either the
+       ElementTransformation or FaceElementTransformations versions described
+       above. Consequently, these methods should not be expected to run faster
+       than calling the above methods in an external loop.
+
+       @note These methods do not reset the ElementTransformation object so they
+       should be safe to use within integration loops or other contexts where
+       the ElementTransformation is already in use.
+
+       @note These methods can also be used with FaceElementTransformations
+       objects.
+    */
+   ///@{
+   /** Compute a collection of scalar values from within the element indicated
+       by the ElementTransformation object. */
+   void GetValues(ElementTransformation &T, const IntegrationRule &ir,
+                  Vector &vals, int comp = 0, DenseMatrix *tr = NULL) const;
+
+   /** Compute a collection of vector values from within the element indicated
+       by the ElementTransformation object. */
+   void GetVectorValues(ElementTransformation &T, const IntegrationRule &ir,
+                        DenseMatrix &vals, DenseMatrix *tr = NULL) const;
+   ///@}
+
+   /** @name Face Index Get Values Methods
+
+       These methods are designed to work with Discontinuous Galerkin basis
+       functions. They compute field values on the interface between elements,
+       or on boundary elements, by interpolating the field in a neighboring
+       element. The \a side argument indices which neighboring element should be
+       used: 0, 1, or 2 (automatically chosen).
+
+       @warning These methods retrieve and use the FaceElementTransformations
+       object from the mfem::Mesh. This can alter the state of the face element
+       transformations object and can also lead to unexpected results when the
+       FaceElementTransformations object is already in use such as when these
+       methods are called from within an integration loop. Consider using
+       GetValues(ElementTransformation &T, ...) instead.
+    */
+   ///@{
+   /** Compute a collection of scalar values from within the face
+       indicated by the index i. */
+   int GetFaceValues(int i, int side, const IntegrationRule &ir, Vector &vals,
+                     DenseMatrix &tr, int vdim = 1) const;
+
+   /** Compute a collection of vector values from within the face
+       indicated by the index i. */
+   int GetFaceVectorValues(int i, int side, const IntegrationRule &ir,
+                           DenseMatrix &vals, DenseMatrix &tr) const;
+   ///@}
+
    void GetLaplacians(int i, const IntegrationRule &ir, Vector &laps,
                       int vdim = 1) const;
 
@@ -167,18 +283,6 @@ class GridFunction : public Vector
    void GetHessians(int i, const IntegrationRule &ir, DenseMatrix &hess,
                     DenseMatrix &tr, int vdim = 1) const;
 
-   int GetFaceValues(int i, int side, const IntegrationRule &ir, Vector &vals,
-                     DenseMatrix &tr, int vdim = 1) const;
-
-   void GetVectorValues(ElementTransformation &T, const IntegrationRule &ir,
-                        DenseMatrix &vals) const;
-
-   void GetVectorValues(int i, const IntegrationRule &ir,
-                        DenseMatrix &vals, DenseMatrix &tr) const;
-
-   int GetFaceVectorValues(int i, int side, const IntegrationRule &ir,
-                           DenseMatrix &vals, DenseMatrix &tr) const;
-
    void GetValuesFrom(const GridFunction &orig_func);
 
    void GetBdrValuesFrom(const GridFunction &orig_func);
@@ -236,12 +340,10 @@ class GridFunction : public Vector
 
    virtual void ProjectCoefficient(Coefficient &coeff);
 
-   // call fes -> BuildDofToArrays() before using this projection
    void ProjectCoefficient(Coefficient &coeff, Array<int> &dofs, int vd = 0);
 
    void ProjectCoefficient(VectorCoefficient &vcoeff);
 
-   // call fes -> BuildDofToArrays() before using this projection
    void ProjectCoefficient(VectorCoefficient &vcoeff, Array<int> &dofs);
 
    void ProjectCoefficient(Coefficient *coeff[]);
@@ -365,28 +467,28 @@ class GridFunction : public Vector
                                  const IntegrationRule *irs[] = NULL) const;
 
    /** Compute the Lp error in each element of the mesh and store the results in
-       the GridFunction @a error. The result should be an L2 GridFunction of
-       order zero using map type VALUE. */
+       the Vector @a error. The result should be of length number of elements,
+       for example an L2 GridFunction of order zero using map type VALUE. */
    virtual void ComputeElementLpErrors(const double p, Coefficient &exsol,
-                                       GridFunction &error,
+                                       Vector &error,
                                        Coefficient *weight = NULL,
                                        const IntegrationRule *irs[] = NULL
                                       ) const;
 
    virtual void ComputeElementL1Errors(Coefficient &exsol,
-                                       GridFunction &error,
+                                       Vector &error,
                                        const IntegrationRule *irs[] = NULL
                                       ) const
    { ComputeElementLpErrors(1.0, exsol, error, NULL, irs); }
 
    virtual void ComputeElementL2Errors(Coefficient &exsol,
-                                       GridFunction &error,
+                                       Vector &error,
                                        const IntegrationRule *irs[] = NULL
                                       ) const
    { ComputeElementLpErrors(2.0, exsol, error, NULL, irs); }
 
    virtual void ComputeElementMaxErrors(Coefficient &exsol,
-                                        GridFunction &error,
+                                        Vector &error,
                                         const IntegrationRule *irs[] = NULL
                                        ) const
    { ComputeElementLpErrors(infinity(), exsol, error, NULL, irs); }
@@ -400,29 +502,29 @@ class GridFunction : public Vector
                                  const IntegrationRule *irs[] = NULL) const;
 
    /** Compute the Lp error in each element of the mesh and store the results in
-       the GridFunction @ error. The result should be an L2 GridFunction of
-       order zero using map type VALUE. */
+       the Vector @ error. The result should be of length number of elements,
+       for example an L2 GridFunction of order zero using map type VALUE. */
    virtual void ComputeElementLpErrors(const double p, VectorCoefficient &exsol,
-                                       GridFunction &error,
+                                       Vector &error,
                                        Coefficient *weight = NULL,
                                        VectorCoefficient *v_weight = NULL,
                                        const IntegrationRule *irs[] = NULL
                                       ) const;
 
    virtual void ComputeElementL1Errors(VectorCoefficient &exsol,
-                                       GridFunction &error,
+                                       Vector &error,
                                        const IntegrationRule *irs[] = NULL
                                       ) const
    { ComputeElementLpErrors(1.0, exsol, error, NULL, NULL, irs); }
 
    virtual void ComputeElementL2Errors(VectorCoefficient &exsol,
-                                       GridFunction &error,
+                                       Vector &error,
                                        const IntegrationRule *irs[] = NULL
                                       ) const
    { ComputeElementLpErrors(2.0, exsol, error, NULL, NULL, irs); }
 
    virtual void ComputeElementMaxErrors(VectorCoefficient &exsol,
-                                        GridFunction &error,
+                                        Vector &error,
                                         const IntegrationRule *irs[] = NULL
                                        ) const
    { ComputeElementLpErrors(infinity(), exsol, error, NULL, NULL, irs); }
@@ -496,11 +598,13 @@ class GridFunction : public Vector
                      type = adios2stream::data_type::point_data) const;
 #endif
 
-   /** Write the GridFunction in VTK format. Note that Mesh::PrintVTK must be
-       called first. The parameter ref > 0 must match the one used in
+   /** @brief Write the GridFunction in VTK format. Note that Mesh::PrintVTK
+       must be called first. The parameter ref > 0 must match the one used in
        Mesh::PrintVTK. */
    void SaveVTK(std::ostream &out, const std::string &field_name, int ref);
 
+   /** @brief Write the GridFunction in STL format. Note that the mesh dimension
+       must be 2 and that quad elements will be broken into two triangles.*/
    void SaveSTL(std::ostream &out, int TimesToRefine = 1);
 
    /// Destroys grid function.
@@ -633,6 +737,16 @@ class QuadratureFunction : public Vector
     */
    inline void GetElementValues(int idx, Vector &values) const;
 
+   /// Return the quadrature function values at an integration point.
+   /** The result is stored in the Vector @a values as a reference to the
+       global values. */
+   inline void GetElementValues(int idx, const int ip_num, Vector &values);
+
+   /// Return the quadrature function values at an integration point.
+   /** The result is stored in the Vector @a values as a copy to the
+       global values. */
+   inline void GetElementValues(int idx, const int ip_num, Vector &values) const;
+
    /// Return all values associated with mesh element @a idx in a DenseMatrix.
    /** The result is stored in the DenseMatrix @a values as a reference to the
        global values.
@@ -737,6 +851,25 @@ inline void QuadratureFunction::GetElementValues(int idx, Vector &values) const
    }
 }
 
+inline void QuadratureFunction::GetElementValues(int idx, const int ip_num,
+                                                 Vector &values)
+{
+   const int s_offset = qspace->element_offsets[idx] * vdim + ip_num * vdim;
+   values.NewDataAndSize(data + s_offset, vdim);
+}
+
+inline void QuadratureFunction::GetElementValues(int idx, const int ip_num,
+                                                 Vector &values) const
+{
+   const int s_offset = qspace->element_offsets[idx] * vdim + ip_num * vdim;
+   values.SetSize(vdim);
+   const double *q = data + s_offset;
+   for (int i = 0; i < values.Size(); i++)
+   {
+      values(i) = *(q++);
+   }
+}
+
 inline void QuadratureFunction::GetElementValues(int idx, DenseMatrix &values)
 {
    const int s_offset = qspace->element_offsets[idx];
diff --git a/fem/gslib.cpp b/fem/gslib.cpp
index 6909c806c19..340ea71c681 100644
--- a/fem/gslib.cpp
+++ b/fem/gslib.cpp
@@ -29,12 +29,14 @@ namespace mfem
 {
 
 FindPointsGSLIB::FindPointsGSLIB()
-   : mesh(NULL), ir_simplex(NULL), gsl_mesh(), fdata2D(NULL), fdata3D(NULL),
-     dim(-1)
+   : mesh(NULL), ir_simplex(NULL), fdata2D(NULL), fdata3D(NULL),
+     dim(-1), gsl_mesh(), gsl_ref(), gsl_dist(), setupflag(false)
 {
    gsl_comm = new comm;
 #ifdef MFEM_USE_MPI
-   MPI_Init(NULL, NULL);
+   int initialized;
+   MPI_Initialized(&initialized);
+   if (!initialized) { MPI_Init(NULL, NULL); }
    MPI_Comm comm = MPI_COMM_WORLD;;
    comm_init(gsl_comm, comm);
 #else
@@ -50,28 +52,29 @@ FindPointsGSLIB::~FindPointsGSLIB()
 
 #ifdef MFEM_USE_MPI
 FindPointsGSLIB::FindPointsGSLIB(MPI_Comm _comm)
-   : mesh(NULL), ir_simplex(NULL), gsl_mesh(), fdata2D(NULL), fdata3D(NULL),
-     dim(-1)
+   : mesh(NULL), ir_simplex(NULL), fdata2D(NULL), fdata3D(NULL),
+     dim(-1), gsl_mesh(), gsl_ref(), gsl_dist(), setupflag(false)
 {
    gsl_comm = new comm;
    comm_init(gsl_comm, _comm);
 }
 #endif
 
-void FindPointsGSLIB::Setup(Mesh &m, double bb_t, double newt_tol, int npt_max)
+void FindPointsGSLIB::Setup(Mesh &m, const double bb_t, const double newt_tol,
+                            const int npt_max)
 {
    MFEM_VERIFY(m.GetNodes() != NULL, "Mesh nodes are required.");
    MFEM_VERIFY(m.GetNumGeometries(m.Dimension()) == 1,
                "Mixed meshes are not currently supported in FindPointsGSLIB.");
 
+   // call FreeData if FindPointsGSLIB::Setup has been called already
+   if (setupflag) { FreeData(); }
+
    mesh = &m;
    dim  = mesh->Dimension();
    const FiniteElement *fe = mesh->GetNodalFESpace()->GetFE(0);
    unsigned dof1D = fe->GetOrder() + 1;
-   int NE      = mesh->GetNE(),
-       dof_cnt = fe->GetDof(),
-       pts_cnt = NE * dof_cnt,
-       gt      = fe->GetGeomType();
+   const int gt   = fe->GetGeomType();
 
    if (gt == Geometry::TRIANGLE || gt == Geometry::TETRAHEDRON ||
        gt == Geometry::PRISM)
@@ -87,8 +90,8 @@ void FindPointsGSLIB::Setup(Mesh &m, double bb_t, double newt_tol, int npt_max)
       MFEM_ABORT("Element type not currently supported in FindPointsGSLIB.");
    }
 
-   pts_cnt = gsl_mesh.Size()/dim;
-   int NEtot = pts_cnt/(int)pow(dof1D, dim);
+   const int pts_cnt = gsl_mesh.Size()/dim,
+             NEtot = pts_cnt/(int)pow(dof1D, dim);
 
    if (dim == 2)
    {
@@ -107,6 +110,7 @@ void FindPointsGSLIB::Setup(Mesh &m, double bb_t, double newt_tol, int npt_max)
       fdata3D = findpts_setup_3(gsl_comm, elx, nr, NEtot, mr, bb_t,
                                 pts_cnt, pts_cnt, npt_max, newt_tol);
    }
+   setupflag = true;
 }
 
 void FindPointsGSLIB::FindPoints(const Vector &point_pos,
@@ -115,6 +119,7 @@ void FindPointsGSLIB::FindPoints(const Vector &point_pos,
                                  Array<unsigned int> &elem_ids,
                                  Vector &ref_pos, Vector &dist)
 {
+   MFEM_VERIFY(setupflag, "Use FindPointsGSLIB::Setup before finding points.");
    const int points_cnt = point_pos.Size() / dim;
    if (dim == 2)
    {
@@ -150,36 +155,92 @@ void FindPointsGSLIB::FindPoints(const Vector &point_pos,
    }
 }
 
+void FindPointsGSLIB::FindPoints(const Vector &point_pos)
+{
+   const int points_cnt = point_pos.Size() / dim;
+   gsl_code.SetSize(points_cnt);
+   gsl_proc.SetSize(points_cnt);
+   gsl_elem.SetSize(points_cnt);
+   gsl_ref.SetSize(points_cnt * dim);
+   gsl_dist.SetSize(points_cnt);
+
+   FindPoints(point_pos, gsl_code, gsl_proc, gsl_elem, gsl_ref, gsl_dist);
+}
+
+void FindPointsGSLIB::FindPoints(Mesh &m, const Vector &point_pos,
+                                 const double bb_t, const double newt_tol,
+                                 const int npt_max)
+{
+   if (!setupflag || (mesh != &m) )
+   {
+      Setup(m, bb_t, newt_tol, npt_max);
+   }
+   FindPoints(point_pos);
+}
+
 void FindPointsGSLIB::Interpolate(Array<unsigned int> &codes,
                                   Array<unsigned int> &proc_ids,
                                   Array<unsigned int> &elem_ids,
                                   Vector &ref_pos, const GridFunction &field_in,
                                   Vector &field_out)
 {
+
+   FiniteElementSpace ind_fes(mesh, field_in.FESpace()->FEColl());
+   GridFunction field_in_scalar(&ind_fes);
    Vector node_vals;
-   GetNodeValues(field_in, node_vals);
 
-   const int points_cnt = ref_pos.Size() / dim;
-   if (dim==2)
-   {
-      findpts_eval_2(field_out.GetData(), sizeof(double),
-                     codes.GetData(), sizeof(unsigned int),
-                     proc_ids.GetData(), sizeof(unsigned int),
-                     elem_ids.GetData(), sizeof(unsigned int),
-                     ref_pos.GetData(), sizeof(double) * dim,
-                     points_cnt, node_vals.GetData(), fdata2D);
-   }
-   else
+   const int ncomp      = field_in.FESpace()->GetVDim(),
+             points_fld = field_in.Size() / ncomp,
+             points_cnt = codes.Size();
+
+   for (int i = 0; i < ncomp; i++)
    {
-      findpts_eval_3(field_out.GetData(), sizeof(double),
-                     codes.GetData(), sizeof(unsigned int),
-                     proc_ids.GetData(), sizeof(unsigned int),
-                     elem_ids.GetData(), sizeof(unsigned int),
-                     ref_pos.GetData(), sizeof(double) * dim,
-                     points_cnt, node_vals.GetData(), fdata3D);
+      const int dataptrin  = i*points_fld,
+                dataptrout = i*points_cnt;
+      field_in_scalar.NewDataAndSize(field_in.GetData()+dataptrin, points_fld);
+      GetNodeValues(field_in_scalar, node_vals);
+
+      if (dim==2)
+      {
+         findpts_eval_2(field_out.GetData()+dataptrout, sizeof(double),
+                        codes.GetData(),       sizeof(unsigned int),
+                        proc_ids.GetData(),    sizeof(unsigned int),
+                        elem_ids.GetData(),    sizeof(unsigned int),
+                        ref_pos.GetData(),     sizeof(double) * dim,
+                        points_cnt, node_vals.GetData(), fdata2D);
+      }
+      else
+      {
+         findpts_eval_3(field_out.GetData()+dataptrout, sizeof(double),
+                        codes.GetData(),       sizeof(unsigned int),
+                        proc_ids.GetData(),    sizeof(unsigned int),
+                        elem_ids.GetData(),    sizeof(unsigned int),
+                        ref_pos.GetData(),     sizeof(double) * dim,
+                        points_cnt, node_vals.GetData(), fdata3D);
+      }
    }
 }
 
+void FindPointsGSLIB::Interpolate(const GridFunction &field_in,
+                                  Vector &field_out)
+{
+   Interpolate(gsl_code, gsl_proc, gsl_elem, gsl_ref, field_in, field_out);
+}
+
+void FindPointsGSLIB::Interpolate(const Vector &point_pos,
+                                  const GridFunction &field_in, Vector &field_out)
+{
+   FindPoints(point_pos);
+   Interpolate(gsl_code, gsl_proc, gsl_elem, gsl_ref, field_in, field_out);
+}
+
+void FindPointsGSLIB::Interpolate(Mesh &m, const Vector &point_pos,
+                                  const GridFunction &field_in, Vector &field_out)
+{
+   FindPoints(m, point_pos);
+   Interpolate(gsl_code, gsl_proc, gsl_elem, gsl_ref, field_in, field_out);
+}
+
 void FindPointsGSLIB::FreeData()
 {
    if (dim == 2)
@@ -190,7 +251,13 @@ void FindPointsGSLIB::FreeData()
    {
       findpts_free_3(fdata3D);
    }
+   setupflag = false;
+   gsl_code.DeleteAll();
+   gsl_proc.DeleteAll();
+   gsl_elem.DeleteAll();
    gsl_mesh.Destroy();
+   gsl_ref.Destroy();
+   gsl_dist.Destroy();
 }
 
 void FindPointsGSLIB::GetNodeValues(const GridFunction &gf_in,
@@ -292,7 +359,7 @@ void FindPointsGSLIB::GetSimplexNodalCoordinates()
    const GridFunction *nodes = mesh->GetNodes();
    Mesh *meshsplit           = NULL;
    const int NE              = mesh->GetNE();
-   int NEsplit;
+   int NEsplit = -1;
 
    // Split the reference element into a reference submesh of quads or hexes.
    if (gt == Geometry::TRIANGLE)
@@ -386,6 +453,7 @@ void FindPointsGSLIB::GetSimplexNodalCoordinates()
       }
       meshsplit->FinalizeHexMesh(1, 1, true);
    }
+   else { MFEM_ABORT("Unsupported geometry type."); }
 
    // Curve the reference submesh.
    H1_FECollection fec(fe->GetOrder(), dim);
diff --git a/fem/gslib.hpp b/fem/gslib.hpp
index 2f043e3b86d..806f59bda44 100644
--- a/fem/gslib.hpp
+++ b/fem/gslib.hpp
@@ -29,10 +29,12 @@ class FindPointsGSLIB
 protected:
    Mesh *mesh;
    IntegrationRule *ir_simplex;
-   Vector gsl_mesh;
    struct findpts_data_2 *fdata2D;
    struct findpts_data_3 *fdata3D;
    int dim;
+   Array<unsigned int> gsl_code, gsl_proc, gsl_elem;
+   Vector gsl_mesh, gsl_ref, gsl_dist;
+   bool setupflag;
 
    struct comm *gsl_comm;
 
@@ -59,7 +61,8 @@ class FindPointsGSLIB
        @param[in] newt_tol  Newton tolerance for the gslib search methods.
        @param[in] npt_max   Number of points for simultaneous iteration. This
                             alters performance and memory footprint. */
-   void Setup(Mesh &m, double bb_t, double newt_tol, int npt_max);
+   void Setup(Mesh &m, const double bb_t = 0.1, const double newt_tol = 1.0e-12,
+              const int npt_max = 256);
 
    /** Searches positions given in physical space by @a point_pos. All output
        Arrays and Vectors are expected to have the correct size.
@@ -73,11 +76,15 @@ class FindPointsGSLIB
        @param[out] ref_pos    Reference coordinates of the found point. Ordered
                               by vdim (XYZ,XYZ,XYZ...).
                               Note: the gslib reference frame is [-1,1].
-       @param[out] dist       Distance between the seeked and the found point
+       @param[out] dist       Distance between the sought and the found point
                               in physical space. */
    void FindPoints(const Vector &point_pos, Array<unsigned int> &codes,
                    Array<unsigned int> &proc_ids, Array<unsigned int> &elem_ids,
                    Vector &ref_pos, Vector &dist);
+   void FindPoints(const Vector &point_pos);
+   /// Setup FindPoints and search positions
+   void FindPoints(Mesh &m, const Vector &point_pos, const double bb_t = 0.1,
+                   const double newt_tol = 1.0e-12,  const int npt_max = 256);
 
    /** Interpolation of field values at prescribed reference space positions.
 
@@ -96,11 +103,31 @@ class FindPointsGSLIB
    void Interpolate(Array<unsigned int> &codes, Array<unsigned int> &proc_ids,
                     Array<unsigned int> &elem_ids, Vector &ref_pos,
                     const GridFunction &field_in, Vector &field_out);
+   void Interpolate(const GridFunction &field_in, Vector &field_out);
+   /** Search positions and interpolate */
+   void Interpolate(const Vector &point_pos, const GridFunction &field_in,
+                    Vector &field_out);
+   /** Setup FindPoints, search positions and interpolate */
+   void Interpolate(Mesh &m, const Vector &point_pos,
+                    const GridFunction &field_in, Vector &field_out);
 
    /** Cleans up memory allocated internally by gslib.
        Note that in parallel, this must be called before MPI_Finalize(), as
        it calls MPI_Comm_free() for internal gslib communicators. */
    void FreeData();
+
+   /// Return code for each point searched by FindPoints: inside element (0), on
+   /// element boundary (1), or not found (2).
+   const Array<unsigned int> &GetCode() const { return gsl_code; }
+   /// Return element number for each point found by FindPoints.
+   const Array<unsigned int> &GetElem() const { return gsl_elem; }
+   /// Return MPI rank on which each point was found by FindPoints.
+   const Array<unsigned int> &GetProc() const { return gsl_proc; }
+   /// Return reference coordinates for each point found by FindPoints.
+   const Vector &GetReferencePosition() const { return gsl_ref;  }
+   /// Return distance Distance between the sought and the found point
+   /// in physical space, for each point found by FindPoints.
+   const Vector &GetDist()              const { return gsl_dist; }
 };
 
 } // namespace mfem
diff --git a/fem/libceed/ceed.hpp b/fem/libceed/ceed.hpp
index 53b59bd727d..9fd97a114ab 100644
--- a/fem/libceed/ceed.hpp
+++ b/fem/libceed/ceed.hpp
@@ -40,7 +40,7 @@ struct CeedConstCoeff
 
 struct CeedGridCoeff
 {
-   GridFunction* coeff;
+   const GridFunction* coeff;
    CeedBasis basis;
    CeedElemRestriction restr;
    CeedVector coeffVector;
diff --git a/fem/linearform.hpp b/fem/linearform.hpp
index f2d050ff29c..1ab80eb3fdf 100644
--- a/fem/linearform.hpp
+++ b/fem/linearform.hpp
@@ -19,7 +19,7 @@
 namespace mfem
 {
 
-/// Class for linear form - Vector with associated FE space and LFIntegrators.
+/// Vector with associated FE space and LinearFormIntegrators.
 class LinearForm : public Vector
 {
 protected:
diff --git a/fem/lininteg.cpp b/fem/lininteg.cpp
index adc9355d662..98dbd6213cd 100644
--- a/fem/lininteg.cpp
+++ b/fem/lininteg.cpp
@@ -307,7 +307,7 @@ void VectorBoundaryLFIntegrator::AssembleRHSElementVect(
    if (ir == NULL)
    {
       int intorder = 2*el.GetOrder();
-      ir = &IntRules.Get(Tr.FaceGeom, intorder);
+      ir = &IntRules.Get(Tr.GetGeometryType(), intorder);
    }
 
    for (int i = 0; i < ir->GetNPoints(); i++)
@@ -316,9 +316,11 @@ void VectorBoundaryLFIntegrator::AssembleRHSElementVect(
       IntegrationPoint eip;
       Tr.Loc1.Transform(ip, eip);
 
-      Tr.Face->SetIntPoint(&ip);
-      Q.Eval(vec, *Tr.Face, ip);
-      vec *= Tr.Face->Weight() * ip.weight;
+      Tr.SetIntPoint(&ip);
+
+      // Use Tr transformation in case Q depends on boundary attribute
+      Q.Eval(vec, Tr, ip);
+      vec *= Tr.Weight() * ip.weight;
       el.CalcShape(eip, shape);
       for (int k = 0; k < vdim; k++)
       {
@@ -510,7 +512,7 @@ void BoundaryFlowIntegrator::AssembleRHSElementVect(
       {
          order++;
       }
-      ir = &IntRules.Get(Tr.FaceGeom, order);
+      ir = &IntRules.Get(Tr.GetGeometryType(), order);
    }
 
    shape.SetSize(ndof);
@@ -524,8 +526,10 @@ void BoundaryFlowIntegrator::AssembleRHSElementVect(
       Tr.Loc1.Transform(ip, eip);
       el.CalcShape(eip, shape);
 
-      Tr.Face->SetIntPoint(&ip);
+      Tr.SetIntPoint(&ip);
 
+      // Use Tr.Elem1 transformation for u so that it matches the coefficient
+      // used with the ConvectionIntegrator and/or the DGTraceIntegrator.
       u->Eval(vu, *Tr.Elem1, eip);
 
       if (dim == 1)
@@ -534,12 +538,12 @@ void BoundaryFlowIntegrator::AssembleRHSElementVect(
       }
       else
       {
-         CalcOrtho(Tr.Face->Jacobian(), nor);
+         CalcOrtho(Tr.Jacobian(), nor);
       }
 
       un = vu * nor;
       w = 0.5*alpha*un - beta*fabs(un);
-      w *= ip.weight*f->Eval(*Tr.Elem1, eip);
+      w *= ip.weight*f->Eval(Tr, ip);
       elvect.Add(w, shape);
    }
 }
@@ -582,7 +586,7 @@ void DGDirichletLFIntegrator::AssembleRHSElementVect(
    {
       // a simple choice for the integration order; is this OK?
       int order = 2*el.GetOrder();
-      ir = &IntRules.Get(Tr.FaceGeom, order);
+      ir = &IntRules.Get(Tr.GetGeometryType(), order);
    }
 
    for (int p = 0; p < ir->GetNPoints(); p++)
@@ -591,33 +595,33 @@ void DGDirichletLFIntegrator::AssembleRHSElementVect(
       IntegrationPoint eip;
 
       Tr.Loc1.Transform(ip, eip);
-      Tr.Face->SetIntPoint(&ip);
+      Tr.SetIntPoint(&ip);
       if (dim == 1)
       {
          nor(0) = 2*eip.x - 1.0;
       }
       else
       {
-         CalcOrtho(Tr.Face->Jacobian(), nor);
+         CalcOrtho(Tr.Jacobian(), nor);
       }
 
       el.CalcShape(eip, shape);
       el.CalcDShape(eip, dshape);
-      Tr.Elem1->SetIntPoint(&eip);
+
       // compute uD through the face transformation
-      w = ip.weight * uD->Eval(*Tr.Face, ip) / Tr.Elem1->Weight();
+      w = ip.weight * uD->Eval(Tr, ip) / Tr.Elem1->Weight();
       if (!MQ)
       {
          if (Q)
          {
-            w *= Q->Eval(*Tr.Elem1, eip);
+            w *= Q->Eval(Tr, ip);
          }
          ni.Set(w, nor);
       }
       else
       {
          nh.Set(w, nor);
-         MQ->Eval(mq, *Tr.Elem1, eip);
+         MQ->Eval(mq, Tr, ip);
          mq.MultTranspose(nh, ni);
       }
       CalcAdjugate(Tr.Elem1->Jacobian(), adjJ);
@@ -676,7 +680,7 @@ void DGElasticityDirichletLFIntegrator::AssembleRHSElementVect(
    if (ir == NULL)
    {
       const int order = 2*el.GetOrder(); // <-----
-      ir = &IntRules.Get(Tr.FaceGeom, order);
+      ir = &IntRules.Get(Tr.GetGeometryType(), order);
    }
 
    for (int pi = 0; pi < ir->GetNPoints(); ++pi)
@@ -684,11 +688,10 @@ void DGElasticityDirichletLFIntegrator::AssembleRHSElementVect(
       const IntegrationPoint &ip = ir->IntPoint(pi);
       IntegrationPoint eip;
       Tr.Loc1.Transform(ip, eip);
-      Tr.Face->SetIntPoint(&ip);
-      Tr.Elem1->SetIntPoint(&eip);
+      Tr.SetIntPoint(&ip);
 
       // Evaluate the Dirichlet b.c. using the face transformation.
-      uD.Eval(u_dir, *Tr.Face, ip);
+      uD.Eval(u_dir, Tr, ip);
 
       el.CalcShape(eip, shape);
       el.CalcDShape(eip, dshape);
@@ -702,7 +705,7 @@ void DGElasticityDirichletLFIntegrator::AssembleRHSElementVect(
       }
       else
       {
-         CalcOrtho(Tr.Face->Jacobian(), nor);
+         CalcOrtho(Tr.Jacobian(), nor);
       }
 
       double wL, wM, jcoef;
@@ -768,4 +771,58 @@ void DGElasticityDirichletLFIntegrator::AssembleRHSElementVect(
    }
 }
 
+void VectorQuadratureLFIntegrator::AssembleRHSElementVect(
+   const FiniteElement &fe, ElementTransformation &Tr, Vector &elvect)
+{
+   const IntegrationRule *ir =
+      &vqfc.GetQuadFunction().GetSpace()->GetElementIntRule(Tr.ElementNo);
+
+   const int nqp = ir->GetNPoints();
+   const int vdim = vqfc.GetVDim();
+   const int ndofs = fe.GetDof();
+   Vector shape(ndofs);
+   Vector temp(vdim);
+   elvect.SetSize(vdim * ndofs);
+   elvect = 0.0;
+   for (int q = 0; q < nqp; q++)
+   {
+      const IntegrationPoint &ip = ir->IntPoint(q);
+      Tr.SetIntPoint(&ip);
+      const double w = Tr.Weight() * ip.weight;
+      vqfc.Eval(temp, Tr, ip);
+      fe.CalcShape(ip, shape);
+      for (int ind = 0; ind < vdim; ind++)
+      {
+         for (int nd = 0; nd < ndofs; nd++)
+         {
+            elvect(nd + ind * ndofs) += w * shape(nd) * temp(ind);
+         }
+      }
+   }
+}
+
+void QuadratureLFIntegrator::AssembleRHSElementVect(const FiniteElement &fe,
+                                                    ElementTransformation &Tr,
+                                                    Vector &elvect)
+{
+   const IntegrationRule *ir =
+      &qfc.GetQuadFunction().GetSpace()->GetElementIntRule(Tr.ElementNo);
+
+   const int nqp = ir->GetNPoints();
+   const int ndofs = fe.GetDof();
+   Vector shape(ndofs);
+   elvect.SetSize(ndofs);
+   elvect = 0.0;
+   for (int q = 0; q < nqp; q++)
+   {
+      const IntegrationPoint &ip = ir->IntPoint(q);
+      Tr.SetIntPoint (&ip);
+      const double w = Tr.Weight() * ip.weight;
+      double temp = qfc.Eval(Tr, ip);
+      fe.CalcShape(ip, shape);
+      shape *= (w * temp);
+      elvect += shape;
+   }
+}
+
 }
diff --git a/fem/lininteg.hpp b/fem/lininteg.hpp
index 1b8c2e19cc2..1a203f0e836 100644
--- a/fem/lininteg.hpp
+++ b/fem/lininteg.hpp
@@ -36,7 +36,7 @@ class LinearFormIntegrator
                                        FaceElementTransformations &Tr,
                                        Vector &elvect);
 
-   void SetIntRule(const IntegrationRule *ir) { IntRule = ir; }
+   virtual void SetIntRule(const IntegrationRule *ir) { IntRule = ir; }
    const IntegrationRule* GetIntRule() { return IntRule; }
 
    virtual ~LinearFormIntegrator() { }
@@ -426,6 +426,69 @@ class DGElasticityDirichletLFIntegrator : public LinearFormIntegrator
                                        Vector &elvect);
 };
 
+/** Class for domain integration of L(v) := (f, v), where
+    f=(f1,...,fn) and v=(v1,...,vn). that makes use of
+    VectorQuadratureFunctionCoefficient*/
+class VectorQuadratureLFIntegrator : public LinearFormIntegrator
+{
+private:
+   VectorQuadratureFunctionCoefficient &vqfc;
+
+public:
+   VectorQuadratureLFIntegrator(VectorQuadratureFunctionCoefficient &vqfc,
+                                const IntegrationRule *ir)
+      : LinearFormIntegrator(ir), vqfc(vqfc)
+   {
+      if (ir)
+      {
+         MFEM_WARNING("Integration rule not used in this class. "
+                      "The QuadratureFunction integration rules are used instead");
+      }
+   }
+
+   using LinearFormIntegrator::AssembleRHSElementVect;
+   virtual void AssembleRHSElementVect(const FiniteElement &fe,
+                                       ElementTransformation &Tr,
+                                       Vector &elvect);
+
+   virtual void SetIntRule(const IntegrationRule *ir)
+   {
+      MFEM_WARNING("Integration rule not used in this class. "
+                   "The QuadratureFunction integration rules are used instead");
+   }
+};
+
+/** Class for domain integration L(v) := (f, v) that makes use
+    of QuadratureFunctionCoefficient. */
+class QuadratureLFIntegrator : public LinearFormIntegrator
+{
+private:
+   QuadratureFunctionCoefficient &qfc;
+
+public:
+   QuadratureLFIntegrator(QuadratureFunctionCoefficient &qfc,
+                          const IntegrationRule *ir)
+      : LinearFormIntegrator(ir), qfc(qfc)
+   {
+      if (ir)
+      {
+         MFEM_WARNING("Integration rule not used in this class. "
+                      "The QuadratureFunction integration rules are used instead");
+      }
+   }
+
+   using LinearFormIntegrator::AssembleRHSElementVect;
+   virtual void AssembleRHSElementVect(const FiniteElement &fe,
+                                       ElementTransformation &Tr,
+                                       Vector &elvect);
+
+   virtual void SetIntRule(const IntegrationRule *ir)
+   {
+      MFEM_WARNING("Integration rule not used in this class. "
+                   "The QuadratureFunction integration rules are used instead");
+   }
+};
+
 }
 
 #endif
diff --git a/fem/nonlininteg.hpp b/fem/nonlininteg.hpp
index b6973ade7a3..5c9975455d4 100644
--- a/fem/nonlininteg.hpp
+++ b/fem/nonlininteg.hpp
@@ -20,10 +20,9 @@
 namespace mfem
 {
 
-/** The abstract base class NonlinearFormIntegrator is used to express the
-    local action of a general nonlinear finite element operator. In addition
-    it may provide the capability to assemble the local gradient operator
-    and to compute the local energy. */
+/** @brief This class is used to express the local action of a general nonlinear
+    finite element operator. In addition it may provide the capability to
+    assemble the local gradient operator and to compute the local energy. */
 class NonlinearFormIntegrator
 {
 protected:
diff --git a/fem/pbilinearform.cpp b/fem/pbilinearform.cpp
index 88bf107fa36..386c1bb9308 100644
--- a/fem/pbilinearform.cpp
+++ b/fem/pbilinearform.cpp
@@ -283,7 +283,14 @@ const
    }
 
    X.Distribute(&x);
-   mat->Mult(X, Y);
+   if (ext)
+   {
+      ext->Mult(X, Y);
+   }
+   else
+   {
+      mat->Mult(X, Y);
+   }
    pfes->Dof_TrueDof_Matrix()->MultTranspose(a, Y, 1.0, y);
 }
 
diff --git a/fem/pfespace.hpp b/fem/pfespace.hpp
index 7b030d622a9..1afe0ab8183 100644
--- a/fem/pfespace.hpp
+++ b/fem/pfespace.hpp
@@ -376,7 +376,7 @@ class ParFiniteElementSpace : public FiniteElementSpace
 
    void PrintPartitionStats();
 
-   // Obsolete, kept for backward compatibility
+   /// Obsolete, kept for backward compatibility
    int TrueVSize() const { return ltdof_size; }
 };
 
diff --git a/fem/pgridfunc.cpp b/fem/pgridfunc.cpp
index 079a04ad4eb..dac6728bf26 100644
--- a/fem/pgridfunc.cpp
+++ b/fem/pgridfunc.cpp
@@ -232,7 +232,8 @@ void ParGridFunction::ExchangeFaceNbrData()
    auto d_send_data = send_data.Write();
    MFEM_FORALL(i, send_data.Size(),
    {
-      d_send_data[i] = d_data[d_send_ldof[i]];
+      const int ldof = d_send_ldof[i];
+      d_send_data[i] = d_data[ldof >= 0 ? ldof : -1-ldof];
    });
 
    bool mpi_gpu_aware = Device::GetGPUAwareMPI();
diff --git a/fem/restriction.cpp b/fem/restriction.cpp
index b0d42c1755c..f618c44c7ab 100644
--- a/fem/restriction.cpp
+++ b/fem/restriction.cpp
@@ -168,6 +168,27 @@ void ElementRestriction::Mult(const Vector& x, Vector& y) const
    });
 }
 
+void ElementRestriction::MultUnsigned(const Vector& x, Vector& y) const
+{
+   // Assumes all elements have the same number of dofs
+   const int nd = dof;
+   const int vd = vdim;
+   const bool t = byvdim;
+   auto d_x = Reshape(x.Read(), t?vd:ndofs, t?ndofs:vd);
+   auto d_y = Reshape(y.Write(), nd, vd, ne);
+   auto d_gatherMap = gatherMap.Read();
+
+   MFEM_FORALL(i, dof*ne,
+   {
+      const int gid = d_gatherMap[i];
+      const int j = gid >= 0 ? gid : -1-gid;
+      for (int c = 0; c < vd; ++c)
+      {
+         d_y(i % nd, c, i / nd) = d_x(t?c:j, t?j:c);
+      }
+   });
+}
+
 void ElementRestriction::MultTranspose(const Vector& x, Vector& y) const
 {
    // Assumes all elements have the same number of dofs
@@ -966,27 +987,51 @@ void L2FaceRestriction::MultTranspose(const Vector& x, Vector& y) const
    const int dofs = nfdofs;
    auto d_offsets = offsets.Read();
    auto d_indices = gather_indices.Read();
-   auto d_x = Reshape(x.Read(), nd, vd, 2, nf);
-   auto d_y = Reshape(y.Write(), t?vd:ndofs, t?ndofs:vd);
-   MFEM_FORALL(i, ndofs,
+
+   if (m == L2FaceValues::DoubleValued)
    {
-      const int offset = d_offsets[i];
-      const int nextOffset = d_offsets[i + 1];
-      for (int c = 0; c < vd; ++c)
+      auto d_x = Reshape(x.Read(), nd, vd, 2, nf);
+      auto d_y = Reshape(y.Write(), t?vd:ndofs, t?ndofs:vd);
+      MFEM_FORALL(i, ndofs,
       {
-         double dofValue = 0;
-         for (int j = offset; j < nextOffset; ++j)
+         const int offset = d_offsets[i];
+         const int nextOffset = d_offsets[i + 1];
+         for (int c = 0; c < vd; ++c)
          {
-            int idx_j = d_indices[j];
-            bool isE1 = idx_j < dofs;
-            idx_j = isE1 ? idx_j : idx_j - dofs;
-            dofValue +=  isE1 ?
-            d_x(idx_j % nd, c, 0, idx_j / nd)
-            :d_x(idx_j % nd, c, 1, idx_j / nd);
+            double dofValue = 0;
+            for (int j = offset; j < nextOffset; ++j)
+            {
+               int idx_j = d_indices[j];
+               bool isE1 = idx_j < dofs;
+               idx_j = isE1 ? idx_j : idx_j - dofs;
+               dofValue +=  isE1 ?
+               d_x(idx_j % nd, c, 0, idx_j / nd)
+               :d_x(idx_j % nd, c, 1, idx_j / nd);
+            }
+            d_y(t?c:i,t?i:c) += dofValue;
          }
-         d_y(t?c:i,t?i:c) += dofValue;
-      }
-   });
+      });
+   }
+   else
+   {
+      auto d_x = Reshape(x.Read(), nd, vd, nf);
+      auto d_y = Reshape(y.Write(), t?vd:ndofs, t?ndofs:vd);
+      MFEM_FORALL(i, ndofs,
+      {
+         const int offset = d_offsets[i];
+         const int nextOffset = d_offsets[i + 1];
+         for (int c = 0; c < vd; ++c)
+         {
+            double dofValue = 0;
+            for (int j = offset; j < nextOffset; ++j)
+            {
+               int idx_j = d_indices[j];
+               dofValue +=  d_x(idx_j % nd, c, idx_j / nd);
+            }
+            d_y(t?c:i,t?i:c) += dofValue;
+         }
+      });
+   }
 }
 
 int ToLexOrdering(const int dim, const int face_id, const int size1d,
diff --git a/fem/restriction.hpp b/fem/restriction.hpp
index acc4a541ba0..14f568a8ab2 100644
--- a/fem/restriction.hpp
+++ b/fem/restriction.hpp
@@ -47,6 +47,8 @@ class ElementRestriction : public Operator
    void Mult(const Vector &x, Vector &y) const;
    void MultTranspose(const Vector &x, Vector &y) const;
 
+   /// Compute Mult without applying signs based on DOF orientations.
+   void MultUnsigned(const Vector &x, Vector &y) const;
    /// Compute MultTranspose without applying signs based on DOF orientations.
    void MultTransposeUnsigned(const Vector &x, Vector &y) const;
 
diff --git a/fem/tbilinearform.hpp b/fem/tbilinearform.hpp
index 7f31a57fbdd..1fa2fd896e4 100644
--- a/fem/tbilinearform.hpp
+++ b/fem/tbilinearform.hpp
@@ -13,6 +13,7 @@
 #define MFEM_TEMPLATE_BILINEAR_FORM
 
 #include "../config/tconfig.hpp"
+#include "../linalg/simd.hpp"
 #include "../linalg/ttensor.hpp"
 #include "bilinearform.hpp"
 #include "tevaluator.hpp"
@@ -23,16 +24,32 @@
 namespace mfem
 {
 
-// Templated bilinear form class, cf. bilinearform.?pp
+/** @brief Templated bilinear form class, cf. bilinearform.?pp
 
 // complex_t - sol dof data type
+    @tparam meshType typically TMesh, which is templated on FE type
 // real_t - mesh nodes, sol basis, mesh basis data type
+    @tparam solFESpace eg. H1_FiniteElementSpace
+    @tparam IR integration rule, typically TIntegrationRule, which is further
+               templated on element geometry
+    @tparam IntegratorType typically a TIntegrator, which is templated on a
+                           kernel, eg. TDiffusionKernel or TMassKernel. This
+                           describes what actual problem you solve.
+    @tparam solVecLayout_t describes how degrees of freedom are laid out,
+                           scalar or vector, column/row major, etc.
+    @tparam complex_t data type for solution dofs
+    @tparam real_t data type for mesh nodes, solution basis, and mesh basis
+*/
 template <typename meshType, typename solFESpace,
           typename IR, typename IntegratorType,
           typename solVecLayout_t = ScalarLayout,
-          typename complex_t = double, typename real_t = double>
+          typename complex_t = double, typename real_t = double,
+          typename impl_traits_t = AutoSIMDTraits<complex_t,real_t> >
 class TBilinearForm : public Operator
 {
+public:
+   typedef impl_traits_t impl_traits_type;
+
 protected:
    typedef complex_t complex_type;
    typedef real_t    real_type;
@@ -48,26 +65,47 @@ class TBilinearForm : public Operator
    static const int dofs = solFE_type::dofs;
    static const int vdim = solVecLayout_t::vec_dim;
    static const int qpts = IR::qpts;
+   static const int AB   = impl_traits_t::align_bytes;
+   static const int SS   = impl_traits_t::simd_size;
+   static const int BE   = impl_traits_t::batch_size;
+   static const int TE   = SS*BE;
+
+   typedef typename impl_traits_t::vcomplex_t vcomplex_t;
+   typedef typename impl_traits_t::vreal_t    vreal_t;
 
+   /// @name IntegratorType defines several internal types
+   ///@{
    typedef IntegratorType integ_t;
+   /// coeff_t might be TConstantCoefficient or TFunctionCoefficient, for example
    typedef typename integ_t::coefficient_type coeff_t;
-   typedef typename integ_t::template kernel<sdim,dim,complex_t>::type kernel_t;
+   /// kernel_t may be TDiffusionKernel or TMassKernel
+   typedef typename integ_t::template kernel<sdim,dim,vcomplex_t>::type kernel_t;
+   /// p_assembled_t is something like a TTensor or TMatrix for partial assembly
    typedef typename kernel_t::template p_asm_data<qpts>::type p_assembled_t;
+   /// f_assembled_t is something like a TTensor or TMatrix for full assembly
    typedef typename kernel_t::template f_asm_data<qpts>::type f_assembled_t;
+   ///@}
+
+   typedef typename kernel_t::template
+   CoefficientEval<IR,coeff_t,impl_traits_t>::Type coeff_eval_t;
+
 
    typedef TElementTransformation<meshType,IR,real_t> Trans_t;
-   template <int NE> struct T_result
+   struct T_result
    {
       static const int EvalOps =
          Trans_t::template Get<coeff_t,kernel_t>::EvalOps;
-      typedef typename Trans_t::template Result<EvalOps,NE> Type;
+      typedef typename Trans_t::template Result<EvalOps,impl_traits_t> Type;
    };
 
    typedef FieldEvaluator<solFESpace,solVecLayout_t,IR,
            complex_t,real_t> solFieldEval;
-   template <int BE> struct S_spec
+
+   /** @brief Contains matrix sizes, type of kernel (ElementMatrix is templated
+       on a kernel, e.g. ElementMatrix::Compute may be AssembleGradGrad()). */
+   struct S_spec
    {
-      typedef typename solFieldEval::template Spec<kernel_t,BE> Spec;
+      typedef typename solFieldEval::template Spec<kernel_t,impl_traits_t> Spec;
       typedef typename Spec::DataType DataType;
       typedef typename Spec::ElementMatrix ElementMatrix;
    };
@@ -86,7 +124,7 @@ class TBilinearForm : public Operator
 
    coeff_t coeff;
 
-   p_assembled_t *assembled_data;
+   Memory<p_assembled_t> assembled_data;
 
    const FiniteElementSpace &in_fes;
 
@@ -101,13 +139,17 @@ class TBilinearForm : public Operator
         solVecLayout(sol_fes),
         int_rule(),
         coeff(integ.coeff),
-        assembled_data(NULL),
+        assembled_data(),
         in_fes(sol_fes)
-   { }
+   {
+      assembled_data.Reset(AB == 64 ? MemoryType::HOST_64 :
+                           AB == 32 ? MemoryType::HOST_32 :
+                           MemoryType::HOST);
+   }
 
    virtual ~TBilinearForm()
    {
-      delete [] assembled_data;
+      assembled_data.Delete();
    }
 
    /// Get the input finite element space prolongation matrix
@@ -119,10 +161,9 @@ class TBilinearForm : public Operator
 
    virtual void Mult(const Vector &x, Vector &y) const
    {
-      if (assembled_data)
+      if (!assembled_data.Empty())
       {
-         const int num_elem = 1;
-         MultAssembled<num_elem>(x, y);
+         MultAssembled(x, y);
       }
       else
       {
@@ -135,10 +176,6 @@ class TBilinearForm : public Operator
    {
       y = 0.0;
 
-      const int BE = 1; // batch-size of elements
-      typedef typename kernel_t::template
-      CoefficientEval<IR,coeff_t,BE>::Type coeff_eval_t;
-
       // For better performance, create stack copies of solFES, and solEval
       // inside 'solFEval'. The element-transformation 'T' also copies the
       // meshFES, meshEval, etc internally.
@@ -149,49 +186,49 @@ class TBilinearForm : public Operator
       coeff_eval_t wQ(int_rule, coeff);
 
       const int NE = mesh.GetNE();
-      for (int el = 0; el < NE; el++)
+      for (int el = 0; el < NE; el += TE)
       {
 #if 0
-         typename S_spec<BE>::DataType R;
+         typename S_spec::DataType R;
          solFEval.Eval(el, R);
 
-         typename T_result<BE>::Type F;
+         typename T_result::Type F;
          T.Eval(el, F);
 #else
-         typename T_result<BE>::Type F;
+         typename T_result::Type F;
          T.Eval(el, F);
 
-         typename S_spec<BE>::DataType R;
+         typename S_spec::DataType R;
          solFEval.Eval(el, R);
 #endif
 
          typename coeff_eval_t::result_t res;
          wQ.Eval(F, res);
 
-         kernel_t::Action(0, F, wQ, res, R);
+         for (int k = 0; k < BE; k++)
+         {
+            kernel_t::Action(k, F, wQ, res, R);
+         }
 
          solFEval.template Assemble<true>(R);
       }
    }
 
-   // Partial assembly of quadrature point data
+   /// Partial assembly of quadrature point data
    void Assemble()
    {
-      const int BE = 1; // batch-size of elements
-      typedef typename kernel_t::template
-      CoefficientEval<IR,coeff_t,BE>::Type coeff_eval_t;
-
       Trans_t T(mesh, meshEval);
       coeff_eval_t wQ(int_rule, coeff);
 
       const int NE = mesh.GetNE();
-      if (!assembled_data)
+      if (assembled_data.Empty())
       {
-         assembled_data = new p_assembled_t[NE];
+         const int size = ((NE+TE-1)/TE)*BE;
+         assembled_data.New(size, assembled_data.GetMemoryType());
       }
-      for (int el = 0; el < NE; el++) // BE == 1
+      for (int el = 0; el < NE; el += TE)
       {
-         typename T_result<BE>::Type F;
+         typename T_result::Type F;
          T.Eval(el, F);
 
          typename coeff_eval_t::result_t res;
@@ -199,28 +236,26 @@ class TBilinearForm : public Operator
 
          for (int k = 0; k < BE; k++)
          {
-            kernel_t::Assemble(k, F, wQ, res, assembled_data[el+k]);
+            kernel_t::Assemble(k, F, wQ, res, assembled_data[el/SS+k]);
          }
       }
    }
 
-   template <int num_elem>
    inline MFEM_ALWAYS_INLINE
    void ElementAddMultAssembled(int el, solFieldEval &solFEval) const
    {
-      typename S_spec<num_elem>::DataType R;
+      typename S_spec::DataType R;
       solFEval.Eval(el, R);
 
-      for (int k = 0; k < num_elem; k++)
+      for (int k = 0; k < BE; k++)
       {
-         kernel_t::MultAssembled(k, assembled_data[el+k], R);
+         kernel_t::MultAssembled(k, assembled_data[el/SS+k], R);
       }
 
       solFEval.template Assemble<true>(R);
    }
 
    // complex_t = double
-   template <int num_elem>
    void MultAssembled(const Vector &x, Vector &y) const
    {
       y = 0.0;
@@ -229,14 +264,9 @@ class TBilinearForm : public Operator
                             x.GetData(), y.GetData());
 
       const int NE = mesh.GetNE();
-      const int bNE = NE-NE%num_elem;
-      for (int el = 0; el < bNE; el += num_elem)
+      for (int el = 0; el < NE; el += TE)
       {
-         ElementAddMultAssembled<num_elem>(el, solFEval);
-      }
-      for (int el = bNE; el < NE; el++)
-      {
-         ElementAddMultAssembled<1>(el, solFEval);
+         ElementAddMultAssembled(el, solFEval);
       }
    }
 
@@ -249,10 +279,10 @@ class TBilinearForm : public Operator
       solVecLayout_type solVecLayout(this->solVecLayout);
       solFESpace solFES(this->solFES);
 
-      TTensor3<dofs,vdim,1,complex_t> xy_dof;
+      TTensor3<dofs,vdim,BE,vcomplex_t> xy_dof;
 
       const int NE = mesh.GetNE();
-      for (int el = 0; el < NE; el++)
+      for (int el = 0; el < NE; el += TE)
       {
          solFES.SetElement(el);
 
@@ -266,98 +296,108 @@ class TBilinearForm : public Operator
    {
       typedef typename meshType::FESpace_type meshFESpace;
       meshFESpace meshFES(mesh.t_fes);
-      typedef TTensor3<meshFE_type::dofs,sdim,1,real_t> lnodes_t;
+      typedef TTensor3<meshFE_type::dofs,sdim,BE,vreal_t> lnodes_t;
 
       const int NE = mesh.GetNE();
-      sNodes.SetSize(lnodes_t::size*NE);
-      real_t *lNodes = sNodes.GetData();
-      for (int el = 0; el < NE; el++)
+      // TODO: How do we make sure that this array is aligned properly, AND the
+      //       compiler knows that it is aligned? => ALIGN_32|ALIGN_64 when ready
+      const int NVE = (NE+TE-1)/TE;
+      vreal_t *vsNodes = new vreal_t[lnodes_t::size*NVE];
+      sNodes.NewDataAndSize(vsNodes[0].vec, (lnodes_t::size*SS)*NVE);
+      sNodes.MakeDataOwner();
+      for (int el = 0; el < NE; el += TE)
       {
          meshFES.SetElement(el);
          meshFES.VectorExtract(mesh.node_layout, mesh.Nodes,
-                               lnodes_t::layout, lNodes);
-         lNodes += lnodes_t::size;
+                               lnodes_t::layout, vsNodes);
+         vsNodes += lnodes_t::size;
       }
    }
 
-   // partial assembly from "serialized" nodes
+   /// Partial assembly from "serialized" nodes
    // real_t = double
    void AssembleFromSerializedNodes(const Vector &sNodes)
    {
-      const int  BE = 1; // batch-size of elements
-      typedef typename kernel_t::template
-      CoefficientEval<IR,coeff_t,BE>::Type coeff_eval_t;
-
-      Trans_t T(this->mesh, this->meshEval);
+      Trans_t T(mesh, meshEval);
       coeff_eval_t wQ(int_rule, coeff);
 
       const int NE = mesh.GetNE();
-      if (!assembled_data)
+      if (assembled_data.Empty())
       {
-         assembled_data = new p_assembled_t[NE];
+         const int size = ((NE+TE-1)/TE)*BE;
+         assembled_data.New(size, assembled_data.GetMemoryType());
       }
-      for (int el = 0; el < NE; el++)
+      const vreal_t *vsNodes = (const vreal_t*)(sNodes.GetData());
+      for (int el = 0; el < NE; el += TE)
       {
-         typename T_result<BE>::Type F;
-         T.EvalSerialized(el, sNodes.GetData(), F);
+         typename T_result::Type F;
+         T.EvalSerialized(el, vsNodes, F);
 
          typename coeff_eval_t::result_t res;
          wQ.Eval(F, res);
 
-         kernel_t::Assemble(0, F, wQ, res, assembled_data[el]);
+         for (int k = 0; k < BE; k++)
+         {
+            kernel_t::Assemble(k, F, wQ, res, assembled_data[el/SS+k]);
+         }
       }
    }
 
    // complex_t = double
    void Serialize(const Vector &x, Vector &sx) const
    {
+      typedef TTensor3<dofs,vdim,BE,vcomplex_t> vdof_data_t;
+
       solVecLayout_t solVecLayout(this->solVecLayout);
-      typedef TTensor3<dofs,vdim,1,complex_t> vdof_data_t;
       solFESpace solFES(this->solFES);
 
       const int NE = mesh.GetNE();
-      sx.SetSize(vdim*dofs*NE);
-      complex_t *loc_sx = sx.GetData();
-      for (int el = 0; el < NE; el++)
+      // TODO: How do we make sure that this array is aligned properly, AND
+      //       the compiler knows that it is aligned? => ALIGN_32|ALIGN_64 when ready
+      const int NVE = (NE+TE-1)/TE;
+      vreal_t *vsx = new vreal_t[vdof_data_t::size*NVE];
+      sx.NewDataAndSize(vsx[0].vec, (vdof_data_t::size*SS)*NVE);
+      sx.MakeDataOwner();
+      for (int el = 0; el < NE; el += TE)
       {
          solFES.SetElement(el);
-         solFES.VectorExtract(solVecLayout, x, vdof_data_t::layout, loc_sx);
-         loc_sx += vdim*dofs;
+         solFES.VectorExtract(solVecLayout, x, vdof_data_t::layout, vsx);
+         vsx += vdof_data_t::size;
       }
    }
 
-   // serialized vector sx --> serialized vector 'sy'
+   /// serialized vector sx --> serialized vector 'sy'
    // complex_t = double
    void MultAssembledSerialized(const Vector &sx, Vector &sy) const
    {
       solFieldEval solFEval(solFES, solEval, solVecLayout, NULL, NULL);
 
       const int NE = mesh.GetNE();
-      const complex_t *loc_sx = sx.GetData();
-      complex_t *loc_sy = sy.GetData();
-      for (int el = 0; el < NE; el++)
+      const vreal_t *vsx = (const vreal_t*)(sx.GetData());
+      vreal_t *vsy = (vreal_t*)(sy.GetData());
+
+      for (int el = 0; el < NE; el += TE)
       {
-         typename S_spec<1>::DataType R;
-         solFEval.EvalSerialized(loc_sx, R);
+         typename S_spec::DataType R;
+         solFEval.EvalSerialized(vsx, R);
 
-         kernel_t::MultAssembled(0, assembled_data[el], R);
+         for (int k = 0; k < BE; k++)
+         {
+            kernel_t::MultAssembled(k, assembled_data[el/SS+k], R);
+         }
 
-         solFEval.template AssembleSerialized<false>(R, loc_sy);
+         solFEval.template AssembleSerialized<false>(R, vsy);
 
-         loc_sx += vdim*dofs;
-         loc_sy += vdim*dofs;
+         vsx += vdim*dofs*BE;
+         vsy += vdim*dofs*BE;
       }
    }
 #endif // MFEM_TEMPLATE_ENABLE_SERIALIZE
 
-   // Assemble the operator in a SparseMatrix.
+   /// Assemble the operator in a SparseMatrix.
    // complex_t = double
    void AssembleMatrix(SparseMatrix &M) const
    {
-      const int BE = 1; // batch-size of elements
-      typedef typename kernel_t::template
-      CoefficientEval<IR,coeff_t,BE>::Type coeff_eval_t;
-
       Trans_t T(mesh, meshEval);
       solFESpace solFES(this->solFES);
       solShapeEval solEval(this->solEval);
@@ -365,79 +405,100 @@ class TBilinearForm : public Operator
       coeff_eval_t wQ(int_rule, coeff);
 
       const int NE = mesh.GetNE();
-      for (int el = 0; el < NE; el++)
+      for (int el = 0; el < NE; el += TE)
       {
-         f_assembled_t asm_qpt_data;
+         f_assembled_t asm_qpt_data[BE];
          {
-            typename T_result<BE>::Type F;
+            typename T_result::Type F;
             T.Eval(el, F);
 
             typename coeff_eval_t::result_t res;
             wQ.Eval(F, res);
 
-            kernel_t::Assemble(0, F, wQ, res, asm_qpt_data);
+            for (int k = 0; k < BE; k++)
+            {
+               kernel_t::Assemble(k, F, wQ, res, asm_qpt_data[k]);
+            }
          }
 
          // For now, when vdim > 1, assume block-diagonal matrix with the same
          // diagonal block for all components.
-         TMatrix<dofs,dofs> M_loc;
-         S_spec<BE>::ElementMatrix::Compute(
-            asm_qpt_data.layout, asm_qpt_data, M_loc.layout, M_loc, solEval);
-
-         solFES.SetElement(el);
-         for (int bi = 0; bi < vdim; bi++)
+         for (int k = 0; k < BE; k++)
          {
-            solFES.AssembleBlock(bi, bi, solVecLayout, M_loc, M);
+            const int el_k = el+SS*k;
+            if (el_k >= NE) { break; }
+
+            TMatrix<dofs,dofs,vcomplex_t> M_loc;
+            S_spec::ElementMatrix::Compute(
+               asm_qpt_data[k].layout, asm_qpt_data[k], M_loc.layout, M_loc,
+               solEval);
+
+            solFES.SetElement(el_k);
+            for (int bi = 0; bi < vdim; bi++)
+            {
+               solFES.AssembleBlock(bi, bi, solVecLayout, M_loc, M);
+            }
          }
       }
    }
 
-   // Assemble element matrices and store them as a DenseTensor object.
+   /// Assemble element matrices and store them as a DenseTensor object.
    // complex_t = double
    void AssembleMatrix(DenseTensor &M) const
    {
-      const int BE = 1; // batch-size of elements
-      typedef typename kernel_t::template
-      CoefficientEval<IR,coeff_t,BE>::Type coeff_eval_t;
-
       Trans_t T(mesh, meshEval);
       solShapeEval solEval(this->solEval);
       coeff_eval_t wQ(int_rule, coeff);
 
       const int NE = mesh.GetNE();
-      for (int el = 0; el < NE; el++)
+      for (int el = 0; el < NE; el += TE)
       {
-         f_assembled_t asm_qpt_data;
+         f_assembled_t asm_qpt_data[BE];
          {
-            typename T_result<BE>::Type F;
+            typename T_result::Type F;
             T.Eval(el, F);
 
             typename coeff_eval_t::result_t res;
             wQ.Eval(F, res);
 
-            kernel_t::Assemble(0, F, wQ, res, asm_qpt_data);
+            for (int k = 0; k < BE; k++)
+            {
+               kernel_t::Assemble(k, F, wQ, res, asm_qpt_data[k]);
+            }
          }
 
          // For now, when vdim > 1, assume block-diagonal matrix with the same
          // diagonal block for all components.
          // M is assumed to be (dof x dof x NE).
-         TMatrix<dofs,dofs> M_loc;
-         S_spec<BE>::ElementMatrix::Compute(
-            asm_qpt_data.layout, asm_qpt_data, M_loc.layout, M_loc, solEval);
+         for (int k = 0; k < BE; k++)
+         {
+            const int el_k = el+SS*k;
+            if (el_k >= NE) { break; }
 
-         complex_t *M_data = M.GetData(el);
-         M_loc.template AssignTo<AssignOp::Set>(M_data);
+            TMatrix<dofs,dofs,vcomplex_t> M_loc;
+            S_spec::ElementMatrix::Compute(
+               asm_qpt_data[k].layout, asm_qpt_data[k], M_loc.layout, M_loc,
+               solEval);
+
+            for (int s = 0; s < SS && el_k+s < NE; s++)
+            {
+               complex_t *M_data = M.GetData(el_k+s);
+               for (int j = 0; j < dofs; j++)
+               {
+                  for (int i = 0; i < dofs; i++)
+                  {
+                     M_data[j+dofs*i] = M_loc(i,j)[s];
+                  }
+               }
+            }
+         }
       }
    }
 
-   // Assemble element matrices and add them to the bilinear form
+   /// Assemble element matrices and add them to the bilinear form
    // complex_t = double
    void AssembleBilinearForm(BilinearForm &a) const
    {
-      const int BE = 1; // batch-size of elements
-      typedef typename kernel_t::template
-      CoefficientEval<IR,coeff_t,BE>::Type coeff_eval_t;
-
       Trans_t T(mesh, meshEval);
       solShapeEval solEval(this->solEval);
       coeff_eval_t wQ(int_rule, coeff);
@@ -448,61 +509,93 @@ class TBilinearForm : public Operator
       DenseMatrix M_loc_perm(dofs*vdim,dofs*vdim); // initialized with zeros
 
       const int NE = mesh.GetNE();
-      for (int el = 0; el < NE; el++)
+      for (int el = 0; el < NE; el += TE)
       {
-         f_assembled_t asm_qpt_data;
+         f_assembled_t asm_qpt_data[BE];
          {
-            typename T_result<BE>::Type F;
+            typename T_result::Type F;
             T.Eval(el, F);
 
             typename coeff_eval_t::result_t res;
             wQ.Eval(F, res);
 
-            kernel_t::Assemble(0, F, wQ, res, asm_qpt_data);
+            for (int k = 0; k < BE; k++)
+            {
+               kernel_t::Assemble(k, F, wQ, res, asm_qpt_data[k]);
+            }
          }
 
          // For now, when vdim > 1, assume block-diagonal matrix with the same
          // diagonal block for all components.
-         TMatrix<dofs,dofs> M_loc;
-         S_spec<BE>::ElementMatrix::Compute(
-            asm_qpt_data.layout, asm_qpt_data, M_loc.layout, M_loc, solEval);
-
-         if (dof_map) // switch from tensor-product ordering
+         for (int k = 0; k < BE; k++)
          {
-            for (int i = 0; i < dofs; i++)
+            const int el_k = el+SS*k;
+            if (el_k >= NE) { break; }
+
+            TMatrix<dofs,dofs,vcomplex_t> M_loc;
+            S_spec::ElementMatrix::Compute(
+               asm_qpt_data[k].layout, asm_qpt_data[k], M_loc.layout, M_loc,
+               solEval);
+
+            if (dof_map) // switch from tensor-product ordering
             {
-               for (int j = 0; j < dofs; j++)
+               for (int s = 0; s < SS && el_k+s < NE; s++)
                {
-                  M_loc_perm(dof_map_[i],dof_map_[j]) = M_loc(i,j);
+                  for (int i = 0; i < dofs; i++)
+                  {
+                     for (int j = 0; j < dofs; j++)
+                     {
+                        M_loc_perm(dof_map_[i],dof_map_[j]) = M_loc(i,j)[s];
+                     }
+                  }
+                  for (int bi = 1; bi < vdim; bi++)
+                  {
+                     M_loc_perm.CopyMN(M_loc_perm, dofs, dofs, 0, 0,
+                                       bi*dofs, bi*dofs);
+                  }
+                  a.AssembleElementMatrix(el_k+s, M_loc_perm, vdofs);
                }
             }
-            for (int bi = 1; bi < vdim; bi++)
-            {
-               M_loc_perm.CopyMN(M_loc_perm, dofs, dofs, 0, 0,
-                                 bi*dofs, bi*dofs);
-            }
-            a.AssembleElementMatrix(el, M_loc_perm, vdofs);
-         }
-         else
-         {
-            DenseMatrix DM(M_loc.data, dofs, dofs);
-            if (vdim == 1)
+            else if (SS == 1)
             {
-               a.AssembleElementMatrix(el, DM, vdofs);
+               DenseMatrix DM(M_loc.data[0].vec, dofs, dofs);
+               if (vdim == 1)
+               {
+                  a.AssembleElementMatrix(el_k, DM, vdofs);
+               }
+               else
+               {
+                  for (int bi = 0; bi < vdim; bi++)
+                  {
+                     M_loc_perm.CopyMN(DM, dofs, dofs, 0, 0, bi*dofs, bi*dofs);
+                  }
+                  a.AssembleElementMatrix(el_k, M_loc_perm, vdofs);
+               }
             }
             else
             {
-               for (int bi = 0; bi < vdim; bi++)
+               for (int s = 0; s < SS && el_k+s < NE; s++)
                {
-                  M_loc_perm.CopyMN(DM, dofs, dofs, 0, 0, bi*dofs, bi*dofs);
+                  for (int i = 0; i < dofs; i++)
+                  {
+                     for (int j = 0; j < dofs; j++)
+                     {
+                        M_loc_perm(i,j) = M_loc(i,j)[s];
+                     }
+                  }
+                  for (int bi = 1; bi < vdim; bi++)
+                  {
+                     M_loc_perm.CopyMN(M_loc_perm, dofs, dofs, 0, 0,
+                                       bi*dofs, bi*dofs);
+                  }
+                  a.AssembleElementMatrix(el_k+s, M_loc_perm, vdofs);
                }
-               a.AssembleElementMatrix(el, M_loc_perm, vdofs);
             }
          }
       }
    }
 
-   // Multiplication using assembled element matrices stored as a DenseTensor.
+   /// Multiplication using assembled element matrices stored as a DenseTensor.
    // complex_t = double
    void AddMult(DenseTensor &M, const Vector &x, Vector &y) const
    {
@@ -513,7 +606,7 @@ class TBilinearForm : public Operator
       const int NE = mesh.GetNE();
       for (int el = 0; el < NE; el++)
       {
-         TTensor3<dofs,vdim,1,complex_t> x_dof, y_dof;
+         TTensor3<dofs,vdim,1,AutoSIMD<complex_t,1,1> > x_dof, y_dof;
 
          solFES.SetElement(el);
          solFES.VectorExtract(solVecLayout, x, x_dof.layout, x_dof);
diff --git a/fem/tbilininteg.hpp b/fem/tbilininteg.hpp
index 4901ac25cf4..177bb10e72c 100644
--- a/fem/tbilininteg.hpp
+++ b/fem/tbilininteg.hpp
@@ -21,8 +21,7 @@ namespace mfem
 
 // Templated local bilinear form integrator kernels, cf. bilininteg.?pp
 
-// The Integrator class combines a kernel and a coefficient
-
+/// The Integrator class combines a kernel and a coefficient
 template <typename coeff_t, template<int,int,typename> class kernel_t>
 class TIntegrator
 {
@@ -38,46 +37,48 @@ class TIntegrator
 };
 
 
-// Mass kernel
-
+/// Mass kernel
 template <int SDim, int Dim, typename complex_t>
 struct TMassKernel
 {
    typedef complex_t complex_type;
 
-   // needed for the TElementTransformation::Result class
+   /// Needed for the TElementTransformation::Result class
    static const bool uses_Jacobians = true;
 
-   // needed for the FieldEvaluator::Data class
+   /// @name Needed for the FieldEvaluator::Data class
+   ///@{
    static const bool in_values     = true;
    static const bool in_gradients  = false;
    static const bool out_values    = true;
    static const bool out_gradients = false;
+   ///@}
 
-   // Partially assembled data type for one element with the given number of
-   // quadrature points. This type is used in partial assembly, and partially
-   // assembled action.
+   /** @brief Partially assembled data type for one element with the given number of
+       quadrature points. This type is used in partial assembly, and partially
+       assembled action. */
    template <int qpts>
    struct p_asm_data { typedef TVector<qpts,complex_t> type; };
 
-   // Partially assembled data type for one element with the given number of
-   // quadrature points. This type is used in full element matrix assembly.
+   /** @brief Partially assembled data type for one element with the given
+       number of quadrature points. This type is used in full element matrix
+       assembly. */
    template <int qpts>
    struct f_asm_data { typedef TVector<qpts,complex_t> type; };
 
-   template <typename IR, typename coeff_t, int NE>
+   template <typename IR, typename coeff_t, typename impl_traits_t>
    struct CoefficientEval
    {
-      typedef typename IntRuleCoefficient<IR,coeff_t,NE>::Type Type;
+      typedef typename IntRuleCoefficient<IR,coeff_t,impl_traits_t>::Type Type;
    };
 
-   // Method used for un-assembled (matrix free) action.
-   // Jt       [M x Dim x SDim x NE] - Jacobian transposed, data member in F
-   // Q                              - CoefficientEval<>::Type
-   // q                              - CoefficientEval<>::Type::result_t
-   // val_qpts [M x NC x NE]         - in/out data member in R
-   //
-   // val_qpts *= w det(J)
+   /** @brief Method used for un-assembled (matrix free) action.
+       @param k the element number
+       @param F  Jt [M x Dim x SDim x NE] - Jacobian transposed, data member in F
+       @param Q  CoefficientEval<>::Type
+       @param q  CoefficientEval<>::Type::result_t
+       @param R  val_qpts [M x NC x NE] - in/out data member in R
+       val_qpts *= w det(J) */
    template <typename T_result_t, typename Q_t, typename q_t,
              typename S_data_t>
    static inline MFEM_ALWAYS_INLINE
@@ -101,13 +102,16 @@ struct TMassKernel
       }
    }
 
-   // Method defining partial assembly.
-   // Jt   [M x Dim x SDim x NE] - Jacobian transposed, data member in F
-   // Q                          - CoefficientEval<>::Type
-   // q                          - CoefficientEval<>::Type::result_t
-   // A    [M]                   - partially assembled scalars
-   //
-   // A = w det(J)
+   /** @brief Method defining partial assembly.
+       Result in A is the quadrature-point dependent part of element matrix
+       assembly (as opposed to part that is same for all elements),
+       A = w det(J)
+       @param k the element number
+       @param F Jt [M x Dim x SDim x NE] - Jacobian transposed, data member in F
+       @param Q CoefficientEval<>::Type
+       @param q CoefficientEval<>::Type::result_t
+       @param A [M] - partially assembled scalars
+   */
    template <typename T_result_t, typename Q_t, typename q_t, int qpts>
    static inline MFEM_ALWAYS_INLINE
    void Assemble(const int k, const T_result_t &F,
@@ -124,11 +128,12 @@ struct TMassKernel
       }
    }
 
-   // Method for partially assembled action.
-   // A        [M]           - partially assembled scalars
-   // val_qpts [M x NC x NE] - in/out data member in R
-   //
-   // val_qpts *= A
+   /** @brief Method for partially assembled action.
+       @param k the element number
+       @param A  [M] - partially assembled scalars
+       @param R  val_qpts [M x NC x NE] - in/out data member in R
+       val_qpts *= A
+   */
    template <int qpts, typename S_data_t>
    static inline MFEM_ALWAYS_INLINE
    void MultAssembled(const int k, const TVector<qpts,complex_t> &A, S_data_t &R)
@@ -148,51 +153,54 @@ struct TMassKernel
 };
 
 
-// Diffusion kernel
-
-// complex_t - type for the assembled data
+/** @brief Diffusion kernel
+    @tparam complex_t - type for the assembled data
+*/
 template <int SDim, int Dim, typename complex_t>
 struct TDiffusionKernel;
 
-// Diffusion kernel in 1D
+/// Diffusion kernel in 1D
 template <typename complex_t>
 struct TDiffusionKernel<1,1,complex_t>
 {
    typedef complex_t complex_type;
 
-   // needed for the TElementTransformation::Result class
+   /// Needed for the TElementTransformation::Result class
    static const bool uses_Jacobians = true;
 
-   // needed for the FieldEvaluator::Data class
+   /// Needed for the FieldEvaluator::Data class
+   ///@{
    static const bool in_values     = false;
    static const bool in_gradients  = true;
    static const bool out_values    = false;
    static const bool out_gradients = true;
+   ///@}
 
-   // Partially assembled data type for one element with the given number of
-   // quadrature points. This type is used in partial assembly, and partially
-   // assembled action.
+   /** @brief Partially assembled data type for one element with the given number of
+       quadrature points. This type is used in partial assembly, and partially
+       assembled action. */
    template <int qpts>
    struct p_asm_data { typedef TMatrix<qpts,1,complex_t> type; };
 
-   // Partially assembled data type for one element with the given number of
-   // quadrature points. This type is used in full element matrix assembly.
+
+   /** @brief Partially assembled data type for one element with the given number of
+       quadrature points. This type is used in full element matrix assembly. */
    template <int qpts>
    struct f_asm_data { typedef TTensor3<qpts,1,1,complex_t> type; };
 
-   template <typename IR, typename coeff_t, int NE>
+   template <typename IR, typename coeff_t, typename impl_traits_t>
    struct CoefficientEval
    {
-      typedef typename IntRuleCoefficient<IR,coeff_t,NE>::Type Type;
+      typedef typename IntRuleCoefficient<IR,coeff_t,impl_traits_t>::Type Type;
    };
 
-   // Method used for un-assembled (matrix free) action.
-   // Jt        [M x Dim x SDim x NE] - Jacobian transposed, data member in F
-   // Q                               - CoefficientEval<>::Type
-   // q                               - CoefficientEval<>::Type::result_t
-   // grad_qpts [M x SDim x NC x NE]  - in/out data member in R
-   //
-   // grad_qpts = (w/det(J)) adj(J) adj(J)^t grad_qpts
+   /** @brief Method used for un-assembled (matrix free) action.
+       @param k the element number
+       @param F Jt [M x Dim x SDim x NE] - Jacobian transposed, data member in F
+       @param Q - CoefficientEval<>::Type
+       @param q - CoefficientEval<>::Type::result_t
+       @param R grad_qpts [M x SDim x NC x NE]  - in/out data member in R
+       grad_qpts = (w/det(J)) adj(J) adj(J)^t grad_qpts */
    template <typename T_result_t, typename Q_t, typename q_t,
              typename S_data_t>
    static inline MFEM_ALWAYS_INLINE
@@ -214,17 +222,20 @@ struct TDiffusionKernel<1,1,complex_t>
       }
    }
 
-   // Method defining partial assembly. The pointwise Dim x Dim matrices are
-   // stored as symmetric (when asm_type == p_asm_data, i.e. A.layout.rank == 2)
-   // or non-symmetric (when asm_type == f_asm_data, i.e. A.layout.rank == 3)
-   // matrices.
-   // Jt   [M x Dim x SDim x NE] - Jacobian transposed, data member in F
-   // Q                          - CoefficientEval<>::Type
-   // q                          - CoefficientEval<>::Type::result_t
-   // A    [M x Dim*(Dim+1)/2]   - partially assembled Dim x Dim symm. matrices
-   // A    [M x Dim x Dim]       - partially assembled Dim x Dim matrices
-   //
-   // A = (w/det(J)) adj(J) adj(J)^t
+
+   /** @brief Method defining partial assembly.
+       The pointwise Dim x Dim matrices are stored as symmetric (when
+       asm_type == p_asm_data, i.e. A.layout.rank == 2) or
+       non-symmetric (when asm_type == f_asm_data, i.e. A.layout.rank
+       == 3) matrices.
+       @param k the element number
+       @param F Jt [M x Dim x SDim x NE] - Jacobian transposed, data member in F
+       @param Q CoefficientEval<>::Type
+       @param q CoefficientEval<>::Type::result_t
+       @param A [M x Dim*(Dim+1)/2] - partially assembled Dim x Dim symm. matrices
+              A [M x Dim x Dim]     - partially assembled Dim x Dim matrices
+       A = (w/det(J)) adj(J) adj(J)^t
+   */
    template <typename T_result_t, typename Q_t, typename q_t, typename asm_type>
    static inline MFEM_ALWAYS_INLINE
    void Assemble(const int k, const T_result_t &F,
@@ -240,13 +251,13 @@ struct TDiffusionKernel<1,1,complex_t>
          A[i] = Q.get(q,i,k) / F.Jt(i,0,0,k);
       }
    }
-
-   // Method for partially assembled action.
-   // A         [M x Dim*(Dim+1)/2]  - partially assembled Dim x Dim symmetric
-   //                                  matrices
-   // grad_qpts [M x SDim x NC x NE] - in/out data member in R
-   //
-   // grad_qpts = A grad_qpts
+   /** @brief Method for partially assembled action.
+       @param k the element number
+       @param A  [M x Dim*(Dim+1)/2] partially assembled Dim x Dim symmetric
+                                     matrices
+       @param R  grad_qpts [M x SDim x NC x NE] - in/out data member in R
+       grad_qpts = A grad_qpts
+   */
    template <int qpts, typename S_data_t>
    static inline MFEM_ALWAYS_INLINE
    void MultAssembled(const int k, const TMatrix<qpts,1,complex_t> &A,
@@ -266,46 +277,49 @@ struct TDiffusionKernel<1,1,complex_t>
    }
 };
 
-// Diffusion kernel in 2D
+/// Diffusion kernel in 2D
 template <typename complex_t>
 struct TDiffusionKernel<2,2,complex_t>
 {
    typedef complex_t complex_type;
 
-   // needed for the TElementTransformation::Result class
+   /// Needed for the TElementTransformation::Result class
    static const bool uses_Jacobians = true;
 
-   // needed for the FieldEvaluator::Data class
+   /// Needed for the FieldEvaluator::Data class
+   ///@{
    static const bool in_values     = false;
    static const bool in_gradients  = true;
    static const bool out_values    = false;
    static const bool out_gradients = true;
+   ///@}
 
-   // Partially assembled data type for one element with the given number of
-   // quadrature points. This type is used in partial assembly, and partially
-   // assembled action. Stores one symmetric 2 x 2 matrix per point.
+   /** @brief Partially assembled data type for one element with the given number of
+       quadrature points. This type is used in partial assembly, and partially
+       assembled action. Stores one symmetric 2 x 2 matrix per point. */
    template <int qpts>
    struct p_asm_data { typedef TMatrix<qpts,3,complex_t> type; };
 
-   // Partially assembled data type for one element with the given number of
-   // quadrature points. This type is used in full element matrix assembly.
-   // Stores one general (non-symmetric) 2 x 2 matrix per point.
+   /** @brief Partially assembled data type for one element with the given number of
+       quadrature points. This type is used in full element matrix assembly.
+       Stores one general (non-symmetric) 2 x 2 matrix per point. */
    template <int qpts>
    struct f_asm_data { typedef TTensor3<qpts,2,2,complex_t> type; };
 
-   template <typename IR, typename coeff_t, int NE>
+   template <typename IR, typename coeff_t, typename impl_traits_t>
    struct CoefficientEval
    {
-      typedef typename IntRuleCoefficient<IR,coeff_t,NE>::Type Type;
+      typedef typename IntRuleCoefficient<IR,coeff_t,impl_traits_t>::Type Type;
    };
 
-   // Method used for un-assembled (matrix free) action.
-   // Jt        [M x Dim x SDim x NE] - Jacobian transposed, data member in F
-   // Q                               - CoefficientEval<>::Type
-   // q                               - CoefficientEval<>::Type::result_t
-   // grad_qpts [M x SDim x NC x NE]  - in/out data member in R
-   //
-   // grad_qpts = (w/det(J)) adj(J) adj(J)^t grad_qpts
+   /** @brief Method used for un-assembled (matrix free) action.
+       @param k the element number
+       @param F Jt [M x Dim x SDim x NE] - Jacobian transposed, data member in F
+       @param Q CoefficientEval<>::Type
+       @param q CoefficientEval<>::Type::result_t
+       @param R grad_qpts [M x SDim x NC x NE]  - in/out data member in R
+       grad_qpts = (w/det(J)) adj(J) adj(J)^t grad_qpts
+   */
    template <typename T_result_t, typename Q_t, typename q_t,
              typename S_data_t>
    static inline MFEM_ALWAYS_INLINE
@@ -338,17 +352,18 @@ struct TDiffusionKernel<2,2,complex_t>
       }
    }
 
-   // Method defining partial assembly. The pointwise Dim x Dim matrices are
-   // stored as symmetric (when asm_type == p_asm_data, i.e. A.layout.rank == 2)
-   // or non-symmetric (when asm_type == f_asm_data, i.e. A.layout.rank == 3)
-   // matrices.
-   // Jt   [M x Dim x SDim x NE] - Jacobian transposed, data member in F
-   // Q                          - CoefficientEval<>::Type
-   // q                          - CoefficientEval<>::Type::result_t
-   // A    [M x Dim*(Dim+1)/2]   - partially assembled Dim x Dim symm. matrices
-   // A    [M x Dim x Dim]       - partially assembled Dim x Dim matrices
-   //
-   // A = (w/det(J)) adj(J) adj(J)^t
+   /** @brief Method defining partial assembly.
+       The pointwise Dim x Dim matrices are stored as symmetric (when
+       asm_type == p_asm_data, i.e. A.layout.rank == 2) or non-symmetric
+       (when asm_type == f_asm_data, i.e. A.layout.rank == 3) matrices.
+       A = (w/det(J)) adj(J) adj(J)^t
+       @param k the element number
+       @param F Jt [M x Dim x SDim x NE] - Jacobian transposed, data member in F
+       @param Q CoefficientEval<>::Type
+       @param q CoefficientEval<>::Type::result_t
+       @param A [M x Dim*(Dim+1)/2] partially assembled Dim x Dim symm. matrices
+       @param A [M x Dim x Dim]       partially assembled Dim x Dim matrices
+   */
    template <typename T_result_t, typename Q_t, typename q_t, typename asm_type>
    static inline MFEM_ALWAYS_INLINE
    void Assemble(const int k, const T_result_t &F,
@@ -376,12 +391,13 @@ struct TDiffusionKernel<2,2,complex_t>
       }
    }
 
-   // Method for partially assembled action.
-   // A         [M x Dim*(Dim+1)/2]  - partially assembled Dim x Dim symmetric
-   //                                  matrices
-   // grad_qpts [M x SDim x NC x NE] - in/out data member in R
-   //
-   // grad_qpts = A grad_qpts
+   /** @brief  Method for partially assembled action.
+       @param k the element number
+       @param  A  [M x Dim*(Dim+1)/2]  - partially assembled Dim x Dim symmetric
+                                         matrices
+       @param R grad_qpts [M x SDim x NC x NE] - in/out data member in R
+       grad_qpts = A grad_qpts
+   */
    template <int qpts, typename S_data_t>
    static inline MFEM_ALWAYS_INLINE
    void MultAssembled(const int k, const TMatrix<qpts,3,complex_t> &A,
@@ -407,46 +423,48 @@ struct TDiffusionKernel<2,2,complex_t>
    }
 };
 
-// Diffusion kernel in 3D
+/// Diffusion kernel in 3D
 template <typename complex_t>
 struct TDiffusionKernel<3,3,complex_t>
 {
    typedef complex_t complex_type;
 
-   // needed for the TElementTransformation::Result class
+   /// Needed for the TElementTransformation::Result class
    static const bool uses_Jacobians = true;
 
-   // needed for the FieldEvaluator::Data class
+   /// Needed for the FieldEvaluator::Data class
+   ///@{
    static const bool in_values     = false;
    static const bool in_gradients  = true;
    static const bool out_values    = false;
    static const bool out_gradients = true;
+   ///@}
 
-   // Partially assembled data type for one element with the given number of
-   // quadrature points. This type is used in partial assembly, and partially
-   // assembled action. Stores one symmetric 3 x 3 matrix per point.
+   /** @brief Partially assembled data type for one element with the given number of
+       quadrature points. This type is used in partial assembly, and partially
+       assembled action. Stores one symmetric 3 x 3 matrix per point. */
    template <int qpts>
    struct p_asm_data { typedef TMatrix<qpts,6,complex_t> type; };
 
-   // Partially assembled data type for one element with the given number of
-   // quadrature points. This type is used in full element matrix assembly.
-   // Stores one general (non-symmetric) 3 x 3 matrix per point.
+   /** @brief Partially assembled data type for one element with the given number of
+       quadrature points. This type is used in full element matrix assembly.
+       Stores one general (non-symmetric) 3 x 3 matrix per point. */
    template <int qpts>
    struct f_asm_data { typedef TTensor3<qpts,3,3,complex_t> type; };
 
-   template <typename IR, typename coeff_t, int NE>
+   template <typename IR, typename coeff_t, typename impl_traits_t>
    struct CoefficientEval
    {
-      typedef typename IntRuleCoefficient<IR,coeff_t,NE>::Type Type;
+      typedef typename IntRuleCoefficient<IR,coeff_t,impl_traits_t>::Type Type;
    };
 
-   // Method used for un-assembled (matrix free) action.
-   // Jt        [M x Dim x SDim x NE] - Jacobian transposed, data member in F
-   // Q                               - CoefficientEval<>::Type
-   // q                               - CoefficientEval<>::Type::result_t
-   // grad_qpts [M x SDim x NC x NE]  - in/out data member in R
-   //
-   // grad_qpts = (w/det(J)) adj(J) adj(J)^t grad_qpts
+   /** @brief Method used for un-assembled (matrix free) action.
+       grad_qpts = (w/det(J)) adj(J) adj(J)^t grad_qpts
+       Jt        [M x Dim x SDim x NE] - Jacobian transposed, data member in F
+       Q                               - CoefficientEval<>::Type
+       q                               - CoefficientEval<>::Type::result_t
+       grad_qpts [M x SDim x NC x NE]  - in/out data member in R
+   */
    template <typename T_result_t, typename Q_t, typename q_t,
              typename S_data_t>
    static inline MFEM_ALWAYS_INLINE
@@ -477,17 +495,18 @@ struct TDiffusionKernel<3,3,complex_t>
       }
    }
 
-   // Method defining partial assembly. The pointwise Dim x Dim matrices are
-   // stored as symmetric (when asm_type == p_asm_data, i.e. A.layout.rank == 2)
-   // or non-symmetric (when asm_type == f_asm_data, i.e. A.layout.rank == 3)
-   // matrices.
-   // Jt   [M x Dim x SDim x NE] - Jacobian transposed, data member in F
-   // Q                          - CoefficientEval<>::Type
-   // q                          - CoefficientEval<>::Type::result_t
-   // A    [M x Dim*(Dim+1)/2]   - partially assembled Dim x Dim symm. matrices
-   // A    [M x Dim x Dim]       - partially assembled Dim x Dim matrices
-   //
-   // A = (w/det(J)) adj(J) adj(J)^t
+   /** @brief Method defining partial assembly.
+      The pointwise Dim x Dim matrices are stored as symmetric (when
+      asm_type == p_asm_data, i.e. A.layout.rank == 2) or
+      non-symmetric (when asm_type == f_asm_data, i.e. A.layout.rank
+      == 3) matrices.
+      A = (w/det(J)) adj(J) adj(J)^t
+      Jt   [M x Dim x SDim x NE] - Jacobian transposed, data member in F
+      Q                          - CoefficientEval<>::Type
+      q                          - CoefficientEval<>::Type::result_t
+      A    [M x Dim*(Dim+1)/2]   - partially assembled Dim x Dim symm. matrices
+      A    [M x Dim x Dim]       - partially assembled Dim x Dim matrices
+   */
    template <typename T_result_t, typename Q_t, typename q_t, typename asm_type>
    static inline MFEM_ALWAYS_INLINE
    void Assemble(const int k, const T_result_t &F,
@@ -518,12 +537,12 @@ struct TDiffusionKernel<3,3,complex_t>
       }
    }
 
-   // Method for partially assembled action.
-   // A         [M x Dim*(Dim+1)/2]  - partially assembled Dim x Dim symmetric
-   //                                  matrices
-   // grad_qpts [M x SDim x NC x NE] - in/out data member in R
-   //
-   // grad_qpts = A grad_qpts
+   /** @brief Method for partially assembled action.
+       A         [M x Dim*(Dim+1)/2]  - partially assembled Dim x Dim symmetric
+                                        matrices
+       grad_qpts [M x SDim x NC x NE] - in/out data member in R
+       grad_qpts = A grad_qpts
+   */
    template <int qpts, typename S_data_t>
    static inline MFEM_ALWAYS_INLINE
    void MultAssembled(const int k, const TMatrix<qpts,6,complex_t> &A,
diff --git a/fem/tcoefficient.hpp b/fem/tcoefficient.hpp
index 19c14cb054a..06e6ff00242 100644
--- a/fem/tcoefficient.hpp
+++ b/fem/tcoefficient.hpp
@@ -21,7 +21,7 @@
 namespace mfem
 {
 
-// Templated coefficient classes, cf. coefficient.?pp
+/// Templated coefficient classes, cf. coefficient.?pp
 
 class TCoefficient
 {
@@ -56,12 +56,13 @@ class TConstantCoefficient : public TCoefficient
 };
 
 
-// Function coefficient. The template class 'Func' has to implement at least one
-// of the following methods, depending on the dimension that will be used:
-// complex_t Eval1D(real_t);
-// complex_t Eval2D(real_t,real_t);
-// complex_t Eval3D(real_t,real_t,real_t);
-// Use MFEM_FLOPS_ADD() to count flops inside Eval*D.
+/** @brief Function coefficient.
+    @tparam Func has to implement at least one of the following methods,
+    depending on the dimension that will be used:
+    complex_t Eval1D(real_t);
+    complex_t Eval2D(real_t,real_t);
+    complex_t Eval3D(real_t,real_t,real_t);
+    Use MFEM_FLOPS_ADD() to count flops inside Eval*D. */
 template <typename Func, typename complex_t = double>
 class TFunctionCoefficient : public TCoefficient
 {
@@ -81,11 +82,15 @@ class TFunctionCoefficient : public TCoefficient
       {
          const int qpts = T_result_t::x_type::layout_type::dim_1;
          const int ne   = T_result_t::x_type::layout_type::dim_3;
+         const int vs   = sizeof(T.x[0])/sizeof(T.x[0][0]);
          for (int k = 0; k < ne; k++)
          {
             for (int i = 0; i < qpts; i++)
             {
-               c[l.ind(i,k)] = F.Eval1D(T.x(i,0,k));
+               for (int s = 0; s < vs; s++)
+               {
+                  c[l.ind(i,k)][s] = F.Eval1D(T.x(i,0,k)[s]);
+               }
             }
          }
       }
@@ -98,11 +103,15 @@ class TFunctionCoefficient : public TCoefficient
       {
          const int qpts = T_result_t::x_type::layout_type::dim_1;
          const int ne   = T_result_t::x_type::layout_type::dim_3;
+         const int vs   = sizeof(T.x[0])/sizeof(T.x[0][0]);
          for (int k = 0; k < ne; k++)
          {
             for (int i = 0; i < qpts; i++)
             {
-               c[l.ind(i,k)] = F.Eval2D(T.x(i,0,k), T.x(i,1,k));
+               for (int s = 0; s < vs; s++)
+               {
+                  c[l.ind(i,k)][s] = F.Eval2D(T.x(i,0,k)[s], T.x(i,1,k)[s]);
+               }
             }
          }
       }
@@ -115,20 +124,25 @@ class TFunctionCoefficient : public TCoefficient
       {
          const int qpts = T_result_t::x_type::layout_type::dim_1;
          const int ne   = T_result_t::x_type::layout_type::dim_3;
+         const int vs   = sizeof(T.x[0])/sizeof(T.x[0][0]);
          for (int k = 0; k < ne; k++)
          {
             for (int i = 0; i < qpts; i++)
             {
-               c[l.ind(i,k)] = F.Eval3D(T.x(i,0,k), T.x(i,1,k), T.x(i,2,k));
+               for (int s = 0; s < vs; s++)
+               {
+                  c[l.ind(i,k)][s] =
+                     F.Eval3D(T.x(i,0,k)[s], T.x(i,1,k)[s], T.x(i,2,k)[s]);
+               }
             }
          }
       }
    };
 
 public:
-   // Constructor for the case when Func has no data members.
+   /// Constructor for the case when Func has no data members.
    TFunctionCoefficient() : F() { }
-   // Constructor for the case when Func has data members.
+   /// Constructor for the case when Func has data members.
    TFunctionCoefficient(Func &F_) : F(F_) { }
    // Default copy constructor, Func has to have copy constructor.
 
@@ -170,14 +184,21 @@ class TPiecewiseConstCoefficient : public TCoefficient
    void Eval(const T_result_t &T, const c_layout_t &l, c_data_t &c)
    {
       const int ne = T_result_t::ne;
+      const int vs = sizeof(T.attrib[0])/sizeof(T.attrib[0][0]);
+      MFEM_STATIC_ASSERT(vs == sizeof(c[0])/sizeof(c[0][0]), "");
       for (int i = 0; i < ne; i++)
       {
-         TAssign<AssignOp::Set>(l.ind2(i), c, constants(T.attrib[i]-1));
+         typename c_data_t::data_type ci;
+         for (int s = 0; s < vs; s++)
+         {
+            ci[s] = constants(T.attrib[i][s]-1);
+         }
+         TAssign<AssignOp::Set>(l.ind2(i), c, ci);
       }
    }
 };
 
-
+/// GridFunction coefficient class.
 template <typename FieldEval>
 class TGridFunctionCoefficient : public TCoefficient
 {
@@ -243,12 +264,13 @@ class TGridFunctionCoefficient : public TCoefficient
 
 /// Auxiliary class that is used to simplify the evaluation of a coefficient and
 /// scaling it by the weights of a quadrature rule.
-template <typename IR, typename coeff_t, int NE>
+template <typename IR, typename coeff_t, typename impl_traits_t>
 struct IntRuleCoefficient
 {
    static const int qpts = IR::qpts;
-   static const int ne   = NE;
+   static const int ne   = impl_traits_t::batch_size;
    typedef typename coeff_t::complex_type complex_type;
+   typedef typename impl_traits_t::vcomplex_t vcomplex_t;
 
    template <bool is_const, bool dummy> struct Aux;
 
@@ -277,7 +299,7 @@ struct IntRuleCoefficient
    // non-constant coefficient
    template <bool dummy> struct Aux<false,dummy>
    {
-      typedef TMatrix<qpts,ne,complex_type> result_t;
+      typedef TMatrix<qpts,ne,vcomplex_t> result_t;
 #ifdef MFEM_TEMPLATE_INTRULE_COEFF_PRECOMP
       TMatrix<qpts,1,typename IR::real_type> w;
 #else
@@ -312,7 +334,7 @@ struct IntRuleCoefficient
       }
 
       inline MFEM_ALWAYS_INLINE
-      const complex_type &get(const result_t &res, int i, int k) const
+      const vcomplex_t &get(const result_t &res, int i, int k) const
       {
          return res(i,k);
       }
diff --git a/fem/teltrans.hpp b/fem/teltrans.hpp
index 5cdfb9e86fc..c0f3cb41ec6 100644
--- a/fem/teltrans.hpp
+++ b/fem/teltrans.hpp
@@ -21,12 +21,14 @@ namespace mfem
 
 // Templated element transformation classes, cf. eltrans.?pp
 
-// Element transformation class, templated on a mesh type and an integration
-// rule. It is constructed from a mesh (e.g. class TMesh) and shape evaluator
-// (e.g. class ShapeEvaluator) objects. Allows computation of physical
-// coordinates and Jacobian matrices corresponding to the reference integration
-// points. The desired result is specified through the template subclass Result
-// and stored in an object of the same type.
+/** @brief Element transformation class, templated on a mesh type and an
+    integration rule.
+    It is constructed from a mesh (e.g. class TMesh) and shape evaluator
+    (e.g. class ShapeEvaluator) objects. Allows computation of physical
+    coordinates and Jacobian matrices corresponding to the reference integration
+    points. The desired result is specified through the template subclass Result
+    and stored in an object of the same type.
+*/
 template <typename Mesh_t, typename IR, typename real_t = double>
 class TElementTransformation
 {
@@ -39,9 +41,9 @@ class TElementTransformation
 
    typedef TElementTransformation<Mesh_t,IR,real_t> T_type;
 
-   // Enumeration for the result type of the TElementTransformation::Eval()
-   // method. The types can obtained by summing constants from this enumeration
-   // and used as a template parameter in struct Result.
+   /// Enumeration for the result type of the TElementTransformation::Eval()
+   /// method. The types can obtained by summing constants from this enumeration
+   /// and used as a template parameter in struct Result.
    enum EvalOperations
    {
       EvalNone        = 0,
@@ -51,6 +53,8 @@ class TElementTransformation
       LoadElementIdxs = 8
    };
 
+   /// Determines at compile-time the operations needed for given coefficient
+   /// and kernel
    template <typename coeff_t, typename kernel_t> struct Get
    {
       static const int EvalOps =
@@ -61,12 +65,14 @@ class TElementTransformation
          (EvalJacobians   * kernel_t::uses_Jacobians);
    };
 
-   // Templated struct Result, used to specify the type result that is computed
-   // by the TElementTransformation::Eval() method and stored in this structure.
-   // The template parameter EvalOps is a sum (bitwise or) of constants from
-   // the enum EvalOperations. The parameter NE is the number of elements to be
-   // processed in the Eval() method.
-   template<int EvalOps, int NE> struct Result;
+   /** @brief Templated struct Result, used to specify the type result that is
+       computed by the TElementTransformation::Eval() method and stored in this
+       structure.
+       @tparam EvalOps is a sum (bitwise or) of constants from the enum EvalOperations
+       @tparam NE is the number of elements to be processed in the Eval() method.
+       @tparam impl_traits_t specifies additional parameters and types to be used by the Eval() method
+   */
+   template<int EvalOps, typename impl_traits_t> struct Result;
 
    static const int dim  = Mesh_t::dim;
    static const int sdim = Mesh_t::space_dim;
@@ -85,13 +91,17 @@ class TElementTransformation
 
    const Element* const *elements;
 
-   template <int NE>
+   template <typename vint_t, int NE>
    inline MFEM_ALWAYS_INLINE
-   void SetAttributes(int el, int (&attrib)[NE]) const
+   void SetAttributes(int el, vint_t (&attrib)[NE]) const
    {
+      const int vsize = sizeof(vint_t)/sizeof(attrib[0][0]);
       for (int i = 0; i < NE; i++)
       {
-         attrib[i] = elements[el+i]->GetAttribute();
+         for (int j = 0; j < vsize; i++)
+         {
+            attrib[i][j] = elements[el+j+i*vsize]->GetAttribute();
+         }
       }
    }
 
@@ -105,26 +115,31 @@ class TElementTransformation
         elements(mesh.m_mesh.GetElementsArray())
    { }
 
-   // Evaluate coordinates and/or Jacobian matrices at quadrature points.
-   template<int EvalOps, int NE>
+   /// Evaluate coordinates and/or Jacobian matrices at quadrature points.
+   template<int EvalOps, typename impl_traits_t>
    inline MFEM_ALWAYS_INLINE
-   void Eval(int el, Result<EvalOps,NE> &F)
+   void Eval(int el, Result<EvalOps,impl_traits_t> &F)
    {
       F.Eval(el, *this);
    }
 
 #ifdef MFEM_TEMPLATE_ENABLE_SERIALIZE
-   template<int EvalOps, int NE>
+   template<int EvalOps, typename impl_traits_t>
    inline MFEM_ALWAYS_INLINE
-   void EvalSerialized(int el, const real_t *nodeData, Result<EvalOps,NE> &F)
+   void EvalSerialized(int el, const typename impl_traits_t::vreal_t *nodeData,
+                       Result<EvalOps,impl_traits_t> &F)
    {
       F.EvalSerialized(el, *this, nodeData);
    }
 #endif
 
-   template <int NE> struct Result<0,NE> // 0 = EvalNone
+   // Specialization of the Result<> class
+
+   // Case EvalOps = 0 = EvalNone
+   template <typename it_t> struct Result<0,it_t>
    {
-      static const int ne = NE;
+      static const int ne = it_t::batch_size;
+      typedef typename it_t::vreal_t vreal_t;
       // x_type x;
       // Jt_type Jt;
       // int attrib[NE];
@@ -137,20 +152,23 @@ class TElementTransformation
       }
 #ifdef MFEM_TEMPLATE_ENABLE_SERIALIZE
       inline MFEM_ALWAYS_INLINE
-      void EvalSerialized(int el, T_type &T, const real_t *nodeData) { }
+      void EvalSerialized(int el, T_type &T, const vreal_t *nodeData) { }
 #endif
    };
-   template <int NE> struct Result<1,NE> // 1 = EvalCoordinates
+
+   // Case EvalOps = 1 = EvalCoordinates
+   template <typename it_t> struct Result<1,it_t>
    {
-      static const int ne = NE;
+      static const int ne = it_t::batch_size;
+      typedef typename it_t::vreal_t vreal_t;
 #ifdef MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES
-      typedef TTensor3<qpts,sdim,NE,real_t,true> x_type;
+      typedef TTensor3<qpts,sdim,NE,vreal_t,true> x_type;
 #else
-      typedef TTensor3<qpts,sdim,NE,real_t/*,true*/> x_type;
+      typedef TTensor3<qpts,sdim,ne,vreal_t/*,true*/> x_type;
 #endif
       x_type x;
 
-      typedef TTensor3<dofs,sdim,NE,real_t> nodes_dof_t;
+      typedef TTensor3<dofs,sdim,ne,vreal_t> nodes_dof_t;
 #ifdef MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES
       nodes_dof_t nodes_dof;
 #endif
@@ -159,8 +177,8 @@ class TElementTransformation
       void Eval(int el, T_type &T)
       {
 #ifdef MFEM_TEMPLATE_ELTRANS_HAS_NODE_DOFS
-         MFEM_STATIC_ASSERT(NE == 1, "only NE == 1 is supported");
-         TTensor3<dofs,sdim,1,real_t> &nodes_dof = T.nodes_dof;
+         MFEM_STATIC_ASSERT(ne == 1, "only ne == 1 is supported");
+         TTensor3<dofs,sdim,1,vreal_t> &nodes_dof = T.nodes_dof;
 #elif !defined(MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES)
          nodes_dof_t nodes_dof;
 #endif
@@ -173,25 +191,30 @@ class TElementTransformation
 
 #ifdef MFEM_TEMPLATE_ENABLE_SERIALIZE
       inline MFEM_ALWAYS_INLINE
-      void EvalSerialized(int el, T_type &T, const real_t *nodeData)
+      void EvalSerialized(int el, T_type &T, const vreal_t *nodeData)
       {
+         const int SS = sizeof(nodeData[0])/sizeof(nodeData[0][0]);
+         MFEM_ASSERT(el % (SS*ne) == 0, "invalid element index: " << el);
          T.evaluator.Calc(nodes_dof_t::layout.merge_23(),
-                          &nodeData[el*nodes_dof_t::size],
+                          &nodeData[el/SS*nodes_dof_t::size],
                           x.layout.merge_23(), x);
       }
 #endif
    };
-   template <int NE> struct Result<2,NE> // 2 = EvalJacobians
+
+   // Case EvalOps = 2 = EvalJacobians
+   template <typename it_t> struct Result<2,it_t>
    {
-      static const int ne = NE;
+      static const int ne = it_t::batch_size;
+      typedef typename it_t::vreal_t vreal_t;
 #ifdef MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES
-      typedef TTensor4<qpts,dim,sdim,NE,real_t,true> Jt_type;
+      typedef TTensor4<qpts,dim,sdim,ne,vreal_t,true> Jt_type;
 #else
-      typedef TTensor4<qpts,dim,sdim,NE,real_t/*,true*/> Jt_type;
+      typedef TTensor4<qpts,dim,sdim,ne,vreal_t/*,true*/> Jt_type;
 #endif
       Jt_type Jt;
 
-      typedef TTensor3<dofs,sdim,NE,real_t> nodes_dof_t;
+      typedef TTensor3<dofs,sdim,ne,vreal_t> nodes_dof_t;
 #ifdef MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES
       nodes_dof_t nodes_dof;
 #endif
@@ -200,8 +223,8 @@ class TElementTransformation
       void Eval(int el, T_type &T)
       {
 #ifdef MFEM_TEMPLATE_ELTRANS_HAS_NODE_DOFS
-         MFEM_STATIC_ASSERT(NE == 1, "only NE == 1 is supported");
-         TTensor3<dofs,sdim,1,real_t> &nodes_dof = T.nodes_dof;
+         MFEM_STATIC_ASSERT(ne == 1, "only ne == 1 is supported");
+         TTensor3<dofs,sdim,1,vreal_t> &nodes_dof = T.nodes_dof;
 #elif !defined(MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES)
          nodes_dof_t nodes_dof;
 #endif
@@ -214,27 +237,32 @@ class TElementTransformation
 
 #ifdef MFEM_TEMPLATE_ENABLE_SERIALIZE
       inline MFEM_ALWAYS_INLINE
-      void EvalSerialized(int el, T_type &T, const real_t *nodeData)
+      void EvalSerialized(int el, T_type &T, const vreal_t *nodeData)
       {
+         const int SS = sizeof(nodeData[0])/sizeof(nodeData[0][0]);
+         MFEM_ASSERT(el % (SS*ne) == 0, "invalid element index: " << el);
          T.evaluator.CalcGrad(nodes_dof_t::layout.merge_23(),
-                              &nodeData[el*nodes_dof_t::size],
+                              &nodeData[el/SS*nodes_dof_t::size],
                               Jt.layout.merge_34(), Jt);
       }
 #endif
    };
-   template <int NE> struct Result<3,NE> // 3 = EvalCoordinates|EvalJacobians
+
+   // Case EvalOps = 3 = EvalCoordinates|EvalJacobians
+   template <typename it_t> struct Result<3,it_t>
    {
-      static const int ne = NE;
-      typedef TTensor3<qpts,sdim,NE,real_t,true> x_type;
+      static const int ne = it_t::batch_size;
+      typedef typename it_t::vreal_t vreal_t;
+      typedef TTensor3<qpts,sdim,ne,vreal_t,true> x_type;
       x_type x;
 #ifdef MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES
-      typedef TTensor4<qpts,dim,sdim,NE,real_t,true> Jt_type;
+      typedef TTensor4<qpts,dim,sdim,ne,vreal_t,true> Jt_type;
 #else
-      typedef TTensor4<qpts,dim,sdim,NE,real_t/*,true*/> Jt_type;
+      typedef TTensor4<qpts,dim,sdim,ne,vreal_t/*,true*/> Jt_type;
 #endif
       Jt_type Jt;
 
-      typedef TTensor3<dofs,sdim,NE,real_t> nodes_dof_t;
+      typedef TTensor3<dofs,sdim,ne,vreal_t> nodes_dof_t;
 #ifdef MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES
       nodes_dof_t nodes_dof;
 #endif
@@ -243,8 +271,8 @@ class TElementTransformation
       void Eval(int el, T_type &T)
       {
 #ifdef MFEM_TEMPLATE_ELTRANS_HAS_NODE_DOFS
-         MFEM_STATIC_ASSERT(NE == 1, "only NE == 1 is supported");
-         TTensor3<dofs,sdim,1,real_t> &nodes_dof = T.nodes_dof;
+         MFEM_STATIC_ASSERT(ne == 1, "only ne == 1 is supported");
+         TTensor3<dofs,sdim,1,vreal_t> &nodes_dof = T.nodes_dof;
 #elif !defined(MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES)
          nodes_dof_t nodes_dof;
 #endif
@@ -259,39 +287,45 @@ class TElementTransformation
 
 #ifdef MFEM_TEMPLATE_ENABLE_SERIALIZE
       inline MFEM_ALWAYS_INLINE
-      void EvalSerialized(int el, T_type &T, const real_t *nodeData)
+      void EvalSerialized(int el, T_type &T, const vreal_t *nodeData)
       {
+         const int SS = sizeof(nodeData[0])/sizeof(nodeData[0][0]);
+         MFEM_ASSERT(el % (SS*ne) == 0, "invalid element index: " << el);
          T.evaluator.Calc(nodes_dof_t::layout.merge_23(),
-                          &nodeData[el*nodes_dof_t::size],
+                          &nodeData[el/SS*nodes_dof_t::size],
                           x.layout.merge_23(), x);
          T.evaluator.CalcGrad(nodes_dof_t::layout.merge_23(),
-                              &nodeData[el*nodes_dof_t::size],
+                              &nodeData[el/SS*nodes_dof_t::size],
                               Jt.layout.merge_34(), Jt);
       }
 #endif
    };
-   template <int NE> struct Result<6,NE> // 6 = EvalJacobians|LoadAttributes
+
+   // Case EvalOps = 6 = EvalJacobians|LoadAttributes
+   template <typename it_t> struct Result<6,it_t>
    {
-      static const int ne = NE;
+      static const int ne = it_t::batch_size;
+      typedef typename it_t::vreal_t vreal_t;
+      typedef typename it_t::vint_t  vint_t;
 #ifdef MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES
-      typedef TTensor4<qpts,dim,sdim,NE,real_t,true> Jt_type;
+      typedef TTensor4<qpts,dim,sdim,ne,vreal_t,true> Jt_type;
 #else
-      typedef TTensor4<qpts,dim,sdim,NE,real_t/*,true*/> Jt_type;
+      typedef TTensor4<qpts,dim,sdim,ne,vreal_t/*,true*/> Jt_type;
 #endif
       Jt_type Jt;
 
-      typedef TTensor3<dofs,sdim,NE,real_t> nodes_dof_t;
+      typedef TTensor3<dofs,sdim,ne,vreal_t> nodes_dof_t;
 #ifdef MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES
       nodes_dof_t nodes_dof;
 #endif
-      int attrib[NE];
+      vint_t attrib[ne];
 
       inline MFEM_ALWAYS_INLINE
       void Eval(int el, T_type &T)
       {
 #ifdef MFEM_TEMPLATE_ELTRANS_HAS_NODE_DOFS
-         MFEM_STATIC_ASSERT(NE == 1, "only NE == 1 is supported");
-         TTensor3<dofs,sdim,1,real_t> &nodes_dof = T.nodes_dof;
+         MFEM_STATIC_ASSERT(ne == 1, "only ne == 1 is supported");
+         TTensor3<dofs,sdim,1,vreal_t> &nodes_dof = T.nodes_dof;
 #elif !defined(MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES)
          nodes_dof_t nodes_dof;
 #endif
@@ -305,26 +339,31 @@ class TElementTransformation
 
 #ifdef MFEM_TEMPLATE_ENABLE_SERIALIZE
       inline MFEM_ALWAYS_INLINE
-      void EvalSerialized(int el, T_type &T, const real_t *nodeData)
+      void EvalSerialized(int el, T_type &T, const vreal_t *nodeData)
       {
+         const int SS = sizeof(nodeData[0])/sizeof(nodeData[0][0]);
+         MFEM_ASSERT(el % (SS*ne) == 0, "invalid element index: " << el);
          T.evaluator.CalcGrad(nodes_dof_t::layout.merge_23(),
-                              &nodeData[el*nodes_dof_t::size],
+                              &nodeData[el/SS*nodes_dof_t::size],
                               Jt.layout.merge_34(), Jt);
          T.SetAttributes(el, attrib);
       }
 #endif
    };
-   template <int NE> struct Result<10,NE> // 10 = EvalJacobians|LoadElementIdxs
+
+   // Case EvalOps = 10 = EvalJacobians|LoadElementIdxs
+   template <typename it_t> struct Result<10,it_t>
    {
-      static const int ne = NE;
+      static const int ne = it_t::batch_size;
+      typedef typename it_t::vreal_t vreal_t;
 #ifdef MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES
-      typedef TTensor4<qpts,dim,sdim,NE,real_t,true> Jt_type;
+      typedef TTensor4<qpts,dim,sdim,ne,vreal_t,true> Jt_type;
 #else
-      typedef TTensor4<qpts,dim,sdim,NE,real_t/*,true*/> Jt_type;
+      typedef TTensor4<qpts,dim,sdim,ne,vreal_t/*,true*/> Jt_type;
 #endif
       Jt_type Jt;
 
-      typedef TTensor3<dofs,sdim,NE,real_t> nodes_dof_t;
+      typedef TTensor3<dofs,sdim,ne,vreal_t> nodes_dof_t;
 #ifdef MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES
       nodes_dof_t nodes_dof;
 #endif
@@ -334,8 +373,8 @@ class TElementTransformation
       void Eval(int el, T_type &T)
       {
 #ifdef MFEM_TEMPLATE_ELTRANS_HAS_NODE_DOFS
-         MFEM_STATIC_ASSERT(NE == 1, "only NE == 1 is supported");
-         TTensor3<dofs,sdim,1,real_t> &nodes_dof = T.nodes_dof;
+         MFEM_STATIC_ASSERT(ne == 1, "only ne == 1 is supported");
+         TTensor3<dofs,sdim,1,vreal_t> &nodes_dof = T.nodes_dof;
 #elif !defined(MFEM_TEMPLATE_ELTRANS_RESULT_HAS_NODES)
          nodes_dof_t nodes_dof;
 #endif
@@ -349,10 +388,12 @@ class TElementTransformation
 
 #ifdef MFEM_TEMPLATE_ENABLE_SERIALIZE
       inline MFEM_ALWAYS_INLINE
-      void EvalSerialized(int el, T_type &T, const real_t *nodeData)
+      void EvalSerialized(int el, T_type &T, const vreal_t *nodeData)
       {
+         const int SS = sizeof(nodeData[0])/sizeof(nodeData[0][0]);
+         MFEM_ASSERT(el % (SS*ne) == 0, "invalid element index: " << el);
          T.evaluator.CalcGrad(nodes_dof_t::layout.merge_23(),
-                              &nodeData[el*nodes_dof_t::size],
+                              &nodeData[el/SS*nodes_dof_t::size],
                               Jt.layout.merge_34(), Jt);
          first_elem_idx = el;
       }
diff --git a/fem/tevaluator.hpp b/fem/tevaluator.hpp
index 50e00782c22..dd9d85a8510 100644
--- a/fem/tevaluator.hpp
+++ b/fem/tevaluator.hpp
@@ -23,12 +23,16 @@ namespace mfem
 // Templated classes for transitioning between degrees of freedom and quadrature
 // points values.
 
-// Shape evaluators -- values of basis functions on the reference element
-
+/** @brief Shape evaluators -- values of basis functions on the reference element
+    @tparam FE some form of TFiniteElement, probably got from TMesh::FE_type
+    @tparam IR some form of TIntegrationRule
+    @tparam TP tensor product or not
+    @tparam real_t data type for mesh nodes, solution basis, mesh basis
+*/
 template <class FE, class IR, bool TP, typename real_t>
 class ShapeEvaluator_base;
 
-// ShapeEvaluator without tensor-product structure
+/// ShapeEvaluator without tensor-product structure
 template <class FE, class IR, typename real_t>
 class ShapeEvaluator_base<FE, IR, false, real_t>
 {
@@ -54,11 +58,11 @@ class ShapeEvaluator_base<FE, IR, false, real_t>
 
    // default copy constructor
 
-   // Multi-component shape evaluation from DOFs to quadrature points.
-   // dof_layout is (DOF x NumComp) and qpt_layout is (NIP x NumComp).
+   /** @brief Multi-component shape evaluation from DOFs to quadrature points.
+       dof_layout is (DOF x NumComp) and qpt_layout is (NIP x NumComp). */
    template <typename dof_layout_t, typename dof_data_t,
              typename qpt_layout_t, typename qpt_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Calc(const dof_layout_t &dof_layout, const dof_data_t &dof_data,
              const qpt_layout_t &qpt_layout, qpt_data_t &qpt_data) const
    {
@@ -76,12 +80,12 @@ class ShapeEvaluator_base<FE, IR, false, real_t>
                      qpt_layout, qpt_data);
    }
 
-   // Multi-component shape evaluation transpose from quadrature points to DOFs.
-   // qpt_layout is (NIP x NumComp) and dof_layout is (DOF x NumComp).
+   /** @brief Multi-component shape evaluation transpose from quadrature points to
+       DOFs.  qpt_layout is (NIP x NumComp) and dof_layout is (DOF x NumComp). */
    template <bool Add,
              typename qpt_layout_t, typename qpt_data_t,
              typename dof_layout_t, typename dof_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcT(const qpt_layout_t &qpt_layout, const qpt_data_t &qpt_data,
               const dof_layout_t &dof_layout, dof_data_t &dof_data) const
    {
@@ -99,11 +103,11 @@ class ShapeEvaluator_base<FE, IR, false, real_t>
                    dof_layout, dof_data);
    }
 
-   // Multi-component gradient evaluation from DOFs to quadrature points.
-   // dof_layout is (DOF x NumComp) and grad_layout is (NIP x DIM x NumComp).
+   /** @brief Multi-component gradient evaluation from DOFs to quadrature points.
+      dof_layout is (DOF x NumComp) and grad_layout is (NIP x DIM x NumComp). */
    template <typename dof_layout_t, typename dof_data_t,
              typename grad_layout_t, typename grad_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcGrad(const dof_layout_t  &dof_layout,
                  const dof_data_t    &dof_data,
                  const grad_layout_t &grad_layout,
@@ -124,12 +128,12 @@ class ShapeEvaluator_base<FE, IR, false, real_t>
                      grad_layout.merge_12(), grad_data);
    }
 
-   // Multi-component gradient evaluation transpose from quadrature points to
-   // DOFs. grad_layout is (NIP x DIM x NumComp), dof_layout is (DOF x NumComp).
+   /** @brief Multi-component gradient evaluation transpose from quadrature points to
+      DOFs. grad_layout is (NIP x DIM x NumComp), dof_layout is (DOF x NumComp). */
    template <bool Add,
              typename grad_layout_t, typename grad_data_t,
              typename dof_layout_t, typename dof_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcGradT(const grad_layout_t &grad_layout,
                   const grad_data_t   &grad_data,
                   const dof_layout_t  &dof_layout,
@@ -150,11 +154,12 @@ class ShapeEvaluator_base<FE, IR, false, real_t>
                    dof_layout, dof_data);
    }
 
-   // Multi-component assemble.
-   // qpt_layout is (NIP x NumComp), M_layout is (DOF x DOF x NumComp)
+   /** @brief Multi-component assemble.
+       qpt_layout is (NIP x NumComp),
+       M_layout is (DOF x DOF x NumComp) */
    template <typename qpt_layout_t, typename qpt_data_t,
              typename M_layout_t, typename M_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Assemble(const qpt_layout_t &qpt_layout, const qpt_data_t &qpt_data,
                  const M_layout_t &M_layout, M_data_t &M_data) const
    {
@@ -173,19 +178,20 @@ class ShapeEvaluator_base<FE, IR, false, real_t>
 #endif
    }
 
-   // Multi-component assemble of grad-grad element matrices.
-   // qpt_layout is (NIP x DIM x DIM x NumComp), and
-   // D_layout is (DOF x DOF x NumComp).
+   /** @brief Multi-component assemble of grad-grad element matrices.
+       qpt_layout is (NIP x DIM x DIM x NumComp), and
+       D_layout is (DOF x DOF x NumComp). */
    template <typename qpt_layout_t, typename qpt_data_t,
              typename D_layout_t, typename D_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void AssembleGradGrad(const qpt_layout_t &qpt_layout,
                          const qpt_data_t   &qpt_data,
                          const D_layout_t   &D_layout,
                          D_data_t           &D_data) const
    {
       const int NC = qpt_layout_t::dim_4;
-      TTensor4<NIP,DIM,DOF,NC> F;
+      typedef typename qpt_data_t::data_type entry_type;
+      TTensor4<NIP,DIM,DOF,NC,entry_type> F;
       for (int k = 0; k < NC; k++)
       {
          // Next loop performs a batch of matrix-matrix products of size
@@ -207,7 +213,7 @@ class ShapeEvaluator_base<FE, IR, false, real_t>
 template <int Dim, int DOF, int NIP, typename real_t>
 class TProductShapeEvaluator;
 
-// ShapeEvaluator with 1D tensor-product structure
+/// ShapeEvaluator with 1D tensor-product structure
 template <int DOF, int NIP, typename real_t>
 class TProductShapeEvaluator<1, DOF, NIP, real_t>
 {
@@ -220,11 +226,11 @@ class TProductShapeEvaluator<1, DOF, NIP, real_t>
 public:
    TProductShapeEvaluator() { }
 
-   // Multi-component shape evaluation from DOFs to quadrature points.
-   // dof_layout is (DOF x NumComp) and qpt_layout is (NIP x NumComp).
+   /** @brief Multi-component shape evaluation from DOFs to quadrature points.
+       dof_layout is (DOF x NumComp) and qpt_layout is (NIP x NumComp). */
    template <typename dof_layout_t, typename dof_data_t,
              typename qpt_layout_t, typename qpt_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Calc(const dof_layout_t &dof_layout, const dof_data_t &dof_data,
              const qpt_layout_t &qpt_layout, qpt_data_t &qpt_data) const
    {
@@ -233,12 +239,12 @@ class TProductShapeEvaluator<1, DOF, NIP, real_t>
                      qpt_layout, qpt_data);
    }
 
-   // Multi-component shape evaluation transpose from quadrature points to DOFs.
-   // qpt_layout is (NIP x NumComp) and dof_layout is (DOF x NumComp).
+   /** @brief Multi-component shape evaluation transpose from quadrature points
+       to DOFs.  qpt_layout is (NIP x NumComp) and dof_layout is (DOF x NumComp). */
    template <bool Add,
              typename qpt_layout_t, typename qpt_data_t,
              typename dof_layout_t, typename dof_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcT(const qpt_layout_t &qpt_layout, const qpt_data_t &qpt_data,
               const dof_layout_t &dof_layout, dof_data_t &dof_data) const
    {
@@ -247,11 +253,11 @@ class TProductShapeEvaluator<1, DOF, NIP, real_t>
                    dof_layout, dof_data);
    }
 
-   // Multi-component gradient evaluation from DOFs to quadrature points.
-   // dof_layout is (DOF x NumComp) and grad_layout is (NIP x DIM x NumComp).
+   /** @brief Multi-component gradient evaluation from DOFs to quadrature points.
+       dof_layout is (DOF x NumComp) and grad_layout is (NIP x DIM x NumComp). */
    template <typename dof_layout_t, typename dof_data_t,
              typename grad_layout_t, typename grad_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcGrad(const dof_layout_t  &dof_layout,
                  const dof_data_t    &dof_data,
                  const grad_layout_t &grad_layout,
@@ -263,12 +269,12 @@ class TProductShapeEvaluator<1, DOF, NIP, real_t>
                      grad_layout.merge_12(), grad_data);
    }
 
-   // Multi-component gradient evaluation transpose from quadrature points to
-   // DOFs. grad_layout is (NIP x DIM x NumComp), dof_layout is (DOF x NumComp).
+   /** @brief Multi-component gradient evaluation transpose from quadrature points to
+       DOFs. grad_layout is (NIP x DIM x NumComp), dof_layout is (DOF x NumComp). */
    template <bool Add,
              typename grad_layout_t, typename grad_data_t,
              typename dof_layout_t, typename dof_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcGradT(const grad_layout_t &grad_layout,
                   const grad_data_t   &grad_data,
                   const dof_layout_t  &dof_layout,
@@ -281,11 +287,11 @@ class TProductShapeEvaluator<1, DOF, NIP, real_t>
                    dof_layout, dof_data);
    }
 
-   // Multi-component assemble.
-   // qpt_layout is (NIP x NumComp), M_layout is (DOF x DOF x NumComp)
+   /** @brief Multi-component assemble.
+       qpt_layout is (NIP x NumComp), M_layout is (DOF x DOF x NumComp) */
    template <typename qpt_layout_t, typename qpt_data_t,
              typename M_layout_t, typename M_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Assemble(const qpt_layout_t &qpt_layout, const qpt_data_t &qpt_data,
                  const M_layout_t &M_layout, M_data_t &M_data) const
    {
@@ -304,12 +310,12 @@ class TProductShapeEvaluator<1, DOF, NIP, real_t>
 #endif
    }
 
-   // Multi-component assemble of grad-grad element matrices.
-   // qpt_layout is (NIP x DIM x DIM x NumComp), and
-   // D_layout is (DOF x DOF x NumComp).
+   /** @brief Multi-component assemble of grad-grad element matrices.
+       qpt_layout is (NIP x DIM x DIM x NumComp), and
+       D_layout is (DOF x DOF x NumComp). */
    template <typename qpt_layout_t, typename qpt_data_t,
              typename D_layout_t, typename D_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void AssembleGradGrad(const qpt_layout_t &qpt_layout,
                          const qpt_data_t   &qpt_data,
                          const D_layout_t   &D_layout,
@@ -331,7 +337,7 @@ class TProductShapeEvaluator<1, DOF, NIP, real_t>
    }
 };
 
-// ShapeEvaluator with 2D tensor-product structure
+/// ShapeEvaluator with 2D tensor-product structure
 template <int DOF, int NIP, typename real_t>
 class TProductShapeEvaluator<2, DOF, NIP, real_t>
 {
@@ -348,13 +354,14 @@ class TProductShapeEvaluator<2, DOF, NIP, real_t>
    template <bool Dx, bool Dy,
              typename dof_layout_t, typename dof_data_t,
              typename qpt_layout_t, typename qpt_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Calc(const dof_layout_t &dof_layout, const dof_data_t &dof_data,
              const qpt_layout_t &qpt_layout, qpt_data_t &qpt_data) const
    {
       const int NC = dof_layout_t::dim_2;
+      typedef typename qpt_data_t::data_type entry_type;
       // DOF x DOF x NC --> NIP x DOF x NC --> NIP x NIP x NC
-      TTensor3<NIP,DOF,NC> A;
+      TTensor3<NIP,DOF,NC,entry_type> A;
 
       // (1) A_{i,j,k} = \sum_s B_1d_{i,s} dof_data_{s,j,k}
       Mult_2_1<false>(B_1d.layout, Dx ? G_1d : B_1d,
@@ -366,11 +373,11 @@ class TProductShapeEvaluator<2, DOF, NIP, real_t>
                       qpt_layout.template split_1<NIP,NIP>(), qpt_data);
    }
 
-   // Multi-component shape evaluation from DOFs to quadrature points.
-   // dof_layout is (TDOF x NumComp) and qpt_layout is (TNIP x NumComp).
+   /** @brief Multi-component shape evaluation from DOFs to quadrature points.
+       dof_layout is (TDOF x NumComp) and qpt_layout is (TNIP x NumComp). */
    template <typename dof_layout_t, typename dof_data_t,
              typename qpt_layout_t, typename qpt_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Calc(const dof_layout_t &dof_layout, const dof_data_t &dof_data,
              const qpt_layout_t &qpt_layout, qpt_data_t &qpt_data) const
    {
@@ -380,13 +387,14 @@ class TProductShapeEvaluator<2, DOF, NIP, real_t>
    template <bool Dx, bool Dy, bool Add,
              typename qpt_layout_t, typename qpt_data_t,
              typename dof_layout_t, typename dof_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcT(const qpt_layout_t &qpt_layout, const qpt_data_t &qpt_data,
               const dof_layout_t &dof_layout, dof_data_t &dof_data) const
    {
       const int NC = dof_layout_t::dim_2;
+      typedef typename qpt_data_t::data_type entry_type;
       // NIP x NIP X NC --> NIP x DOF x NC --> DOF x DOF x NC
-      TTensor3<NIP,DOF,NC> A;
+      TTensor3<NIP,DOF,NC,entry_type> A;
 
       // (1) A_{i,j,k} = \sum_s B_1d_{s,j} qpt_data_{i,s,k}
       Mult_1_2<false>(B_1d.layout, Dy ? G_1d : B_1d,
@@ -398,23 +406,23 @@ class TProductShapeEvaluator<2, DOF, NIP, real_t>
                     dof_layout.template split_1<DOF,DOF>(), dof_data);
    }
 
-   // Multi-component shape evaluation transpose from quadrature points to DOFs.
-   // qpt_layout is (TNIP x NumComp) and dof_layout is (TDOF x NumComp).
+   /** @brief Multi-component shape evaluation transpose from quadrature points to DOFs.
+       qpt_layout is (TNIP x NumComp) and dof_layout is (TDOF x NumComp). */
    template <bool Add,
              typename qpt_layout_t, typename qpt_data_t,
              typename dof_layout_t, typename dof_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcT(const qpt_layout_t &qpt_layout, const qpt_data_t &qpt_data,
               const dof_layout_t &dof_layout, dof_data_t &dof_data) const
    {
       CalcT<false,false,Add>(qpt_layout, qpt_data, dof_layout, dof_data);
    }
 
-   // Multi-component gradient evaluation from DOFs to quadrature points.
-   // dof_layout is (TDOF x NumComp) and grad_layout is (TNIP x DIM x NumComp).
+   /** @brief Multi-component gradient evaluation from DOFs to quadrature points.
+       dof_layout is (TDOF x NumComp) and grad_layout is (TNIP x DIM x NumComp). */
    template <typename dof_layout_t, typename dof_data_t,
              typename grad_layout_t, typename grad_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcGrad(const dof_layout_t  &dof_layout,
                  const dof_data_t    &dof_data,
                  const grad_layout_t &grad_layout,
@@ -426,13 +434,13 @@ class TProductShapeEvaluator<2, DOF, NIP, real_t>
                        grad_layout.ind2(1), grad_data);
    }
 
-   // Multi-component gradient evaluation transpose from quadrature points to
-   // DOFs. grad_layout is (TNIP x DIM x NumComp), dof_layout is
-   // (TDOF x NumComp).
+   /** @brief Multi-component gradient evaluation transpose from quadrature points to
+       DOFs. grad_layout is (TNIP x DIM x NumComp), dof_layout is
+       (TDOF x NumComp). */
    template <bool Add,
              typename grad_layout_t, typename grad_data_t,
              typename dof_layout_t, typename dof_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcGradT(const grad_layout_t &grad_layout,
                   const grad_data_t   &grad_data,
                   const dof_layout_t  &dof_layout,
@@ -444,15 +452,16 @@ class TProductShapeEvaluator<2, DOF, NIP, real_t>
                              dof_layout, dof_data);
    }
 
-   // Multi-component assemble.
-   // qpt_layout is (TNIP x NumComp), M_layout is (TDOF x TDOF x NumComp)
+   /** @brief Multi-component assemble.
+       qpt_layout is (TNIP x NumComp), M_layout is (TDOF x TDOF x NumComp) */
    template <typename qpt_layout_t, typename qpt_data_t,
              typename M_layout_t, typename M_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Assemble(const qpt_layout_t &qpt_layout, const qpt_data_t &qpt_data,
                  const M_layout_t &M_layout, M_data_t &M_data) const
    {
       const int NC = qpt_layout_t::dim_2;
+      typedef typename qpt_data_t::data_type entry_type;
 
       // Using TensorAssemble: <I,NIP,J> --> <DOF,I,DOF,J>
 
@@ -469,7 +478,7 @@ class TProductShapeEvaluator<2, DOF, NIP, real_t>
          TTensor3<DOF,NIP,DOF*NC>::layout, A,
          M_layout.merge_23().template split_12<DOF,DOF,DOF,DOF*NC>(), M_data);
 #elif 1
-      TTensor4<DOF,NIP,DOF,NC> A;
+      TTensor4<DOF,NIP,DOF,NC,entry_type> A;
       // qpt_data<NIP1,NIP2,NC> --> A<DOF2,NIP1,DOF2,NC>
       TensorAssemble<false>(
          Bt_1d.layout, Bt_1d, B_1d.layout, B_1d,
@@ -510,14 +519,15 @@ class TProductShapeEvaluator<2, DOF, NIP, real_t>
    template <int D1, int D2, bool Add,
              typename qpt_layout_t, typename qpt_data_t,
              typename D_layout_t, typename D_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Assemble(const qpt_layout_t &qpt_layout,
                  const qpt_data_t   &qpt_data,
                  const D_layout_t   &D_layout,
                  D_data_t           &D_data) const
    {
       const int NC = qpt_layout_t::dim_2;
-      TTensor4<DOF,NIP,DOF,NC> A;
+      typedef typename qpt_data_t::data_type entry_type;
+      TTensor4<DOF,NIP,DOF,NC,entry_type> A;
 
       // Using TensorAssemble: <I,NIP,J> --> <DOF,I,DOF,J>
 
@@ -531,16 +541,16 @@ class TProductShapeEvaluator<2, DOF, NIP, real_t>
       TensorAssemble<Add>(
          Bt_1d.layout, D1 == 1 ? Bt_1d : Gt_1d,
          B_1d.layout, D2 == 1 ? B_1d : G_1d,
-         TTensor3<DOF,NIP,DOF*NC>::layout, A,
+         A.layout.merge_34(), A,
          D_layout.merge_23().template split_12<DOF,DOF,DOF,DOF*NC>(), D_data);
    }
 
-   // Multi-component assemble of grad-grad element matrices.
-   // qpt_layout is (TNIP x DIM x DIM x NumComp), and
-   // D_layout is (TDOF x TDOF x NumComp).
+   /** @brief Multi-component assemble of grad-grad element matrices.
+      qpt_layout is (TNIP x DIM x DIM x NumComp), and
+      D_layout is (TDOF x TDOF x NumComp). */
    template <typename qpt_layout_t, typename qpt_data_t,
              typename D_layout_t, typename D_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void AssembleGradGrad(const qpt_layout_t &qpt_layout,
                          const qpt_data_t   &qpt_data,
                          const D_layout_t   &D_layout,
@@ -607,7 +617,7 @@ class TProductShapeEvaluator<2, DOF, NIP, real_t>
    }
 };
 
-// ShapeEvaluator with 3D tensor-product structure
+/// ShapeEvaluator with 3D tensor-product structure
 template <int DOF, int NIP, typename real_t>
 class TProductShapeEvaluator<3, DOF, NIP, real_t>
 {
@@ -624,13 +634,14 @@ class TProductShapeEvaluator<3, DOF, NIP, real_t>
    template <bool Dx, bool Dy, bool Dz,
              typename dof_layout_t, typename dof_data_t,
              typename qpt_layout_t, typename qpt_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Calc(const dof_layout_t &dof_layout, const dof_data_t &dof_data,
              const qpt_layout_t &qpt_layout, qpt_data_t &qpt_data) const
    {
       const int NC = dof_layout_t::dim_2;
-      TVector<NIP*DOF*DOF*NC> QDD;
-      TVector<NIP*NIP*DOF*NC> QQD;
+      typedef typename qpt_data_t::data_type entry_type;
+      TVector<NIP*DOF*DOF*NC,entry_type> QDD;
+      TVector<NIP*NIP*DOF*NC,entry_type> QQD;
 
       // QDD_{i,jj,k} = \sum_s B_1d_{i,s} dof_data_{s,jj,k}
       Mult_2_1<false>(B_1d.layout, Dx ? G_1d : B_1d,
@@ -646,11 +657,11 @@ class TProductShapeEvaluator<3, DOF, NIP, real_t>
                       qpt_layout.template split_1<NIP*NIP,NIP>(), qpt_data);
    }
 
-   // Multi-component shape evaluation from DOFs to quadrature points.
-   // dof_layout is (TDOF x NumComp) and qpt_layout is (TNIP x NumComp).
+   /** @brief Multi-component shape evaluation from DOFs to quadrature points.
+       dof_layout is (TDOF x NumComp) and qpt_layout is (TNIP x NumComp). */
    template <typename dof_layout_t, typename dof_data_t,
              typename qpt_layout_t, typename qpt_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Calc(const dof_layout_t &dof_layout, const dof_data_t &dof_data,
              const qpt_layout_t &qpt_layout, qpt_data_t &qpt_data) const
    {
@@ -660,13 +671,14 @@ class TProductShapeEvaluator<3, DOF, NIP, real_t>
    template <bool Dx, bool Dy, bool Dz, bool Add,
              typename qpt_layout_t, typename qpt_data_t,
              typename dof_layout_t, typename dof_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcT(const qpt_layout_t &qpt_layout, const qpt_data_t &qpt_data,
               const dof_layout_t &dof_layout, dof_data_t &dof_data) const
    {
       const int NC = dof_layout_t::dim_2;
-      TVector<NIP*DOF*DOF*NC> QDD;
-      TVector<NIP*NIP*DOF*NC> QQD;
+      typedef typename qpt_data_t::data_type entry_type;
+      TVector<NIP*DOF*DOF*NC,entry_type> QDD;
+      TVector<NIP*NIP*DOF*NC,entry_type> QQD;
 
       // QQD_{ii,j,k} = \sum_s B_1d_{s,j} qpt_data_{ii,s,k}
       Mult_1_2<false>(B_1d.layout, Dz ? G_1d : B_1d,
@@ -682,23 +694,23 @@ class TProductShapeEvaluator<3, DOF, NIP, real_t>
                     dof_layout.template split_1<DOF,DOF*DOF>(), dof_data);
    }
 
-   // Multi-component shape evaluation transpose from quadrature points to DOFs.
-   // qpt_layout is (TNIP x NumComp) and dof_layout is (TDOF x NumComp).
+   /** @brief Multi-component shape evaluation transpose from quadrature points to DOFs.
+       qpt_layout is (TNIP x NumComp) and dof_layout is (TDOF x NumComp). */
    template <bool Add,
              typename qpt_layout_t, typename qpt_data_t,
              typename dof_layout_t, typename dof_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcT(const qpt_layout_t &qpt_layout, const qpt_data_t &qpt_data,
               const dof_layout_t &dof_layout, dof_data_t &dof_data) const
    {
       CalcT<false,false,false,Add>(qpt_layout, qpt_data, dof_layout, dof_data);
    }
 
-   // Multi-component gradient evaluation from DOFs to quadrature points.
-   // dof_layout is (TDOF x NumComp) and grad_layout is (TNIP x DIM x NumComp).
+   /** @brief Multi-component gradient evaluation from DOFs to quadrature points.
+       dof_layout is (TDOF x NumComp) and grad_layout is (TNIP x DIM x NumComp). */
    template <typename dof_layout_t, typename dof_data_t,
              typename grad_layout_t, typename grad_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcGrad(const dof_layout_t  &dof_layout,
                  const dof_data_t    &dof_data,
                  const grad_layout_t &grad_layout,
@@ -714,13 +726,13 @@ class TProductShapeEvaluator<3, DOF, NIP, real_t>
       // y-derivatives and second time for the z-derivatives.
    }
 
-   // Multi-component gradient evaluation transpose from quadrature points to
-   // DOFs. grad_layout is (TNIP x DIM x NumComp), dof_layout is
-   // (TDOF x NumComp).
+   /** @brief Multi-component gradient evaluation transpose from quadrature points to
+       DOFs. grad_layout is (TNIP x DIM x NumComp), dof_layout is
+       (TDOF x NumComp). */
    template <bool Add,
              typename grad_layout_t, typename grad_data_t,
              typename dof_layout_t, typename dof_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void CalcGradT(const grad_layout_t &grad_layout,
                   const grad_data_t   &grad_data,
                   const dof_layout_t  &dof_layout,
@@ -734,17 +746,18 @@ class TProductShapeEvaluator<3, DOF, NIP, real_t>
                                    dof_layout, dof_data);
    }
 
-   // Multi-component assemble.
-   // qpt_layout is (TNIP x NumComp), M_layout is (TDOF x TDOF x NumComp)
+   /** @brief Multi-component assemble.
+       qpt_layout is (TNIP x NumComp), M_layout is (TDOF x TDOF x NumComp) */
    template <typename qpt_layout_t, typename qpt_data_t,
              typename M_layout_t, typename M_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Assemble(const qpt_layout_t &qpt_layout, const qpt_data_t &qpt_data,
                  const M_layout_t &M_layout, M_data_t &M_data) const
    {
       const int NC = qpt_layout_t::dim_2;
-      TTensor4<DOF,NIP*NIP,DOF,NC> A1;
-      TTensor4<DOF,DOF*NIP,DOF,DOF*NC> A2;
+      typedef typename qpt_data_t::data_type entry_type;
+      TTensor4<DOF,NIP*NIP,DOF,NC,entry_type> A1;
+      TTensor4<DOF,DOF*NIP,DOF,DOF*NC,entry_type> A2;
 
       // Using TensorAssemble: <I,NIP,J> --> <DOF,I,DOF,J>
 
@@ -788,15 +801,16 @@ class TProductShapeEvaluator<3, DOF, NIP, real_t>
    template <int D1, int D2, bool Add,
              typename qpt_layout_t, typename qpt_data_t,
              typename D_layout_t, typename D_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Assemble(const qpt_layout_t &qpt_layout,
                  const qpt_data_t   &qpt_data,
                  const D_layout_t   &D_layout,
                  D_data_t           &D_data) const
    {
       const int NC = qpt_layout_t::dim_2;
-      TTensor4<DOF,NIP*NIP,DOF,NC> A1;
-      TTensor4<DOF,DOF*NIP,DOF,DOF*NC> A2;
+      typedef typename qpt_data_t::data_type entry_type;
+      TTensor4<DOF,NIP*NIP,DOF,NC,entry_type> A1;
+      TTensor4<DOF,DOF*NIP,DOF,DOF*NC,entry_type> A2;
 
       // Using TensorAssemble: <I,NIP,J> --> <DOF,I,DOF,J>
 
@@ -824,7 +838,7 @@ class TProductShapeEvaluator<3, DOF, NIP, real_t>
 #if 0
    template <typename qpt_layout_t, typename qpt_data_t,
              typename D_layout_t, typename D_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void Assemble(int D1, int D2,
                  const qpt_layout_t &qpt_layout,
                  const qpt_data_t   &qpt_data,
@@ -859,12 +873,12 @@ class TProductShapeEvaluator<3, DOF, NIP, real_t>
    }
 #endif
 
-   // Multi-component assemble of grad-grad element matrices.
-   // qpt_layout is (TNIP x DIM x DIM x NumComp), and
-   // D_layout is (TDOF x TDOF x NumComp).
+   /** @brief Multi-component assemble of grad-grad element matrices.
+       qpt_layout is (TNIP x DIM x DIM x NumComp), and
+       D_layout is (TDOF x TDOF x NumComp). */
    template <typename qpt_layout_t, typename qpt_data_t,
              typename D_layout_t, typename D_data_t>
-   MFEM_ALWAYS_INLINE
+   inline MFEM_ALWAYS_INLINE
    void AssembleGradGrad(const qpt_layout_t &qpt_layout,
                          const qpt_data_t   &qpt_data,
                          const D_layout_t   &D_layout,
@@ -895,7 +909,7 @@ class TProductShapeEvaluator<3, DOF, NIP, real_t>
    }
 };
 
-// ShapeEvaluator with tensor-product structure in any dimension
+/// ShapeEvaluator with tensor-product structure in any dimension
 template <class FE, class IR, typename real_t>
 class ShapeEvaluator_base<FE, IR, true, real_t>
    : public TProductShapeEvaluator<FE::dim, FE::dofs_1d, IR::qpts_1d, real_t>
@@ -921,7 +935,7 @@ class ShapeEvaluator_base<FE, IR, true, real_t>
    // default copy constructor
 };
 
-// General ShapeEvaluator for any scalar FE type (L2 or H1)
+/// General ShapeEvaluator for any scalar FE type (L2 or H1)
 template <class FE, class IR, typename real_t>
 class ShapeEvaluator
    : public ShapeEvaluator_base<FE,IR,FE::tensor_prod && IR::tensor_prod,real_t>
@@ -946,8 +960,9 @@ class ShapeEvaluator
 };
 
 
-// Field evaluators -- values of a given global FE grid function
-
+/** @brief Field evaluators -- values of a given global FE grid function
+    This is roughly speaking a templated version of GridFunction
+*/
 template <typename FESpace_t, typename VecLayout_t, typename IR,
           typename complex_t, typename real_t>
 class FieldEvaluator_base
@@ -960,7 +975,7 @@ class FieldEvaluator_base
    ShapeEval_type  shapeEval;
    VecLayout_t     vec_layout;
 
-   // With this constructor, fespace is a shallow copy.
+   /// With this constructor, fespace is a shallow copy.
    inline MFEM_ALWAYS_INLINE
    FieldEvaluator_base(const FESpace_t &tfes, const ShapeEval_type &shape_eval,
                        const VecLayout_t &vec_layout)
@@ -969,14 +984,14 @@ class FieldEvaluator_base
         vec_layout(vec_layout)
    { }
 
-   // This constructor creates new fespace, not a shallow copy.
+   /// This constructor creates new fespace, not a shallow copy.
    inline MFEM_ALWAYS_INLINE
    FieldEvaluator_base(const FE_type &fe, const FiniteElementSpace &fes)
       : fespace(fe, fes), shapeEval(fe), vec_layout(fes)
    { }
 };
 
-// complex_t - dof/qpt data type, real_t - ShapeEvaluator (FE basis) data type
+/// complex_t - dof/qpt data type, real_t - ShapeEvaluator (FE basis) data type
 template <typename FESpace_t, typename VecLayout_t, typename IR,
           typename complex_t = double, typename real_t = double>
 class FieldEvaluator
@@ -1009,7 +1024,7 @@ class FieldEvaluator
    complex_t       *data_out;
 
 public:
-   // With this constructor, fespace is a shallow copy of tfes.
+   /// With this constructor, fespace is a shallow copy of tfes.
    inline MFEM_ALWAYS_INLINE
    FieldEvaluator(const FESpace_t &tfes, const ShapeEval_type &shape_eval,
                   const VecLayout_type &vec_layout,
@@ -1019,7 +1034,7 @@ class FieldEvaluator
         data_out(global_data_out)
    { }
 
-   // With this constructor, fespace is a shallow copy of f.fespace.
+   /// With this constructor, fespace is a shallow copy of f.fespace.
    inline MFEM_ALWAYS_INLINE
    FieldEvaluator(const FieldEvaluator &f,
                   const complex_t *global_data_in, complex_t *global_data_out)
@@ -1028,7 +1043,7 @@ class FieldEvaluator
         data_out(global_data_out)
    { }
 
-   // This constructor creates a new fespace, not a shallow copy.
+   /// This constructor creates a new fespace, not a shallow copy.
    inline MFEM_ALWAYS_INLINE
    FieldEvaluator(const FiniteElementSpace &fes,
                   const complex_t *global_data_in, complex_t *global_data_out)
@@ -1049,25 +1064,25 @@ class FieldEvaluator
       fespace.SetElement(el);
    }
 
-   // val_layout_t is (qpts x vdim x NE)
+   /// val_layout_t is (qpts x vdim x NE)
    template <typename val_layout_t, typename val_data_t>
    inline MFEM_ALWAYS_INLINE
    void GetValues(int el, const val_layout_t &l, val_data_t &vals)
    {
       const int ne = val_layout_t::dim_3;
-      TTensor3<dofs,vdim,ne,complex_type> val_dofs;
+      TTensor3<dofs,vdim,ne,typename val_data_t::data_type> val_dofs;
       SetElement(el);
       fespace.VectorExtract(vec_layout, data_in, val_dofs.layout, val_dofs);
       shapeEval.Calc(val_dofs.layout.merge_23(), val_dofs, l.merge_23(), vals);
    }
 
-   // grad_layout_t is (qpts x dim x vdim x NE)
+   /// grad_layout_t is (qpts x dim x vdim x NE)
    template <typename grad_layout_t, typename grad_data_t>
    inline MFEM_ALWAYS_INLINE
    void GetGradients(int el, const grad_layout_t &l, grad_data_t &grad)
    {
       const int ne = grad_layout_t::dim_4;
-      TTensor3<dofs,vdim,ne,complex_type> val_dofs;
+      TTensor3<dofs,vdim,ne,typename grad_data_t::data_type> val_dofs;
       SetElement(el);
       fespace.VectorExtract(vec_layout, data_in, val_dofs.layout, val_dofs);
       shapeEval.CalcGrad(val_dofs.layout.merge_23(), val_dofs,
@@ -1112,23 +1127,25 @@ class FieldEvaluator
 #ifdef MFEM_TEMPLATE_ENABLE_SERIALIZE
    template <typename DataType>
    inline MFEM_ALWAYS_INLINE
-   void EvalSerialized(const complex_t *loc_dofs, DataType &F)
+   void EvalSerialized(const typename DataType::vcomplex_t *loc_dofs,
+                       DataType &F)
    {
       Action<DataType::InData,true>::EvalSerialized(*this, loc_dofs, F);
    }
 
    template <bool Add, typename DataType>
    inline MFEM_ALWAYS_INLINE
-   void AssembleSerialized(const DataType &F, complex_t *loc_dofs)
+   void AssembleSerialized(const DataType &F,
+                           typename DataType::vcomplex_t *loc_dofs)
    {
       Action<DataType::OutData,true>::
       template AssembleSerialized<Add>(*this, F, loc_dofs);
    }
 #endif
 
-   // Enumeration for the data type used by the Eval() and Assemble() methods.
-   // The types can obtained by summing constants from this enumeration and used
-   // as a template parameter in struct Data.
+   /** @brief Enumeration for the data type used by the Eval() and Assemble() methods.
+       The types can be obtained by summing constants from this enumeration and used
+       as a template parameter in struct Data. */
    enum InOutData
    {
       None      = 0,
@@ -1136,65 +1153,72 @@ class FieldEvaluator
       Gradients = 2
    };
 
-   // Auxiliary templated struct AData, used by the Eval() and Assemble()
-   // methods. The template parameter IOData is "bitwise or" of constants from
-   // the enum InOutData. The parameter NE is the number of elements to be
-   // processed in the Eval() and Assemble() methods.
-   template<int IOData, int NE> struct AData;
+   /** @brief  Auxiliary templated struct AData, used by the Eval() and Assemble()
+       methods.
+
+       The template parameter IOData is "bitwise or" of constants from
+       the enum InOutData. The parameter NE is the number of elements to be
+       processed in the Eval() and Assemble() methods. */
+   template<int IOData, typename impl_traits_t> struct AData;
 
-   template <int NE> struct AData<0,NE> // 0 = None
+   template <typename it_t> struct AData<0,it_t> // 0 = None
    {
       // Do we need this?
    };
 
-   template <int NE> struct AData<1,NE> // 1 = Values
+   template <typename it_t> struct AData<1,it_t> // 1 = Values
    {
+      static const int ne = it_t::batch_size;
+      typedef typename it_t::vcomplex_t vcomplex_t;
 #ifdef MFEM_TEMPLATE_FIELD_EVAL_DATA_HAS_DOFS
-      typedef TTensor3<dofs,vdim,NE,complex_t,true> val_dofs_t;
+      typedef TTensor3<dofs,vdim,ne,vcomplex_t,true> val_dofs_t;
       val_dofs_t val_dofs;
 #else
-      typedef TTensor3<dofs,vdim,NE,complex_t> val_dofs_t;
+      typedef TTensor3<dofs,vdim,ne,vcomplex_t> val_dofs_t;
 #endif
-      TTensor3<qpts,vdim,NE,complex_t>      val_qpts;
+      TTensor3<qpts,vdim,ne,vcomplex_t>      val_qpts;
    };
 
-   template <int NE> struct AData<2,NE> // 2 = Gradients
+   template <typename it_t> struct AData<2,it_t> // 2 = Gradients
    {
+      static const int ne = it_t::batch_size;
+      typedef typename it_t::vcomplex_t vcomplex_t;
 #ifdef MFEM_TEMPLATE_FIELD_EVAL_DATA_HAS_DOFS
-      typedef TTensor3<dofs,vdim,NE,complex_t,true> val_dofs_t;
+      typedef TTensor3<dofs,vdim,ne,vcomplex_t,true> val_dofs_t;
       val_dofs_t val_dofs;
 #else
-      typedef TTensor3<dofs,vdim,NE,complex_t> val_dofs_t;
+      typedef TTensor3<dofs,vdim,ne,vcomplex_t> val_dofs_t;
 #endif
-      TTensor4<qpts,dim,vdim,NE,complex_t>      grad_qpts;
+      TTensor4<qpts,dim,vdim,ne,vcomplex_t>      grad_qpts;
    };
 
-   template <int NE> struct AData<3,NE> // 3 = Values+Gradients
+   template <typename it_t> struct AData<3,it_t> // 3 = Values+Gradients
    {
+      static const int ne = it_t::batch_size;
+      typedef typename it_t::vcomplex_t vcomplex_t;
 #ifdef MFEM_TEMPLATE_FIELD_EVAL_DATA_HAS_DOFS
-      typedef TTensor3<dofs,vdim,NE,complex_t,true> val_dofs_t;
+      typedef TTensor3<dofs,vdim,ne,vcomplex_t,true> val_dofs_t;
       val_dofs_t val_dofs;
 #else
-      typedef TTensor3<dofs,vdim,NE,complex_t> val_dofs_t;
+      typedef TTensor3<dofs,vdim,ne,vcomplex_t> val_dofs_t;
 #endif
-      TTensor3<qpts,    vdim,NE,complex_t,true>  val_qpts;
-      TTensor4<qpts,dim,vdim,NE,complex_t>      grad_qpts;
+      TTensor3<qpts,    vdim,ne,vcomplex_t,true>  val_qpts;
+      TTensor4<qpts,dim,vdim,ne,vcomplex_t>      grad_qpts;
    };
 
-   // This struct is similar to struct AData, adding separate static data
-   // members for the input (InData) and output (OutData) data types.
-   template <int IData, int OData, int NE>
-   struct BData : public AData<IData|OData,NE>
+   /** @brief This struct is similar to struct AData, adding separate static data
+       members for the input (InData) and output (OutData) data types. */
+   template <int IData, int OData, typename it_t>
+   struct BData : public AData<IData|OData,it_t>
    {
       typedef T_type eval_type;
-      static const int ne = NE;
       static const int InData = IData;
       static const int OutData = OData;
    };
 
-   // This struct implements the input (Eval, EvalSerialized) and output
-   // (Assemble, AssembleSerialized) operations for the given Ops.
-   // Ops is "bitwise or" of constants from the enum InOutData.
+   /** @brief This struct implements the input (Eval, EvalSerialized) and output
+       (Assemble, AssembleSerialized) operations for the given Ops.
+       Ops is "bitwise or" of constants from the enum InOutData. */
    template <int Ops, bool dummy> struct Action;
 
    template <bool dummy> struct Action<0,dummy> // 0 = None
@@ -1238,7 +1262,9 @@ class FieldEvaluator
 #ifdef MFEM_TEMPLATE_ENABLE_SERIALIZE
       template <typename AData_t>
       static inline MFEM_ALWAYS_INLINE
-      void EvalSerialized(T_type &T, const complex_t *loc_dofs, AData_t &D)
+      void EvalSerialized(T_type &T,
+                          const typename AData_t::vcomplex_t *loc_dofs,
+                          AData_t &D)
       {
          T.shapeEval.Calc(AData_t::val_dofs_t::layout.merge_23(), loc_dofs,
                           D.val_qpts.layout.merge_23(), D.val_qpts);
@@ -1246,7 +1272,8 @@ class FieldEvaluator
 
       template <bool Add, typename AData_t>
       static inline MFEM_ALWAYS_INLINE
-      void AssembleSerialized(T_type &T, const AData_t &D, complex_t *loc_dofs)
+      void AssembleSerialized(T_type &T, const AData_t &D,
+                              typename AData_t::vcomplex_t *loc_dofs)
       {
          T.shapeEval.template CalcT<Add>(
             D.val_qpts.layout.merge_23(), D.val_qpts,
@@ -1291,7 +1318,9 @@ class FieldEvaluator
 #ifdef MFEM_TEMPLATE_ENABLE_SERIALIZE
       template <typename AData_t>
       static inline MFEM_ALWAYS_INLINE
-      void EvalSerialized(T_type &T, const complex_t *loc_dofs, AData_t &D)
+      void EvalSerialized(T_type &T,
+                          const typename AData_t::vcomplex_t *loc_dofs,
+                          AData_t &D)
       {
          T.shapeEval.CalcGrad(AData_t::val_dofs_t::layout.merge_23(), loc_dofs,
                               D.grad_qpts.layout.merge_34(), D.grad_qpts);
@@ -1299,7 +1328,8 @@ class FieldEvaluator
 
       template <bool Add, typename AData_t>
       static inline MFEM_ALWAYS_INLINE
-      void AssembleSerialized(T_type &T, const AData_t &D, complex_t *loc_dofs)
+      void AssembleSerialized(T_type &T, const AData_t &D,
+                              typename AData_t::vcomplex_t *loc_dofs)
       {
          T.shapeEval.template CalcGradT<Add>(
             D.grad_qpts.layout.merge_34(), D.grad_qpts,
@@ -1349,7 +1379,9 @@ class FieldEvaluator
 #ifdef MFEM_TEMPLATE_ENABLE_SERIALIZE
       template <typename AData_t>
       static inline MFEM_ALWAYS_INLINE
-      void EvalSerialized(T_type &T, const complex_t *loc_dofs, AData_t &D)
+      void EvalSerialized(T_type &T,
+                          const typename AData_t::vcomplex_t *loc_dofs,
+                          AData_t &D)
       {
          T.shapeEval.Calc(AData_t::val_dofs_t::layout.merge_23(), loc_dofs,
                           D.val_qpts.layout.merge_23(), D.val_qpts);
@@ -1359,7 +1391,8 @@ class FieldEvaluator
 
       template <bool Add, typename AData_t>
       static inline MFEM_ALWAYS_INLINE
-      void AssembleSerialized(T_type &T, const AData_t &D, complex_t *loc_dofs)
+      void AssembleSerialized(T_type &T, const AData_t &D,
+                              typename AData_t::vcomplex_t *loc_dofs)
       {
          T.shapeEval.template CalcT<Add>(
             D.val_qpts.layout.merge_23(), D.val_qpts,
@@ -1371,14 +1404,15 @@ class FieldEvaluator
 #endif
    };
 
-   // This struct implements element matrix computation for some combinations
-   // of input (InOps) and output (OutOps) operations.
-   template <int InOps, int OutOps, int NE> struct TElementMatrix;
+   /** @brief This struct implements element matrix computation for some combinations
+       of input (InOps) and output (OutOps) operations. */
+   template <int InOps, int OutOps, typename it_t> struct TElementMatrix;
 
-   template <int NE> struct TElementMatrix<1,1,NE> // 1,1 = Values,Values
+   // Case 1,1 = Values,Values
+   template <typename it_t> struct TElementMatrix<1,1,it_t>
    {
       // qpt_layout_t is (nip), M_layout_t is (dof x dof)
-      // NE = 1 is assumed
+      // it_t::batch_size = 1 is assumed
       template <typename qpt_layout_t, typename qpt_data_t,
                 typename M_layout_t, typename M_data_t>
       static inline MFEM_ALWAYS_INLINE
@@ -1390,10 +1424,18 @@ class FieldEvaluator
       }
    };
 
-   template <int NE> struct TElementMatrix<2,2,NE> // 2,2 = Gradients,Gradients
+   // Case 2,2 = Gradients,Gradients
+   template <typename it_t> struct TElementMatrix<2,2,it_t>
    {
-      // qpt_layout_t is (nip x dim x dim), M_layout_t is (dof x dof)
-      // NE = 1 is assumed
+      /** @brief Assemble element mass matrix
+          @param a the layout for the quadrature point data
+          @param A given quadrature point data for element (incl. coefficient,
+                 geometry)
+          @param m the layout for the resulting element mass matrix
+          @param M the resulting element mass matrix
+          @param ev the shape evaluator
+          qpt_layout_t is (nip), M_layout_t is (dof x dof)
+          NE = 1 is assumed */
       template <typename qpt_layout_t, typename qpt_data_t,
                 typename M_layout_t, typename M_data_t>
       static inline MFEM_ALWAYS_INLINE
@@ -1405,15 +1447,15 @@ class FieldEvaluator
       }
    };
 
-   template <typename kernel_t, int NE> struct Spec
+   template <typename kernel_t, typename impl_traits_t> struct Spec
    {
       static const int InData =
          Values*kernel_t::in_values + Gradients*kernel_t::in_gradients;
       static const int OutData =
          Values*kernel_t::out_values + Gradients*kernel_t::out_gradients;
 
-      typedef BData<InData,OutData,NE>          DataType;
-      typedef TElementMatrix<InData,OutData,NE> ElementMatrix;
+      typedef BData<InData,OutData,impl_traits_t>          DataType;
+      typedef TElementMatrix<InData,OutData,impl_traits_t> ElementMatrix;
    };
 };
 
diff --git a/fem/tfe.hpp b/fem/tfe.hpp
index 8a71866062c..f3d27fd876f 100644
--- a/fem/tfe.hpp
+++ b/fem/tfe.hpp
@@ -20,6 +20,18 @@ namespace mfem
 
 // Templated finite element classes, cf. fe.?pp
 
+/** @brief Store mass-like matrix B for each integration point on the reference
+    element.
+    For tensor product evaluation, this is only called on the 1D reference
+    element, and higher dimensions are put together from that.
+    The element mass matrix can be written \f$ M_E = B^T D_E B \f$ where the B
+    built here is the B, and is unchanging across the mesh. The diagonal matrix
+    \f$ D_E \f$ then contains all the element-specific geometry and physics data.
+    @param fe the element we are calculating on
+    @param ir the integration rule to calculate the shape matrix on
+    @param B must be (nip x dof) with column major storage
+    @param dof_map the inverse of dof_map is applied to reorder local dofs.
+*/
 template <typename real_t>
 void CalcShapeMatrix(const FiniteElement &fe, const IntegrationRule &ir,
                      real_t *B, const Array<int> *dof_map = NULL)
@@ -41,6 +53,23 @@ void CalcShapeMatrix(const FiniteElement &fe, const IntegrationRule &ir,
    }
 }
 
+/** @brief store gradient matrix G for each integration point on the reference
+    element.
+    For tensor product evaluation, this is only called on the 1D reference
+    element, and higher dimensions are put together from that.
+    The element stiffness matrix can be written
+    \f[
+       S_E = \sum_{k=1}^{nq} G_{k,i}^T (D_E^G)_{k,k} G_{k,j}
+    \f]
+    where \f$ nq \f$ is the number of quadrature points, \f$ D_E^G \f$ contains
+    all the information about the element geometry and coefficients (Jacobians
+    etc.), and \f$ G \f$ is the matrix built in this routine, which is the same
+    for all elements in a mesh.
+    @param fe the element we are calculating on
+    @param ir the integration rule to calculate the gradients on
+    @param[out] G must be (nip x dim x dof) with column major storage
+    @param[in] dof_map the inverse of dof_map is applied to reorder local dofs.
+*/
 template <typename real_t>
 void CalcGradTensor(const FiniteElement &fe, const IntegrationRule &ir,
                     real_t *G, const Array<int> *dof_map = NULL)
diff --git a/fem/tfespace.hpp b/fem/tfespace.hpp
index 0cbb53b4e02..2f27eda3d1b 100644
--- a/fem/tfespace.hpp
+++ b/fem/tfespace.hpp
@@ -114,18 +114,23 @@ class TFiniteElementSpace_simple
 public:
    typedef FE        FE_type;
    typedef IndexType index_type;
+   static const int dofs = FE::dofs;
 
 protected:
    index_type ind;
+   int num_elems, remain_elems;
 
 public:
    TFiniteElementSpace_simple(const FE &fe, const FiniteElementSpace &fes)
-      : ind(fe, fes) { }
+      : ind(fe, fes), num_elems(fes.GetNE()), remain_elems(num_elems) { }
 
    // default copy constructor
 
-   void SetElement(int el) { ind.SetElement(el); }
+   int GetNE() const { return num_elems; }
 
+   void SetElement(int el) { ind.SetElement(el); remain_elems = num_elems-el; }
+
+#if 0
    // Multi-element Extract:
    // Extract dofs for multiple elements starting with the current element.
    // The number of elements to extract is given by the second dimension of
@@ -137,6 +142,7 @@ class TFiniteElementSpace_simple
                 const dof_layout_t    &dof_layout,
                 dof_data_t            &dof_data) const
    {
+      const int SS = sizeof(dof_data[0])/sizeof(dof_data[0][0]);
       const int NE = dof_layout_t::dim_2;
       MFEM_STATIC_ASSERT(FE::dofs == dof_layout_t::dim_1,
                          "invalid number of dofs");
@@ -144,8 +150,11 @@ class TFiniteElementSpace_simple
       {
          for (int i = 0; i < FE::dofs; i++)
          {
-            Assign<Op>(dof_data[dof_layout.ind(i,j)],
-                       glob_dof_data[ind.map(i,j)]);
+            for (int s = 0; s < SS; s++)
+            {
+               Assign<Op>(dof_data[dof_layout.ind(i,j)][s],
+                          glob_dof_data[ind.map(i,s+SS*j)]);
+            }
          }
       }
    }
@@ -169,6 +178,7 @@ class TFiniteElementSpace_simple
                  const dof_data_t   &dof_data,
                  glob_dof_data_t    &glob_dof_data) const
    {
+      const int SS = sizeof(dof_data[0])/sizeof(dof_data[0][0]);
       const int NE = dof_layout_t::dim_2;
       MFEM_STATIC_ASSERT(FE::dofs == dof_layout_t::dim_1,
                          "invalid number of dofs");
@@ -176,8 +186,11 @@ class TFiniteElementSpace_simple
       {
          for (int i = 0; i < FE::dofs; i++)
          {
-            Assign<Op>(glob_dof_data[ind.map(i,j)],
-                       dof_data[dof_layout.ind(i,j)]);
+            for (int s = 0; s < SS; s++)
+            {
+               Assign<Op>(glob_dof_data[ind.map(i,s+SS*j)],
+                          dof_data[dof_layout.ind(i,j)][s]);
+            }
          }
       }
    }
@@ -191,6 +204,7 @@ class TFiniteElementSpace_simple
    {
       Assemble<AssignOp::Add>(dof_layout, dof_data, glob_dof_data);
    }
+#endif
 
    // Multi-element VectorExtract: vdof_layout is (DOFS x NumComp x NumElems).
    template <AssignOp::Type Op,
@@ -202,21 +216,39 @@ class TFiniteElementSpace_simple
                       const vdof_layout_t    &vdof_layout,
                       vdof_data_t            &vdof_data) const
    {
+      const int SS = sizeof(vdof_data[0])/sizeof(vdof_data[0][0]);
       const int NC = vdof_layout_t::dim_2;
       const int NE = vdof_layout_t::dim_3;
       MFEM_STATIC_ASSERT(FE::dofs == vdof_layout_t::dim_1,
                          "invalid number of dofs");
       MFEM_ASSERT(NC == vl.NumComponents(), "invalid number of components");
+      const int TE = std::min(SS*NE, remain_elems);
+      // const int TE = SS*NE;
       for (int k = 0; k < NC; k++)
       {
+#if 0
          for (int j = 0; j < NE; j++)
          {
             for (int i = 0; i < FE::dofs; i++)
             {
-               Assign<Op>(vdof_data[vdof_layout.ind(i,k,j)],
-                          glob_vdof_data[vl.ind(ind.map(i,j), k)]);
+               for (int s = 0; s < SS; s++)
+               {
+                  Assign<Op>(vdof_data[vdof_layout.ind(i,k,j)][s],
+                             glob_vdof_data[vl.ind(ind.map(i,s+SS*j), k)]);
+               }
             }
          }
+#else
+         for (int js = 0; js < TE; js++)
+         {
+            for (int i = 0; i < FE::dofs; i++)
+            {
+               const int s = js % SS, j = js / SS;
+               Assign<Op>(vdof_data[vdof_layout.ind(i,k,j)][s],
+                          glob_vdof_data[vl.ind(ind.map(i,js), k)]);
+            }
+         }
+#endif
       }
    }
 
@@ -241,21 +273,39 @@ class TFiniteElementSpace_simple
                        const vec_layout_t  &vl,
                        glob_vdof_data_t    &glob_vdof_data) const
    {
+      const int SS = sizeof(vdof_data[0])/sizeof(vdof_data[0][0]);
       const int NC = vdof_layout_t::dim_2;
       const int NE = vdof_layout_t::dim_3;
       MFEM_STATIC_ASSERT(FE::dofs == vdof_layout_t::dim_1,
                          "invalid number of dofs");
       MFEM_ASSERT(NC == vl.NumComponents(), "invalid number of components");
+      const int TE = std::min(SS*NE, remain_elems);
+      // const int TE = SS*NE;
       for (int k = 0; k < NC; k++)
       {
+#if 0
          for (int j = 0; j < NE; j++)
          {
             for (int i = 0; i < FE::dofs; i++)
             {
-               Assign<Op>(glob_vdof_data[vl.ind(ind.map(i,j), k)],
-                          vdof_data[vdof_layout.ind(i,k,j)]);
+               for (int s = 0; s < SS; s++)
+               {
+                  Assign<Op>(glob_vdof_data[vl.ind(ind.map(i,s+SS*j), k)],
+                             vdof_data[vdof_layout.ind(i,k,j)][s]);
+               }
+            }
+         }
+#else
+         for (int js = 0; js < TE; js++)
+         {
+            for (int i = 0; i < FE::dofs; i++)
+            {
+               const int s = js % SS, j = js / SS;
+               Assign<Op>(glob_vdof_data[vl.ind(ind.map(i,js), k)],
+                          vdof_data[vdof_layout.ind(i,k,j)][s]);
             }
          }
+#endif
       }
    }
 
@@ -282,21 +332,24 @@ class TFiniteElementSpace_simple
                           const vdof_layout_t    &vdof_layout,
                           vdof_data_t            &vdof_data) const
    {
+      const int SS = sizeof(vdof_data[0])/sizeof(vdof_data[0][0]);
       const int NC = vdof_layout_t::dim_2;
       const int NE = vdof_layout_t::dim_3;
+      const int TE = std::min(SS*NE, remain_elems);
       MFEM_STATIC_ASSERT(FE::dofs == vdof_layout_t::dim_1,
                          "invalid number of dofs");
       MFEM_ASSERT(first_comp + NC <= vl.NumComponents(),
                   "invalid number of components");
       for (int k = 0; k < NC; k++)
       {
-         for (int j = 0; j < NE; j++)
+         for (int js = 0; js < TE; js++)
          {
             for (int i = 0; i < FE::dofs; i++)
             {
+               const int s = js % SS, j = js / SS;
                Assign<AssignOp::Set>(
-                  vdof_data[vdof_layout.ind(i,k,j)],
-                  glob_vdof_data[vl.ind(ind.map(i,j), first_comp+k)]);
+                  vdof_data[vdof_layout.ind(i,k,j)][s],
+                  glob_vdof_data[vl.ind(ind.map(i,js), first_comp+k)]);
             }
          }
       }
@@ -314,55 +367,69 @@ class TFiniteElementSpace_simple
                            const vec_layout_t  &vl,
                            glob_vdof_data_t    &glob_vdof_data) const
    {
+      const int SS = sizeof(vdof_data[0])/sizeof(vdof_data[0][0]);
       const int NC = vdof_layout_t::dim_2;
       const int NE = vdof_layout_t::dim_3;
+      const int TE = std::min(SS*NE, remain_elems);
       MFEM_STATIC_ASSERT(FE::dofs == vdof_layout_t::dim_1,
                          "invalid number of dofs");
       MFEM_ASSERT(first_comp + NC <= vl.NumComponents(),
                   "invalid number of components");
       for (int k = 0; k < NC; k++)
       {
-         for (int j = 0; j < NE; j++)
+         for (int js = 0; js < TE; js++)
          {
             for (int i = 0; i < FE::dofs; i++)
             {
+               const int s = js % SS, j = js / SS;
                Assign<AssignOp::Add>(
-                  glob_vdof_data[vl.ind(ind.map(i,j), first_comp+k)],
-                  vdof_data[vdof_layout.ind(i,k,j)]);
+                  glob_vdof_data[vl.ind(ind.map(i,js), first_comp+k)],
+                  vdof_data[vdof_layout.ind(i,k,j)][s]);
             }
          }
       }
    }
 
-   void Assemble(const TMatrix<FE::dofs,FE::dofs,double> &m,
+   template <typename vcomplex_t>
+   void Assemble(const TMatrix<FE::dofs,FE::dofs,vcomplex_t> &m,
                  SparseMatrix &M) const
    {
+      const int SS = sizeof(m[0])/sizeof(m[0][0]);
+      const int TE = std::min(SS, remain_elems);
       MFEM_FLOPS_ADD(FE::dofs*FE::dofs);
-      for (int i = 0; i < FE::dofs; i++)
+      for (int s = 0; s < TE; s++)
       {
-         M.SetColPtr(ind.map(i,0));
-         for (int j = 0; j < FE::dofs; j++)
+         for (int i = 0; i < FE::dofs; i++)
          {
-            M._Add_(ind.map(j,0), m(i,j));
+            M.SetColPtr(ind.map(i,s));
+            for (int j = 0; j < FE::dofs; j++)
+            {
+               M._Add_(ind.map(j,s), m(i,j)[s]);
+            }
+            M.ClearColPtr();
          }
-         M.ClearColPtr();
       }
    }
 
-   template <typename vec_layout_t>
+   template <typename vec_layout_t, typename vcomplex_t>
    void AssembleBlock(int block_i, int block_j, const vec_layout_t &vl,
-                      const TMatrix<FE::dofs,FE::dofs,double> &m,
+                      const TMatrix<FE::dofs,FE::dofs,vcomplex_t> &m,
                       SparseMatrix &M) const
    {
+      const int SS = sizeof(m[0])/sizeof(m[0][0]);
+      const int TE = std::min(SS, remain_elems);
       MFEM_FLOPS_ADD(FE::dofs*FE::dofs);
-      for (int i = 0; i < FE::dofs; i++)
+      for (int s = 0; s < TE; s++)
       {
-         M.SetColPtr(vl.ind(ind.map(i,0), block_i));
-         for (int j = 0; j < FE::dofs; j++)
+         for (int i = 0; i < FE::dofs; i++)
          {
-            M._Add_(vl.ind(ind.map(j,0), block_j), m(i,j));
+            M.SetColPtr(vl.ind(ind.map(i,s), block_i));
+            for (int j = 0; j < FE::dofs; j++)
+            {
+               M._Add_(vl.ind(ind.map(j,s), block_j), m(i,j)[s]);
+            }
+            M.ClearColPtr();
          }
-         M.ClearColPtr();
       }
    }
 };
diff --git a/fem/tmop.cpp b/fem/tmop.cpp
index 91d714c92ed..a8957ad99cd 100644
--- a/fem/tmop.cpp
+++ b/fem/tmop.cpp
@@ -843,7 +843,20 @@ void TargetConstructor::ComputeAvgVolume() const
 #endif
 }
 
-// virtual method
+bool TargetConstructor::ContainsVolumeInfo() const
+{
+   switch (target_type)
+   {
+      case IDEAL_SHAPE_UNIT_SIZE: return false;
+      case IDEAL_SHAPE_EQUAL_SIZE:
+      case IDEAL_SHAPE_GIVEN_SIZE:
+      case GIVEN_SHAPE_AND_SIZE:
+      case GIVEN_FULL: return true;
+      default: MFEM_ABORT("TargetType not added to ContainsVolumeInfo.");
+   }
+   return false;
+}
+
 void TargetConstructor::ComputeElementTargets(int e_id, const FiniteElement &fe,
                                               const IntegrationRule &ir,
                                               const Vector &elfun,
@@ -941,6 +954,7 @@ void AnalyticAdaptTC::ComputeElementTargets(int e_id, const FiniteElement &fe,
          IsoparametricTransformation Tpr;
          Tpr.SetFE(&fe);
          Tpr.ElementNo = e_id;
+         Tpr.ElementType = ElementTransformation::ELEMENT;
          Tpr.GetPointMat().Transpose(point_mat);
 
          for (int i = 0; i < ir.GetNPoints(); i++)
@@ -957,42 +971,176 @@ void AnalyticAdaptTC::ComputeElementTargets(int e_id, const FiniteElement &fe,
 }
 
 #ifdef MFEM_USE_MPI
-void DiscreteAdaptTC::SetParDiscreteTargetSpec(ParGridFunction &tspec_)
+void DiscreteAdaptTC::FinalizeParDiscreteTargetSpec(const ParGridFunction
+                                                    &tspec_)
 {
-   tspec.SetSize(tspec_.Size());
-   tspec = tspec_;
-   tspec_fes   = tspec_.FESpace();
+   MFEM_VERIFY(adapt_eval, "SetAdaptivityEvaluator() has not been called!")
+   MFEM_VERIFY(ncomp > 0, "No target specifications have been set!");
 
-   if (!adapt_eval) { MFEM_ABORT("Set adaptivity evaluator\n"); }
+   ParFiniteElementSpace *ptspec_fes = tspec_.ParFESpace();
 
-   adapt_eval->SetParMetaInfo(*tspec_.ParFESpace()->GetParMesh(),
-                              *tspec_.FESpace()->FEColl(),
-                              tspec_.FESpace()->GetVDim());
-
-   adapt_eval->SetInitialField
-   (*tspec_.FESpace()->GetMesh()->GetNodes(), tspec);
+   adapt_eval->SetParMetaInfo(*ptspec_fes->GetParMesh(),
+                              *ptspec_fes->FEColl(), ncomp);
+   adapt_eval->SetInitialField(*tspec_fes->GetMesh()->GetNodes(), tspec);
 
    tspec_sav = tspec;
+
+   delete tspec_fesv;
+   tspec_fesv = new FiniteElementSpace(tspec_fes->GetMesh(),
+                                       tspec_fes->FEColl(), ncomp);
+}
+
+void DiscreteAdaptTC::SetTspecAtIndex(int idx, const ParGridFunction &tspec_)
+{
+   const int vdim     = tspec_.FESpace()->GetVDim(),
+             dof_cnt  = tspec_.Size()/vdim;
+   for (int i = 0; i < dof_cnt*vdim; i++)
+   {
+      tspec(i+idx*dof_cnt) = tspec_(i);
+   }
+
+   FinalizeParDiscreteTargetSpec(tspec_);
+}
+
+void DiscreteAdaptTC::SetParDiscreteTargetSize(const ParGridFunction &tspec_)
+{
+   if (sizeidx > -1) { SetTspecAtIndex(sizeidx, tspec_); return; }
+   sizeidx = ncomp;
+   SetDiscreteTargetBase(tspec_);
+   FinalizeParDiscreteTargetSpec(tspec_);
+}
+
+void DiscreteAdaptTC::SetParDiscreteTargetSkew(const ParGridFunction &tspec_)
+{
+   if (skewidx > -1) { SetTspecAtIndex(skewidx, tspec_); return; }
+   skewidx = ncomp;
+   SetDiscreteTargetBase(tspec_);
+   FinalizeParDiscreteTargetSpec(tspec_);
+}
+
+void DiscreteAdaptTC::SetParDiscreteTargetAspectRatio(const ParGridFunction
+                                                      &tspec_)
+{
+   if (aspectratioidx > -1) { SetTspecAtIndex(aspectratioidx, tspec_); return; }
+   aspectratioidx = ncomp;
+   SetDiscreteTargetBase(tspec_);
+   FinalizeParDiscreteTargetSpec(tspec_);
+}
+
+void DiscreteAdaptTC::SetParDiscreteTargetOrientation(const ParGridFunction
+                                                      &tspec_)
+{
+   if (orientationidx > -1) { SetTspecAtIndex(orientationidx, tspec_); return; }
+   orientationidx = ncomp;
+   SetDiscreteTargetBase(tspec_);
+   FinalizeParDiscreteTargetSpec(tspec_);
+}
+
+void DiscreteAdaptTC::SetParDiscreteTargetSpec(const ParGridFunction &tspec_)
+{
+   SetParDiscreteTargetSize(tspec_);
+   FinalizeParDiscreteTargetSpec(tspec_);
 }
 #endif
 
-void DiscreteAdaptTC::SetSerialDiscreteTargetSpec(GridFunction &tspec_)
+void DiscreteAdaptTC::SetDiscreteTargetBase(const GridFunction &tspec_)
 {
-   tspec.SetSize(tspec_.Size());
-   tspec = tspec_;
-   tspec_fes   = tspec_.FESpace();
+   const int vdim     = tspec_.FESpace()->GetVDim(),
+             dof_cnt  = tspec_.Size()/vdim;
 
-   if (!adapt_eval) { MFEM_ABORT("Set adaptivity evaluator\n"); }
+   ncomp += vdim;
 
-   adapt_eval->SetSerialMetaInfo(*tspec_.FESpace()->GetMesh(),
-                                 *tspec_.FESpace()->FEColl(),
-                                 tspec_.FESpace()->GetVDim());
-   adapt_eval->SetInitialField
-   (*tspec_.FESpace()->GetMesh()->GetNodes(), tspec);
+   delete tspec_fes;
+   tspec_fes = new FiniteElementSpace(tspec_.FESpace()->GetMesh(),
+                                      tspec_.FESpace()->FEColl(), 1);
+
+   // need to append data to tspec
+   // make a copy of tspec->tspec_temp, increase its size, and
+   // copy data from tspec_temp -> tspec, then add new entries
+   Vector tspec_temp = tspec;
+   tspec.SetSize(ncomp*dof_cnt);
+
+   for (int i = 0; i < tspec_temp.Size(); i++)
+   {
+      tspec(i) = tspec_temp(i);
+   }
+
+   for (int i = 0; i < dof_cnt*vdim; i++)
+   {
+      tspec(i+(ncomp-vdim)*dof_cnt) = tspec_(i);
+   }
+}
+
+void DiscreteAdaptTC::SetTspecAtIndex(int idx, const GridFunction &tspec_)
+{
+   const int vdim     = tspec_.FESpace()->GetVDim(),
+             dof_cnt  = tspec_.Size()/vdim;
+   for (int i = 0; i < dof_cnt*vdim; i++)
+   {
+      tspec(i+idx*dof_cnt) = tspec_(i);
+   }
+
+   FinalizeSerialDiscreteTargetSpec();
+}
+
+void DiscreteAdaptTC::SetSerialDiscreteTargetSize(const GridFunction &tspec_)
+{
+
+   if (sizeidx > -1) { SetTspecAtIndex(sizeidx, tspec_); return; }
+   sizeidx = ncomp;
+   SetDiscreteTargetBase(tspec_);
+   FinalizeSerialDiscreteTargetSpec();
+}
+
+void DiscreteAdaptTC::SetSerialDiscreteTargetSkew(const GridFunction &tspec_)
+{
+   if (skewidx > -1) { SetTspecAtIndex(skewidx, tspec_); return; }
+   skewidx = ncomp;
+   SetDiscreteTargetBase(tspec_);
+   FinalizeSerialDiscreteTargetSpec();
+}
+
+void DiscreteAdaptTC::SetSerialDiscreteTargetAspectRatio(
+   const GridFunction &tspec_)
+{
+   if (aspectratioidx > -1) { SetTspecAtIndex(aspectratioidx, tspec_); return; }
+   aspectratioidx = ncomp;
+   SetDiscreteTargetBase(tspec_);
+   FinalizeSerialDiscreteTargetSpec();
+}
+
+void DiscreteAdaptTC::SetSerialDiscreteTargetOrientation(
+   const GridFunction &tspec_)
+{
+   if (orientationidx > -1) { SetTspecAtIndex(orientationidx, tspec_); return; }
+   orientationidx = ncomp;
+   SetDiscreteTargetBase(tspec_);
+   FinalizeSerialDiscreteTargetSpec();
+}
+
+void DiscreteAdaptTC::FinalizeSerialDiscreteTargetSpec()
+{
+   MFEM_VERIFY(adapt_eval, "SetAdaptivityEvaluator() has not been called!")
+   MFEM_VERIFY(ncomp > 0, "No target specifications have been set!");
+
+   adapt_eval->SetSerialMetaInfo(*tspec_fes->GetMesh(),
+                                 *tspec_fes->FEColl(), ncomp);
+   adapt_eval->SetInitialField(*tspec_fes->GetMesh()->GetNodes(), tspec);
 
    tspec_sav = tspec;
+
+   delete tspec_fesv;
+   tspec_fesv = new FiniteElementSpace(tspec_fes->GetMesh(),
+                                       tspec_fes->FEColl(), ncomp);
+}
+
+void DiscreteAdaptTC::SetSerialDiscreteTargetSpec(const GridFunction &tspec_)
+{
+   SetSerialDiscreteTargetSize(tspec_);
+   FinalizeSerialDiscreteTargetSpec();
 }
 
+
 void DiscreteAdaptTC::UpdateTargetSpecification(const Vector &new_x,
                                                 bool use_flag)
 {
@@ -1020,8 +1168,12 @@ void DiscreteAdaptTC::UpdateTargetSpecificationAtNode(const FiniteElement &el,
 
    Array<int> dofs;
    tspec_fes->GetElementDofs(T.ElementNo, dofs);
-   int cnt = tspec.Size();
-   tspec(dofs[dofidx]) = IntData(dofs[dofidx]+dir*cnt);
+   const int cnt = tspec.Size()/ncomp; //dofs per scalar-field
+
+   for (int i = 0; i < ncomp; i++)
+   {
+      tspec(dofs[dofidx]+i*cnt) = IntData(dofs[dofidx] + i*cnt + dir*cnt*ncomp);
+   }
 }
 
 void DiscreteAdaptTC::RestoreTargetSpecificationAtNode(ElementTransformation &T,
@@ -1031,7 +1183,11 @@ void DiscreteAdaptTC::RestoreTargetSpecificationAtNode(ElementTransformation &T,
 
    Array<int> dofs;
    tspec_fes->GetElementDofs(T.ElementNo, dofs);
-   tspec(dofs[dofidx]) = tspec_sav(dofs[dofidx]);
+   const int cnt = tspec.Size()/ncomp;
+   for (int i = 0; i < ncomp; i++)
+   {
+      tspec(dofs[dofidx] + i*cnt) = tspec_sav(dofs[dofidx] + i*cnt);
+   }
 }
 
 void DiscreteAdaptTC::ComputeElementTargets(int e_id, const FiniteElement &fe,
@@ -1039,37 +1195,176 @@ void DiscreteAdaptTC::ComputeElementTargets(int e_id, const FiniteElement &fe,
                                             const Vector &elfun,
                                             DenseTensor &Jtr) const
 {
-   MFEM_VERIFY(tspec_fes, "A call to SetDiscreteTargerSpec() is needed.");
+   MFEM_VERIFY(tspec_fesv, "No target specifications have been set.");
 
    switch (target_type)
    {
       case IDEAL_SHAPE_GIVEN_SIZE:
+      case GIVEN_SHAPE_AND_SIZE:
       {
          const DenseMatrix &Wideal =
             Geometries.GetGeomToPerfGeomJac(fe.GetGeomType());
          const int dim = Wideal.Height(),
-                   ntspec_dofs = tspec_fes->GetFE(0)->GetDof();
+                   ndofs = tspec_fes->GetFE(0)->GetDof(),
+                   ntspec_dofs = ndofs*ncomp;
+
+         Vector shape(ndofs), tspec_vals(ntspec_dofs), par_vals,
+                par_vals_c1, par_vals_c2, par_vals_c3;
 
-         Vector shape(ntspec_dofs), tspec_vals(ntspec_dofs);
          Array<int> dofs;
-         tspec_fes->GetElementDofs(e_id, dofs);
+         DenseMatrix D_rho(dim), Q_phi(dim), R_theta(dim);
+         tspec_fesv->GetElementVDofs(e_id, dofs);
          tspec.GetSubVector(dofs, tspec_vals);
 
-         const double min_size = tspec_vals.Min();
-         MFEM_ASSERT(min_size > 0.0,
-                     "Non-positive size propagated in the target definition.");
-
          for (int i = 0; i < ir.GetNPoints(); i++)
          {
             const IntegrationPoint &ip = ir.IntPoint(i);
             tspec_fes->GetFE(e_id)->CalcShape(ip, shape);
-            const double size = std::max(shape * tspec_vals, min_size);
-            Jtr(i).Set(std::pow(size / Wideal.Det(), 1.0/dim), Wideal);
+            Jtr(i) = Wideal; //Initialize to identity
+
+            if (sizeidx != -1) //Set size
+            {
+               par_vals.SetDataAndSize(tspec_vals.GetData()+sizeidx*ndofs, ndofs);
+               const double min_size = par_vals.Min();
+               MFEM_VERIFY(min_size > 0.0,
+                           "Non-positive size propagated in the target definition.");
+               const double size = std::max(shape * par_vals, min_size);
+               Jtr(i).Set(std::pow(size, 1.0/dim), Jtr(i));
+            } //Done size
+
+            if (target_type == IDEAL_SHAPE_GIVEN_SIZE) { continue; }
+
+            if (aspectratioidx != -1) //Set aspect ratio
+            {
+               if (dim == 2)
+               {
+                  par_vals.SetDataAndSize(tspec_vals.GetData()+
+                                          aspectratioidx*ndofs, ndofs);
+
+                  const double aspectratio = shape * par_vals;
+                  D_rho = 0.;
+                  D_rho(0,0) = 1./pow(aspectratio,0.5);
+                  D_rho(1,1) = pow(aspectratio,0.5);
+               }
+               else
+               {
+                  par_vals.SetDataAndSize(tspec_vals.GetData()+
+                                          aspectratioidx*ndofs, ndofs*3);
+                  par_vals_c1.SetDataAndSize(par_vals.GetData(), ndofs);
+                  par_vals_c2.SetDataAndSize(par_vals.GetData()+ndofs, ndofs);
+                  par_vals_c3.SetDataAndSize(par_vals.GetData()+2*ndofs, ndofs);
+
+                  const double rho1 = shape * par_vals_c1;
+                  const double rho2 = shape * par_vals_c2;
+                  const double rho3 = shape * par_vals_c3;
+                  D_rho = 0.;
+                  D_rho(0,0) = pow(rho1,2./3.);
+                  D_rho(1,1) = pow(rho2,2./3.);
+                  D_rho(2,2) = pow(rho3,2./3.);
+               }
+
+               DenseMatrix Temp = Jtr(i);
+               Mult(D_rho, Temp, Jtr(i));
+            } //Done aspect ratio
+
+            if (skewidx != -1) //Set skew
+            {
+               if (dim == 2)
+               {
+                  par_vals.SetDataAndSize(tspec_vals.GetData()+
+                                          skewidx*ndofs, ndofs);
+
+                  const double skew = shape * par_vals;
+
+                  Q_phi = 0.;
+                  Q_phi(0,0) = 1.;
+                  Q_phi(0,1) = cos(skew);
+                  Q_phi(1,1) = sin(skew);
+               }
+               else
+               {
+                  par_vals.SetDataAndSize(tspec_vals.GetData()+
+                                          skewidx*ndofs, ndofs*3);
+                  par_vals_c1.SetDataAndSize(par_vals.GetData(), ndofs);
+                  par_vals_c2.SetDataAndSize(par_vals.GetData()+ndofs, ndofs);
+                  par_vals_c3.SetDataAndSize(par_vals.GetData()+2*ndofs, ndofs);
+
+                  const double phi12  = shape * par_vals_c1;
+                  const double phi13  = shape * par_vals_c2;
+                  const double chi = shape * par_vals_c3;
+
+                  Q_phi = 0.;
+                  Q_phi(0,0) = 1.;
+                  Q_phi(0,1) = cos(phi12);
+                  Q_phi(0,2) = cos(phi13);
+
+                  Q_phi(1,1) = sin(phi12);
+                  Q_phi(1,2) = sin(phi13)*cos(chi);
+
+                  Q_phi(2,2) = sin(phi13)*sin(chi);
+               }
+
+               DenseMatrix Temp = Jtr(i);
+               Mult(Q_phi, Temp, Jtr(i));
+            } // done skew
+
+            if (orientationidx != -1) //Set orientation
+            {
+               if (dim == 2)
+               {
+                  par_vals.SetDataAndSize(tspec_vals.GetData()+
+                                          orientationidx*ndofs, ndofs);
+
+                  const double theta = shape * par_vals;
+                  R_theta(0,0) =  cos(theta);
+                  R_theta(0,1) = -sin(theta);
+                  R_theta(1,0) =  sin(theta);
+                  R_theta(1,1) =  cos(theta);
+               }
+               else
+               {
+                  par_vals.SetDataAndSize(tspec_vals.GetData()+
+                                          orientationidx*ndofs, ndofs*3);
+                  par_vals_c1.SetDataAndSize(par_vals.GetData(), ndofs);
+                  par_vals_c2.SetDataAndSize(par_vals.GetData()+ndofs, ndofs);
+                  par_vals_c3.SetDataAndSize(par_vals.GetData()+2*ndofs, ndofs);
+
+                  const double theta = shape * par_vals_c1;
+                  const double psi   = shape * par_vals_c2;
+                  const double beta  = shape * par_vals_c3;
+
+                  DenseMatrix R_tp(dim), R_beta(dim), R_theta(dim);
+                  double ct = cos(theta), st = sin(theta),
+                         cp = cos(psi),   sp = sin(psi);
+                  R_tp(0,0) = ct*sp;
+                  R_tp(1,0) = st*sp;
+                  R_tp(2,0) = cp;
+
+                  R_tp(0,1) = -(ct*st*sp*sp)/(1+cp);
+                  R_tp(1,1) = cp+(pow(ct,2.)*pow(sp,2.))/(1+cp);
+                  R_tp(2,1) = -st*sp;
+
+                  R_tp(0,2) = -cp-(pow(st,2.)*pow(sp,2.))/(1+cp);
+                  R_tp(1,2) = -R_tp(0,1);
+                  R_tp(2,2) =  ct*sp;
+
+                  R_beta = 0.;
+                  R_beta(0,0) = 1.;
+                  R_beta(1,1) =  cos(beta);
+                  R_beta(1,2) = -sin(beta);
+                  R_beta(2,1) =  sin(beta);
+                  R_beta(2,2) =  cos(beta);
+
+                  Mult(R_tp, R_beta, R_theta);
+               }
+               DenseMatrix Temp = Jtr(i);
+               Mult(R_theta, Temp, Jtr(i));
+            } // done orientation
          }
          break;
       }
       default:
-         MFEM_ABORT("Incompatible target type for analytic adaptation!");
+         MFEM_ABORT("Incompatible target type for discrete adaptation!");
    }
 }
 
@@ -1079,13 +1374,10 @@ void DiscreteAdaptTC::UpdateGradientTargetSpecification(const Vector &x,
 {
    if (use_flag && good_tspec_grad) { return; }
 
-   const int dim = tspec_fes->GetFE(0)->GetDim();
-   const int cnt = x.Size()/dim;
+   const int dim = tspec_fes->GetFE(0)->GetDim(),
+             cnt = x.Size()/dim;
 
-   if (tspec_pert1h.Size() != x.Size())
-   {
-      tspec_pert1h.SetSize(x.Size());
-   }
+   tspec_pert1h.SetSize(x.Size()*ncomp);
 
    Vector TSpecTemp;
    Vector xtemp = x;
@@ -1093,7 +1385,7 @@ void DiscreteAdaptTC::UpdateGradientTargetSpecification(const Vector &x,
    {
       for (int i = 0; i < cnt; i++) { xtemp(j*cnt+i) += dx; }
 
-      TSpecTemp.SetDataAndSize(tspec_pert1h.GetData() + j*cnt, cnt);
+      TSpecTemp.NewDataAndSize(tspec_pert1h.GetData() + j*cnt*ncomp, cnt*ncomp);
       UpdateTargetSpecification(xtemp, TSpecTemp);
 
       for (int i = 0; i < cnt; i++) { xtemp(j*cnt+i) -= dx; }
@@ -1105,16 +1397,15 @@ void DiscreteAdaptTC::UpdateGradientTargetSpecification(const Vector &x,
 void DiscreteAdaptTC::UpdateHessianTargetSpecification(const Vector &x,
                                                        double dx, bool use_flag)
 {
+
    if (use_flag && good_tspec_hess) { return; }
 
-   const int dim = tspec_fes->GetFE(0)->GetDim();
-   const int cnt = x.Size()/dim;
+   const int dim    = tspec_fes->GetFE(0)->GetDim(),
+             cnt    = x.Size()/dim,
+             totmix = 1+2*(dim-2);
 
-   if (tspec_pert2h.Size() != x.Size())
-   {
-      tspec_pert2h.SetSize(x.Size());
-      tspec_pertmix.SetSize(cnt*(1+2*(dim-2)));
-   }
+   tspec_pert2h.SetSize(cnt*dim*ncomp);
+   tspec_pertmix.SetSize(cnt*totmix*ncomp);
 
    Vector TSpecTemp;
    Vector xtemp = x;
@@ -1124,14 +1415,14 @@ void DiscreteAdaptTC::UpdateHessianTargetSpecification(const Vector &x,
    {
       for (int i = 0; i < cnt; i++) { xtemp(j*cnt+i) += 2*dx; }
 
-      TSpecTemp.SetDataAndSize(tspec_pert2h.GetData() + j*cnt, cnt);
+      TSpecTemp.NewDataAndSize(tspec_pert2h.GetData() + j*cnt*ncomp, cnt*ncomp);
       UpdateTargetSpecification(xtemp, TSpecTemp);
 
       for (int i = 0; i < cnt; i++) { xtemp(j*cnt+i) -= 2*dx; }
    }
 
    // T(x+h,y+h)
-   int idx = 0;
+   int j = 0;
    for (int k1 = 0; k1 < dim; k1++)
    {
       for (int k2 = 0; (k1 != k2) && (k2 < dim); k2++)
@@ -1142,7 +1433,7 @@ void DiscreteAdaptTC::UpdateHessianTargetSpecification(const Vector &x,
             xtemp(k2*cnt+i) += dx;
          }
 
-         TSpecTemp.SetDataAndSize(tspec_pertmix.GetData() + idx*cnt, cnt);
+         TSpecTemp.NewDataAndSize(tspec_pertmix.GetData() + j*cnt*ncomp, cnt*ncomp);
          UpdateTargetSpecification(xtemp, TSpecTemp);
 
          for (int i = 0; i < cnt; i++)
@@ -1150,7 +1441,7 @@ void DiscreteAdaptTC::UpdateHessianTargetSpecification(const Vector &x,
             xtemp(k1*cnt+i) -= dx;
             xtemp(k2*cnt+i) -= dx;
          }
-         idx++;
+         j++;
       }
    }
 
@@ -1165,6 +1456,8 @@ void AdaptivityEvaluator::SetSerialMetaInfo(const Mesh &m,
    delete mesh;
    mesh = new Mesh(m, true);
    fes = new FiniteElementSpace(mesh, &fec, num_comp);
+   dim = fes->GetFE(0)->GetDim();
+   ncomp = num_comp;
 }
 
 #ifdef MFEM_USE_MPI
@@ -1176,6 +1469,8 @@ void AdaptivityEvaluator::SetParMetaInfo(const ParMesh &m,
    delete pmesh;
    pmesh = new ParMesh(m, true);
    pfes  = new ParFiniteElementSpace(pmesh, &fec, num_comp);
+   dim = pfes->GetFE(0)->GetDim();
+   ncomp = num_comp;
 }
 #endif
 
@@ -1189,6 +1484,17 @@ AdaptivityEvaluator::~AdaptivityEvaluator()
 #endif
 }
 
+TMOP_Integrator::~TMOP_Integrator()
+{
+   delete lim_func;
+   delete zeta;
+   for (int i = 0; i < ElemDer.Size(); i++)
+   {
+      delete ElemDer[i];
+      delete ElemPertEnergy[i];
+   }
+}
+
 void TMOP_Integrator::EnableLimiting(const GridFunction &n0,
                                      const GridFunction &dist, Coefficient &w0,
                                      TMOP_LimiterFunction *lfunc)
@@ -1214,24 +1520,57 @@ void TMOP_Integrator::EnableLimiting(const GridFunction &n0, Coefficient &w0,
    }
 }
 
+void TMOP_Integrator::EnableAdaptiveLimiting(const GridFunction &z0,
+                                             Coefficient &coeff,
+                                             AdaptivityEvaluator &ae)
+{
+   zeta_0 = &z0;
+   delete zeta;
+   zeta   = new GridFunction(z0);
+   coeff_zeta = &coeff;
+   adapt_eval = &ae;
+
+   adapt_eval->SetSerialMetaInfo(*zeta->FESpace()->GetMesh(),
+                                 *zeta->FESpace()->FEColl(), 1);
+   adapt_eval->SetInitialField
+   (*zeta->FESpace()->GetMesh()->GetNodes(), *zeta);
+}
+
+#ifdef MFEM_USE_MPI
+void TMOP_Integrator::EnableAdaptiveLimiting(const ParGridFunction &z0,
+                                             Coefficient &coeff,
+                                             AdaptivityEvaluator &ae)
+{
+   zeta_0 = &z0;
+   delete zeta;
+   zeta   = new GridFunction(z0);
+   coeff_zeta = &coeff;
+   adapt_eval = &ae;
+
+   adapt_eval->SetParMetaInfo(*z0.ParFESpace()->GetParMesh(),
+                              *z0.ParFESpace()->FEColl(), 1);
+   adapt_eval->SetInitialField
+   (*zeta->FESpace()->GetMesh()->GetNodes(), *zeta);
+}
+#endif
+
 double TMOP_Integrator::GetElementEnergy(const FiniteElement &el,
                                          ElementTransformation &T,
                                          const Vector &elfun)
 {
-   int dof = el.GetDof(), dim = el.GetDim();
+   const int dof = el.GetDof(), dim = el.GetDim();
    double energy;
 
+   // No adaptive limiting terms if this is a FD computation.
+   const bool adaptive_limiting = (zeta && fd_call_flag == false);
+
    DSh.SetSize(dof, dim);
    Jrt.SetSize(dim);
    Jpr.SetSize(dim);
    Jpt.SetSize(dim);
    PMatI.UseExternalData(elfun.GetData(), dof, dim);
 
-   const IntegrationRule *ir = IntRule;
-   if (!ir)
-   {
-      ir = &(IntRules.Get(el.GetGeomType(), 2*el.GetOrder() + 3)); // <---
-   }
+   const IntegrationRule *ir = EnergyIntegrationRule(el);
 
    energy = 0.0;
    DenseTensor Jtr(dim, dim, ir->GetNPoints());
@@ -1262,11 +1601,12 @@ double TMOP_Integrator::GetElementEnergy(const FiniteElement &el,
 
    // Define ref->physical transformation, when a Coefficient is specified.
    IsoparametricTransformation *Tpr = NULL;
-   if (coeff1 || coeff0)
+   if (coeff1 || coeff0 || adaptive_limiting)
    {
       Tpr = new IsoparametricTransformation;
       Tpr->SetFE(&el);
       Tpr->ElementNo = T.ElementNo;
+      Tpr->ElementType = ElementTransformation::ELEMENT;
       Tpr->Attribute = T.Attribute;
       Tpr->GetPointMat().Transpose(PMatI); // PointMat = PMatI^T
    }
@@ -1278,6 +1618,13 @@ double TMOP_Integrator::GetElementEnergy(const FiniteElement &el,
    //       the physical coordinates (i.e. changes in 'elfun'), e.g. when the
    //       coefficient is a ConstantCoefficient or a GridFunctionCoefficient.
 
+   Vector zeta_q, zeta0_q;
+   if (adaptive_limiting)
+   {
+      zeta->GetValues(T.ElementNo, *ir, zeta_q);
+      zeta_0->GetValues(T.ElementNo, *ir, zeta0_q);
+   }
+
    for (int i = 0; i < ir->GetNPoints(); i++)
    {
       const IntegrationPoint &ip = ir->IntPoint(i);
@@ -1301,6 +1648,13 @@ double TMOP_Integrator::GetElementEnergy(const FiniteElement &el,
          val += lim_normal *
                 lim_func->Eval(p, p0, d_vals(i)) * coeff0->Eval(*Tpr, ip);
       }
+
+      if (adaptive_limiting)
+      {
+         const double diff = zeta_q(i) - zeta0_q(i);
+         val += coeff_zeta->Eval(*Tpr, ip) * lim_normal * diff * diff;
+      }
+
       energy += weight * val;
    }
    delete Tpr;
@@ -1338,9 +1692,10 @@ void TMOP_Integrator::AssembleElementGrad(const FiniteElement &el,
 
 void TMOP_Integrator::AssembleElementVectorExact(const FiniteElement &el,
                                                  ElementTransformation &T,
-                                                 const Vector &elfun, Vector &elvect)
+                                                 const Vector &elfun,
+                                                 Vector &elvect)
 {
-   int dof = el.GetDof(), dim = el.GetDim();
+   const int dof = el.GetDof(), dim = el.GetDim();
 
    DSh.SetSize(dof, dim);
    DS.SetSize(dof, dim);
@@ -1351,14 +1706,12 @@ void TMOP_Integrator::AssembleElementVectorExact(const FiniteElement &el,
    elvect.SetSize(dof*dim);
    PMatO.UseExternalData(elvect.GetData(), dof, dim);
 
-   const IntegrationRule *ir = IntRule;
-   if (!ir)
-   {
-      ir = &(IntRules.Get(el.GetGeomType(), 2*el.GetOrder() + 3)); // <---
-   }
+   const IntegrationRule *ir = ActionIntegrationRule(el);
+   const int nqp = ir->GetNPoints();
 
    elvect = 0.0;
-   DenseTensor Jtr(dim, dim, ir->GetNPoints());
+   Vector weights(nqp);
+   DenseTensor Jtr(dim, dim, nqp);
    targetC->ComputeElementTargets(T.ElementNo, el, *ir, elfun, Jtr);
 
    // Limited case.
@@ -1380,29 +1733,30 @@ void TMOP_Integrator::AssembleElementVectorExact(const FiniteElement &el,
       }
       else
       {
-         d_vals.SetSize(ir->GetNPoints()); d_vals = 1.0;
+         d_vals.SetSize(nqp); d_vals = 1.0;
       }
    }
 
    // Define ref->physical transformation, when a Coefficient is specified.
    IsoparametricTransformation *Tpr = NULL;
-   if (coeff1 || coeff0)
+   if (coeff1 || coeff0 || zeta)
    {
       Tpr = new IsoparametricTransformation;
       Tpr->SetFE(&el);
       Tpr->ElementNo = T.ElementNo;
+      Tpr->ElementType = ElementTransformation::ELEMENT;
       Tpr->Attribute = T.Attribute;
       Tpr->GetPointMat().Transpose(PMatI); // PointMat = PMatI^T
    }
 
-   for (int i = 0; i < ir->GetNPoints(); i++)
+   for (int q = 0; q < nqp; q++)
    {
-      const IntegrationPoint &ip = ir->IntPoint(i);
-      const DenseMatrix &Jtr_i = Jtr(i);
-      metric->SetTargetJacobian(Jtr_i);
-      CalcInverse(Jtr_i, Jrt);
-      const double weight = ip.weight * Jtr_i.Det();
-      double weight_m = weight * metric_normal;
+      const IntegrationPoint &ip = ir->IntPoint(q);
+      const DenseMatrix &Jtr_q = Jtr(q);
+      metric->SetTargetJacobian(Jtr_q);
+      CalcInverse(Jtr_q, Jrt);
+      weights(q) = ip.weight * Jtr_q.Det();
+      double weight_m = weights(q) * metric_normal;
 
       el.CalcDShape(ip, DSh);
       Mult(DSh, Jrt, DS);
@@ -1422,11 +1776,14 @@ void TMOP_Integrator::AssembleElementVectorExact(const FiniteElement &el,
          el.CalcShape(ip, shape);
          PMatI.MultTranspose(shape, p);
          pos0.MultTranspose(shape, p0);
-         lim_func->Eval_d1(p, p0, d_vals(i), grad);
-         grad *= weight * lim_normal * coeff0->Eval(*Tpr, ip);
+         lim_func->Eval_d1(p, p0, d_vals(q), grad);
+         grad *= weights(q) * lim_normal * coeff0->Eval(*Tpr, ip);
          AddMultVWt(shape, grad, PMatO);
       }
    }
+
+   if (zeta) { AssembleElemVecAdaptLim(el, weights, *Tpr, *ir, PMatO); }
+
    delete Tpr;
 }
 
@@ -1435,7 +1792,7 @@ void TMOP_Integrator::AssembleElementGradExact(const FiniteElement &el,
                                                const Vector &elfun,
                                                DenseMatrix &elmat)
 {
-   int dof = el.GetDof(), dim = el.GetDim();
+   const int dof = el.GetDof(), dim = el.GetDim();
 
    DSh.SetSize(dof, dim);
    DS.SetSize(dof, dim);
@@ -1444,14 +1801,12 @@ void TMOP_Integrator::AssembleElementGradExact(const FiniteElement &el,
    PMatI.UseExternalData(elfun.GetData(), dof, dim);
    elmat.SetSize(dof*dim);
 
-   const IntegrationRule *ir = IntRule;
-   if (!ir)
-   {
-      ir = &(IntRules.Get(el.GetGeomType(), 2*el.GetOrder() + 3)); // <---
-   }
+   const IntegrationRule *ir = GradientIntegrationRule(el);
+   const int nqp = ir->GetNPoints();
 
    elmat = 0.0;
-   DenseTensor Jtr(dim, dim, ir->GetNPoints());
+   Vector weights(nqp);
+   DenseTensor Jtr(dim, dim, nqp);
    targetC->ComputeElementTargets(T.ElementNo, el, *ir, elfun, Jtr);
 
    // Limited case.
@@ -1473,29 +1828,30 @@ void TMOP_Integrator::AssembleElementGradExact(const FiniteElement &el,
       }
       else
       {
-         d_vals.SetSize(ir->GetNPoints()); d_vals = 1.0;
+         d_vals.SetSize(nqp); d_vals = 1.0;
       }
    }
 
    // Define ref->physical transformation, when a Coefficient is specified.
    IsoparametricTransformation *Tpr = NULL;
-   if (coeff1 || coeff0)
+   if (coeff1 || coeff0 || zeta)
    {
       Tpr = new IsoparametricTransformation;
       Tpr->SetFE(&el);
       Tpr->ElementNo = T.ElementNo;
+      Tpr->ElementType = ElementTransformation::ELEMENT;
       Tpr->Attribute = T.Attribute;
       Tpr->GetPointMat().Transpose(PMatI);
    }
 
-   for (int i = 0; i < ir->GetNPoints(); i++)
+   for (int q = 0; q < nqp; q++)
    {
-      const IntegrationPoint &ip = ir->IntPoint(i);
-      const DenseMatrix &Jtr_i = Jtr(i);
-      metric->SetTargetJacobian(Jtr_i);
-      CalcInverse(Jtr_i, Jrt);
-      const double weight = ip.weight * Jtr_i.Det();
-      double weight_m = weight * metric_normal;
+      const IntegrationPoint &ip = ir->IntPoint(q);
+      const DenseMatrix &Jtr_q = Jtr(q);
+      metric->SetTargetJacobian(Jtr_q);
+      CalcInverse(Jtr_q, Jrt);
+      weights(q) = ip.weight * Jtr_q.Det();
+      double weight_m = weights(q) * metric_normal;
 
       el.CalcDShape(ip, DSh);
       Mult(DSh, Jrt, DS);
@@ -1507,13 +1863,14 @@ void TMOP_Integrator::AssembleElementGradExact(const FiniteElement &el,
 
       // TODO: derivatives of adaptivity-based targets.
 
+      // TODO optimize by symmetry.
       if (coeff0)
       {
          el.CalcShape(ip, shape);
          PMatI.MultTranspose(shape, p);
          pos0.MultTranspose(shape, p0);
-         weight_m = weight * lim_normal * coeff0->Eval(*Tpr, ip);
-         lim_func->Eval_d2(p, p0, d_vals(i), grad_grad);
+         weight_m = weights(q) * lim_normal * coeff0->Eval(*Tpr, ip);
+         lim_func->Eval_d2(p, p0, d_vals(q), grad_grad);
          for (int i = 0; i < dof; i++)
          {
             const double w_shape_i = weight_m * shape(i);
@@ -1531,9 +1888,115 @@ void TMOP_Integrator::AssembleElementGradExact(const FiniteElement &el,
          }
       }
    }
+
+   if (zeta) { AssembleElemGradAdaptLim(el, weights, *Tpr, *ir, elmat); }
+
    delete Tpr;
 }
 
+void TMOP_Integrator::AssembleElemVecAdaptLim(const FiniteElement &el,
+                                              const Vector &weights,
+                                              IsoparametricTransformation &Tpr,
+                                              const IntegrationRule &ir,
+                                              DenseMatrix &mat)
+{
+   if (zeta == NULL) { return; }
+
+   const int dof = el.GetDof(), dim = el.GetDim();
+   Vector shape(dof), zeta_e, zeta_q, zeta0_q;
+
+   Array<int> dofs;
+   zeta->FESpace()->GetElementDofs(Tpr.ElementNo, dofs);
+   zeta->GetSubVector(dofs, zeta_e);
+   zeta->GetValues(Tpr.ElementNo, ir, zeta_q);
+   zeta_0->GetValues(Tpr.ElementNo, ir, zeta0_q);
+
+   // Project the gradient of zeta in the same space.
+   // The FE coefficients of the gradient go in zeta_grad_e.
+   DenseMatrix zeta_grad_e(dof, dim);
+   DenseMatrix grad_phys; // This will be (dof x dim, dof).
+   el.ProjectGrad(el, Tpr, grad_phys);
+   Vector grad_ptr(zeta_grad_e.GetData(), dof*dim);
+   grad_phys.Mult(zeta_e, grad_ptr);
+
+   Vector zeta_grad_q(dim);
+
+   const int nqp = weights.Size();
+   for (int q = 0; q < nqp; q++)
+   {
+      const IntegrationPoint &ip = ir.IntPoint(q);
+      el.CalcShape(ip, shape);
+      zeta_grad_e.MultTranspose(shape, zeta_grad_q);
+      zeta_grad_q *= 2.0 * (zeta_q(q) - zeta0_q(q));
+      zeta_grad_q *= weights(q) * lim_normal * coeff_zeta->Eval(Tpr, ip);
+      AddMultVWt(shape, zeta_grad_q, mat);
+   }
+}
+
+void TMOP_Integrator::AssembleElemGradAdaptLim(const FiniteElement &el,
+                                               const Vector &weights,
+                                               IsoparametricTransformation &Tpr,
+                                               const IntegrationRule &ir,
+                                               DenseMatrix &mat)
+{
+   if (zeta == NULL) { return; }
+
+   const int dof = el.GetDof(), dim = el.GetDim();
+   Vector shape(dof), zeta_e, zeta_q, zeta0_q;
+
+   Array<int> dofs;
+   zeta->FESpace()->GetElementDofs(Tpr.ElementNo, dofs);
+   zeta->GetSubVector(dofs, zeta_e);
+   zeta->GetValues(Tpr.ElementNo, ir, zeta_q);
+   zeta_0->GetValues(Tpr.ElementNo, ir, zeta0_q);
+
+   // Project the gradient of zeta in the same space.
+   // The FE coefficients of the gradient go in zeta_grad_e.
+   DenseMatrix zeta_grad_e(dof, dim);
+   DenseMatrix grad_phys; // This will be (dof x dim, dof).
+   el.ProjectGrad(el, Tpr, grad_phys);
+   Vector grad_ptr(zeta_grad_e.GetData(), dof*dim);
+   grad_phys.Mult(zeta_e, grad_ptr);
+
+   // Project the gradient of each gradient of zeta in the same space.
+   // The FE coefficients of the second derivatives go in zeta_grad_grad_e.
+   DenseMatrix zeta_grad_grad_e(dof*dim, dim);
+   Mult(grad_phys, zeta_grad_e, zeta_grad_grad_e);
+   // Reshape to be more convenient later (no change in the data).
+   zeta_grad_grad_e.SetSize(dof, dim*dim);
+
+   Vector zeta_grad_q(dim);
+   DenseMatrix zeta_grad_grad_q(dim, dim);
+
+   const int nqp = weights.Size();
+   for (int q = 0; q < nqp; q++)
+   {
+      const IntegrationPoint &ip = ir.IntPoint(q);
+      el.CalcShape(ip, shape);
+
+      zeta_grad_e.MultTranspose(shape, zeta_grad_q);
+      Vector gg_ptr(zeta_grad_grad_q.GetData(), dim*dim);
+      zeta_grad_grad_e.MultTranspose(shape, gg_ptr);
+
+      const double w = weights(q) * lim_normal * coeff_zeta->Eval(Tpr, ip);
+      for (int i = 0; i < dof * dim; i++)
+      {
+         const int idof = i % dof, idim = i / dof;
+         for (int j = 0; j <= i; j++)
+         {
+            const int jdof = j % dof, jdim = j / dof;
+            const double entry =
+               w * ( 2.0 * zeta_grad_q(idim) * shape(idof) *
+                     /* */ zeta_grad_q(jdim) * shape(jdof) +
+                     2.0 * (zeta_q(q) - zeta0_q(q)) *
+                     zeta_grad_grad_q(idim, jdim) * shape(idof) * shape(jdof));
+            mat(i, j) += entry;
+            if (i != j) { mat(j, i) += entry; }
+         }
+      }
+   }
+}
+
 double TMOP_Integrator::GetFDDerivative(const FiniteElement &el,
                                         ElementTransformation &T,
                                         Vector &elfun, const int dofidx,
@@ -1573,8 +2036,11 @@ void TMOP_Integrator::AssembleElementVectorFD(const FiniteElement &el,
    elvect.SetSize(dof*dim);
    Vector elfunmod(elfun);
 
-   // Energy for unperturbed configuration
-   double e_fx = GetElementEnergy(el, T, elfun);
+   // In GetElementEnergy(), skip terms that have exact derivative calculations.
+   fd_call_flag = true;
+
+   // Energy for unperturbed configuration.
+   const double e_fx = GetElementEnergy(el, T, elfun);
 
    for (int j = 0; j < dim; j++)
    {
@@ -1589,6 +2055,32 @@ void TMOP_Integrator::AssembleElementVectorFD(const FiniteElement &el,
          if (discr_tc) { discr_tc->RestoreTargetSpecificationAtNode(T, i); }
       }
    }
+   fd_call_flag = false;
+
+   // Contributions from adaptive limiting (exact derivatives).
+   if (zeta)
+   {
+      const IntegrationRule *ir = ActionIntegrationRule(el);
+      const int nqp = ir->GetNPoints();
+      DenseTensor Jtr(dim, dim, nqp);
+      targetC->ComputeElementTargets(T.ElementNo, el, *ir, elfun, Jtr);
+
+      IsoparametricTransformation Tpr;
+      Tpr.SetFE(&el);
+      Tpr.ElementNo = T.ElementNo;
+      Tpr.Attribute = T.Attribute;
+      PMatI.UseExternalData(elfun.GetData(), dof, dim);
+      Tpr.GetPointMat().Transpose(PMatI); // PointMat = PMatI^T
+
+      Vector weights(nqp);
+      for (int q = 0; q < nqp; q++)
+      {
+         weights(q) = ir->IntPoint(q).weight * Jtr(q).Det();
+      }
+
+      PMatO.UseExternalData(elvect.GetData(), dof, dim);
+      AssembleElemVecAdaptLim(el, weights, Tpr, *ir, PMatO);
+   }
 }
 
 void TMOP_Integrator::AssembleElementGradFD(const FiniteElement &el,
@@ -1604,6 +2096,8 @@ void TMOP_Integrator::AssembleElementGradFD(const FiniteElement &el,
    const Vector &ElemDerLoc = *(ElemDer[T.ElementNo]);
    const Vector &ElemPertLoc = *(ElemPertEnergy[T.ElementNo]);
 
+   // In GetElementEnergy(), skip terms that have exact derivative calculations.
+   fd_call_flag = true;
    for (int i = 0; i < dof; i++)
    {
       for (int j = 0; j < i+1; j++)
@@ -1657,6 +2151,31 @@ void TMOP_Integrator::AssembleElementGradFD(const FiniteElement &el,
          }
       }
    }
+   fd_call_flag = false;
+
+   // Contributions from adaptive limiting.
+   if (zeta)
+   {
+      const IntegrationRule *ir = GradientIntegrationRule(el);
+      const int nqp = ir->GetNPoints();
+      DenseTensor Jtr(dim, dim, nqp);
+      targetC->ComputeElementTargets(T.ElementNo, el, *ir, elfun, Jtr);
+
+      IsoparametricTransformation Tpr;
+      Tpr.SetFE(&el);
+      Tpr.ElementNo = T.ElementNo;
+      Tpr.Attribute = T.Attribute;
+      PMatI.UseExternalData(elfun.GetData(), dof, dim);
+      Tpr.GetPointMat().Transpose(PMatI); // PointMat = PMatI^T
+
+      Vector weights(nqp);
+      for (int q = 0; q < nqp; q++)
+      {
+         weights(q) = ir->IntPoint(q).weight * Jtr(q).Det();
+      }
+
+      AssembleElemGradAdaptLim(el, weights, Tpr, *ir, elmat);
+   }
 }
 
 void TMOP_Integrator::EnableNormalization(const GridFunction &x)
@@ -1673,7 +2192,8 @@ void TMOP_Integrator::ParEnableNormalization(const ParGridFunction &x)
    ComputeNormalizationEnergies(x, loc[0], loc[1]);
    double rdc[2];
    MPI_Allreduce(loc, rdc, 2, MPI_DOUBLE, MPI_SUM, x.ParFESpace()->GetComm());
-   metric_normal = 1.0 / rdc[0]; lim_normal = 1.0 / rdc[1];
+   metric_normal = 1.0 / rdc[0];
+   lim_normal    = 1.0 / rdc[1];
 }
 #endif
 
@@ -1693,12 +2213,7 @@ void TMOP_Integrator::ComputeNormalizationEnergies(const GridFunction &x,
    Jpr.SetSize(dim);
    Jpt.SetSize(dim);
 
-   const IntegrationRule *ir = IntRule;
-   if (!ir)
-   {
-      ir = &(IntRules.Get(fe->GetGeomType(), 2*fe->GetOrder() + 3)); // <---
-   }
-
+   const IntegrationRule *ir = EnergyIntegrationRule(*fe);
    DenseTensor Jtr(dim, dim, ir->GetNPoints());
 
    metric_energy = 0.0;
@@ -1727,19 +2242,20 @@ void TMOP_Integrator::ComputeNormalizationEnergies(const GridFunction &x,
          lim_energy += weight;
       }
    }
+   if (targetC->ContainsVolumeInfo() == false)
+   {
+      // Special case when the targets don't contain volumetric information.
+      lim_energy = fes->GetNE();
+   }
 }
 
 void TMOP_Integrator::ComputeMinJac(const Vector &x,
                                     const FiniteElementSpace &fes)
 {
-   const IntegrationRule *ir = IntRule;
-   if (!ir)
-   {
-      ir = &(IntRules.Get(fes.GetFE(0)->GetGeomType(),
-                          2*fes.GetFE(0)->GetOrder() + 3)); // <---
-   }
-   const int NE = fes.GetMesh()->GetNE(), dim = fes.GetFE(0)->GetDim(),
-             dof = fes.GetFE(0)->GetDof(), nsp = ir->GetNPoints();
+   const FiniteElement *fe = fes.GetFE(0);
+   const IntegrationRule *ir = EnergyIntegrationRule(*fe);
+   const int NE = fes.GetMesh()->GetNE(), dim = fe->GetDim(),
+             dof = fe->GetDof(), nsp = ir->GetNPoints();
 
    Array<int> xdofs(dof * dim);
    DenseMatrix Jpr(dim), dshape(dof, dim), pos(dof, dim);
@@ -1766,6 +2282,12 @@ void TMOP_Integrator::ComputeMinJac(const Vector &x,
    dx = detv_avg_min / dxscale;
 }
 
+void TMOP_Integrator::UpdateAfterMeshChange(const Vector &new_x)
+{
+   // Update zeta if adaptive limiting is enabled.
+   if (zeta) { adapt_eval->ComputeAtNewPosition(new_x, *zeta); }
+}
+
 void TMOP_Integrator::ComputeFDh(const Vector &x, const FiniteElementSpace &fes)
 {
    if (!fdflag) { return; }
diff --git a/fem/tmop.hpp b/fem/tmop.hpp
index 05c0aaab4cf..e5d0e58d001 100644
--- a/fem/tmop.hpp
+++ b/fem/tmop.hpp
@@ -560,6 +560,8 @@ class AdaptivityEvaluator
    ParFiniteElementSpace *pfes;
 #endif
 
+   int dim, ncomp;
+
 public:
    AdaptivityEvaluator() : mesh(NULL), fes(NULL)
    {
@@ -596,7 +598,8 @@ class AdaptivityEvaluator
     supports a set of algorithms chosen by the #TargetType enumeration.
 
     New target-matrix construction algorithms can be defined by deriving new
-    classes and overriding the method ComputeElementTargets(). */
+    classes and overriding the methods ComputeElementTargets() and
+    ContainsVolumeInfo(). */
 class TargetConstructor
 {
 public:
@@ -664,6 +667,9 @@ class TargetConstructor
    /// Used by target type IDEAL_SHAPE_EQUAL_SIZE. The default volume scale is 1.
    void SetVolumeScale(double vol_scale) { volume_scale = vol_scale; }
 
+   /// Checks if the target matrices contain non-trivial size specification.
+   virtual bool ContainsVolumeInfo() const;
+
    /** @brief Given an element and quadrature rule, computes ref->target
        transformation Jacobians for each quadrature point in the element.
        The physical positions of the element's nodes are given by @a elfun. */
@@ -708,15 +714,20 @@ class DiscreteAdaptTC : public TargetConstructor
 protected:
    // Discrete target specification.
    // Data is owned, updated by UpdateTargetSpecification.
+   int ncomp, sizeidx, skewidx, aspectratioidx, orientationidx;
    Vector tspec;             //eta(x)
    Vector tspec_sav;
    Vector tspec_pert1h;      //eta(x+h)
    Vector tspec_pert2h;      //eta(x+2*h)
    Vector tspec_pertmix;     //eta(x+h,y+h)
+   // The order inside these perturbation vectors (e.g. in 2D) is
+   // eta1(x+h,y), eta2(x+h,y) ... etan(x+h,y), eta1(x,y+h), eta2(x,y+h) ...
+   // same for tspec_pert2h and tspec_pertmix.
 
    // Note: do not use the Nodes of this space as they may not be on the
    // positions corresponding to the values of tspec.
    const FiniteElementSpace *tspec_fes;
+   const FiniteElementSpace *tspec_fesv;
 
    // These flags can be used by outside functions to avoid recomputing
    // the tspec and tspec_perth fields again on the same mesh.
@@ -726,20 +737,55 @@ class DiscreteAdaptTC : public TargetConstructor
    // Owned.
    AdaptivityEvaluator *adapt_eval;
 
+   void SetDiscreteTargetBase(const GridFunction &tspec_);
+   void SetTspecAtIndex(int idx, const GridFunction &tspec_);
+   void FinalizeSerialDiscreteTargetSpec();
+#ifdef MFEM_USE_MPI
+   void SetTspecAtIndex(int idx, const ParGridFunction &tspec_);
+   void FinalizeParDiscreteTargetSpec(const ParGridFunction &tspec_);
+#endif
+
 public:
    DiscreteAdaptTC(TargetType ttype)
       : TargetConstructor(ttype),
+        ncomp(0),
+        sizeidx(-1), skewidx(-1), aspectratioidx(-1), orientationidx(-1),
         tspec(), tspec_sav(), tspec_pert1h(), tspec_pert2h(), tspec_pertmix(),
-        tspec_fes(NULL),
+        tspec_fes(NULL), tspec_fesv(NULL),
         good_tspec(false), good_tspec_grad(false), good_tspec_hess(false),
         adapt_eval(NULL) { }
 
-   virtual ~DiscreteAdaptTC() { delete adapt_eval; }
+   virtual ~DiscreteAdaptTC()
+   {
+      delete adapt_eval;
+      delete tspec_fes;
+      delete tspec_fesv;
+   }
 
-   virtual void SetSerialDiscreteTargetSpec(GridFunction &tspec_);
+   /** @name Target specification methods.
+       The following methods are used to specify geometric parameters of the
+       targets when these parameters are given by discrete FE functions.
+       Note that every GridFunction given to the Set methods must use a
+       H1_FECollection of the same order. The number of components must
+       correspond to the type of geometric parameter and dimension.
+
+       @param[in] tspec_  Input values of a geometric parameter. Note that
+                          the methods in this class support only functions that
+                          use H1_FECollection collection of the same order. */
+   ///@{
+   virtual void SetSerialDiscreteTargetSpec(const GridFunction &tspec_);
+   virtual void SetSerialDiscreteTargetSize(const GridFunction &tspec_);
+   virtual void SetSerialDiscreteTargetSkew(const GridFunction &tspec_);
+   virtual void SetSerialDiscreteTargetAspectRatio(const GridFunction &tspec_);
+   virtual void SetSerialDiscreteTargetOrientation(const GridFunction &tspec_);
 #ifdef MFEM_USE_MPI
-   virtual void SetParDiscreteTargetSpec(ParGridFunction &tspec_);
+   virtual void SetParDiscreteTargetSpec(const ParGridFunction &tspec_);
+   virtual void SetParDiscreteTargetSize(const ParGridFunction &tspec_);
+   virtual void SetParDiscreteTargetSkew(const ParGridFunction &tspec_);
+   virtual void SetParDiscreteTargetAspectRatio(const ParGridFunction &tspec_);
+   virtual void SetParDiscreteTargetOrientation(const ParGridFunction &tspec_);
 #endif
+   ///@}
 
    /// Used in combination with the Update methods to avoid extra computations.
    void ResetUpdateFlags()
@@ -756,6 +802,7 @@ class DiscreteAdaptTC : public TargetConstructor
                                         ElementTransformation &T,
                                         int nodenum, int idir,
                                         const Vector &IntData);
+
    void RestoreTargetSpecificationAtNode(ElementTransformation &T, int nodenum);
 
    /** Used for finite-difference based computations. Computes the target
@@ -827,12 +874,21 @@ class TMOP_Integrator : public NonlinearFormIntegrator
    // Normalization factor for the limiting term.
    double lim_normal;
 
+   // Adaptive limiting.
+   const GridFunction *zeta_0;       // Not owned.
+   GridFunction *zeta;               // Owned. Updated by adapt_eval.
+   Coefficient *coeff_zeta;          // Not owned.
+   AdaptivityEvaluator *adapt_eval;  // Not owned.
+
    DiscreteAdaptTC *discr_tc;
 
    // Parameters for FD-based Gradient & Hessian calculation.
-   bool   fdflag;
+   bool fdflag;
    double dx;
    double dxscale;
+   // Specifies that ComputeElementTargets is being called by a FD function.
+   // It's used to skip terms that have exact derivative calculations.
+   bool fd_call_flag;
 
    Array <Vector *> ElemDer;        //f'(x)
    Array <Vector *> ElemPertEnergy; //f(x+h)
@@ -865,11 +921,18 @@ class TMOP_Integrator : public NonlinearFormIntegrator
                                 ElementTransformation &T,
                                 const Vector &elfun, Vector &elvect);
 
-   /** Assumes that AssembleElementVectorFD has been called. */
+   // Assumes that AssembleElementVectorFD has been called.
    void AssembleElementGradFD(const FiniteElement &el,
                               ElementTransformation &T,
                               const Vector &elfun, DenseMatrix &elmat);
 
+   void AssembleElemVecAdaptLim(const FiniteElement &el, const Vector &weights,
+                                IsoparametricTransformation &Tpr,
+                                const IntegrationRule &ir, DenseMatrix &m);
+   void AssembleElemGradAdaptLim(const FiniteElement &el, const Vector &weights,
+                                 IsoparametricTransformation &Tpr,
+                                 const IntegrationRule &ir, DenseMatrix &m);
+
    double GetFDDerivative(const FiniteElement &el,
                           ElementTransformation &T,
                           Vector &elfun, const int nodenum,const int idir,
@@ -882,11 +945,29 @@ class TMOP_Integrator : public NonlinearFormIntegrator
 #endif
    void ComputeMinJac(const Vector &x, const FiniteElementSpace &fes);
 
+   void UpdateAfterMeshChange(const Vector &new_x);
+
    void DisableLimiting()
    {
       nodes0 = NULL; coeff0 = NULL; lim_dist = NULL; lim_func = NULL;
    }
 
+   const IntegrationRule *EnergyIntegrationRule(const FiniteElement &el) const
+   {
+      return (IntRule) ? IntRule
+             /*     */ : &(IntRules.Get(el.GetGeomType(), 2*el.GetOrder() + 3));
+   }
+   const IntegrationRule *ActionIntegrationRule(const FiniteElement &el) const
+   {
+      // TODO the energy most likely needs less integration points.
+      return EnergyIntegrationRule(el);
+   }
+   const IntegrationRule *GradientIntegrationRule(const FiniteElement &el) const
+   {
+      // TODO the action and energy most likely need less integration points.
+      return EnergyIntegrationRule(el);
+   }
+
 public:
    /** @param[in] m  TMOP_QualityMetric that will be integrated (not owned).
        @param[in] tc Target-matrix construction algorithm to use (not owned). */
@@ -895,19 +976,12 @@ class TMOP_Integrator : public NonlinearFormIntegrator
         coeff1(NULL), metric_normal(1.0),
         nodes0(NULL), coeff0(NULL),
         lim_dist(NULL), lim_func(NULL), lim_normal(1.0),
+        zeta_0(NULL), zeta(NULL), coeff_zeta(NULL), adapt_eval(NULL),
         discr_tc(dynamic_cast<DiscreteAdaptTC *>(tc)),
-        fdflag(false), dxscale(1.0e3)
+        fdflag(false), dxscale(1.0e3), fd_call_flag(false)
    { }
 
-   ~TMOP_Integrator()
-   {
-      delete lim_func;
-      for (int i = 0; i < ElemDer.Size(); i++)
-      {
-         delete ElemDer[i];
-         delete ElemPertEnergy[i];
-      }
-   }
+   ~TMOP_Integrator();
 
    /// Sets a scaling Coefficient for the quality metric term of the integrator.
    /** With this addition, the integrator becomes
@@ -917,15 +991,15 @@ class TMOP_Integrator : public NonlinearFormIntegrator
        not in the target configuration which may be undefined. */
    void SetCoefficient(Coefficient &w1) { coeff1 = &w1; }
 
-   /// Adds a limiting term to the integrator (general version).
-   /** With this addition, the integrator becomes
-          @f$ \int w1 W(Jpt) + w0 f(x, x_0, d) dx @f$,
-       where the second term measures the change with respect to the original
-       physical positions, @a n0.
-       @param[in] n0     Original mesh node coordinates.
-       @param[in] dist   Limiting physical distances.
-       @param[in] w0     Coefficient scaling the limiting term.
-       @param[in] lfunc  TMOP_LimiterFunction defining the limiting term f. If
+   /** @brief Limiting of the mesh displacements (general version).
+
+       Adds the term @f$ \int w_0 f(x, x_0, d) dx @f$, where f is a measure of
+       the displacement between x and x_0, given the max allowed displacement d.
+
+       @param[in] n0     Original mesh node coordinates (x0 above).
+       @param[in] dist   Allowed displacement in physical space (d above).
+       @param[in] w0     Coefficient scaling the limiting integral.
+       @param[in] lfunc  TMOP_LimiterFunction defining the function f. If
                          NULL, a TMOP_QuadraticLimiter will be used. The
                          TMOP_Integrator assumes ownership of this pointer. */
    void EnableLimiting(const GridFunction &n0, const GridFunction &dist,
@@ -936,6 +1010,25 @@ class TMOP_Integrator : public NonlinearFormIntegrator
    void EnableLimiting(const GridFunction &n0, Coefficient &w0,
                        TMOP_LimiterFunction *lfunc = NULL);
 
+   /** @brief Restriction of the node positions to certain regions.
+
+       Adds the term @f$ \int c (z(x) - z_0(x_0))^2 @f$, where z0(x0) is a given
+       function on the starting mesh, and z(x) is its image on the new mesh.
+       Minimizing this, means that a node at x0 is allowed to move to a
+       position x(x0) only if z(x) ~ z0(x0).
+       Such term can be used for tangential mesh relaxation.
+
+       @param[in] z0     Function z0 that controls the adaptive limiting.
+       @param[in] coeff  Coefficient c for the above integral.
+       @param[in] ae     AdaptivityEvaluator to compute z(x) from z0(x0). */
+   void EnableAdaptiveLimiting(const GridFunction &z0, Coefficient &coeff,
+                               AdaptivityEvaluator &ae);
+#ifdef MFEM_USE_MPI
+   /// Parallel support for adaptive limiting.
+   void EnableAdaptiveLimiting(const ParGridFunction &z0, Coefficient &coeff,
+                               AdaptivityEvaluator &ae);
+#endif
+
    /// Update the original/reference nodes used for limiting.
    void SetLimitingNodes(const GridFunction &n0) { nodes0 = &n0; }
 
@@ -992,7 +1085,7 @@ class TMOPComboIntegrator : public NonlinearFormIntegrator
    /// Adds a new TMOP_Integrator to the combination.
    void AddTMOPIntegrator(TMOP_Integrator *ti) { tmopi.Append(ti); }
 
-   Array<TMOP_Integrator *> GetTMOPIntegrators() const { return tmopi; }
+   const Array<TMOP_Integrator *> &GetTMOPIntegrators() const { return tmopi; }
 
    /// Adds the limiting term to the first integrator. Disables it for the rest.
    void EnableLimiting(const GridFunction &n0, const GridFunction &dist,
diff --git a/fem/tmop_tools.cpp b/fem/tmop_tools.cpp
index d41eb0c638e..993cadc13ec 100644
--- a/fem/tmop_tools.cpp
+++ b/fem/tmop_tools.cpp
@@ -29,13 +29,26 @@ void AdvectorCG::SetInitialField(const Vector &init_nodes,
 void AdvectorCG::ComputeAtNewPosition(const Vector &new_nodes,
                                       Vector &new_field)
 {
-#if defined(MFEM_DEBUG) || defined(MFEM_USE_MPI)
-   int myid = 0;
-#endif
-   Mesh *m = mesh;
+   // TODO: Implement for AMR meshes.
+   const int pnt_cnt = new_field.Size()/ncomp;
+
+   new_field = field0;
+
+   for (int i = 0; i < ncomp; i++)
+   {
+      Vector new_field_temp(new_field.GetData()+i*pnt_cnt, pnt_cnt);
+      ComputeAtNewPositionScalar(new_nodes, new_field_temp);
+   }
+
+   field0 = new_field;
+   nodes0 = new_nodes;
+}
 
+void AdvectorCG::ComputeAtNewPositionScalar(const Vector &new_nodes,
+                                            Vector &new_field)
+{
+   Mesh *m = mesh;
 #ifdef MFEM_USE_MPI
-   if (pfes) { MPI_Comm_rank(pfes->GetComm(), &myid); }
    if (pmesh) { m = pmesh; }
 #endif
 
@@ -44,17 +57,29 @@ void AdvectorCG::ComputeAtNewPosition(const Vector &new_nodes,
    // This will be used to move the positions.
    GridFunction *mesh_nodes = m->GetNodes();
    *mesh_nodes = nodes0;
-   new_field = field0;
+   double minv = new_field.Min(), maxv = new_field.Max();
 
    // Velocity of the positions.
    GridFunction u(mesh_nodes->FESpace());
    subtract(new_nodes, nodes0, u);
 
+   // Define a scalar FE space for the solution, and the advection operator.
    TimeDependentOperator *oper = NULL;
-   // This must be the fes of the ind, associated with the object's mesh.
-   if (fes)  { oper = new SerialAdvectorCGOper(nodes0, u, *fes); }
+   FiniteElementSpace *fess = NULL;
 #ifdef MFEM_USE_MPI
-   else if (pfes) { oper = new ParAdvectorCGOper(nodes0, u, *pfes); }
+   ParFiniteElementSpace *pfess = NULL;
+#endif
+   if (fes)
+   {
+      fess = new FiniteElementSpace(fes->GetMesh(), fes->FEColl(), 1);
+      oper = new SerialAdvectorCGOper(nodes0, u, *fess);
+   }
+#ifdef MFEM_USE_MPI
+   else if (pfes)
+   {
+      pfess = new ParFiniteElementSpace(pfes->GetParMesh(), pfes->FEColl(), 1);
+      oper  = new ParAdvectorCGOper(nodes0, u, *pfess);
+   }
 #endif
    MFEM_VERIFY(oper != NULL,
                "No FE space has been given to the AdaptivityEvaluator.");
@@ -67,12 +92,18 @@ void AdvectorCG::ComputeAtNewPosition(const Vector &new_nodes,
       h_min = std::min(h_min, m->GetElementSize(i));
    }
    double v_max = 0.0;
-   const int s = u.FESpace()->GetVSize() / 2;
+   const int s = new_field.Size();
+
    for (int i = 0; i < s; i++)
    {
-      const double vel = u(i) * u(i) + u(i+s) * u(i+s);
+      double vel = 0.;
+      for (int j = 0; j < dim; j++)
+      {
+         vel += u(i+j*s)*u(i+j*s);
+      }
       v_max = std::max(v_max, vel);
    }
+
 #ifdef MFEM_USE_MPI
    if (pfes)
    {
@@ -81,12 +112,17 @@ void AdvectorCG::ComputeAtNewPosition(const Vector &new_nodes,
       MPI_Allreduce(&h_loc, &h_min, 1, MPI_DOUBLE, MPI_MIN, pfes->GetComm());
    }
 #endif
-   if (v_max == 0.0)
+
+   if (v_max == 0.0) // No need to change the field.
    {
-      // No mesh motion --> no need to change the field.
       delete oper;
+      delete fess;
+#ifdef MFEM_USE_MPI
+      delete pfess;
+#endif
       return;
    }
+
    v_max = std::sqrt(v_max);
    double dt = dt_scale * h_min / v_max;
 
@@ -96,30 +132,34 @@ void AdvectorCG::ComputeAtNewPosition(const Vector &new_nodes,
    {
       if (t + dt >= 1.0)
       {
-#ifdef MFEM_DEBUG
-         if (myid == 0)
-         {
-            mfem::out << "Remap took " << ti << " steps." << std::endl;
-         }
-#endif
          dt = 1.0 - t;
          last_step = true;
       }
       ode_solver.Step(new_field, t, dt);
    }
 
-   // Trim the overshoots and undershoots.
-   const double minv = field0.Min(), maxv = field0.Max();
-   for (int i = 0; i < new_field.Size(); i++)
+   double glob_minv = minv,
+          glob_maxv = maxv;
+#ifdef MFEM_USE_MPI
+   if (pfes)
    {
-      if (new_field(i) < minv) { new_field(i) = minv; }
-      if (new_field(i) > maxv) { new_field(i) = maxv; }
+      MPI_Allreduce(&minv, &glob_minv, 1, MPI_DOUBLE, MPI_MIN, pfes->GetComm());
+      MPI_Allreduce(&maxv, &glob_maxv, 1, MPI_DOUBLE, MPI_MAX, pfes->GetComm());
    }
+#endif
 
-   nodes0 = new_nodes;
-   field0 = new_field;
+   // Trim the overshoots and undershoots.
+   for (int i = 0; i < s; i++)
+   {
+      if (new_field(i) < glob_minv) { new_field(i) = glob_minv; }
+      if (new_field(i) > glob_maxv) { new_field(i) = glob_maxv; }
+   }
 
    delete oper;
+   delete fess;
+#ifdef MFEM_USE_MPI
+   delete pfess;
+#endif
 }
 
 SerialAdvectorCGOper::SerialAdvectorCGOper(const Vector &x_start,
@@ -235,6 +275,12 @@ void InterpolatorFP::SetInitialField(const Vector &init_nodes,
    const double newton_tol  = 1.0e-12;
    const int npts_at_once   = 256;
 
+   if (finder)
+   {
+      finder->FreeData();
+      delete finder;
+   }
+
    FiniteElementSpace *f = fes;
 #ifdef MFEM_USE_MPI
    if (pfes)
@@ -453,6 +499,7 @@ void TMOPNewtonSolver::ProcessNewState(const Vector &x) const
          ti = dynamic_cast<TMOP_Integrator *>(integs[i]);
          if (ti)
          {
+            ti->UpdateAfterMeshChange(x_loc);
             ti->ComputeFDh(x_loc, *pfesc);
             UpdateDiscreteTC(*ti, x_loc);
          }
@@ -488,6 +535,7 @@ void TMOPNewtonSolver::ProcessNewState(const Vector &x) const
          ti = dynamic_cast<TMOP_Integrator *>(integs[i]);
          if (ti)
          {
+            ti->UpdateAfterMeshChange(x_loc);
             ti->ComputeFDh(x_loc, *fesc);
             UpdateDiscreteTC(*ti, x_loc);
          }
diff --git a/fem/tmop_tools.hpp b/fem/tmop_tools.hpp
index 8469b0064d0..e77e23c7e9d 100644
--- a/fem/tmop_tools.hpp
+++ b/fem/tmop_tools.hpp
@@ -27,9 +27,9 @@ class AdvectorCG : public AdaptivityEvaluator
    RK4Solver ode_solver;
    Vector nodes0;
    Vector field0;
-
    const double dt_scale;
 
+   void ComputeAtNewPositionScalar(const Vector &new_nodes, Vector &new_field);
 public:
    AdvectorCG(double timestep_scale = 0.5)
       : AdaptivityEvaluator(),
@@ -53,6 +53,8 @@ class InterpolatorFP : public AdaptivityEvaluator
    Vector pos_r_out, dist_p_out;
    int dim;
 public:
+   InterpolatorFP() : finder(NULL) { }
+
    virtual void SetInitialField(const Vector &init_nodes,
                                 const Vector &init_field);
 
diff --git a/fem/transfer.cpp b/fem/transfer.cpp
index 868760b6f25..edbd9866dbe 100644
--- a/fem/transfer.cpp
+++ b/fem/transfer.cpp
@@ -520,7 +520,9 @@ void TensorProductPRefinementTransferOperator::MultTranspose(const Vector& x,
 TrueTransferOperator::TrueTransferOperator(const
                                            ParFiniteElementSpace& lFESpace_,
                                            const ParFiniteElementSpace& hFESpace_)
-   : lFESpace(lFESpace_), hFESpace(hFESpace_)
+   : Operator(hFESpace_.GetTrueVSize(), lFESpace_.GetTrueVSize()),
+     lFESpace(lFESpace_),
+     hFESpace(hFESpace_)
 {
    localTransferOperator = new TransferOperator(lFESpace_, hFESpace_);
 
diff --git a/general/adios2stream.cpp b/general/adios2stream.cpp
index 0fa79a78b22..59d88cb06b1 100644
--- a/general/adios2stream.cpp
+++ b/general/adios2stream.cpp
@@ -15,6 +15,8 @@
 
 #include "adios2stream.hpp"
 
+#ifdef MFEM_USE_ADIOS2
+
 #include "../fem/geom.hpp"
 #include "../general/array.hpp"
 #include "../mesh/element.hpp"
@@ -750,3 +752,5 @@ noexcept
 }
 
 }  // end namespace mfem
+
+#endif // MFEM_USE_ADIOS2
diff --git a/general/adios2stream.hpp b/general/adios2stream.hpp
index 78ad7d793a9..2c483b0b7a6 100644
--- a/general/adios2stream.hpp
+++ b/general/adios2stream.hpp
@@ -18,6 +18,8 @@
 
 #include "../config/config.hpp"
 
+#ifdef MFEM_USE_ADIOS2
+
 #include <map>
 #include <memory>  // std::unique_ptr
 #include <string>
@@ -230,4 +232,6 @@ class adios2stream
 
 }  // end namespace mfem
 
+#endif // MFEM_USE_ADIOS2
+
 #endif /* MFEM_ADIOS2STREAM */
diff --git a/general/array.hpp b/general/array.hpp
index 6765b1c832d..c6dc6b0a28c 100644
--- a/general/array.hpp
+++ b/general/array.hpp
@@ -60,31 +60,31 @@ class Array
    /// Creates an empty array with a given MemoryType
    inline Array(MemoryType mt) : size(0) { data.Reset(mt); }
 
-   /// Creates array of asize elements
+   /// Creates array of @a asize elements
    explicit inline Array(int asize)
       : size(asize) { asize > 0 ? data.New(asize) : data.Reset(); }
 
-   /** Creates array using an existing c-array of asize elements;
+   /** @brief Creates array using an existing c-array of asize elements;
        allocsize is set to -asize to indicate that the data will not
        be deleted. */
    inline Array(T *_data, int asize)
    { data.Wrap(_data, asize, false); size = asize; }
 
-   /// Copy constructor: deep copy
+   /// Copy constructor: deep copy from @a src
    /** This method supports source arrays using any MemoryType. */
    inline Array(const Array &src);
 
-   /// Copy constructor (deep copy) from an Array of convertable type
+   /// Copy constructor (deep copy) from 'src', an Array of convertible type.
    template <typename CT>
    inline Array(const Array<CT> &src);
 
    /// Destructor
    inline ~Array() { data.Delete(); }
 
-   /// Assignment operator: deep copy
+   /// Assignment operator: deep copy from 'src'.
    Array<T> &operator=(const Array<T> &src) { src.Copy(*this); return *this; }
 
-   /// Assignment operator (deep copy) from an Array of convertable type
+   /// Assignment operator (deep copy) from @a src, an Array of convertible type.
    template <typename CT>
    inline Array &operator=(const Array<CT> &src);
 
@@ -120,13 +120,13 @@ class Array
    /// Make the Array own the data
    void MakeDataOwner() const { data.SetHostPtrOwner(true); }
 
-   /// Logical size of the array
+   /// Return the logical size of the array.
    inline int Size() const { return size; }
 
-   /// Change logical size of the array, keep existing entries
+   /// Change the logical size of the array, keep existing entries.
    inline void SetSize(int nsize);
 
-   /// Same as SetSize(int) plus initialize new entries with 'initval'
+   /// Same as SetSize(int) plus initialize new entries with 'initval'.
    inline void SetSize(int nsize, const T &initval);
 
    /** @brief Resize the array to size @a nsize using MemoryType @a mt. Note
@@ -142,58 +142,63 @@ class Array
    inline void Reserve(int capacity)
    { if (capacity > Capacity()) { GrowSize(capacity); } }
 
-   /// Access element
+   /// Reference access to the ith element.
    inline T & operator[](int i);
 
-   /// Access const element
+   /// Const reference access to the ith element.
    inline const T &operator[](int i) const;
 
-   /// Append element to array, resize if necessary
+   /// Append element 'el' to array, resize if necessary.
    inline int Append(const T & el);
 
-   /// Append another array to this array, resize if necessary
+   /// Append another array to this array, resize if necessary.
    inline int Append(const T *els, int nels);
 
-   /// Append another array to this array, resize if necessary
+   /// Append another array to this array, resize if necessary.
    inline int Append(const Array<T> &els) { return Append(els, els.Size()); }
 
-   /// Prepend an element to the array, resize if necessary
+   /// Prepend an 'el' to the array, resize if necessary.
    inline int Prepend(const T &el);
 
-   /// Return the last element in the array
+   /// Return the last element in the array.
    inline T &Last();
+
+   /// Return the last element in the array.
    inline const T &Last() const;
 
-   /// Append element when it is not yet in the array, return index
+   /// Append element when it is not yet in the array, return index.
    inline int Union(const T & el);
 
-   /// Return the first index where 'el' is found; return -1 if not found
+   /// Return the first index where 'el' is found; return -1 if not found.
    inline int Find(const T &el) const;
 
    /// Do bisection search for 'el' in a sorted array; return -1 if not found.
    inline int FindSorted(const T &el) const;
 
-   /// Delete the last entry
+   /// Delete the last entry of the array.
    inline void DeleteLast() { if (size > 0) { size--; } }
 
-   /// Delete the first 'el' entry
+   /// Delete the first entry with value == 'el'.
    inline void DeleteFirst(const T &el);
 
-   /// Delete whole array
+   /// Delete the whole array.
    inline void DeleteAll();
 
-   /// Create a copy of the current array
+
+   ///  Create a copy of the internal array to the provided @a copy.
    inline void Copy(Array &copy) const;
 
-   /// Make this Array a reference to a pointer
+   /// Make this Array a reference to a pointer.
    inline void MakeRef(T *, int);
 
-   /// Make this Array a reference to 'master'
+   /// Make this Array a reference to 'master'.
    inline void MakeRef(const Array &master);
 
+
+   /// Copy sub array starting from @a offset out to the provided @a sa.
    inline void GetSubArray(int offset, int sa_size, Array<T> &sa) const;
 
-   /// Prints array to stream with width elements per row
+   /// Prints array to stream with width elements per row.
    void Print(std::ostream &out = mfem::out, int width = 4) const;
 
    /** @brief Save the Array to the stream @a out using the format @a fmt.
@@ -225,48 +230,60 @@ class Array
        operator `<` for class T. */
    T Min() const;
 
-   /// Sorts the array. This requires operator< to be defined for T.
+   /// Sorts the array in ascending order. This requires operator< to be defined for T.
    void Sort() { std::sort((T*)data, data + size); }
 
-   /// Sorts the array using the supplied comparison function object.
+   /// Sorts the array in ascending order using the supplied comparison function object.
    template<class Compare>
    void Sort(Compare cmp) { std::sort((T*)data, data + size, cmp); }
 
-   /** Removes duplicities from a sorted array. This requires operator== to be
-       defined for T. */
+   /** @brief Removes duplicities from a sorted array. This requires
+       operator== to be defined for T. */
    void Unique()
    {
       T* end = std::unique((T*)data, data + size);
       SetSize(end - data);
    }
 
-   /// return true if the array is sorted.
+   /// Return 1 if the array is sorted from lowest to highest.  Otherwise return 0.
    int IsSorted();
 
-   /// Partial Sum
+   /// Fill the entries of the array with the cumulative sum of the entries.
    void PartialSum();
 
-   /// Sum all entries
+   /// Return the sum of all the array entries using the '+'' operator for class 'T'.
    T Sum();
 
+   /// Set all entries of the array to the provided constant.
    inline void operator=(const T &a);
 
-   /// Copy data from a pointer. Size() elements are copied.
+   /// Copy data from a pointer. 'Size()' elements are copied.
    inline void Assign(const T *);
 
+   /// STL-like copyTo @a dest from begin to end.
    template <typename U>
    inline void CopyTo(U *dest) { std::copy(begin(), end(), dest); }
 
+   /** @brief Copy from @a src into this array.  Copies enough entries to
+       fill the Capacity size of this array.  Careful this does not update
+       the Size to match this Capacity after this.*/
    template <typename U>
    inline void CopyFrom(const U *src)
    { std::memcpy(begin(), src, MemoryUsage()); }
 
-   // STL-like begin/end
+   /// STL-like begin.  Returns pointer to the first element of the array.
    inline T* begin() { return data; }
+
+   /// STL-like end.  Returns pointer after the last element of the array.
    inline T* end() { return data + size; }
+
+   /// STL-like begin.  Returns const pointer to the first element of the array.
    inline const T* begin() const { return data; }
+
+   /// STL-like end.  Returns const pointer after the last element of the array.
    inline const T* end() const { return data + size; }
 
+   /// Returns the number of bytes allocated for the array including any reserve.
    long MemoryUsage() const { return Capacity() * sizeof(T); }
 
    /// Shortcut for mfem::Read(a.GetMemory(), a.Size(), on_dev).
diff --git a/general/binaryio.hpp b/general/binaryio.hpp
index 06de1fc532a..75a23002024 100644
--- a/general/binaryio.hpp
+++ b/general/binaryio.hpp
@@ -25,12 +25,14 @@ namespace mfem
 namespace bin_io
 {
 
+/// Write 'value' to stream.
 template<typename T>
 inline void write(std::ostream& os, T value)
 {
    os.write((char*) &value, sizeof(T));
 }
 
+/// Read a value from the stream and return it.
 template<typename T>
 inline T read(std::istream& is)
 {
diff --git a/general/communication.hpp b/general/communication.hpp
index c92e26fad4c..8542420822f 100644
--- a/general/communication.hpp
+++ b/general/communication.hpp
@@ -47,69 +47,91 @@ class MPI_Session
    bool Root() const { return world_rank == 0; }
 };
 
+
+/** The shared entities (e.g. vertices, faces and edges) are split into groups,
+    each group determined by the set of participating processors. They are
+    numbered locally in lproc. Assumptions:
+    - group 0 is the 'local' group
+    - groupmaster_lproc[0] = 0
+    - lproc_proc[0] = MyRank */
 class GroupTopology
 {
 private:
    MPI_Comm   MyComm;
 
-   /* The shared entities (e.g. vertices, faces and edges) are split into
-      groups, each group determined by the set of participating processors.
-      They are numbered locally in lproc. Assumptions:
-      - group 0 is the 'local' group
-      - groupmaster_lproc[0] = 0
-      - lproc_proc[0] = MyRank */
-
-   // Neighbor ids (lproc) in each group.
+   /// Neighbor ids (lproc) in each group.
    Table      group_lproc;
-   // Master neighbor id for each group.
+   /// Master neighbor id for each group.
    Array<int> groupmaster_lproc;
-   // MPI rank of each neighbor.
+   /// MPI rank of each neighbor.
    Array<int> lproc_proc;
-   // Group --> Group number in the master.
+   /// Group --> Group number in the master.
    Array<int> group_mgroup;
 
    void ProcToLProc();
 
 public:
+   /// Constructor with the MPI communicator = 0.
    GroupTopology() : MyComm(0) {}
+
+   /// Constructor given the MPI communicator 'comm'.
    GroupTopology(MPI_Comm comm) { MyComm = comm; }
 
    /// Copy constructor
    GroupTopology(const GroupTopology &gt);
+
+   /// Set the MPI communicator to 'comm'.
    void SetComm(MPI_Comm comm) { MyComm = comm; }
 
+   /// Return the MPI communicator.
    MPI_Comm GetComm() const { return MyComm; }
+
+   /// Return the MPI rank within this object's communicator.
    int MyRank() const { int r; MPI_Comm_rank(MyComm, &r); return r; }
+
+   /// Return the number of MPI ranks within this object's communicator.
    int NRanks() const { int s; MPI_Comm_size(MyComm, &s); return s; }
 
+   /// Set up the group topology given the list of sets of shared entities.
    void Create(ListOfIntegerSets &groups, int mpitag);
 
+   /// Return the number of groups.
    int NGroups() const { return group_lproc.Size(); }
-   // return the number of neighbors including the local processor
+
+   /// Return the number of neighbors including the local processor.
    int GetNumNeighbors() const { return lproc_proc.Size(); }
+
+   /// Return the MPI rank of neighbor 'i'.
    int GetNeighborRank(int i) const { return lproc_proc[i]; }
-   // am I master for group 'g'?
+
+   /// Return true if I am master for group 'g'.
    bool IAmMaster(int g) const { return (groupmaster_lproc[g] == 0); }
-   // return the neighbor index of the group master for a given group.
-   // neighbor 0 is the local processor
+
+   /** @brief Return the neighbor index of the group master for a given group.
+       Neighbor 0 is the local processor. */
    int GetGroupMaster(int g) const { return groupmaster_lproc[g]; }
-   // return the rank of the group master for a given group
+
+   /// Return the rank of the group master for group 'g'.
    int GetGroupMasterRank(int g) const
    { return lproc_proc[groupmaster_lproc[g]]; }
-   // for a given group return the group number in the master
+
+   /// Return the group number in the master for group 'g'.
    int GetGroupMasterGroup(int g) const { return group_mgroup[g]; }
-   // get the number of processors in a group
+
+   /// Get the number of processors in a group
    int GetGroupSize(int g) const { return group_lproc.RowSize(g); }
-   // return a pointer to a list of neighbors for a given group.
-   // neighbor 0 is the local processor
+
+   /** @brief Return a pointer to a list of neighbors for a given group.
+       Neighbor 0 is the local processor */
    const int *GetGroup(int g) const { return group_lproc.GetRow(g); }
 
    /// Save the data in a stream.
    void Save(std::ostream &out) const;
+
    /// Load the data from a stream.
    void Load(std::istream &in);
 
-   /// Copy
+   /// Copy the internal data to the external 'copy'.
    void Copy(GroupTopology & copy) const;
 
    virtual ~GroupTopology() {}
@@ -186,9 +208,8 @@ class GroupCommunicator
    void GetNeighborLDofTable(Table &nbr_ldof) const;
 
    /** @brief Data structure on which we define reduce operations.
-
-     The data is associated with (and the operation is performed on) one group
-     at a time. */
+       The data is associated with (and the operation is performed on) one
+       group at a time. */
    template <class T> struct OpData
    {
       int nldofs, nb;
@@ -322,9 +343,10 @@ struct VarMessage
    std::string data;
    MPI_Request send_request;
 
-   /** Non-blocking send to processor 'rank'. Returns immediately. Completion
-       (as tested by MPI_Wait/Test) does not mean the message was received --
-       it may be on its way or just buffered locally. */
+   /** @brief Non-blocking send to processor 'rank'.
+       Returns immediately. Completion (as tested by MPI_Wait/Test) does not
+       mean the message was received -- it may be on its way or just buffered
+       locally. */
    void Isend(int rank, MPI_Comm comm)
    {
       Encode(rank);
@@ -332,8 +354,9 @@ struct VarMessage
                 &send_request);
    }
 
-   /** Non-blocking synchronous send to processor 'rank'. Returns immediately.
-       Completion (MPI_Wait/Test) means that the message was received. */
+   /** @brief Non-blocking synchronous send to processor 'rank'.
+       Returns immediately. Completion (MPI_Wait/Test) means that the message
+       was received. */
    void Issend(int rank, MPI_Comm comm)
    {
       Encode(rank);
@@ -362,8 +385,8 @@ struct VarMessage
       }
    }
 
-   /** Return true if all messages in the map container were sent, otherwise
-       return false, without waiting. */
+   /** @brief Return true if all messages in the map container were sent,
+       otherwise return false, without waiting. */
    template<typename MapT>
    static bool TestAllSent(MapT& rank_msg)
    {
@@ -381,7 +404,7 @@ struct VarMessage
       return true;
    }
 
-   /** Blocking probe for incoming message of this type from any rank.
+   /** @brief Blocking probe for incoming message of this type from any rank.
        Returns the rank and message size. */
    static void Probe(int &rank, int &size, MPI_Comm comm)
    {
@@ -391,9 +414,9 @@ struct VarMessage
       MPI_Get_count(&status, MPI_BYTE, &size);
    }
 
-   /** Non-blocking probe for incoming message of this type from any rank.
-       If there is an incoming message, returns true and sets 'rank' and 'size'.
-       Otherwise returns false. */
+   /** @brief Non-blocking probe for incoming message of this type from any
+       rank. If there is an incoming message, returns true and sets 'rank' and
+       'size'. Otherwise returns false. */
    static bool IProbe(int &rank, int &size, MPI_Comm comm)
    {
       int flag;
@@ -421,7 +444,7 @@ struct VarMessage
       Decode(rank);
    }
 
-   /// Like Recv(), but throw away the messsage.
+   /// Like Recv(), but throw away the message.
    void RecvDrop(int rank, int size, MPI_Comm comm)
    {
       data.resize(size);
@@ -448,6 +471,8 @@ struct VarMessage
    }
 
    VarMessage() : send_request(MPI_REQUEST_NULL) {}
+
+   /// Clear the message and associated request.
    void Clear() { data.clear(); send_request = MPI_REQUEST_NULL; }
 
    virtual ~VarMessage()
diff --git a/general/cuda.hpp b/general/cuda.hpp
index f749e69e28b..005e16c927c 100644
--- a/general/cuda.hpp
+++ b/general/cuda.hpp
@@ -77,19 +77,19 @@ void mfem_cuda_error(cudaError_t err, const char *expr, const char *func,
                      const char *file, int line);
 #endif
 
-/// Allocates device memory
+/// Allocates device memory and returns destination ptr.
 void* CuMemAlloc(void **d_ptr, size_t bytes);
 
 /// Allocates managed device memory
 void* CuMallocManaged(void **d_ptr, size_t bytes);
 
-/// Frees device memory
+/// Frees device memory and returns destination ptr.
 void* CuMemFree(void *d_ptr);
 
-/// Copies memory from Host to Device
+/// Copies memory from Host to Device and returns destination ptr.
 void* CuMemcpyHtoD(void *d_dst, const void *h_src, size_t bytes);
 
-/// Copies memory from Host to Device
+/// Copies memory from Host to Device and returns destination ptr.
 void* CuMemcpyHtoDAsync(void *d_dst, const void *h_src, size_t bytes);
 
 /// Copies memory from Device to Device
diff --git a/general/hash.hpp b/general/hash.hpp
index a8a43f4b925..3360cc26701 100644
--- a/general/hash.hpp
+++ b/general/hash.hpp
@@ -77,7 +77,7 @@ class HashTable : public BlockArray<T>
    HashTable(const HashTable& other); // deep copy
    ~HashTable();
 
-   /// Get item whose parents are p1, p2... Create it if it doesn't exist.
+   /// Get item whose parents are 'p1', 'p2'... Create it if it doesn't exist.
    T* Get(int p1, int p2);
    T* Get(int p1, int p2, int p3, int p4 = -1 /* p4 optional */);
 
@@ -123,6 +123,7 @@ class HashTable : public BlockArray<T>
    /// Return total size of allocated memory (tables plus items), in bytes.
    long MemoryUsage() const;
 
+   /// Write details of the memory usage to the mfem output stream.
    void PrintMemoryDetail() const;
 
    class iterator : public Base::iterator
diff --git a/general/mem_alloc.hpp b/general/mem_alloc.hpp
index e4fe7a6300b..19c66b509a5 100644
--- a/general/mem_alloc.hpp
+++ b/general/mem_alloc.hpp
@@ -33,12 +33,21 @@ class Stack
    StackPart <Elem, Num> *TopPart, *TopFreePart;
    int UsedInTop, SSize;
 public:
+   /// Construct an empty stack.
    Stack() { TopPart = TopFreePart = NULL; UsedInTop = Num; SSize = 0; }
+   /// Return the number of elements on the stack.
    int Size() const { return SSize; }
+   /// Push element 'E' on the stack.
    void Push (Elem E);
+   /// Pop an element off the stack and return it.
    Elem Pop();
+   /// Clear the elements off the stack.
    void Clear();
+
+   /// Swap the data in this stack with the data in @a other.
    void Swap(Stack<Elem, Num> &other);
+
+   /// Return the number of bytes used by the stack.
    size_t MemoryUsage() const;
    ~Stack() { Clear(); }
 };
diff --git a/general/mem_manager.cpp b/general/mem_manager.cpp
index 54f09418c87..ac0664db8cc 100644
--- a/general/mem_manager.cpp
+++ b/general/mem_manager.cpp
@@ -26,8 +26,10 @@
 #include <signal.h>
 #include <sys/mman.h>
 #define mfem_memalign(p,a,s) posix_memalign(p,a,s)
+#define mfem_aligned_free free
 #else
 #define mfem_memalign(p,a,s) (((*(p))=_aligned_malloc((s),(a))),*(p)?0:errno)
+#define mfem_aligned_free _aligned_free
 #endif
 
 #ifdef MFEM_USE_UMPIRE
@@ -212,7 +214,7 @@ class Aligned32HostMemorySpace : public HostMemorySpace
    Aligned32HostMemorySpace(): HostMemorySpace() { }
    void Alloc(void **ptr, size_t bytes)
    { if (mfem_memalign(ptr, 32, bytes) != 0) { throw ::std::bad_alloc(); } }
-   void Dealloc(void *ptr) { std::free(ptr); }
+   void Dealloc(void *ptr) { mfem_aligned_free(ptr); }
 };
 
 /// The aligned 64 host memory space
@@ -222,6 +224,7 @@ class Aligned64HostMemorySpace : public HostMemorySpace
    Aligned64HostMemorySpace(): HostMemorySpace() { }
    void Alloc(void **ptr, size_t bytes)
    { if (mfem_memalign(ptr, 64, bytes) != 0) { throw ::std::bad_alloc(); } }
+   void Dealloc(void *ptr) { mfem_aligned_free(ptr); }
 };
 
 #ifndef _WIN32
@@ -666,12 +669,11 @@ void *MemoryManager::Register_(void *ptr, void *h_tmp, size_t bytes,
 {
    MFEM_CONTRACT_VAR(alias);
    MFEM_ASSERT(exists, "Internal error!");
-   MFEM_ASSERT(IsHostMemory(mt), "Internal error!");
    MFEM_ASSERT(!alias, "Cannot register an alias!");
    const bool is_host_mem = IsHostMemory(mt);
    const MemType dual_mt = GetDualMemoryType_(mt);
-   const MemType h_mt = mt;
-   const MemType d_mt = dual_mt;
+   const MemType h_mt = is_host_mem ? mt : dual_mt;
+   const MemType d_mt = is_host_mem ? dual_mt : mt;
    MFEM_VERIFY_TYPES(h_mt, d_mt);
 
    if (ptr == nullptr && h_tmp == nullptr)
@@ -693,10 +695,11 @@ void *MemoryManager::Register_(void *ptr, void *h_tmp, size_t bytes,
    else // DEVICE TYPES
    {
       h_ptr = h_tmp;
-      if (h_tmp == nullptr) { ctrl->Host(h_mt)->Alloc(&h_ptr, bytes); }
+      if (own && h_tmp == nullptr) { ctrl->Host(h_mt)->Alloc(&h_ptr, bytes); }
       mm.InsertDevice(ptr, h_ptr, bytes, h_mt, d_mt);
-      flags = (own ? flags | Mem::OWNS_DEVICE : flags & ~Mem::OWNS_DEVICE) |
-              Mem::OWNS_HOST | Mem::VALID_DEVICE;
+      flags = own ? flags | Mem::OWNS_DEVICE : flags & ~Mem::OWNS_DEVICE;
+      flags = own ? flags | Mem::OWNS_HOST   : flags & ~Mem::OWNS_HOST;
+      flags |= Mem::VALID_DEVICE;
    }
    CheckHostMemoryType_(h_mt, h_ptr);
    return h_ptr;
diff --git a/general/mem_manager.hpp b/general/mem_manager.hpp
index ae735c2e57c..3022cb65996 100644
--- a/general/mem_manager.hpp
+++ b/general/mem_manager.hpp
@@ -16,6 +16,7 @@
 #include "error.hpp"
 #include <cstring> // std::memcpy
 #include <type_traits> // std::is_const
+#include <cstddef> // std::max_align_t
 
 namespace mfem
 {
@@ -283,6 +284,28 @@ class Memory
        @note The current memory is NOT deleted by this method. */
    inline void Wrap(T *ptr, int size, MemoryType mt, bool own);
 
+   /** Wrap an externally pair of allocated pointers, @a h_ptr and @ d_ptr,
+       of the given host MemoryType @a h_mt. */
+   /** The new memory object will have the device MemoryType set as valid.
+
+       The given @a h_ptr and @a d_ptr must be allocated appropriately for the
+       given host MemoryType and its associated device MemoryType:
+          - MANAGED => MANAGED,
+          - HOST_DEBUG => DEVICE_DEBUG,
+          - HOST_UMPIRE => DEVICE_UMPIRE,
+          - HOST, HOST_32, HOST_64 => DEVICE.
+
+       The parameter @a own determines whether both @a h_ptr and @a d_ptr will
+       be deleted when the method Delete() is called.
+
+       @note Ownership can also be controled by using the folowing methods:
+         - ClearOwnerFlags,
+         - SetHostPtrOwner,
+         - SetDevicePtrOwner.
+
+       @note The current memory is NOT deleted by this method. */
+   inline void Wrap(T *h_ptr, T *d_ptr, int size, MemoryType h_mt, bool own);
+
    /// Create a memory object that points inside the memory object @a base.
    /** The new Memory object uses the same MemoryType(s) as @a base.
 
@@ -413,10 +436,44 @@ class Memory
    /** This method can be useful for debugging. It is explicitly instantiated
        for Memory<T> with T = int and T = double. */
    inline int CompareHostAndDevice(int size) const;
+
+private:
+   // GCC 4.8 workaround: max_align_t is not in std.
+   static constexpr std::size_t def_align_bytes_()
+   {
+      using namespace std;
+      return alignof(max_align_t);
+   }
+   static constexpr std::size_t def_align_bytes = def_align_bytes_();
+   static constexpr std::size_t new_align_bytes =
+      alignof(T) > def_align_bytes ? alignof(T) : def_align_bytes;
+
+   template <std::size_t align_bytes, bool dummy = true> struct Alloc
+   {
+      static inline T *New(std::size_t)
+      {
+#if __cplusplus < 201703L
+         // Generate an error in debug mode
+         MFEM_ASSERT(false, "overaligned type cannot use MemoryType::HOST");
+         return nullptr;
+#else
+         return new T[size];
+#endif
+      }
+   };
+
+#if __cplusplus < 201703L
+   template<bool dummy> struct Alloc<def_align_bytes,dummy>
+   {
+      static inline T *New(std::size_t size) { return new T[size]; }
+   };
+#endif
 };
 
 
-/// The memory manager class
+/** The MFEM memory manager class. Host-side pointers are inserted into this
+    manager which keeps track of the associated device pointer, and where the
+    data currently resides. */
 class MemoryManager
 {
 private:
@@ -523,7 +580,8 @@ class MemoryManager
 
 private:
 
-   /// Insert a host address in the memory map
+   /// Insert a host address @a h_ptr and size *a bytes in the memory map to be
+   /// managed.
    void Insert(void *h_ptr, size_t bytes, MemoryType h_mt,  MemoryType d_mt);
 
    /// Insert a device and the host addresses in the memory map
@@ -625,7 +683,7 @@ inline void Memory<T>::New(int size)
    capacity = size;
    flags = OWNS_HOST | VALID_HOST;
    h_mt = MemoryManager::host_mem_type;
-   h_ptr = (h_mt == MemoryType::HOST) ? new T[size] :
+   h_ptr = (h_mt == MemoryType::HOST) ? Alloc<new_align_bytes>::New(size) :
            (T*)MemoryManager::New_(nullptr, size*sizeof(T), h_mt, flags);
 }
 
@@ -637,8 +695,9 @@ inline void Memory<T>::New(int size, MemoryType mt)
    const bool mt_host = mt == MemoryType::HOST;
    if (mt_host) { flags = OWNS_HOST | VALID_HOST; }
    h_mt = IsHostMemory(mt) ? mt : MemoryManager::GetDualMemoryType_(mt);
-   T *h_tmp = (h_mt == MemoryType::HOST) ? new T[size] : nullptr;
-   h_ptr = (mt_host) ? h_tmp: (T*)MemoryManager::New_(h_tmp, bytes, mt, flags);
+   T *h_tmp = (h_mt == MemoryType::HOST) ?
+              Alloc<new_align_bytes>::New(size) : nullptr;
+   h_ptr = (mt_host) ? h_tmp : (T*)MemoryManager::New_(h_tmp, bytes, mt, flags);
 }
 
 template <typename T>
@@ -682,6 +741,19 @@ inline void Memory<T>::Wrap(T *ptr, int size, MemoryType mt, bool own)
                                         own, false, flags);
 }
 
+template <typename T>
+inline void Memory<T>::Wrap(T *ptr, T *d_ptr, int size, MemoryType mt, bool own)
+{
+   h_mt = mt;
+   flags = 0;
+   h_ptr = ptr;
+   capacity = size;
+   MFEM_ASSERT(IsHostMemory(h_mt),"");
+   const size_t bytes = size*sizeof(T);
+   const MemoryType d_mt = MemoryManager::GetDualMemoryType_(h_mt);
+   MemoryManager::Register_(d_ptr, h_ptr, bytes, d_mt, own, false, flags);
+}
+
 template <typename T>
 inline void Memory<T>::MakeAlias(const Memory &base, int offset, int size)
 {
diff --git a/general/optparser.hpp b/general/optparser.hpp
index ddab615b960..00c68da1361 100644
--- a/general/optparser.hpp
+++ b/general/optparser.hpp
@@ -68,11 +68,17 @@ class OptionsParser
    static void WriteValue(const Option &opt, std::ostream &out);
 
 public:
+
+   /// Construct a command line option parser with '_argc' and '_argv'.
    OptionsParser(int _argc, char *_argv[])
       : argc(_argc), argv(_argv)
    {
       error_type = error_idx = 0;
    }
+
+   /** @brief Add a boolean option and set 'var' to receive the value.
+       Enable/disable tags are used to set the bool to true/false
+       respectively. */
    void AddOption(bool *var, const char *enable_short_name,
                   const char *enable_long_name, const char *disable_short_name,
                   const char *disable_long_name, const char *description,
@@ -83,18 +89,24 @@ class OptionsParser
       options.Append(Option(DISABLE, var, disable_short_name, disable_long_name,
                             description, required));
    }
+
+   /// Add an integer option and set 'var' to receive the value.
    void AddOption(int *var, const char *short_name, const char *long_name,
                   const char *description, bool required = false)
    {
       options.Append(Option(INT, var, short_name, long_name, description,
                             required));
    }
+
+   /// Add a double option and set 'var' to receive the value.
    void AddOption(double *var, const char *short_name, const char *long_name,
                   const char *description, bool required = false)
    {
       options.Append(Option(DOUBLE, var, short_name, long_name, description,
                             required));
    }
+
+   /// Add a string (char*) option and set 'var' to receive the value.
    void AddOption(const char **var, const char *short_name,
                   const char *long_name, const char *description,
                   bool required = false)
@@ -102,6 +114,9 @@ class OptionsParser
       options.Append(Option(STRING, var, short_name, long_name, description,
                             required));
    }
+
+   /** Add an integer array (separated by spaces) option and set 'var' to
+       receive the values. */
    void AddOption(Array<int> * var, const char *short_name,
                   const char *long_name, const char *description,
                   bool required = false)
@@ -109,6 +124,9 @@ class OptionsParser
       options.Append(Option(ARRAY, var, short_name, long_name, description,
                             required));
    }
+
+   /** Add a vector (doubles separated by spaces) option and set 'var' to
+       receive the values. */
    void AddOption(Vector * var, const char *short_name,
                   const char *long_name, const char *description,
                   bool required = false)
@@ -117,16 +135,28 @@ class OptionsParser
                             required));
    }
 
-   /** Parse the command-line options. Note that this function expects all the
-       options provided through the command line to have a corresponding
-       AddOption. In particular, this function cannot be used for partial
-       parsing. */
+   /** @brief Parse the command-line options.
+       Note that this function expects all the options provided through the
+       command line to have a corresponding AddOption. In particular, this
+       function cannot be used for partial parsing. */
    void Parse();
+
+   /// Return true if the command line options were parsed successfully.
    bool Good() const { return (error_type == 0); }
+
+   /// Return true if we are flagged to print the help message.
    bool Help() const { return (error_type == 1); }
+
+   /// Print the options
    void PrintOptions(std::ostream &out) const;
+
+   /// Print the error message
    void PrintError(std::ostream &out) const;
+
+   /// Print the help message
    void PrintHelp(std::ostream &out) const;
+
+   /// Print the usage message
    void PrintUsage(std::ostream &out) const;
 };
 
diff --git a/general/sets.hpp b/general/sets.hpp
index 871a599a29e..b3e92b85212 100644
--- a/general/sets.hpp
+++ b/general/sets.hpp
@@ -26,27 +26,32 @@ class IntegerSet
    Array<int> me;
 
 public:
+   /// Create an empty set.
    IntegerSet() { }
 
+   /// Create a copy of set 's'.
    IntegerSet(IntegerSet &s);
 
-   /// Create an integer set from a block of memory containing integer values
-   /// ( like an array ).
-   ///
-   /// n - length ( number of integers )
-   /// p - pointer to block of memory containing the integer values
+   /// Create an integer set from C-array 'p' of 'n' integers.
    IntegerSet(const int n, const int *p) { Recreate(n, p); }
 
+   /// Return the size of the set.
    int Size() { return me.Size(); }
 
+   /// Return a reference to the sorted array of all the set entries.
    operator Array<int>& () { return me; }
 
+   /// Return the value of the lowest element of the set.
    int PickElement() { return me[0]; }
 
+   /// Return the value of a random element of the set.
    int PickRandomElement();
 
+   /// Return 1 if the sets are equal and 0 otherwise.
    int operator==(IntegerSet &s);
 
+   /** @brief Create an integer set from C-array 'p' of 'n' integers.
+       Overwrites any existing set data. */
    void Recreate(const int n, const int *p);
 };
 
@@ -58,16 +63,25 @@ class ListOfIntegerSets
 
 public:
 
+   /// Return the number of integer sets in the list.
    int Size() { return TheList.Size(); }
 
+   /// Return the value of the first element of the ith set.
    int PickElementInSet(int i) { return TheList[i]->PickElement(); }
 
+   /// Return a random value from the ith set in the list.
    int PickRandomElementInSet(int i) { return TheList[i]->PickRandomElement(); }
 
+   /** @brief Check to see if set 's' is in the list. If not append it to the
+       end of the list. Returns the index of the list where set 's' can be
+       found. */
    int Insert(IntegerSet &s);
 
+   /** Return the index of the list where set 's' can be found. Returns -1 if
+       not found. */
    int Lookup(IntegerSet &s);
 
+   /// Write the list of sets into table 't'.
    void AsTable(Table &t);
 
    ~ListOfIntegerSets();
diff --git a/general/socketstream.hpp b/general/socketstream.hpp
index 64748cdae18..83979487302 100644
--- a/general/socketstream.hpp
+++ b/general/socketstream.hpp
@@ -53,18 +53,25 @@ class socketbuf : public std::streambuf
       open(hostname, port);
    }
 
-   /** Attach a new socket descriptor to the socketbuf.
-       Returns the old socket descriptor which is NOT closed. */
+   /** @brief Attach a new socket descriptor to the socketbuf. Returns the old
+       socket descriptor which is NOT closed. */
    virtual int attach(int sd);
 
+   /// Detach the current socket descriptor from the socketbuf.
    int detach() { return attach(-1); }
 
+   /** @brief Open a socket on the 'port' at 'hostname' and store the socket
+       descriptor. Returns 0 if there is no error, otherwise returns -1. */
    virtual int open(const char hostname[], int port);
 
+   /// Close the current socket descriptor.
    virtual int close();
 
+   /// Returns the attached socket descriptor.
    int getsocketdescriptor() { return socket_descriptor; }
 
+   /** @brief Returns true if the socket is open and has a valid socket
+       descriptor. Otherwise returns false. */
    bool is_open() { return (socket_descriptor >= 0); }
 
    virtual ~socketbuf() { close(); }
@@ -177,8 +184,8 @@ class GnuTLS_socketbuf : public socketbuf
 
    bool gnutls_good() const { return status.good(); }
 
-   /** Attach a new socket descriptor to the socketbuf.
-       Returns the old socket descriptor which is NOT closed. */
+   /** Attach a new socket descriptor to the socketbuf. Returns the old socket
+       descriptor which is NOT closed. */
    virtual int attach(int sd);
 
    virtual int open(const char hostname[], int port);
@@ -255,10 +262,13 @@ class socketstream : public std::iostream
 
    socketbuf *rdbuf() { return buf__; }
 
+   /// Open the socket stream on 'port' at 'hostname'.
    int open(const char hostname[], int port);
 
+   /// Close the socketstream.
    int close() { return buf__->close(); }
 
+   /// True if the socketstream is open, false otherwise.
    bool is_open() { return buf__->is_open(); }
 
    virtual ~socketstream();
diff --git a/general/sort_pairs.hpp b/general/sort_pairs.hpp
index 0b7bb0d79d4..4a272699d3e 100644
--- a/general/sort_pairs.hpp
+++ b/general/sort_pairs.hpp
@@ -52,7 +52,7 @@ void SortPairs (Pair<A, B> *pairs, int size)
    std::sort(pairs, pairs + size);
 }
 
-
+/// A triple of objects
 template <class A, class B, class C>
 class Triple
 {
diff --git a/general/stable3d.hpp b/general/stable3d.hpp
index 1e7c774dfc4..c1380003a23 100644
--- a/general/stable3d.hpp
+++ b/general/stable3d.hpp
@@ -25,7 +25,12 @@ class STable3DNode
    int Column, Floor, Number;
 };
 
-/// Symmetric 3D Table
+/** @brief Symmetric 3D Table stored as an array of rows each of which has a
+    stack of column, floor, number nodes. The number of the node is assigned by
+    counting the nodes from zero as they are pushed into the table. Diagonals of
+    any kind are not allowed so the row, column and floor must all be different
+    for each node. Only one node is stored for all 6 symmetric entries that are
+    indexable by unique triplets of row, column, and floor. */
 class STable3D
 {
 private:
@@ -37,20 +42,34 @@ class STable3D
 #endif
 
 public:
+   /// Construct the table with a total of 'nr' rows.
    explicit STable3D (int nr);
 
+   /** @brief Check to see if this entry is in the table and add it to the table
+       if it is not there. Returns the number assigned to the table entry. */
    int Push (int r, int c, int f);
 
+   /// Return the number assigned to the table entry. Abort if it's not there.
    int operator() (int r, int c, int f) const;
 
+   /** Return the number assigned to the table entry. Return -1 if it's not
+       there. */
    int Index (int r, int c, int f) const;
 
+   /** @brief Check to see if this entry is in the table and add it to the table
+       if it is not there. The entry is addressed by the three smallest values
+       of (r,c,f,t). Returns the number assigned to the table entry. */
    int Push4 (int r, int c, int f, int t);
 
+   /** @brief Return the number assigned to the table entry. The entry is
+       addressed by the three smallest values of (r,c,f,t). Return -1 if it is
+       not there. */
    int operator() (int r, int c, int f, int t) const;
 
+   /// Return the number of elements added to the table.
    int NumberOfElements() { return NElem; }
 
+   /// Print out all of the table elements.
    void Print(std::ostream &out = mfem::out) const;
 
    ~STable3D ();
diff --git a/general/table.hpp b/general/table.hpp
index 91ce53c11c0..9702139e749 100644
--- a/general/table.hpp
+++ b/general/table.hpp
@@ -192,7 +192,6 @@ Table * Mult (const Table &A, const Table &B);
 /** Data type STable. STable is similar to Table, but it's for symmetric
     connectivity, i.e. TYPE I is equivalent to TYPE II. In the first
     dimension we put the elements with smaller index. */
-
 class STable : public Table
 {
 public:
diff --git a/general/tassign.hpp b/general/tassign.hpp
index c72482b21c7..7018a203b24 100644
--- a/general/tassign.hpp
+++ b/general/tassign.hpp
@@ -43,70 +43,110 @@ template <>
 struct AssignOp_Impl<AssignOp::Set>
 {
    template <typename lvalue_t, typename rvalue_t>
-   MFEM_HOST_DEVICE
    static inline lvalue_t &Assign(lvalue_t &a, const rvalue_t &b)
    {
       return (a = b);
    }
+
+   template <typename lvalue_t, typename rvalue_t>
+   MFEM_HOST_DEVICE
+   static inline lvalue_t &AssignHD(lvalue_t &a, const rvalue_t &b)
+   {
+      return (a = b);
+   }
 };
 
 template <>
 struct AssignOp_Impl<AssignOp::Add>
 {
    template <typename lvalue_t, typename rvalue_t>
-   MFEM_HOST_DEVICE
    static inline lvalue_t &Assign(lvalue_t &a, const rvalue_t &b)
    {
       MFEM_FLOPS_ADD(1);
       return (a += b);
    }
+
+   template <typename lvalue_t, typename rvalue_t>
+   MFEM_HOST_DEVICE
+   static inline lvalue_t &AssignHD(lvalue_t &a, const rvalue_t &b)
+   {
+      MFEM_FLOPS_ADD(1);
+      return (a += b);
+   }
 };
 
 template <>
 struct AssignOp_Impl<AssignOp::Mult>
 {
    template <typename lvalue_t, typename rvalue_t>
-   MFEM_HOST_DEVICE
    static inline lvalue_t &Assign(lvalue_t &a, const rvalue_t &b)
    {
       MFEM_FLOPS_ADD(1);
       return (a *= b);
    }
+
+   template <typename lvalue_t, typename rvalue_t>
+   MFEM_HOST_DEVICE
+   static inline lvalue_t &AssignHD(lvalue_t &a, const rvalue_t &b)
+   {
+      MFEM_FLOPS_ADD(1);
+      return (a *= b);
+   }
 };
 
 template <>
 struct AssignOp_Impl<AssignOp::Div>
 {
    template <typename lvalue_t, typename rvalue_t>
-   MFEM_HOST_DEVICE
    static inline lvalue_t &Assign(lvalue_t &a, const rvalue_t &b)
    {
       MFEM_FLOPS_ADD(1);
       return (a /= b);
    }
+
+   template <typename lvalue_t, typename rvalue_t>
+   MFEM_HOST_DEVICE
+   static inline lvalue_t &AssignHD(lvalue_t &a, const rvalue_t &b)
+   {
+      MFEM_FLOPS_ADD(1);
+      return (a /= b);
+   }
 };
 
 template <>
 struct AssignOp_Impl<AssignOp::rDiv>
 {
    template <typename lvalue_t, typename rvalue_t>
-   MFEM_HOST_DEVICE
    static inline lvalue_t &Assign(lvalue_t &a, const rvalue_t &b)
    {
       MFEM_FLOPS_ADD(1);
       return (a = b/a);
    }
+
+   template <typename lvalue_t, typename rvalue_t>
+   MFEM_HOST_DEVICE
+   static inline lvalue_t &AssignHD(lvalue_t &a, const rvalue_t &b)
+   {
+      MFEM_FLOPS_ADD(1);
+      return (a = b/a);
+   }
 };
 
 } // namespace mfem::internal
 
 template <AssignOp::Type Op, typename lvalue_t, typename rvalue_t>
-MFEM_HOST_DEVICE
 inline lvalue_t &Assign(lvalue_t &a, const rvalue_t &b)
 {
    return internal::AssignOp_Impl<Op>::Assign(a, b);
 }
 
+template <AssignOp::Type Op, typename lvalue_t, typename rvalue_t>
+MFEM_HOST_DEVICE
+inline lvalue_t &AssignHD(lvalue_t &a, const rvalue_t &b)
+{
+   return internal::AssignOp_Impl<Op>::AssignHD(a, b);
+}
+
 } // namespace mfem
 
 #endif // MFEM_TEMPLATE_ASSIGN
diff --git a/general/text.hpp b/general/text.hpp
index ffa03ebf2ef..32684ee38d3 100644
--- a/general/text.hpp
+++ b/general/text.hpp
@@ -24,6 +24,7 @@ namespace mfem
 
 // Utilities for text parsing
 
+/// Check if the stream starts with @a comment_char. If so skip it.
 inline void skip_comment_lines(std::istream &is, const char comment_char)
 {
    while (1)
@@ -37,7 +38,7 @@ inline void skip_comment_lines(std::istream &is, const char comment_char)
    }
 }
 
-// Check for, and remove, a trailing '\r'.
+/// Check for, and remove, a trailing '\\r' from and std::string.
 inline void filter_dos(std::string &line)
 {
    if (!line.empty() && *line.rbegin() == '\r')
@@ -46,7 +47,7 @@ inline void filter_dos(std::string &line)
    }
 }
 
-// Convert an integer to a string
+/// Convert an integer to an std::string.
 inline std::string to_string(int i)
 {
    std::stringstream ss;
@@ -58,7 +59,7 @@ inline std::string to_string(int i)
    return out_str;
 }
 
-// Convert an integer to a 0-padded string with the given number of 'digits'
+/// Convert an integer to a 0-padded string with the given number of @a digits
 inline std::string to_padded_string(int i, int digits)
 {
    std::ostringstream oss;
@@ -66,7 +67,7 @@ inline std::string to_padded_string(int i, int digits)
    return oss.str();
 }
 
-// Convert a string to an int
+/// Convert a string to an int
 inline int to_int(const std::string& str)
 {
    int i;
diff --git a/general/tic_toc.hpp b/general/tic_toc.hpp
index 016fdfe151e..36a06697ad1 100644
--- a/general/tic_toc.hpp
+++ b/general/tic_toc.hpp
@@ -38,12 +38,29 @@ class StopWatch
 
 public:
    StopWatch();
+
+   /// Clear the elapsed time on the stopwatch and restart it if it's running.
    void Clear();
+
+   /// Clear the elapsed time and start the stopwatch.
    void Start();
+
+   /// Stop the stopwatch.
    void Stop();
+
+   /// Return the time resolution available to the stopwatch.
    double Resolution();
+
+   /** Return the number of real seconds elapsed since the stopwatch was
+       started. */
    double RealTime();
+
+   /** Return the number of user seconds elapsed since the stopwatch was
+       started. */
    double UserTime();
+
+   /** Return the number of system seconds elapsed since the stopwatch was
+       started. */
    double SystTime();
    ~StopWatch();
 };
@@ -51,10 +68,10 @@ class StopWatch
 
 extern StopWatch tic_toc;
 
-/// Start timing
+/// Start the tic_toc timer
 extern void tic();
 
-/// End timing
+/// End timing and return the time from tic() to toc() in seconds.
 extern double toc();
 
 }
diff --git a/general/version.cpp b/general/version.cpp
index 19c5fecca5f..704d9beeadd 100644
--- a/general/version.cpp
+++ b/general/version.cpp
@@ -148,6 +148,9 @@ const char *GetConfigStr()
 #ifdef MFEM_USE_OCCA
       "MFEM_USE_OCCA\n"
 #endif
+#ifdef MFEM_USE_SIMD
+      "MFEM_USE_SIMD\n"
+#endif
 #ifdef MFEM_USE_ADIOS2
       "MFEM_USE_ADIOS2\n"
 #endif
diff --git a/general/version.hpp b/general/version.hpp
index 3f81037de26..85905e4531a 100644
--- a/general/version.hpp
+++ b/general/version.hpp
@@ -15,13 +15,25 @@
 namespace mfem
 {
 
+/// Return the MFEM version number as a single integer.
 int GetVersion();
+
+/// Return the MFEM major version number as an integer.
 int GetVersionMajor();
+
+/// Return the MFEM minor version number as an integer.
 int GetVersionMinor();
+
+/// Return the MFEM version patch number as an integer.
 int GetVersionPatch();
 
+/// Return the MFEM version number as a string.
 const char *GetVersionStr();
+
+/// Return the MFEM Git hash as a string.
 const char *GetGitStr();
+
+/// Return the MFEM configuration as a string.
 const char *GetConfigStr();
 
 } // namespace mfem
diff --git a/general/zstr.hpp b/general/zstr.hpp
index dc61bcdcce4..47fd98f9fdf 100644
--- a/general/zstr.hpp
+++ b/general/zstr.hpp
@@ -175,7 +175,7 @@ struct static_method_holder
          is_p->peek();
          peek_failed = is_p->fail();
       }
-      catch (std::ios_base::failure &e) {}
+      catch (std::ios_base::failure&) {}
       if (peek_failed)
       {
          throw Exception(std::string("strict_fstream: open('")
@@ -203,10 +203,10 @@ class ifstream
    {
       mode |= std::ios_base::in;
       exceptions(std::ios_base::badbit);
-      detail::static_method_holder::check_mode(filename, mode);
+      // detail::static_method_holder::check_mode(filename, mode);
       std::ifstream::open(filename, mode);
-      detail::static_method_holder::check_open(this, filename, mode);
-      detail::static_method_holder::check_peek(this, filename, mode);
+      // detail::static_method_holder::check_open(this, filename, mode);
+      // detail::static_method_holder::check_peek(this, filename, mode);
    }
 }; // class ifstream
 
@@ -225,9 +225,9 @@ class ofstream
    {
       mode |= std::ios_base::out;
       exceptions(std::ios_base::badbit);
-      detail::static_method_holder::check_mode(filename, mode);
+      // detail::static_method_holder::check_mode(filename, mode);
       std::ofstream::open(filename, mode);
-      detail::static_method_holder::check_open(this, filename, mode);
+      // detail::static_method_holder::check_open(this, filename, mode);
    }
 }; // class ofstream
 
@@ -246,10 +246,10 @@ class fstream
    {
       if (! (mode & std::ios_base::out)) { mode |= std::ios_base::in; }
       exceptions(std::ios_base::badbit);
-      detail::static_method_holder::check_mode(filename, mode);
+      // detail::static_method_holder::check_mode(filename, mode);
       std::fstream::open(filename, mode);
-      detail::static_method_holder::check_open(this, filename, mode);
-      detail::static_method_holder::check_peek(this, filename, mode);
+      // detail::static_method_holder::check_open(this, filename, mode);
+      // detail::static_method_holder::check_peek(this, filename, mode);
    }
 }; // class fstream
 
@@ -754,6 +754,7 @@ class ofgzstream
       {
          rdbuf(_fs.rdbuf());
       }
+      setstate(_fs.rdstate());
       exceptions(std::ios_base::badbit);
    }
 
@@ -781,6 +782,7 @@ class ifgzstream
 #else
       rdbuf(_fs.rdbuf());
 #endif
+      setstate(_fs.rdstate());
       exceptions(std::ios_base::badbit);
    }
 
diff --git a/linalg/densemat.cpp b/linalg/densemat.cpp
index 710d47727cf..27f0555ad98 100644
--- a/linalg/densemat.cpp
+++ b/linalg/densemat.cpp
@@ -3043,9 +3043,6 @@ void LUFactors::RightSolve(int m, int n, double *X) const
    }
 #else
    // compiling without LAPACK
-   const double *data = this->data;
-   const int *ipiv = this->ipiv;
-
    // X <- X U^{-1}
    x = X;
    for (int k = 0; k < n; k++)
@@ -3080,7 +3077,7 @@ void LUFactors::RightSolve(int m, int n, double *X) const
    x = X;
    for (int k = 0; k < n; k++)
    {
-      for (int i = 0; i < m; i++)
+      for (int i = m-1; i >= 0; --i)
       {
          Swap<double>(x[i*n], x[(ipiv[i]-ipiv_base)*n]);
       }
diff --git a/linalg/hypre.cpp b/linalg/hypre.cpp
index b5b5a447e1f..02ff6b6c56d 100644
--- a/linalg/hypre.cpp
+++ b/linalg/hypre.cpp
@@ -21,55 +21,6 @@
 #include <cmath>
 #include <cstdlib>
 
-// Define macro wrappers for hypre_TAlloc, hypre_CTAlloc and hypre_TFree:
-// mfem_hypre_TAlloc, mfem_hypre_CTAlloc, and mfem_hypre_TFree, respectively.
-// Note: the same macros are defined in hypre_parcsr.cpp.
-#if MFEM_HYPRE_VERSION < 21400
-
-#define mfem_hypre_TAlloc(type, size) hypre_TAlloc(type, size)
-#define mfem_hypre_CTAlloc(type, size) hypre_CTAlloc(type, size)
-#define mfem_hypre_TFree(ptr) hypre_TFree(ptr)
-
-#else // MFEM_HYPRE_VERSION >= 21400
-
-#define mfem_hypre_TAlloc(type, size) \
-   hypre_TAlloc(type, size, HYPRE_MEMORY_HOST)
-#define mfem_hypre_CTAlloc(type, size) \
-   hypre_CTAlloc(type, size, HYPRE_MEMORY_HOST)
-#define mfem_hypre_TFree(ptr) hypre_TFree(ptr, HYPRE_MEMORY_HOST)
-
-// Notes regarding allocation and deallocation of hypre objects in 2.14.0
-//-----------------------------------------------------------------------
-//
-// 1. hypre_CSRMatrix: i, j, data, and rownnz use HYPRE_MEMORY_SHARED while the
-//    hypre_CSRMatrix structure uses HYPRE_MEMORY_HOST.
-//
-//    Note: the function HYPRE_CSRMatrixCreate creates the i array using
-//          HYPRE_MEMORY_HOST!
-//    Note: the functions hypre_CSRMatrixAdd and hypre_CSRMatrixMultiply create
-//          C_i using HYPRE_MEMORY_HOST!
-//
-// 2. hypre_Vector: data uses HYPRE_MEMORY_SHARED while the hypre_Vector
-//    structure uses HYPRE_MEMORY_HOST.
-//
-// 3. hypre_ParVector: the structure hypre_ParVector uses HYPRE_MEMORY_HOST;
-//    partitioning uses HYPRE_MEMORY_HOST.
-//
-// 4. hypre_ParCSRMatrix: the structure hypre_ParCSRMatrix uses
-//    HYPRE_MEMORY_HOST; col_map_offd, row_starts, col_starts, rowindices,
-//    rowvalues also use HYPRE_MEMORY_HOST.
-//
-//    Note: the function hypre_ParCSRMatrixToCSRMatrixAll allocates matrix_i
-//          using HYPRE_MEMORY_HOST!
-//
-// 5. The goal for the MFEM wrappers of hypre objects is to support only the
-//    standard hypre build case, i.e. when hypre is build without device support
-//    and all memory types correspond to host memory. In this case memory
-//    allocated with operator new can be used by hypre but (as usual) it must
-//    not be owned by hypre.
-
-#endif // #if MFEM_HYPRE_VERSION < 21400
-
 using namespace std;
 
 namespace mfem
@@ -1713,6 +1664,293 @@ HypreParMatrix * RAP(const HypreParMatrix * Rt, const HypreParMatrix *A,
    return new HypreParMatrix(rap);
 }
 
+// Helper function for HypreParMatrixFromBlocks. Note that scalability to
+// extremely large processor counts is limited by the use of MPI_Allgather.
+void GatherBlockOffsetData(MPI_Comm comm, const int rank, const int nprocs,
+                           const int num_loc, Array<int> &offsets,
+                           std::vector<int> &all_num_loc, const int numBlocks,
+                           std::vector<std::vector<int>> &blockProcOffsets,
+                           std::vector<int> &procOffsets,
+                           std::vector<std::vector<int>> &procBlockOffsets,
+                           int &firstLocal, int &globalNum)
+{
+   std::vector<std::vector<int>> all_block_num_loc(numBlocks);
+
+   MPI_Allgather(&num_loc, 1, MPI_INT, all_num_loc.data(), 1, MPI_INT, comm);
+
+   for (int j = 0; j < numBlocks; ++j)
+   {
+      all_block_num_loc[j].resize(nprocs);
+      blockProcOffsets[j].resize(nprocs);
+
+      const int blockNumRows = offsets[j + 1] - offsets[j];
+      MPI_Allgather(&blockNumRows, 1, MPI_INT, all_block_num_loc[j].data(), 1,
+                    MPI_INT, comm);
+      blockProcOffsets[j][0] = 0;
+      for (int i = 0; i < nprocs - 1; ++i)
+      {
+         blockProcOffsets[j][i + 1] = blockProcOffsets[j][i]
+                                      + all_block_num_loc[j][i];
+      }
+   }
+
+   firstLocal = 0;
+   globalNum = 0;
+   procOffsets[0] = 0;
+   for (int i = 0; i < nprocs; ++i)
+   {
+      globalNum += all_num_loc[i];
+      if (i < rank)
+      {
+         firstLocal += all_num_loc[i];
+      }
+
+      if (i < nprocs - 1)
+      {
+         procOffsets[i + 1] = procOffsets[i] + all_num_loc[i];
+      }
+
+      procBlockOffsets[i].resize(numBlocks);
+      procBlockOffsets[i][0] = 0;
+      for (int j = 1; j < numBlocks; ++j)
+      {
+         procBlockOffsets[i][j] = procBlockOffsets[i][j - 1]
+                                  + all_block_num_loc[j - 1][i];
+      }
+   }
+}
+
+HypreParMatrix * HypreParMatrixFromBlocks(Array2D<HypreParMatrix*> &blocks,
+                                          Array2D<double> *blockCoeff)
+{
+   const int numBlockRows = blocks.NumRows();
+   const int numBlockCols = blocks.NumCols();
+
+   MFEM_VERIFY(numBlockRows > 0 &&
+               numBlockCols > 0, "Invalid input to HypreParMatrixFromBlocks");
+
+   if (blockCoeff != NULL)
+   {
+      MFEM_VERIFY(numBlockRows == blockCoeff->NumRows() &&
+                  numBlockCols == blockCoeff->NumCols(),
+                  "Invalid input to HypreParMatrixFromBlocks");
+   }
+
+   Array<int> rowOffsets(numBlockRows+1);
+   Array<int> colOffsets(numBlockCols+1);
+
+   int nonNullBlockRow0 = -1;
+   for (int j=0; j<numBlockCols; ++j)
+   {
+      if (blocks(0,j) != NULL)
+      {
+         nonNullBlockRow0 = j;
+         break;
+      }
+   }
+
+   MFEM_VERIFY(nonNullBlockRow0 >= 0, "Null row of blocks");
+   MPI_Comm comm = blocks(0,nonNullBlockRow0)->GetComm();
+
+   // Set offsets based on the number of rows or columns in each block.
+   rowOffsets = 0;
+   colOffsets = 0;
+   for (int i=0; i<numBlockRows; ++i)
+   {
+      for (int j=0; j<numBlockCols; ++j)
+      {
+         if (blocks(i,j) != NULL)
+         {
+            const int nrows = blocks(i,j)->NumRows();
+            const int ncols = blocks(i,j)->NumCols();
+
+            MFEM_VERIFY(nrows > 0 &&
+                        ncols > 0, "Invalid block in HypreParMatrixFromBlocks");
+
+            if (rowOffsets[i+1] == 0)
+            {
+               rowOffsets[i+1] = nrows;
+            }
+            else
+            {
+               MFEM_VERIFY(rowOffsets[i+1] == nrows,
+                           "Inconsistent blocks in HypreParMatrixFromBlocks");
+            }
+
+            if (colOffsets[j+1] == 0)
+            {
+               colOffsets[j+1] = ncols;
+            }
+            else
+            {
+               MFEM_VERIFY(colOffsets[j+1] == ncols,
+                           "Inconsistent blocks in HypreParMatrixFromBlocks");
+            }
+         }
+      }
+
+      MFEM_VERIFY(rowOffsets[i+1] > 0, "Invalid input blocks");
+      rowOffsets[i+1] += rowOffsets[i];
+   }
+
+   for (int j=0; j<numBlockCols; ++j)
+   {
+      MFEM_VERIFY(colOffsets[j+1] > 0, "Invalid input blocks");
+      colOffsets[j+1] += colOffsets[j];
+   }
+
+   const int num_loc_rows = rowOffsets[numBlockRows];
+   const int num_loc_cols = colOffsets[numBlockCols];
+
+   int nprocs, rank;
+   MPI_Comm_rank(comm, &rank);
+   MPI_Comm_size(comm, &nprocs);
+
+   std::vector<int> all_num_loc_rows(nprocs);
+   std::vector<int> all_num_loc_cols(nprocs);
+   std::vector<int> procRowOffsets(nprocs);
+   std::vector<int> procColOffsets(nprocs);
+   std::vector<std::vector<int>> blockRowProcOffsets(numBlockRows);
+   std::vector<std::vector<int>> blockColProcOffsets(numBlockCols);
+   std::vector<std::vector<int>> procBlockRowOffsets(nprocs);
+   std::vector<std::vector<int>> procBlockColOffsets(nprocs);
+
+   int first_loc_row, glob_nrows, first_loc_col, glob_ncols;
+   GatherBlockOffsetData(comm, rank, nprocs, num_loc_rows, rowOffsets,
+                         all_num_loc_rows, numBlockRows, blockRowProcOffsets,
+                         procRowOffsets, procBlockRowOffsets, first_loc_row,
+                         glob_nrows);
+
+   GatherBlockOffsetData(comm, rank, nprocs, num_loc_cols, colOffsets,
+                         all_num_loc_cols, numBlockCols, blockColProcOffsets,
+                         procColOffsets, procBlockColOffsets, first_loc_col,
+                         glob_ncols);
+
+   std::vector<int> opI(num_loc_rows + 1);
+   std::vector<int> cnt(num_loc_rows);
+
+   for (int i = 0; i < num_loc_rows; ++i)
+   {
+      opI[i] = 0;
+      cnt[i] = 0;
+   }
+
+   opI[num_loc_rows] = 0;
+
+   Array2D<hypre_CSRMatrix *> csr_blocks(numBlockRows, numBlockCols);
+
+   // Loop over all blocks, to determine nnz for each row.
+   for (int i = 0; i < numBlockRows; ++i)
+   {
+      for (int j = 0; j < numBlockCols; ++j)
+      {
+         if (blocks(i, j) == NULL)
+         {
+            csr_blocks(i, j) = NULL;
+         }
+         else
+         {
+            {
+               hypre_ParCSRMatrix *parcsr_op = (hypre_ParCSRMatrix*)
+                                               const_cast<HypreParMatrix&>
+                                               (*(blocks(i, j)));
+               MFEM_ASSERT(parcsr_op != NULL, "const_cast failed");
+               csr_blocks(i, j) = hypre_MergeDiagAndOffd(parcsr_op);
+            }
+
+            for (int k = 0; k < csr_blocks(i, j)->num_rows; ++k)
+            {
+               opI[rowOffsets[i] + k + 1] +=
+                  csr_blocks(i, j)->i[k + 1] - csr_blocks(i, j)->i[k];
+            }
+         }
+      }
+   }
+
+   // Now opI[i] is nnz for row i-1. Do a partial sum to get offsets.
+   for (int i = 0; i < num_loc_rows; ++i)
+   {
+      opI[i + 1] += opI[i];
+   }
+
+   const int nnz = opI[num_loc_rows];
+
+   std::vector<HYPRE_Int> opJ(nnz);
+   std::vector<double> data(nnz);
+
+   // Loop over all blocks, to set matrix data.
+   for (int i = 0; i < numBlockRows; ++i)
+   {
+      for (int j = 0; j < numBlockCols; ++j)
+      {
+         if (csr_blocks(i, j) != NULL)
+         {
+            const int nrows = csr_blocks(i, j)->num_rows;
+            const double cij = blockCoeff ? (*blockCoeff)(i, j) : 1.0;
+
+            for (int k = 0; k < nrows; ++k)
+            {
+               const int rowg = rowOffsets[i] + k; // process-local row
+               const int nnz_k = csr_blocks(i,j)->i[k+1]-csr_blocks(i,j)->i[k];
+               const int osk = csr_blocks(i, j)->i[k];
+
+               for (int l = 0; l < nnz_k; ++l)
+               {
+                  // Find the column process offset for the block.
+                  const int bcol = csr_blocks(i, j)->j[osk + l];
+                  int bcolproc = 0;
+
+                  for (int p = 1; p < nprocs; ++p)
+                  {
+                     if (blockColProcOffsets[j][p] > bcol)
+                     {
+                        bcolproc = p - 1;
+                        break;
+                     }
+                  }
+                  if (blockColProcOffsets[j][nprocs - 1] <= bcol)
+                  {
+                     bcolproc = nprocs - 1;
+                  }
+
+                  opJ[opI[rowg] + cnt[rowg]] = procColOffsets[bcolproc] +
+                                               procBlockColOffsets[bcolproc][j]
+                                               + bcol
+                                               - blockColProcOffsets[j][bcolproc];
+                  data[opI[rowg] + cnt[rowg]] = cij * csr_blocks(i, j)->data[osk + l];
+                  cnt[rowg]++;
+               }
+            }
+         }
+      }
+   }
+
+   for (int i = 0; i < numBlockRows; ++i)
+   {
+      for (int j = 0; j < numBlockCols; ++j)
+      {
+         if (csr_blocks(i, j) != NULL)
+         {
+            hypre_CSRMatrixDestroy(csr_blocks(i, j));
+         }
+      }
+   }
+
+   std::vector<HYPRE_Int> rowStarts2(2);
+   rowStarts2[0] = first_loc_row;
+   rowStarts2[1] = first_loc_row + all_num_loc_rows[rank];
+
+   std::vector<HYPRE_Int> colStarts2(2);
+   colStarts2[0] = first_loc_col;
+   colStarts2[1] = first_loc_col + all_num_loc_cols[rank];
+
+   return new HypreParMatrix(comm, num_loc_rows, glob_nrows, glob_ncols,
+                             (int *)opI.data(), (HYPRE_Int *)opJ.data(),
+                             (double *)data.data(),
+                             (HYPRE_Int *)rowStarts2.data(),
+                             (HYPRE_Int *)colStarts2.data());
+}
+
 void EliminateBC(HypreParMatrix &A, HypreParMatrix &Ae,
                  const Array<int> &ess_dof_list,
                  const Vector &X, Vector &B)
@@ -1892,6 +2130,7 @@ HypreSmoother::HypreSmoother() : Solver()
 
    l1_norms = NULL;
    pos_l1_norms = false;
+   eig_est_cg_iter = 10;
    B = X = V = Z = NULL;
    X0 = X1 = NULL;
    fir_coeffs = NULL;
@@ -1899,7 +2138,7 @@ HypreSmoother::HypreSmoother() : Solver()
 
 HypreSmoother::HypreSmoother(HypreParMatrix &_A, int _type,
                              int _relax_times, double _relax_weight, double _omega,
-                             int _poly_order, double _poly_fraction)
+                             int _poly_order, double _poly_fraction, int _eig_est_cg_iter)
 {
    type = _type;
    relax_times = _relax_times;
@@ -1907,6 +2146,7 @@ HypreSmoother::HypreSmoother(HypreParMatrix &_A, int _type,
    omega = _omega;
    poly_order = _poly_order;
    poly_fraction = _poly_fraction;
+   eig_est_cg_iter = _eig_est_cg_iter;
 
    l1_norms = NULL;
    pos_l1_norms = false;
@@ -1929,10 +2169,12 @@ void HypreSmoother::SetSOROptions(double _relax_weight, double _omega)
    omega = _omega;
 }
 
-void HypreSmoother::SetPolyOptions(int _poly_order, double _poly_fraction)
+void HypreSmoother::SetPolyOptions(int _poly_order, double _poly_fraction,
+                                   int _eig_est_cg_iter)
 {
    poly_order = _poly_order;
    poly_fraction = _poly_fraction;
+   eig_est_cg_iter = _eig_est_cg_iter;
 }
 
 void HypreSmoother::SetTaubinOptions(double _lambda, double _mu,
@@ -2016,15 +2258,31 @@ void HypreSmoother::SetOperator(const Operator &op)
    if (type == 16)
    {
       poly_scale = 1;
-      hypre_ParCSRMaxEigEstimateCG(*A, poly_scale, 10,
-                                   &max_eig_est, &min_eig_est);
+      if (eig_est_cg_iter > 0)
+      {
+         hypre_ParCSRMaxEigEstimateCG(*A, poly_scale, eig_est_cg_iter,
+                                      &max_eig_est, &min_eig_est);
+      }
+      else
+      {
+         min_eig_est = 0;
+         hypre_ParCSRMaxEigEstimate(*A, poly_scale, &max_eig_est);
+      }
       Z = new HypreParVector(*A);
    }
    else if (type == 1001 || type == 1002)
    {
       poly_scale = 0;
-      hypre_ParCSRMaxEigEstimateCG(*A, poly_scale, 10,
-                                   &max_eig_est, &min_eig_est);
+      if (eig_est_cg_iter > 0)
+      {
+         hypre_ParCSRMaxEigEstimateCG(*A, poly_scale, eig_est_cg_iter,
+                                      &max_eig_est, &min_eig_est);
+      }
+      else
+      {
+         min_eig_est = 0;
+         hypre_ParCSRMaxEigEstimate(*A, poly_scale, &max_eig_est);
+      }
 
       // The Taubin and FIR polynomials are defined on [0, 2]
       max_eig_est /= 2;
diff --git a/linalg/hypre.hpp b/linalg/hypre.hpp
index d8041232c48..b742825bcda 100644
--- a/linalg/hypre.hpp
+++ b/linalg/hypre.hpp
@@ -570,6 +570,17 @@ HypreParMatrix * RAP(const HypreParMatrix *A, const HypreParMatrix *P);
 HypreParMatrix * RAP(const HypreParMatrix * Rt, const HypreParMatrix *A,
                      const HypreParMatrix *P);
 
+/// Returns a merged hypre matrix constructed from hypre matrix blocks.
+/** It is assumed that all block matrices use the same communicator, and the
+    block sizes are consistent in rows and columns. Rows and columns are
+    renumbered but not redistributed in parallel, e.g. the block rows owned by
+    each process remain on that process in the resulting matrix. Some blocks can
+    be NULL. Each block and the entire system can be rectangular. Scalability to
+    extremely large processor counts is limited by global MPI communication, see
+    GatherBlockOffsetData in hypre.cpp. */
+HypreParMatrix * HypreParMatrixFromBlocks(Array2D<HypreParMatrix*> &blocks,
+                                          Array2D<double> *blockCoeff=NULL);
+
 /** Eliminate essential BC specified by 'ess_dof_list' from the solution X to
     the r.h.s. B. Here A is a matrix with eliminated BC, while Ae is such that
     (A+Ae) is the original (Neumann) matrix before elimination. */
@@ -615,6 +626,8 @@ class HypreSmoother : public Solver
    double *l1_norms;
    /// If set, take absolute values of the computed l1_norms
    bool pos_l1_norms;
+   /// Number of CG iterations to determine eigenvalue estimates
+   int eig_est_cg_iter;
    /// Maximal eigenvalue estimate for polynomial smoothing
    double max_eig_est;
    /// Minimal eigenvalue estimate for polynomial smoothing
@@ -645,14 +658,17 @@ class HypreSmoother : public Solver
    HypreSmoother(HypreParMatrix &_A, int type = l1GS,
                  int relax_times = 1, double relax_weight = 1.0,
                  double omega = 1.0, int poly_order = 2,
-                 double poly_fraction = .3);
+                 double poly_fraction = .3, int eig_est_cg_iter = 10);
 
    /// Set the relaxation type and number of sweeps
    void SetType(HypreSmoother::Type type, int relax_times = 1);
    /// Set SOR-related parameters
    void SetSOROptions(double relax_weight, double omega);
    /// Set parameters for polynomial smoothing
-   void SetPolyOptions(int poly_order, double poly_fraction);
+   /** By default, 10 iterations of CG are used to estimate the eigenvalues.
+       Setting eig_est_cg_iter = 0 uses hypre's hypre_ParCSRMaxEigEstimate() instead. */
+   void SetPolyOptions(int poly_order, double poly_fraction,
+                       int eig_est_cg_iter = 10);
    /// Set parameters for Taubin's lambda-mu method
    void SetTaubinOptions(double lambda, double mu, int iter);
 
diff --git a/linalg/hypre_parcsr.cpp b/linalg/hypre_parcsr.cpp
index 126389b7df9..d6f0f1f39aa 100644
--- a/linalg/hypre_parcsr.cpp
+++ b/linalg/hypre_parcsr.cpp
@@ -17,26 +17,6 @@
 #include "hypre_parcsr.hpp"
 #include <limits>
 
-// Define macro wrappers for hypre_TAlloc, hypre_CTAlloc and hypre_TFree:
-// mfem_hypre_TAlloc, mfem_hypre_CTAlloc, and mfem_hypre_TFree, respectively.
-// Note: the same macros are defined in hypre.cpp.
-#if MFEM_HYPRE_VERSION < 21400
-
-#define mfem_hypre_TAlloc(type, size) hypre_TAlloc(type, size)
-#define mfem_hypre_CTAlloc(type, size) hypre_CTAlloc(type, size)
-#define mfem_hypre_TFree(ptr) hypre_TFree(ptr)
-
-#else // MFEM_HYPRE_VERSION >= 21400
-
-// See the notes about hypre 2.14.0 in hypre.cpp
-#define mfem_hypre_TAlloc(type, size) \
-   hypre_TAlloc(type, size, HYPRE_MEMORY_HOST)
-#define mfem_hypre_CTAlloc(type, size) \
-   hypre_CTAlloc(type, size, HYPRE_MEMORY_HOST)
-#define mfem_hypre_TFree(ptr) hypre_TFree(ptr, HYPRE_MEMORY_HOST)
-
-#endif // #if MFEM_HYPRE_VERSION < 21400
-
 namespace mfem
 {
 namespace internal
diff --git a/linalg/hypre_parcsr.hpp b/linalg/hypre_parcsr.hpp
index fe5795c7ded..458d874d69f 100644
--- a/linalg/hypre_parcsr.hpp
+++ b/linalg/hypre_parcsr.hpp
@@ -21,6 +21,56 @@
 
 #include "_hypre_parcsr_mv.h"
 
+// Define macro wrappers for hypre_TAlloc, hypre_CTAlloc and hypre_TFree:
+// mfem_hypre_TAlloc, mfem_hypre_CTAlloc, and mfem_hypre_TFree, respectively.
+// Note: these macros are used in hypre.cpp, hypre_parcsr.cpp, and perhaps
+// other locations in the future.
+#if MFEM_HYPRE_VERSION < 21400
+
+#define mfem_hypre_TAlloc(type, size) hypre_TAlloc(type, size)
+#define mfem_hypre_CTAlloc(type, size) hypre_CTAlloc(type, size)
+#define mfem_hypre_TFree(ptr) hypre_TFree(ptr)
+
+#else // MFEM_HYPRE_VERSION >= 21400
+
+#define mfem_hypre_TAlloc(type, size) \
+   hypre_TAlloc(type, size, HYPRE_MEMORY_HOST)
+#define mfem_hypre_CTAlloc(type, size) \
+   hypre_CTAlloc(type, size, HYPRE_MEMORY_HOST)
+#define mfem_hypre_TFree(ptr) hypre_TFree(ptr, HYPRE_MEMORY_HOST)
+
+// Notes regarding allocation and deallocation of hypre objects in 2.14.0
+//-----------------------------------------------------------------------
+//
+// 1. hypre_CSRMatrix: i, j, data, and rownnz use HYPRE_MEMORY_SHARED while the
+//    hypre_CSRMatrix structure uses HYPRE_MEMORY_HOST.
+//
+//    Note: the function HYPRE_CSRMatrixCreate creates the i array using
+//          HYPRE_MEMORY_HOST!
+//    Note: the functions hypre_CSRMatrixAdd and hypre_CSRMatrixMultiply create
+//          C_i using HYPRE_MEMORY_HOST!
+//
+// 2. hypre_Vector: data uses HYPRE_MEMORY_SHARED while the hypre_Vector
+//    structure uses HYPRE_MEMORY_HOST.
+//
+// 3. hypre_ParVector: the structure hypre_ParVector uses HYPRE_MEMORY_HOST;
+//    partitioning uses HYPRE_MEMORY_HOST.
+//
+// 4. hypre_ParCSRMatrix: the structure hypre_ParCSRMatrix uses
+//    HYPRE_MEMORY_HOST; col_map_offd, row_starts, col_starts, rowindices,
+//    rowvalues also use HYPRE_MEMORY_HOST.
+//
+//    Note: the function hypre_ParCSRMatrixToCSRMatrixAll allocates matrix_i
+//          using HYPRE_MEMORY_HOST!
+//
+// 5. The goal for the MFEM wrappers of hypre objects is to support only the
+//    standard hypre build case, i.e. when hypre is build without device support
+//    and all memory types correspond to host memory. In this case memory
+//    allocated with operator new can be used by hypre but (as usual) it must
+//    not be owned by hypre.
+
+#endif // #if MFEM_HYPRE_VERSION < 21400
+
 namespace mfem
 {
 
diff --git a/linalg/invariants.hpp b/linalg/invariants.hpp
index d491d2ba4b0..4280ceda139 100644
--- a/linalg/invariants.hpp
+++ b/linalg/invariants.hpp
@@ -593,7 +593,8 @@ class InvariantsEvaluator3D
       if (dont(HAVE_I3b_p))
       {
          eval_state |= HAVE_I3b_p;
-         I3b_p = sign_detJ*scalar_ops::pow(Get_I3b(), -2, 3);
+         const scalar_t i3b = Get_I3b();
+         I3b_p = sign_detJ*scalar_ops::pow(i3b, -2, 3);
       }
       return I3b_p;
    }
diff --git a/linalg/kernels.hpp b/linalg/kernels.hpp
index bebecdfbade..e2e18a46a0f 100644
--- a/linalg/kernels.hpp
+++ b/linalg/kernels.hpp
@@ -120,7 +120,7 @@ void Symmetrize(const int size, T *data)
 template<int dim, typename T>
 MFEM_HOST_DEVICE inline T Det(const T *data)
 {
-   return TDet<T>(ColumnMajorLayout2D<dim,dim>(), data);
+   return TDetHD<T>(ColumnMajorLayout2D<dim,dim>(), data);
 }
 
 /** @brief Return the inverse a matrix with given @a size and @a data into the
@@ -130,8 +130,8 @@ MFEM_HOST_DEVICE inline
 void CalcInverse(const T *data, T *inv_data)
 {
    typedef ColumnMajorLayout2D<dim,dim> layout_t;
-   const T det = TAdjDet<T>(layout_t(), data, layout_t(), inv_data);
-   TAssign<AssignOp::Mult>(layout_t(), inv_data, static_cast<T>(1.0)/det);
+   const T det = TAdjDetHD<T>(layout_t(), data, layout_t(), inv_data);
+   TAssignHD<AssignOp::Mult>(layout_t(), inv_data, static_cast<T>(1.0)/det);
 }
 
 /** @brief Compute C = A + alpha*B, where the matrices A, B and C are of size @a
diff --git a/linalg/matrix.hpp b/linalg/matrix.hpp
index 06416111bf1..efdf640306f 100644
--- a/linalg/matrix.hpp
+++ b/linalg/matrix.hpp
@@ -28,7 +28,7 @@ class Matrix : public Operator
 {
    friend class MatrixInverse;
 public:
-   //// Defines matrix diagonal policy upon elimination of rows and/or columns.
+   /// Defines matrix diagonal policy upon elimination of rows and/or columns.
    enum DiagonalPolicy
    {
       DIAG_ZERO, ///< Set the diagonal value to zero
diff --git a/linalg/ode.hpp b/linalg/ode.hpp
index bd988a1b350..ae6368d21a0 100644
--- a/linalg/ode.hpp
+++ b/linalg/ode.hpp
@@ -492,7 +492,7 @@ class SIASolver
    mutable Vector dq_;
 };
 
-// First Order Symplectic Integration Algorithm
+/// First Order Symplectic Integration Algorithm
 class SIA1Solver : public SIASolver
 {
 public:
@@ -500,7 +500,7 @@ class SIA1Solver : public SIASolver
    void Step(Vector &q, Vector &p, double &t, double &dt);
 };
 
-// Second Order Symplectic Integration Algorithm
+/// Second Order Symplectic Integration Algorithm
 class SIA2Solver : public SIASolver
 {
 public:
@@ -508,7 +508,7 @@ class SIA2Solver : public SIASolver
    void Step(Vector &q, Vector &p, double &t, double &dt);
 };
 
-// Variable order Symplectic Integration Algorithm (orders 1-4)
+/// Variable order Symplectic Integration Algorithm (orders 1-4)
 class SIAVSolver : public SIASolver
 {
 public:
diff --git a/linalg/operator.cpp b/linalg/operator.cpp
index b2a7f71b39f..57d942698e4 100644
--- a/linalg/operator.cpp
+++ b/linalg/operator.cpp
@@ -543,6 +543,35 @@ void RectangularConstrainedOperator::Mult(const Vector &x, Vector &y) const
    }
 }
 
+void RectangularConstrainedOperator::MultTranspose(const Vector &x,
+                                                   Vector &y) const
+{
+   const int trial_csz = trial_constraints.Size();
+   const int test_csz = test_constraints.Size();
+   if (test_csz == 0)
+   {
+      A->MultTranspose(x, y);
+   }
+   else
+   {
+      z = x;
+
+      auto idx = test_constraints.Read();
+      // Use read+write access - we are modifying sub-vector of z
+      auto d_z = z.ReadWrite();
+      MFEM_FORALL(i, test_csz, d_z[idx[i]] = 0.0;);
+
+      A->MultTranspose(z, y);
+   }
+
+   if (trial_csz != 0)
+   {
+      auto idx = trial_constraints.Read();
+      auto d_y = y.ReadWrite();
+      MFEM_FORALL(i, trial_csz, d_y[idx[i]] = 0.0;);
+   }
+}
+
 double PowerMethod::EstimateLargestEigenvalue(Operator& opr, Vector& v0,
                                               int numSteps, double tolerance, int seed)
 {
diff --git a/linalg/operator.hpp b/linalg/operator.hpp
index 0bb93ebf51e..d9c45f602fe 100644
--- a/linalg/operator.hpp
+++ b/linalg/operator.hpp
@@ -759,6 +759,7 @@ class RectangularConstrainedOperator : public Operator
        where the "_i" subscripts denote all the nonessential (boundary) trial
        indices and the "_j" subscript denotes the essential test indices */
    virtual void Mult(const Vector &x, Vector &y) const;
+   virtual void MultTranspose(const Vector &x, Vector &y) const;
    virtual ~RectangularConstrainedOperator() { if (own_A) { delete A; } }
 };
 
diff --git a/linalg/simd.hpp b/linalg/simd.hpp
new file mode 100644
index 00000000000..26df56f0da6
--- /dev/null
+++ b/linalg/simd.hpp
@@ -0,0 +1,100 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#ifndef MFEM_SIMD_HPP
+#define MFEM_SIMD_HPP
+
+#include "../config/tconfig.hpp"
+
+// --- AutoSIMD + specializations with intrinsics
+#include "simd/auto.hpp"
+#ifdef MFEM_USE_SIMD
+#if defined(__VSX__)
+#include "simd/vsx.hpp"
+#elif defined (__bgq__)
+#include "simd/qpx.hpp"
+#elif defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
+#include "simd/x86.hpp"
+#elif !defined(_MSC_VER)
+#warning Unknown SIMD architecture
+#else
+#pragma message("warning: Unknown SIMD architecture")
+#endif
+#endif
+
+// MFEM_SIMD_BYTES is the default SIMD size used by MFEM, see e.g. class
+// TBilinearForm and the default traits class AutoSIMDTraits.
+// MFEM_ALIGN_BYTES determines the padding used in TVector when its 'align'
+// template parameter is set to true -- it ensues that the size of such TVector
+// types is a multiple of MFEM_ALIGN_BYTES. MFEM_ALIGN_BYTES must be a multiple
+// of MFEM_SIMD_BYTES.
+#if !defined(MFEM_USE_SIMD)
+#define MFEM_SIMD_BYTES 8
+#define MFEM_ALIGN_BYTES 32
+#elif defined(__AVX512F__)
+#define MFEM_SIMD_BYTES 64
+#define MFEM_ALIGN_BYTES 64
+#elif defined(__AVX__) || defined(__VECTOR4DOUBLE__)
+#define MFEM_SIMD_BYTES 32
+#define MFEM_ALIGN_BYTES 32
+#elif defined(__SSE2__) || defined(__VSX__)
+#define MFEM_SIMD_BYTES 16
+#define MFEM_ALIGN_BYTES 32
+#else
+#define MFEM_SIMD_BYTES 8
+#define MFEM_ALIGN_BYTES 32
+#endif
+
+// derived macros
+#define MFEM_ROUNDUP(val,base) ((((val)+(base)-1)/(base))*(base))
+#define MFEM_ALIGN_SIZE(size,type) \
+   MFEM_ROUNDUP(size,(MFEM_ALIGN_BYTES)/sizeof(type))
+
+namespace mfem
+{
+
+template<typename complex_t, typename real_t>
+struct AutoSIMDTraits
+{
+   static const int block_size = MFEM_TEMPLATE_BLOCK_SIZE;
+
+   // Alignment for arrays of vcomplex_t and vreal_t
+   static const int align_bytes = MFEM_SIMD_BYTES;
+
+   static const int batch_size = 1;
+
+   static const int simd_size = MFEM_SIMD_BYTES/sizeof(real_t);
+
+   typedef AutoSIMD<complex_t, simd_size, MFEM_SIMD_BYTES> vcomplex_t;
+   typedef AutoSIMD<real_t, simd_size, MFEM_SIMD_BYTES> vreal_t;
+   typedef AutoSIMD<int, simd_size, simd_size*sizeof(int)> vint_t;
+};
+
+template<typename complex_t, typename real_t>
+struct NoSIMDTraits
+{
+   static const int block_size = MFEM_TEMPLATE_BLOCK_SIZE;
+
+   // Alignment for arrays of vcomplex_t and vreal_t
+   static const int align_bytes = sizeof(real_t);
+
+   static const int batch_size = 1;
+
+   static const int simd_size = 1;
+
+   typedef AutoSIMD<complex_t, simd_size, align_bytes> vcomplex_t;
+   typedef AutoSIMD<real_t, simd_size, align_bytes> vreal_t;
+   typedef AutoSIMD<int, simd_size, simd_size*sizeof(int)> vint_t;
+};
+
+} // mfem namespace
+
+#endif // MFEM_SIMD_HPP
diff --git a/linalg/simd/auto.hpp b/linalg/simd/auto.hpp
new file mode 100644
index 00000000000..7e8f3a3e02d
--- /dev/null
+++ b/linalg/simd/auto.hpp
@@ -0,0 +1,273 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#ifndef MFEM_SIMD_AUTO_HPP
+#define MFEM_SIMD_AUTO_HPP
+
+#include "../../config/tconfig.hpp"
+
+namespace mfem
+{
+
+// Use this macro as a workaround for astyle formatting issue with 'alignas'
+#define MFEM_AUTOSIMD_ALIGN__ alignas(align_bytes_)
+
+template <typename scalar_t, int S, int align_bytes_>
+struct MFEM_AUTOSIMD_ALIGN__ AutoSIMD
+{
+   typedef scalar_t scalar_type;
+   static const int size = S;
+   static const int align_bytes = align_bytes_;
+
+   scalar_t vec[size];
+
+   inline MFEM_ALWAYS_INLINE scalar_t &operator[](int i)
+   {
+      return vec[i];
+   }
+
+   inline MFEM_ALWAYS_INLINE const scalar_t &operator[](int i) const
+   {
+      return vec[i];
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator=(const AutoSIMD &v)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] = v[i]; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator=(const scalar_t &e)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] = e; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator+=(const AutoSIMD &v)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] += v[i]; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator+=(const scalar_t &e)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] += e; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator-=(const AutoSIMD &v)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] -= v[i]; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator-=(const scalar_t &e)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] -= e; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator*=(const AutoSIMD &v)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] *= v[i]; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator*=(const scalar_t &e)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] *= e; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator/=(const AutoSIMD &v)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] /= v[i]; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator/=(const scalar_t &e)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] /= e; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-() const
+   {
+      AutoSIMD r;
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { r[i] = -vec[i]; }
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator+(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { r[i] = vec[i] + v[i]; }
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator+(const scalar_t &e) const
+   {
+      AutoSIMD r;
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { r[i] = vec[i] + e; }
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { r[i] = vec[i] - v[i]; }
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-(const scalar_t &e) const
+   {
+      AutoSIMD r;
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { r[i] = vec[i] - e; }
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator*(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { r[i] = vec[i] * v[i]; }
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator*(const scalar_t &e) const
+   {
+      AutoSIMD r;
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { r[i] = vec[i] * e; }
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator/(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { r[i] = vec[i] / v[i]; }
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator/(const scalar_t &e) const
+   {
+      AutoSIMD r;
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { r[i] = vec[i] / e; }
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const AutoSIMD &v, const AutoSIMD &w)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] += v[i] * w[i]; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const AutoSIMD &v, const scalar_t &e)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] += v[i] * e; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const scalar_t &e, const AutoSIMD &v)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] += e * v[i]; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const AutoSIMD &v, const AutoSIMD &w)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] = v[i] * w[i]; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const AutoSIMD &v, const scalar_t &e)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] = v[i] * e; }
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const scalar_t &e, const AutoSIMD &v)
+   {
+      MFEM_VECTORIZE_LOOP
+      for (int i = 0; i < size; i++) { vec[i] = e * v[i]; }
+      return *this;
+   }
+};
+
+template <typename scalar_t, int S, int A>
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<scalar_t,S,A> operator+(const scalar_t &e,
+                                 const AutoSIMD<scalar_t,S,A> &v)
+{
+   AutoSIMD<scalar_t,S,A> r;
+   MFEM_VECTORIZE_LOOP
+   for (int i = 0; i < S; i++) { r[i] = e + v[i]; }
+   return r;
+}
+
+template <typename scalar_t, int S, int A>
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<scalar_t,S,A> operator-(const scalar_t &e,
+                                 const AutoSIMD<scalar_t,S,A> &v)
+{
+   AutoSIMD<scalar_t,S,A> r;
+   MFEM_VECTORIZE_LOOP
+   for (int i = 0; i < S; i++) { r[i] = e - v[i]; }
+   return r;
+}
+
+template <typename scalar_t, int S, int A>
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<scalar_t,S,A> operator*(const scalar_t &e,
+                                 const AutoSIMD<scalar_t,S,A> &v)
+{
+   AutoSIMD<scalar_t,S,A> r;
+   MFEM_VECTORIZE_LOOP
+   for (int i = 0; i < S; i++) { r[i] = e * v[i]; }
+   return r;
+}
+
+template <typename scalar_t, int S, int A>
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<scalar_t,S,A> operator/(const scalar_t &e,
+                                 const AutoSIMD<scalar_t,S,A> &v)
+{
+   AutoSIMD<scalar_t,S,A> r;
+   MFEM_VECTORIZE_LOOP
+   for (int i = 0; i < S; i++) { r[i] = e / v[i]; }
+   return r;
+}
+
+} // namespace mfem
+
+#endif // MFEM_SIMD_AUTO_HPP
diff --git a/linalg/simd/m128.hpp b/linalg/simd/m128.hpp
new file mode 100644
index 00000000000..eb3dbc0692d
--- /dev/null
+++ b/linalg/simd/m128.hpp
@@ -0,0 +1,254 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#ifndef MFEM_SIMD_M128_HPP
+#define MFEM_SIMD_M128_HPP
+
+#ifdef __SSE2__
+
+#include "../../config/tconfig.hpp"
+#if defined(__x86_64__)
+#include <x86intrin.h>
+#else // assuming MSVC with _M_X64 or _M_IX86
+#include <intrin.h>
+#endif
+
+namespace mfem
+{
+
+template <typename, int, int> struct AutoSIMD;
+
+template <> struct AutoSIMD<double,2,16>
+{
+   typedef double scalar_type;
+   static constexpr int size = 2;
+   static constexpr int align_bytes = 16;
+
+   union
+   {
+      __m128d m128d;
+      double vec[size];
+   };
+
+   inline MFEM_ALWAYS_INLINE double &operator[](int i)
+   {
+      return vec[i];
+   }
+
+   inline MFEM_ALWAYS_INLINE const double &operator[](int i) const
+   {
+      return vec[i];
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator=(const AutoSIMD &v)
+   {
+      m128d = v.m128d;
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator=(const double &e)
+   {
+      m128d = _mm_set1_pd(e);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator+=(const AutoSIMD &v)
+   {
+      m128d = _mm_add_pd(m128d,v.m128d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator+=(const double &e)
+   {
+      m128d = _mm_add_pd(m128d,_mm_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator-=(const AutoSIMD &v)
+   {
+      m128d = _mm_sub_pd(m128d,v.m128d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator-=(const double &e)
+   {
+      m128d = _mm_sub_pd(m128d,_mm_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator*=(const AutoSIMD &v)
+   {
+      m128d = _mm_mul_pd(m128d,v.m128d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator*=(const double &e)
+   {
+      m128d = _mm_mul_pd(m128d,_mm_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator/=(const AutoSIMD &v)
+   {
+      m128d = _mm_div_pd(m128d,v.m128d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator/=(const double &e)
+   {
+      m128d = _mm_div_pd(m128d,_mm_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-() const
+   {
+      AutoSIMD r;
+      r.m128d = _mm_xor_pd(_mm_set1_pd(-0.0), m128d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator+(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.m128d = _mm_add_pd(m128d,v.m128d);
+      return r;
+   }
+
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator+(const double &e) const
+   {
+      AutoSIMD r;
+      r.m128d = _mm_add_pd(m128d, _mm_set1_pd(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.m128d = _mm_sub_pd(m128d,v.m128d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-(const double &e) const
+   {
+      AutoSIMD r;
+      r.m128d = _mm_sub_pd(m128d, _mm_set1_pd(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator*(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.m128d = _mm_mul_pd(m128d,v.m128d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator*(const double &e) const
+   {
+      AutoSIMD r;
+      r.m128d = _mm_mul_pd(m128d, _mm_set1_pd(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator/(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.m128d = _mm_div_pd(m128d,v.m128d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator/(const double &e) const
+   {
+      AutoSIMD r;
+      r.m128d = _mm_div_pd(m128d, _mm_set1_pd(e));
+      return r;
+   }
+
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const AutoSIMD &v, const AutoSIMD &w)
+   {
+      m128d = _mm_add_pd(_mm_mul_pd(w.m128d,v.m128d),m128d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const AutoSIMD &v, const double &e)
+   {
+      m128d = _mm_add_pd(_mm_mul_pd(_mm_set1_pd(e),v.m128d),m128d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const double &e, const AutoSIMD &v)
+   {
+      m128d = _mm_add_pd(_mm_mul_pd(v.m128d,_mm_set1_pd(e)),m128d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const AutoSIMD &v, const AutoSIMD &w)
+   {
+      m128d = _mm_mul_pd(v.m128d,w.m128d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const AutoSIMD &v, const double &e)
+   {
+      m128d = _mm_mul_pd(v.m128d,_mm_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const double &e, const AutoSIMD &v)
+   {
+      m128d = _mm_mul_pd(_mm_set1_pd(e),v.m128d);
+      return *this;
+   }
+};
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,2,16> operator+(const double &e,
+                                const AutoSIMD<double,2,16> &v)
+{
+   AutoSIMD<double,2,16> r;
+   r.m128d = _mm_add_pd(_mm_set1_pd(e),v.m128d);
+   return r;
+}
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,2,16> operator-(const double &e,
+                                const AutoSIMD<double,2,16> &v)
+{
+   AutoSIMD<double,2,16> r;
+   r.m128d = _mm_sub_pd(_mm_set1_pd(e),v.m128d);
+   return r;
+}
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,2,16> operator*(const double &e,
+                                const AutoSIMD<double,2,16> &v)
+{
+   AutoSIMD<double,2,16> r;
+   r.m128d = _mm_mul_pd(_mm_set1_pd(e),v.m128d);
+   return r;
+}
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,2,16> operator/(const double &e,
+                                const AutoSIMD<double,2,16> &v)
+{
+   AutoSIMD<double,2,16> r;
+   r.m128d = _mm_div_pd(_mm_set1_pd(e),v.m128d);
+   return r;
+}
+
+} // namespace mfem
+
+#endif // __SSE2__
+
+#endif // MFEM_SIMD_M128_HPP
+
diff --git a/linalg/simd/m256.hpp b/linalg/simd/m256.hpp
new file mode 100644
index 00000000000..6bc8c42ef03
--- /dev/null
+++ b/linalg/simd/m256.hpp
@@ -0,0 +1,263 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#ifndef MFEM_SIMD_M256_HPP
+#define MFEM_SIMD_M256_HPP
+
+#ifdef __AVX__
+
+#include "../../config/tconfig.hpp"
+#if defined(__x86_64__)
+#include <x86intrin.h>
+#else // assuming MSVC with _M_X64 or _M_IX86
+#include <intrin.h>
+#endif
+
+namespace mfem
+{
+
+template <typename, int, int> struct AutoSIMD;
+
+template <> struct AutoSIMD<double,4,32>
+{
+   typedef double scalar_type;
+   static constexpr int size = 4;
+   static constexpr int align_bytes = 32;
+
+   union
+   {
+      __m256d m256d;
+      double vec[size];
+   };
+
+   inline MFEM_ALWAYS_INLINE double &operator[](int i)
+   {
+      return vec[i];
+   }
+
+   inline MFEM_ALWAYS_INLINE const double &operator[](int i) const
+   {
+      return vec[i];
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator=(const AutoSIMD &v)
+   {
+      m256d = v.m256d;
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator=(const double &e)
+   {
+      m256d = _mm256_set1_pd(e);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator+=(const AutoSIMD &v)
+   {
+      m256d = _mm256_add_pd(m256d,v.m256d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator+=(const double &e)
+   {
+      m256d = _mm256_add_pd(m256d,_mm256_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator-=(const AutoSIMD &v)
+   {
+      m256d = _mm256_sub_pd(m256d,v.m256d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator-=(const double &e)
+   {
+      m256d = _mm256_sub_pd(m256d,_mm256_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator*=(const AutoSIMD &v)
+   {
+      m256d = _mm256_mul_pd(m256d,v.m256d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator*=(const double &e)
+   {
+      m256d = _mm256_mul_pd(m256d,_mm256_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator/=(const AutoSIMD &v)
+   {
+      m256d = _mm256_div_pd(m256d,v.m256d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator/=(const double &e)
+   {
+      m256d = _mm256_div_pd(m256d,_mm256_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-() const
+   {
+      AutoSIMD r;
+      r.m256d = _mm256_xor_pd(_mm256_set1_pd(-0.0), m256d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator+(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.m256d = _mm256_add_pd(m256d,v.m256d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator+(const double &e) const
+   {
+      AutoSIMD r;
+      r.m256d = _mm256_add_pd(m256d, _mm256_set1_pd(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.m256d = _mm256_sub_pd(m256d,v.m256d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-(const double &e) const
+   {
+      AutoSIMD r;
+      r.m256d = _mm256_sub_pd(m256d, _mm256_set1_pd(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator*(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.m256d = _mm256_mul_pd(m256d,v.m256d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator*(const double &e) const
+   {
+      AutoSIMD r;
+      r.m256d = _mm256_mul_pd(m256d, _mm256_set1_pd(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator/(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.m256d = _mm256_div_pd(m256d,v.m256d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator/(const double &e) const
+   {
+      AutoSIMD r;
+      r.m256d = _mm256_div_pd(m256d, _mm256_set1_pd(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const AutoSIMD &v, const AutoSIMD &w)
+   {
+#ifndef __AVX2__
+      m256d = _mm256_add_pd(_mm256_mul_pd(w.m256d,v.m256d),m256d);
+#else
+      m256d = _mm256_fmadd_pd(w.m256d,v.m256d,m256d);
+#endif
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const AutoSIMD &v, const double &e)
+   {
+#ifndef __AVX2__
+      m256d = _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(e),v.m256d),m256d);
+#else
+      m256d = _mm256_fmadd_pd(_mm256_set1_pd(e),v.m256d,m256d);
+#endif
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const double &e, const AutoSIMD &v)
+   {
+#ifndef __AVX2__
+      m256d = _mm256_add_pd(_mm256_mul_pd(v.m256d,_mm256_set1_pd(e)),m256d);
+#else
+      m256d = _mm256_fmadd_pd(v.m256d,_mm256_set1_pd(e),m256d);
+#endif
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const AutoSIMD &v, const AutoSIMD &w)
+   {
+      m256d = _mm256_mul_pd(v.m256d,w.m256d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const AutoSIMD &v, const double &e)
+   {
+      m256d = _mm256_mul_pd(v.m256d,_mm256_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const double &e, const AutoSIMD &v)
+   {
+      m256d = _mm256_mul_pd(_mm256_set1_pd(e),v.m256d);
+      return *this;
+   }
+};
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,4,32> operator+(const double &e,
+                                const AutoSIMD<double,4,32> &v)
+{
+   AutoSIMD<double,4,32> r;
+   r.m256d = _mm256_add_pd(_mm256_set1_pd(e),v.m256d);
+   return r;
+}
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,4,32> operator-(const double &e,
+                                const AutoSIMD<double,4,32> &v)
+{
+   AutoSIMD<double,4,32> r;
+   r.m256d = _mm256_sub_pd(_mm256_set1_pd(e),v.m256d);
+   return r;
+}
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,4,32> operator*(const double &e,
+                                const AutoSIMD<double,4,32> &v)
+{
+   AutoSIMD<double,4,32> r;
+   r.m256d = _mm256_mul_pd(_mm256_set1_pd(e),v.m256d);
+   return r;
+}
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,4,32> operator/(const double &e,
+                                const AutoSIMD<double,4,32> &v)
+{
+   AutoSIMD<double,4,32> r;
+   r.m256d = _mm256_div_pd(_mm256_set1_pd(e),v.m256d);
+   return r;
+}
+
+} // namespace mfem
+
+#endif // __AVX__
+
+#endif // MFEM_SIMD_M256_HPP
diff --git a/linalg/simd/m512.hpp b/linalg/simd/m512.hpp
new file mode 100644
index 00000000000..dec27db24ed
--- /dev/null
+++ b/linalg/simd/m512.hpp
@@ -0,0 +1,256 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#ifndef MFEM_SIMD_M512_HPP
+#define MFEM_SIMD_M512_HPP
+
+#ifdef __AVX512F__
+
+#include "../../config/tconfig.hpp"
+#if defined(__x86_64__)
+#include <x86intrin.h>
+#else // assuming MSVC with _M_X64 or _M_IX86
+#include <intrin.h>
+#endif
+
+
+namespace mfem
+{
+
+template <typename, int, int> struct AutoSIMD;
+
+template <> struct AutoSIMD<double,8,64>
+{
+   typedef double scalar_type;
+   static constexpr int size = 8;
+   static constexpr int align_bytes = 64;
+
+   union
+   {
+      __m512d m512d;
+      double vec[size];
+   };
+
+   inline MFEM_ALWAYS_INLINE double &operator[](int i)
+   {
+      return vec[i];
+   }
+
+   inline MFEM_ALWAYS_INLINE const double &operator[](int i) const
+   {
+      return vec[i];
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator=(const AutoSIMD &v)
+   {
+      m512d = v.m512d;
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator=(const double &e)
+   {
+      m512d = _mm512_set1_pd(e);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator+=(const AutoSIMD &v)
+   {
+      m512d = _mm512_add_pd(m512d,v.m512d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator+=(const double &e)
+   {
+      m512d = _mm512_add_pd(m512d,_mm512_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator-=(const AutoSIMD &v)
+   {
+      m512d = _mm512_sub_pd(m512d,v.m512d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator-=(const double &e)
+   {
+      m512d = _mm512_sub_pd(m512d,_mm512_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator*=(const AutoSIMD &v)
+   {
+      m512d = _mm512_mul_pd(m512d,v.m512d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator*=(const double &e)
+   {
+      m512d = _mm512_mul_pd(m512d,_mm512_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator/=(const AutoSIMD &v)
+   {
+      m512d = _mm512_div_pd(m512d,v.m512d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator/=(const double &e)
+   {
+      m512d = _mm512_div_pd(m512d,_mm512_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-() const
+   {
+      AutoSIMD r;
+#ifdef __AVX512DQ__
+      r.m512d = _mm512_xor_pd(_mm512_set1_pd(-0.0), m512d);
+#else
+      r.m512d = _mm512_sub_pd(_mm512_set1_pd(0.0), m512d);
+#endif
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator+(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.m512d = _mm512_add_pd(m512d,v.m512d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator+(const double &e) const
+   {
+      AutoSIMD r;
+      r.m512d = _mm512_add_pd(m512d, _mm512_set1_pd(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.m512d = _mm512_sub_pd(m512d,v.m512d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-(const double &e) const
+   {
+      AutoSIMD r;
+      r.m512d = _mm512_sub_pd(m512d, _mm512_set1_pd(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator*(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.m512d = _mm512_mul_pd(m512d,v.m512d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator*(const double &e) const
+   {
+      AutoSIMD r;
+      r.m512d = _mm512_mul_pd(m512d, _mm512_set1_pd(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator/(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.m512d = _mm512_div_pd(m512d,v.m512d);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator/(const double &e) const
+   {
+      AutoSIMD r;
+      r.m512d = _mm512_div_pd(m512d, _mm512_set1_pd(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const AutoSIMD &v, const AutoSIMD &w)
+   {
+      m512d = _mm512_fmadd_pd(w.m512d,v.m512d,m512d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const AutoSIMD &v, const double &e)
+   {
+      m512d = _mm512_fmadd_pd(_mm512_set1_pd(e),v.m512d,m512d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const double &e, const AutoSIMD &v)
+   {
+      m512d = _mm512_fmadd_pd(v.m512d,_mm512_set1_pd(e),m512d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const AutoSIMD &v, const AutoSIMD &w)
+   {
+      m512d = _mm512_mul_pd(v.m512d,w.m512d);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const AutoSIMD &v, const double &e)
+   {
+      m512d = _mm512_mul_pd(v.m512d,_mm512_set1_pd(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const double &e, const AutoSIMD &v)
+   {
+      m512d = _mm512_mul_pd(_mm512_set1_pd(e),v.m512d);
+      return *this;
+   }
+};
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,8,64> operator+(const double &e,
+                                const AutoSIMD<double,8,64> &v)
+{
+   AutoSIMD<double,8,64> r;
+   r.m512d = _mm512_add_pd(_mm512_set1_pd(e),v.m512d);
+   return r;
+}
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,8,64> operator-(const double &e,
+                                const AutoSIMD<double,8,64> &v)
+{
+   AutoSIMD<double,8,64> r;
+   r.m512d = _mm512_sub_pd(_mm512_set1_pd(e),v.m512d);
+   return r;
+}
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,8,64> operator*(const double &e,
+                                const AutoSIMD<double,8,64> &v)
+{
+   AutoSIMD<double,8,64> r;
+   r.m512d = _mm512_mul_pd(_mm512_set1_pd(e),v.m512d);
+   return r;
+}
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,8,64> operator/(const double &e,
+                                const AutoSIMD<double,8,64> &v)
+{
+   AutoSIMD<double,8,64> r;
+   r.m512d = _mm512_div_pd(_mm512_set1_pd(e),v.m512d);
+   return r;
+}
+
+} // namespace mfem
+
+#endif // __AVX512F__
+
+#endif // MFEM_SIMD_M512_HPP
diff --git a/linalg/simd/qpx.hpp b/linalg/simd/qpx.hpp
new file mode 100644
index 00000000000..b0c37f9d408
--- /dev/null
+++ b/linalg/simd/qpx.hpp
@@ -0,0 +1,17 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#ifndef MFEM_SIMD_QPX_HPP
+#define MFEM_SIMD_QPX_HPP
+
+#include "qpx256.hpp"
+
+#endif // MFEM_SIMD_QPX_HPP
diff --git a/linalg/simd/qpx256.hpp b/linalg/simd/qpx256.hpp
new file mode 100644
index 00000000000..7705fa7f83a
--- /dev/null
+++ b/linalg/simd/qpx256.hpp
@@ -0,0 +1,241 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#ifndef MFEM_SIMD_QPX_256_HPP
+#define MFEM_SIMD_QPX_256_HPP
+
+#ifdef __bgq__
+
+#include "../../config/tconfig.hpp"
+#include <builtins.h>
+
+namespace mfem
+{
+
+template <typename,int,int> struct AutoSIMD;
+
+template <> struct AutoSIMD<double,4,32>
+{
+   typedef double scalar_type;
+   static constexpr int size = 4;
+   static constexpr int align_bytes = 32;
+
+   union
+   {
+      vector4double vd;
+      double vec[size];
+   };
+
+   inline __ATTRS_ai double &operator[](int i) { return vec[i]; }
+
+   inline __ATTRS_ai const double &operator[](int i) const { return vec[i]; }
+
+   inline __ATTRS_ai AutoSIMD &operator=(const AutoSIMD &v)
+   {
+      vd = v.vd;
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &operator=(const double &e)
+   {
+      vd = vec_splats(e);
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &operator+=(const AutoSIMD &v)
+   {
+      vd = vec_add(vd,v.vd);
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &operator+=(const double &e)
+   {
+      vd = vec_add(vd,vec_splats(e));
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &operator-=(const AutoSIMD &v)
+   {
+      vd = vec_sub(vd,v.vd);
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &operator-=(const double &e)
+   {
+      vd = vec_sub(vd,vec_splats(e));
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &operator*=(const AutoSIMD &v)
+   {
+      vd = vec_mul(vd,v.vd);
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &operator*=(const double &e)
+   {
+      vd = vec_mul(vd,vec_splats(e));
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &operator/=(const AutoSIMD &v)
+   {
+      vd = vec_swdiv(vd,v.vd);
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &operator/=(const double &e)
+   {
+      vd = vec_swdiv(vd,vec_splats(e));
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD operator-() const
+   {
+      AutoSIMD r;
+      r.vd = vec_neg(vd);
+      return r;
+   }
+
+   inline __ATTRS_ai AutoSIMD operator+(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.vd = vec_add(vd,v.vd);
+      return r;
+   }
+
+   inline __ATTRS_ai AutoSIMD operator+(const double &e) const
+   {
+      AutoSIMD r;
+      r.vd = vec_add(vd, vec_splats(e));
+      return r;
+   }
+
+   inline __ATTRS_ai AutoSIMD operator-(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.vd = vec_sub(vd,v.vd);
+      return r;
+   }
+
+   inline __ATTRS_ai AutoSIMD operator-(const double &e) const
+   {
+      AutoSIMD r;
+      r.vd = vec_sub(vd, vec_splats(e));
+      return r;
+   }
+
+   inline __ATTRS_ai AutoSIMD operator*(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.vd = vec_mul(vd,v.vd);
+      return r;
+   }
+
+   inline __ATTRS_ai AutoSIMD operator*(const double &e) const
+   {
+      AutoSIMD r;
+      r.vd = vec_mul(vd, vec_splats(e));
+      return r;
+   }
+
+   inline __ATTRS_ai AutoSIMD operator/(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.vd = vec_swdiv(vd,v.vd);
+      return r;
+   }
+
+   inline __ATTRS_ai AutoSIMD operator/(const double &e) const
+   {
+      AutoSIMD r;
+      r.vd = vec_swdiv(vd, vec_splats(e));
+      return r;
+   }
+
+   inline __ATTRS_ai AutoSIMD &fma(const AutoSIMD &v, const AutoSIMD &w)
+   {
+      vd = vec_madd(w.vd,vd,v.vd);
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &fma(const AutoSIMD &v, const double &e)
+   {
+      vd = vec_madd(v.vd,vec_splats(e),vd);
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &fma(const double &e, const AutoSIMD &v)
+   {
+      vd = vec_madd(vec_splats(e),v.vd,vd);
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &mul(const AutoSIMD &v, const AutoSIMD &w)
+   {
+      vd = vec_mul(v.vd,w.vd);
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &mul(const AutoSIMD &v, const double &e)
+   {
+      vd = vec_mul(v.vd,vec_splats(e));
+      return *this;
+   }
+
+   inline __ATTRS_ai AutoSIMD &mul(const double &e, const AutoSIMD &v)
+   {
+      vd = vec_mul(vec_splats(e),v.vd);
+      return *this;
+   }
+};
+
+inline __ATTRS_ai
+AutoSIMD<double,4,32> operator+(const double &e,
+                                const AutoSIMD<double,4,32> &v)
+{
+   AutoSIMD<double,4,32> r;
+   r.vd = vec_add(vec_splats(e),v.vd);
+   return r;
+}
+
+inline __ATTRS_ai
+AutoSIMD<double,4,32> operator-(const double &e,
+                                const AutoSIMD<double,4,32> &v)
+{
+   AutoSIMD<double,4,32> r;
+   r.vd = vec_sub(vec_splats(e),v.vd);
+   return r;
+}
+
+inline __ATTRS_ai
+AutoSIMD<double,4,32> operator*(const double &e,
+                                const AutoSIMD<double,4,32> &v)
+{
+   AutoSIMD<double,4,32> r;
+   r.vd = vec_mul(vec_splats(e),v.vd);
+   return r;
+}
+
+inline __ATTRS_ai
+AutoSIMD<double,4,32> operator/(const double &e,
+                                const AutoSIMD<double,4,32> &v)
+{
+   AutoSIMD<double,4,32> r;
+   r.vd = vec_swdiv(vec_splats(e),v.vd);
+   return r;
+}
+
+} // namespace mfem
+
+#endif // __bgq__
+
+#endif // MFEM_SIMD_QPX_256_HPP
diff --git a/linalg/simd/vsx.hpp b/linalg/simd/vsx.hpp
new file mode 100644
index 00000000000..638da44dcff
--- /dev/null
+++ b/linalg/simd/vsx.hpp
@@ -0,0 +1,17 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#ifndef MFEM_SIMD_VSX_HPP
+#define MFEM_SIMD_VSX_HPP
+
+#include "vsx128.hpp"
+
+#endif // MFEM_SIMD_VSX_HPP
diff --git a/linalg/simd/vsx128.hpp b/linalg/simd/vsx128.hpp
new file mode 100644
index 00000000000..175884edfce
--- /dev/null
+++ b/linalg/simd/vsx128.hpp
@@ -0,0 +1,247 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#ifndef MFEM_SIMD_VSX128_HPP
+#define MFEM_SIMD_VSX128_HPP
+
+#ifdef __VSX__
+
+#include "../../config/tconfig.hpp"
+#include <altivec.h>
+
+namespace mfem
+{
+
+template <typename,int,int> struct AutoSIMD;
+
+template <> struct AutoSIMD<double,2,16>
+{
+   typedef double scalar_type;
+   static constexpr int size = 2;
+   static constexpr int align_bytes = 16;
+
+   union
+   {
+      vector double vd;
+      double vec[size];
+   };
+
+   inline MFEM_ALWAYS_INLINE double &operator[](int i)
+   {
+      return vec[i];
+   }
+
+   inline MFEM_ALWAYS_INLINE const double &operator[](int i) const
+   {
+      return vec[i];
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator=(const AutoSIMD &v)
+   {
+      vd = v.vd;
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator=(const double &e)
+   {
+      vd = vec_splats(e);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator+=(const AutoSIMD &v)
+   {
+      vd = vec_add(vd,v.vd);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator+=(const double &e)
+   {
+      vd = vec_add(vd,vec_splats(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator-=(const AutoSIMD &v)
+   {
+      vd = vec_sub(vd,v.vd);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator-=(const double &e)
+   {
+      vd = vec_sub(vd,vec_splats(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator*=(const AutoSIMD &v)
+   {
+      vd = vec_mul(vd,v.vd);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator*=(const double &e)
+   {
+      vd = vec_mul(vd,vec_splats(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator/=(const AutoSIMD &v)
+   {
+      vd = vec_div(vd,v.vd);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &operator/=(const double &e)
+   {
+      vd = vec_div(vd,vec_splats(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-() const
+   {
+      AutoSIMD r;
+      r.vd = vec_neg(vd);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator+(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.vd = vec_add(vd,v.vd);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator+(const double &e) const
+   {
+      AutoSIMD r;
+      r.vd = vec_add(vd, vec_splats(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.vd = vec_sub(vd,v.vd);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator-(const double &e) const
+   {
+      AutoSIMD r;
+      r.vd = vec_sub(vd, vec_splats(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator*(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.vd = vec_mul(vd,v.vd);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator*(const double &e) const
+   {
+      AutoSIMD r;
+      r.vd = vec_mul(vd, vec_splats(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator/(const AutoSIMD &v) const
+   {
+      AutoSIMD r;
+      r.vd = vec_div(vd,v.vd);
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD operator/(const double &e) const
+   {
+      AutoSIMD r;
+      r.vd = vec_div(vd, vec_splats(e));
+      return r;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const AutoSIMD &v, const AutoSIMD &w)
+   {
+      vd = vec_madd(w.vd,vd,v.vd);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const AutoSIMD &v, const double &e)
+   {
+      vd = vec_madd(v.vd,vec_splats(e),vd);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &fma(const double &e, const AutoSIMD &v)
+   {
+      vd = vec_madd(vec_splats(e),v.vd,vd);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const AutoSIMD &v, const AutoSIMD &w)
+   {
+      vd = vec_mul(v.vd,w.vd);
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const AutoSIMD &v, const double &e)
+   {
+      vd = vec_mul(v.vd,vec_splats(e));
+      return *this;
+   }
+
+   inline MFEM_ALWAYS_INLINE AutoSIMD &mul(const double &e, const AutoSIMD &v)
+   {
+      vd = vec_mul(vec_splats(e),v.vd);
+      return *this;
+   }
+};
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,2,16> operator+(const double &e,
+                                const AutoSIMD<double,2,16> &v)
+{
+   AutoSIMD<double,2,16> r;
+   r.vd = vec_add(vec_splats(e),v.vd);
+   return r;
+}
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,2,16> operator-(const double &e,
+                                const AutoSIMD<double,2,16> &v)
+{
+   AutoSIMD<double,2,16> r;
+   r.vd = vec_sub(vec_splats(e),v.vd);
+   return r;
+}
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,2,16> operator*(const double &e,
+                                const AutoSIMD<double,2,16> &v)
+{
+   AutoSIMD<double,2,16> r;
+   r.vd = vec_mul(vec_splats(e),v.vd);
+   return r;
+}
+
+inline MFEM_ALWAYS_INLINE
+AutoSIMD<double,2,16> operator/(const double &e,
+                                const AutoSIMD<double,2,16> &v)
+{
+   AutoSIMD<double,2,16> r;
+   r.vd = vec_div(vec_splats(e),v.vd);
+   return r;
+}
+
+} // namespace mfem
+
+#endif // __VSX__
+
+#endif // MFEM_SIMD_VSX128_HPP
diff --git a/linalg/simd/x86.hpp b/linalg/simd/x86.hpp
new file mode 100644
index 00000000000..c13e67db59b
--- /dev/null
+++ b/linalg/simd/x86.hpp
@@ -0,0 +1,21 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#ifndef MFEM_SIMD_X86_HPP
+#define MFEM_SIMD_X86_HPP
+
+#include "m128.hpp"
+
+#include "m256.hpp"
+
+#include "m512.hpp"
+
+#endif // MFEM_SIMD_X86_HPP
diff --git a/linalg/solvers.cpp b/linalg/solvers.cpp
index fbc5f581114..c889fa35590 100644
--- a/linalg/solvers.cpp
+++ b/linalg/solvers.cpp
@@ -105,6 +105,15 @@ void IterativeSolver::SetOperator(const Operator &op)
    }
 }
 
+void IterativeSolver::Monitor(int it, double norm, const Vector& r,
+                              const Vector& x, bool final) const
+{
+   if (monitor != nullptr)
+   {
+      monitor->MonitorResidual(it, norm, r, final);
+      monitor->MonitorSolution(it, norm, x, final);
+   }
+}
 
 OperatorJacobiSmoother::OperatorJacobiSmoother(const BilinearForm &a,
                                                const Array<int> &ess_tdofs,
@@ -555,6 +564,7 @@ void CGSolver::Mult(const Vector &b, Vector &x) const
       mfem::out << "   Iteration : " << setw(3) << 0 << "  (B r, r) = "
                 << nom << (print_level == 3 ? " ...\n" : "\n");
    }
+   Monitor(0, nom, r, x);
 
    if (nom < 0.0)
    {
@@ -633,6 +643,8 @@ void CGSolver::Mult(const Vector &b, Vector &x) const
                    << betanom << '\n';
       }
 
+      Monitor(i, betanom, r, x);
+
       if (betanom < r0)
       {
          if (print_level == 2)
@@ -701,6 +713,8 @@ void CGSolver::Mult(const Vector &b, Vector &x) const
                 << pow (betanom/nom0, 0.5/final_iter) << '\n';
    }
    final_norm = sqrt(betanom);
+
+   Monitor(final_iter, final_norm, r, x, true);
 }
 
 void CG(const Operator &A, const Vector &b, Vector &x,
@@ -848,6 +862,8 @@ void GMRESSolver::Mult(const Vector &b, Vector &x) const
                 << "  ||B r|| = " << beta << (print_level == 3 ? " ...\n" : "\n");
    }
 
+   Monitor(0, beta, r, x);
+
    v.SetSize(m+1, NULL);
 
    for (j = 1; j <= max_iter; )
@@ -906,6 +922,8 @@ void GMRESSolver::Mult(const Vector &b, Vector &x) const
                       << "   Iteration : " << setw(3) << j
                       << "  ||B r|| = " << resid << '\n';
          }
+
+         Monitor(j, resid, r, x);
       }
 
       if (print_level == 1 && j <= max_iter)
@@ -955,6 +973,9 @@ void GMRESSolver::Mult(const Vector &b, Vector &x) const
    {
       mfem::out << "GMRES: No convergence!\n";
    }
+
+   Monitor(final_iter, final_norm, r, x, true);
+
    for (i = 0; i < v.Size(); i++)
    {
       delete v[i];
@@ -1000,6 +1021,8 @@ void FGMRESSolver::Mult(const Vector &b, Vector &x) const
                 << "  || r || = " << beta << endl;
    }
 
+   Monitor(0, beta, r, x);
+
    Array<Vector*> v(m+1);
    Array<Vector*> z(m+1);
    for (i= 0; i<=m; i++)
@@ -1060,6 +1083,7 @@ void FGMRESSolver::Mult(const Vector &b, Vector &x) const
                       << "   Iteration : " << setw(3) << j
                       << "  || r || = " << resid << endl;
          }
+         Monitor(j, resid, r, x, resid <= final_norm);
 
          if (resid <= final_norm)
          {
@@ -1192,6 +1216,8 @@ void BiCGSTABSolver::Mult(const Vector &b, Vector &x) const
       mfem::out << "   Iteration : " << setw(3) << 0
                 << "   ||r|| = " << resid << '\n';
 
+   Monitor(0, resid, r, x);
+
    tol_goal = std::max(resid*rel_tol, abs_tol);
 
    if (resid <= tol_goal)
@@ -1210,6 +1236,9 @@ void BiCGSTABSolver::Mult(const Vector &b, Vector &x) const
          if (print_level >= 0)
             mfem::out << "   Iteration : " << setw(3) << i
                       << "   ||r|| = " << resid << '\n';
+
+         Monitor(i, resid, r, x);
+
          final_norm = resid;
          final_iter = i;
          converged = 0;
@@ -1252,6 +1281,7 @@ void BiCGSTABSolver::Mult(const Vector &b, Vector &x) const
       if (print_level >= 0)
          mfem::out << "   Iteration : " << setw(3) << i
                    << "   ||s|| = " << resid;
+      Monitor(i, resid, r, x);
       if (prec)
       {
          prec->Mult(s, shat);  //  shat = M^{-1} * s
@@ -1273,6 +1303,7 @@ void BiCGSTABSolver::Mult(const Vector &b, Vector &x) const
       {
          mfem::out << "   ||r|| = " << resid << '\n';
       }
+      Monitor(i, resid, r, x);
       if (resid < tol_goal)
       {
          final_norm = resid;
@@ -1378,6 +1409,7 @@ void MINRESSolver::Mult(const Vector &b, Vector &x) const
       mfem::out << "MINRES: iteration " << setw(3) << 0 << ": ||r||_B = "
                 << eta << (print_level == 3 ? " ...\n" : "\n");
    }
+   Monitor(0, eta, *z, x);
 
    for (it = 1; it <= max_iter; it++)
    {
@@ -1445,6 +1477,7 @@ void MINRESSolver::Mult(const Vector &b, Vector &x) const
          mfem::out << "MINRES: iteration " << setw(3) << it << ": ||r||_B = "
                    << fabs(eta) << '\n';
       }
+      Monitor(it, fabs(eta), *z, x);
 
       if (prec)
       {
@@ -1469,6 +1502,7 @@ void MINRESSolver::Mult(const Vector &b, Vector &x) const
    {
       mfem::out << "MINRES: number of iterations: " << final_iter << '\n';
    }
+   Monitor(final_iter, final_norm, *z, x, true);
 #if 0
    if (print_level >= 1)
    {
@@ -1567,6 +1601,7 @@ void NewtonSolver::Mult(const Vector &b, Vector &x) const
          }
          mfem::out << '\n';
       }
+      Monitor(it, norm, r, x);
 
       if (norm <= norm_goal)
       {
@@ -2110,17 +2145,37 @@ void MinimumDiscardedFillOrdering(SparseMatrix &C, Array<int> &p)
    }
 
    std::vector<double> w(n, 0.0);
-   // Compute the discarded-fill weights
    for (int k=0; k<n; ++k)
    {
+      // Find all neighbors i of k
       for (int ii=I[k]; ii<I[k+1]; ++ii)
       {
-         double C_ki = V[ii];
+         int i = J[ii];
+         // Find value of (i,k)
+         double C_ik;
+         for (int kk=I[i]; kk<I[i+1]; ++kk)
+         {
+            if (J[kk] == k)
+            {
+               C_ik = V[kk];
+               break;
+            }
+         }
          for (int jj=I[k]; jj<I[k+1]; ++jj)
          {
-            if (jj == ii) { continue; }
-            double C_jk = V[jj];
-            w[k] += pow(C_jk*C_ki, 2);
+            int j = J[jj];
+            if (j == k) { continue; }
+            double C_kj = V[jj];
+            bool ij_exists = false;
+            for (int jj2=I[i]; jj2<I[i+1]; ++jj2)
+            {
+               if (J[jj2] == j)
+               {
+                  ij_exists = true;
+                  break;
+               }
+            }
+            if (!ij_exists) { w[k] += pow(C_ik*C_kj,2); }
          }
       }
       w[k] = sqrt(w[k]);
@@ -2130,10 +2185,10 @@ void MinimumDiscardedFillOrdering(SparseMatrix &C, Array<int> &p)
 
    // Compute ordering
    p.SetSize(n);
-   for (int i=0; i<n; ++i)
+   for (int ii=0; ii<n; ++ii)
    {
       int pi = w_heap.pop();
-      p[n-1-i] = pi;
+      p[ii] = pi;
       w[pi] = -1;
       for (int kk=I[pi]; kk<I[pi+1]; ++kk)
       {
@@ -2141,15 +2196,36 @@ void MinimumDiscardedFillOrdering(SparseMatrix &C, Array<int> &p)
          if (w_heap.picked(k)) { continue; }
          // Recompute weight
          w[k] = 0.0;
-         for (int ii=I[k]; ii<I[k+1]; ++ii)
+         // Find all neighbors i of k
+         for (int ii2=I[k]; ii2<I[k+1]; ++ii2)
          {
-            if (w_heap.picked(J[ii])) { continue; }
-            double C_ki = V[ii];
+            int i = J[ii2];
+            if (w_heap.picked(i)) { continue; }
+            // Find value of (i,k)
+            double C_ik;
+            for (int kk2=I[i]; kk2<I[i+1]; ++kk2)
+            {
+               if (J[kk2] == k)
+               {
+                  C_ik = V[kk2];
+                  break;
+               }
+            }
             for (int jj=I[k]; jj<I[k+1]; ++jj)
             {
-               if (jj == ii || w_heap.picked(J[jj])) { continue; }
-               double C_jk = V[jj];
-               w[k] += pow(C_jk*C_ki, 2);
+               int j = J[jj];
+               if (j == k || w_heap.picked(j)) { continue; }
+               double C_kj = V[jj];
+               bool ij_exists = false;
+               for (int jj2=I[i]; jj2<I[i+1]; ++jj2)
+               {
+                  if (J[jj2] == j)
+                  {
+                     ij_exists = true;
+                     break;
+                  }
+               }
+               if (!ij_exists) { w[k] += pow(C_ik*C_kj,2); }
             }
          }
          w[k] = sqrt(w[k]);
diff --git a/linalg/solvers.hpp b/linalg/solvers.hpp
index c0a87440fb6..904a55f6a96 100644
--- a/linalg/solvers.hpp
+++ b/linalg/solvers.hpp
@@ -30,6 +30,27 @@ namespace mfem
 
 class BilinearForm;
 
+/// Abstract base class for an iterative solver monitor
+class IterativeSolverMonitor
+{
+public:
+   IterativeSolverMonitor() {}
+
+   virtual ~IterativeSolverMonitor() {}
+
+   /// Monitor the residual vector r
+   virtual void MonitorResidual(int it, double norm, const Vector &r,
+                                bool final)
+   {
+   }
+
+   /// Monitor the solution vector x
+   virtual void MonitorSolution(int it, double norm, const Vector &x,
+                                bool final)
+   {
+   }
+};
+
 /// Abstract base class for iterative solver
 class IterativeSolver : public Solver
 {
@@ -42,6 +63,7 @@ class IterativeSolver : public Solver
 protected:
    const Operator *oper;
    Solver *prec;
+   IterativeSolverMonitor *monitor = nullptr;
 
    int max_iter, print_level;
    double rel_tol, abs_tol;
@@ -52,6 +74,8 @@ class IterativeSolver : public Solver
 
    double Dot(const Vector &x, const Vector &y) const;
    double Norm(const Vector &x) const { return sqrt(Dot(x, x)); }
+   void Monitor(int it, double norm, const Vector& r, const Vector& x,
+                bool final=false) const;
 
 public:
    IterativeSolver();
@@ -74,6 +98,9 @@ class IterativeSolver : public Solver
 
    /// Also calls SetOperator for the preconditioner if there is one
    virtual void SetOperator(const Operator &op);
+
+   /// Set the iterative solver monitor
+   void SetMonitor(IterativeSolverMonitor &m) { monitor = &m; }
 };
 
 
diff --git a/linalg/superlu.cpp b/linalg/superlu.cpp
index a22cbe64e1f..88de9000592 100644
--- a/linalg/superlu.cpp
+++ b/linalg/superlu.cpp
@@ -24,6 +24,18 @@
 #error "SuperLUDist has been built with 64bit integers. This is not supported"
 #endif
 
+#if SUPERLU_DIST_MAJOR_VERSION > 6 ||                                   \
+  (SUPERLU_DIST_MAJOR_VERSION == 6 && SUPERLU_DIST_MINOR_VERSION > 2)
+#define ScalePermstruct_t dScalePermstruct_t
+#define LUstruct_t dLUstruct_t
+#define SOLVEstruct_t dSOLVEstruct_t
+#define ScalePermstructFree dScalePermstructFree
+#define Destroy_LU dDestroy_LU
+#define LUstructFree dLUstructFree
+#define LUstructInit dLUstructInit
+#endif
+
+
 using namespace std;
 
 namespace mfem
diff --git a/linalg/tmatrix.hpp b/linalg/tmatrix.hpp
index 33eeaa34d23..ca34453d1fc 100644
--- a/linalg/tmatrix.hpp
+++ b/linalg/tmatrix.hpp
@@ -22,6 +22,16 @@ namespace mfem
 
 // Matrix-matrix products
 
+namespace internal
+{
+
+template <typename T> struct entry_type { typedef typename T::data_type type; };
+
+template <typename T> struct entry_type<T*> { typedef T type; };
+
+} // namespace mfem::internal
+
+
 // C  {=|+=}  A.B -- simple version (no blocks)
 template <bool Add,
           typename A_layout_t, typename A_data_t,
@@ -46,23 +56,26 @@ void sMult_AB(const A_layout_t &A_layout, const A_data_t &A_data,
    MFEM_FLOPS_ADD(Add ? 2*A1*A2*B2 : 2*A1*A2*B2-A1*B2);
    for (int b2 = 0; b2 < B2; b2++)
    {
-      for (int s = 0; s < A2; s++)
+      for (int a1 = 0; a1 < A1; a1++)
       {
-         for (int a1 = 0; a1 < A1; a1++)
+         typename internal::entry_type<C_data_t>::type c_a1_b2;
+         if (Add)
          {
-            if (!Add && s == 0)
-            {
-               // C(a1,b2) = A(a1,s) * B(s,b2);
-               C_data[C_layout.ind(a1,b2)] =
-                  A_data[A_layout.ind(a1,s)] * B_data[B_layout.ind(s,b2)];
-            }
-            else
-            {
-               // C(a1,b2) += A(a1,s) * B(s,b2);
-               C_data[C_layout.ind(a1,b2)] +=
-                  A_data[A_layout.ind(a1,s)] * B_data[B_layout.ind(s,b2)];
-            }
+            // C(a1,b2) += A(a1,0) * B(0,b2);
+            c_a1_b2 = C_data[C_layout.ind(a1,b2)];
+            c_a1_b2.fma(A_data[A_layout.ind(a1,0)], B_data[B_layout.ind(0,b2)]);
          }
+         else
+         {
+            // C(a1,b2) = A(a1,0) * B(0,b2);
+            c_a1_b2.mul(A_data[A_layout.ind(a1,0)], B_data[B_layout.ind(0,b2)]);
+         }
+         for (int s = 1; s < A2; s++)
+         {
+            // C(a1,b2) += A(a1,s) * B(s,b2);
+            c_a1_b2.fma(A_data[A_layout.ind(a1,s)], B_data[B_layout.ind(s,b2)]);
+         }
+         C_data[C_layout.ind(a1,b2)] = c_a1_b2;
       }
    }
 }
@@ -268,7 +281,6 @@ struct MatrixOps<1,1>
    template <typename scalar_t,
              typename A_layout_t, typename A_data_t,
              typename B_layout_t, typename B_data_t>
-   MFEM_HOST_DEVICE
    static inline scalar_t AdjDet(const A_layout_t &a, const A_data_t &A,
                                  const B_layout_t &b, B_data_t &B)
    {
@@ -282,7 +294,6 @@ struct MatrixOps<2,2>
 {
    // Compute det(A).
    template <typename scalar_t, typename layout_t, typename data_t>
-   MFEM_HOST_DEVICE
    static inline scalar_t Det(const layout_t &a, const data_t &A)
    {
       MFEM_FLOPS_ADD(3);
@@ -290,6 +301,16 @@ struct MatrixOps<2,2>
               A[a.ind(1,0)]*A[a.ind(0,1)]);
    }
 
+   // Compute det(A), host+device version.
+   template <typename scalar_t, typename layout_t, typename data_t>
+   MFEM_HOST_DEVICE
+   static inline scalar_t DetHD(const layout_t &a, const data_t &A)
+   {
+      MFEM_FLOPS_ADD(3);
+      return (A[a.ind(0,0)]*A[a.ind(1,1)] -
+              A[a.ind(1,0)]*A[a.ind(0,1)]);
+   }
+
    // Compute det(A). Batched version: D[i] {=,+=,*=} det(A[i,*,*])
    template <AssignOp::Type Op, typename A_layout_t, typename A_data_t,
              typename D_data_t>
@@ -308,7 +329,6 @@ struct MatrixOps<2,2>
    template <typename scalar_t,
              typename A_layout_t, typename A_data_t,
              typename B_layout_t, typename B_data_t>
-   MFEM_HOST_DEVICE
    static inline void Adjugate(const A_layout_t &a, const A_data_t &A,
                                const B_layout_t &b, B_data_t &B)
    {
@@ -318,11 +338,24 @@ struct MatrixOps<2,2>
       B[b.ind(1,1)] =  A[a.ind(0,0)];
    }
 
-   // Compute adj(A) and det(A).
+   // Compute B = adj(A), host+device version.
    template <typename scalar_t,
              typename A_layout_t, typename A_data_t,
              typename B_layout_t, typename B_data_t>
    MFEM_HOST_DEVICE
+   static inline void AdjugateHD(const A_layout_t &a, const A_data_t &A,
+                                 const B_layout_t &b, B_data_t &B)
+   {
+      B[b.ind(0,0)] =  A[a.ind(1,1)];
+      B[b.ind(0,1)] = -A[a.ind(0,1)];
+      B[b.ind(1,0)] = -A[a.ind(1,0)];
+      B[b.ind(1,1)] =  A[a.ind(0,0)];
+   }
+
+   // Compute adj(A) and det(A).
+   template <typename scalar_t,
+             typename A_layout_t, typename A_data_t,
+             typename B_layout_t, typename B_data_t>
    static inline scalar_t AdjDet(const A_layout_t &a, const A_data_t &A,
                                  const B_layout_t &b, B_data_t &B)
    {
@@ -330,6 +363,18 @@ struct MatrixOps<2,2>
       return Det<scalar_t>(a, A);
    }
 
+   // Compute adj(A) and det(A), host+device version.
+   template <typename scalar_t,
+             typename A_layout_t, typename A_data_t,
+             typename B_layout_t, typename B_data_t>
+   MFEM_HOST_DEVICE
+   static inline scalar_t AdjDetHD(const A_layout_t &a, const A_data_t &A,
+                                   const B_layout_t &b, B_data_t &B)
+   {
+      AdjugateHD<scalar_t>(a, A, b, B);
+      return DetHD<scalar_t>(a, A);
+   }
+
    template <bool symm> struct Symm;
 };
 
@@ -367,7 +412,6 @@ struct MatrixOps<3,3>
 {
    // Compute det(A).
    template <typename scalar_t, typename layout_t, typename data_t>
-   MFEM_HOST_DEVICE
    static inline scalar_t Det(const layout_t &a, const data_t &A)
    {
       MFEM_FLOPS_ADD(14);
@@ -379,6 +423,20 @@ struct MatrixOps<3,3>
                              A[a.ind(1,1)]*A[a.ind(0,2)]));
    }
 
+   // Compute det(A), host+device version.
+   template <typename scalar_t, typename layout_t, typename data_t>
+   MFEM_HOST_DEVICE
+   static inline scalar_t DetHD(const layout_t &a, const data_t &A)
+   {
+      MFEM_FLOPS_ADD(14);
+      return (A[a.ind(0,0)]*(A[a.ind(1,1)]*A[a.ind(2,2)] -
+                             A[a.ind(2,1)]*A[a.ind(1,2)]) -
+              A[a.ind(1,0)]*(A[a.ind(0,1)]*A[a.ind(2,2)] -
+                             A[a.ind(2,1)]*A[a.ind(0,2)]) +
+              A[a.ind(2,0)]*(A[a.ind(0,1)]*A[a.ind(1,2)] -
+                             A[a.ind(1,1)]*A[a.ind(0,2)]));
+   }
+
    // Compute det(A). Batched version: D[i] {=,+=,*=} det(A[i,*,*])
    template <AssignOp::Type Op, typename A_layout_t, typename A_data_t,
              typename D_data_t>
@@ -403,7 +461,6 @@ struct MatrixOps<3,3>
    template <typename scalar_t,
              typename A_layout_t, typename A_data_t,
              typename B_layout_t, typename B_data_t>
-   MFEM_HOST_DEVICE
    static inline void Adjugate(const A_layout_t &a, const A_data_t &A,
                                const B_layout_t &b, B_data_t &B)
    {
@@ -419,11 +476,30 @@ struct MatrixOps<3,3>
       B[b.ind(2,2)] = A[a.ind(0,0)]*A[a.ind(1,1)] - A[a.ind(0,1)]*A[a.ind(1,0)];
    }
 
-   // Compute adj(A) and det(A).
+   // Compute B = adj(A), host+device version.
    template <typename scalar_t,
              typename A_layout_t, typename A_data_t,
              typename B_layout_t, typename B_data_t>
    MFEM_HOST_DEVICE
+   static inline void AdjugateHD(const A_layout_t &a, const A_data_t &A,
+                                 const B_layout_t &b, B_data_t &B)
+   {
+      MFEM_FLOPS_ADD(27);
+      B[b.ind(0,0)] = A[a.ind(1,1)]*A[a.ind(2,2)] - A[a.ind(1,2)]*A[a.ind(2,1)];
+      B[b.ind(0,1)] = A[a.ind(0,2)]*A[a.ind(2,1)] - A[a.ind(0,1)]*A[a.ind(2,2)];
+      B[b.ind(0,2)] = A[a.ind(0,1)]*A[a.ind(1,2)] - A[a.ind(0,2)]*A[a.ind(1,1)];
+      B[b.ind(1,0)] = A[a.ind(1,2)]*A[a.ind(2,0)] - A[a.ind(1,0)]*A[a.ind(2,2)];
+      B[b.ind(1,1)] = A[a.ind(0,0)]*A[a.ind(2,2)] - A[a.ind(0,2)]*A[a.ind(2,0)];
+      B[b.ind(1,2)] = A[a.ind(0,2)]*A[a.ind(1,0)] - A[a.ind(0,0)]*A[a.ind(1,2)];
+      B[b.ind(2,0)] = A[a.ind(1,0)]*A[a.ind(2,1)] - A[a.ind(1,1)]*A[a.ind(2,0)];
+      B[b.ind(2,1)] = A[a.ind(0,1)]*A[a.ind(2,0)] - A[a.ind(0,0)]*A[a.ind(2,1)];
+      B[b.ind(2,2)] = A[a.ind(0,0)]*A[a.ind(1,1)] - A[a.ind(0,1)]*A[a.ind(1,0)];
+   }
+
+   // Compute adj(A) and det(A).
+   template <typename scalar_t,
+             typename A_layout_t, typename A_data_t,
+             typename B_layout_t, typename B_data_t>
    static inline scalar_t AdjDet(const A_layout_t &a, const A_data_t &A,
                                  const B_layout_t &b, B_data_t &B)
    {
@@ -434,6 +510,21 @@ struct MatrixOps<3,3>
               A[a.ind(2,0)]*B[b.ind(0,2)]);
    }
 
+   // Compute adj(A) and det(A), host+device version.
+   template <typename scalar_t,
+             typename A_layout_t, typename A_data_t,
+             typename B_layout_t, typename B_data_t>
+   MFEM_HOST_DEVICE
+   static inline scalar_t AdjDetHD(const A_layout_t &a, const A_data_t &A,
+                                   const B_layout_t &b, B_data_t &B)
+   {
+      MFEM_FLOPS_ADD(5);
+      AdjugateHD<scalar_t>(a, A, b, B);
+      return (A[a.ind(0,0)]*B[b.ind(0,0)] +
+              A[a.ind(1,0)]*B[b.ind(0,1)] +
+              A[a.ind(2,0)]*B[b.ind(0,2)]);
+   }
+
    template <bool symm> struct Symm;
 };
 
@@ -480,7 +571,6 @@ struct MatrixOps<3,3>::Symm<false>
 
 // Compute the determinant of a (small) matrix: det(A).
 template <typename scalar_t, typename layout_t, typename data_t>
-MFEM_HOST_DEVICE
 inline scalar_t TDet(const layout_t &a, const data_t &A)
 {
    MFEM_STATIC_ASSERT(layout_t::rank == 2, "invalid rank");
@@ -493,11 +583,25 @@ inline scalar_t TDet(const layout_t &a, const data_t &A)
 #endif
 }
 
+// Compute the determinant of a (small) matrix: det(A). Host+device version.
+template <typename scalar_t, typename layout_t, typename data_t>
+MFEM_HOST_DEVICE
+inline scalar_t TDetHD(const layout_t &a, const data_t &A)
+{
+   MFEM_STATIC_ASSERT(layout_t::rank == 2, "invalid rank");
+#if !defined(__xlC__) || (__xlC__ >= 0x0d00)
+   return internal::MatrixOps<layout_t::dim_1,layout_t::dim_2>::
+          template DetHD<scalar_t>(a, A);
+#else
+   return internal::MatrixOps<layout_t::dim_1,layout_t::dim_2>::
+          DetHD<scalar_t>(a, A);
+#endif
+}
+
 // Compute the determinants of a set of (small) matrices: D[i] = det(A[i,*,*]).
 // The layout of A is (M x N1 x N2) and the size of D is M.
 template <AssignOp::Type Op, typename A_layout_t, typename A_data_t,
           typename D_data_t>
-MFEM_HOST_DEVICE
 inline void TDet(const A_layout_t &a, const A_data_t &A, D_data_t &D)
 {
    MFEM_STATIC_ASSERT(A_layout_t::rank == 3, "invalid rank");
@@ -528,7 +632,6 @@ inline void TAdjugate(const A_layout_t &a, const A_data_t &A,
 template <typename scalar_t,
           typename A_layout_t, typename A_data_t,
           typename B_layout_t, typename B_data_t>
-MFEM_HOST_DEVICE
 inline scalar_t TAdjDet(const A_layout_t &a, const A_data_t &A,
                         const B_layout_t &b, B_data_t &B)
 {
@@ -538,6 +641,21 @@ inline scalar_t TAdjDet(const A_layout_t &a, const A_data_t &A,
           template AdjDet<scalar_t>(a, A, b, B);
 }
 
+// Compute the adjugate and the determinant of a (small) matrix: B = adj(A),
+// return det(A). Host+device version.
+template <typename scalar_t,
+          typename A_layout_t, typename A_data_t,
+          typename B_layout_t, typename B_data_t>
+MFEM_HOST_DEVICE
+inline scalar_t TAdjDetHD(const A_layout_t &a, const A_data_t &A,
+                          const B_layout_t &b, B_data_t &B)
+{
+   MFEM_STATIC_ASSERT(A_layout_t::rank == 2 && B_layout_t::rank == 2,
+                      "invalid ranks");
+   return internal::MatrixOps<A_layout_t::dim_1,A_layout_t::dim_2>::
+          template AdjDetHD<scalar_t>(a, A, b, B);
+}
+
 } // namespace mfem
 
 #endif // MFEM_TEMPLATE_MATRIX
diff --git a/linalg/ttensor.hpp b/linalg/ttensor.hpp
index 05f2ff49c46..a991503ae12 100644
--- a/linalg/ttensor.hpp
+++ b/linalg/ttensor.hpp
@@ -13,6 +13,7 @@
 #define MFEM_TEMPLATE_TENSOR
 
 #include "../config/tconfig.hpp"
+#include "../linalg/simd.hpp"
 #include "../general/tassign.hpp"
 #include "tlayout.hpp"
 #include "tmatrix.hpp"
@@ -38,9 +39,8 @@ struct TensorOps<1> // rank = 1
    // Assign: A {=,+=,*=} scalar_value
    template <AssignOp::Type Op, typename A_layout_t, typename A_data_t,
              typename scalar_t>
-   MFEM_HOST_DEVICE
    static void Assign(const A_layout_t &A_layout, A_data_t &A_data,
-                      scalar_t value)
+                      const scalar_t value)
    {
       MFEM_STATIC_ASSERT(A_layout_t::rank == 1, "invalid rank");
       for (int i1 = 0; i1 < A_layout_t::dim_1; i1++)
@@ -49,6 +49,20 @@ struct TensorOps<1> // rank = 1
       }
    }
 
+   // Assign: A {=,+=,*=} scalar_value, host+device version
+   template <AssignOp::Type Op, typename A_layout_t, typename A_data_t,
+             typename scalar_t>
+   MFEM_HOST_DEVICE
+   static void AssignHD(const A_layout_t &A_layout, A_data_t &A_data,
+                        const scalar_t value)
+   {
+      MFEM_STATIC_ASSERT(A_layout_t::rank == 1, "invalid rank");
+      for (int i1 = 0; i1 < A_layout_t::dim_1; i1++)
+      {
+         mfem::AssignHD<Op>(A_data[A_layout.ind(i1)], value);
+      }
+   }
+
    // Assign: A {=,+=,*=} B
    template <AssignOp::Type Op,
              typename A_layout_t, typename A_data_t,
@@ -73,7 +87,6 @@ struct TensorOps<2> // rank = 2
    // Assign: A {=,+=,*=} scalar_value
    template <AssignOp::Type Op, typename A_layout_t, typename A_data_t,
              typename scalar_t>
-   MFEM_HOST_DEVICE
    static void Assign(const A_layout_t &A_layout, A_data_t &A_data,
                       scalar_t value)
    {
@@ -87,6 +100,23 @@ struct TensorOps<2> // rank = 2
       }
    }
 
+   // Assign: A {=,+=,*=} scalar_value, host+device version
+   template <AssignOp::Type Op, typename A_layout_t, typename A_data_t,
+             typename scalar_t>
+   MFEM_HOST_DEVICE
+   static void AssignHD(const A_layout_t &A_layout, A_data_t &A_data,
+                        scalar_t value)
+   {
+      MFEM_STATIC_ASSERT(A_layout_t::rank == 2, "invalid rank");
+      for (int i2 = 0; i2 < A_layout_t::dim_2; i2++)
+      {
+         for (int i1 = 0; i1 < A_layout_t::dim_1; i1++)
+         {
+            mfem::AssignHD<Op>(A_data[A_layout.ind(i1,i2)], value);
+         }
+      }
+   }
+
    // Assign: A {=,+=,*=} B
    template <AssignOp::Type Op,
              typename A_layout_t, typename A_data_t,
@@ -220,14 +250,25 @@ struct TensorOps<4> // rank = 4
 // Tensor or sub-tensor assign function: A {=,+=,*=} scalar_value.
 template <AssignOp::Type Op, typename A_layout_t, typename A_data_t,
           typename scalar_t>
-MFEM_HOST_DEVICE
 inline void TAssign(const A_layout_t &A_layout, A_data_t &A_data,
-                    scalar_t value)
+                    const scalar_t value)
 {
    internal::TensorOps<A_layout_t::rank>::
    template Assign<Op>(A_layout, A_data, value);
 }
 
+// Tensor or sub-tensor assign function: A {=,+=,*=} scalar_value.
+// Host+device version.
+template <AssignOp::Type Op, typename A_layout_t, typename A_data_t,
+          typename scalar_t>
+MFEM_HOST_DEVICE
+inline void TAssignHD(const A_layout_t &A_layout, A_data_t &A_data,
+                      const scalar_t value)
+{
+   internal::TensorOps<A_layout_t::rank>::
+   template AssignHD<Op>(A_layout, A_data, value);
+}
+
 // Tensor assign function: A {=,+=,*=} B that allows different input and output
 // layouts. With suitable layouts this function can be used to permute
 // (transpose) tensors, extract sub-tensors, etc.
@@ -256,8 +297,8 @@ struct TVector
    typedef StridedLayout1D<S,1> layout_type;
    static const layout_type layout;
 
-   MFEM_HOST_DEVICE data_t &operator[](int i) { return data[i]; }
-   MFEM_HOST_DEVICE const data_t &operator[](int i) const { return data[i]; }
+   data_t &operator[](int i) { return data[i]; }
+   const data_t &operator[](int i) const { return data[i]; }
 
    template <AssignOp::Type Op>
    void Assign(const data_t d)
diff --git a/linalg/vector.hpp b/linalg/vector.hpp
index efa485d8bbc..eabe335b6a4 100644
--- a/linalg/vector.hpp
+++ b/linalg/vector.hpp
@@ -281,21 +281,48 @@ class Vector
    /// v = median(v,lo,hi) entrywise.  Implementation assumes lo <= hi.
    void median(const Vector &lo, const Vector &hi);
 
+   /// Extract entries listed in @a dofs to the output Vector @a elemvect.
+   /** Negative dof values cause the -dof-1 position in @a elemvect to receive
+       the -val in from this Vector. */
    void GetSubVector(const Array<int> &dofs, Vector &elemvect) const;
+
+   /// Extract entries listed in @a dofs to the output array @a elem_data.
+   /** Negative dof values cause the -dof-1 position in @a elem_data to receive
+       the -val in from this Vector. */
    void GetSubVector(const Array<int> &dofs, double *elem_data) const;
 
-   /// Set the entries listed in `dofs` to the given `value`.
+   /// Set the entries listed in @a dofs to the given @a value.
+   /** Negative dof values cause the -dof-1 position in this Vector to receive
+       the -value. */
    void SetSubVector(const Array<int> &dofs, const double value);
+
+   /** @brief Set the entries listed in @a dofs to the values given in the @a
+       elemvect Vector. Negative dof values cause the -dof-1 position in this
+       Vector to receive the -val from @a elemvect. */
    void SetSubVector(const Array<int> &dofs, const Vector &elemvect);
+
+   /** @brief Set the entries listed in @a dofs to the values given the @a ,
+       elem_data array. Negative dof values cause the -dof-1 position in this
+       Vector to receive the -val from @a elem_data. */
    void SetSubVector(const Array<int> &dofs, double *elem_data);
 
-   /// Add (element) subvector to the vector.
+   /** @brief Add elements of the @a elemvect Vector to the entries listed in @a
+       dofs. Negative dof values cause the -dof-1 position in this Vector to add
+       the -val from @a elemvect. */
    void AddElementVector(const Array<int> & dofs, const Vector & elemvect);
+
+   /** @brief Add elements of the @a elem_data array to the entries listed in @a
+       dofs. Negative dof values cause the -dof-1 position in this Vector to add
+       the -val from @a elem_data. */
    void AddElementVector(const Array<int> & dofs, double *elem_data);
+
+   /** @brief Add @a times the elements of the @a elemvect Vector to the entries
+       listed in @a dofs. Negative dof values cause the -dof-1 position in this
+       Vector to add the -a*val from @a elemvect. */
    void AddElementVector(const Array<int> & dofs, const double a,
                          const Vector & elemvect);
 
-   /// Set all vector entries NOT in the 'dofs' array to the given 'val'.
+   /// Set all vector entries NOT in the @a dofs Array to the given @a val.
    void SetSubVectorComplement(const Array<int> &dofs, const double val);
 
    /// Prints vector to stream out.
diff --git a/makefile b/makefile
index 15684d1b7e1..8cabd8b2551 100644
--- a/makefile
+++ b/makefile
@@ -120,7 +120,7 @@ EXAMPLE_TEST_DIRS := examples
 MINIAPP_SUBDIRS = common electromagnetics meshing navier performance tools toys nurbs gslib
 MINIAPP_DIRS := $(addprefix miniapps/,$(MINIAPP_SUBDIRS))
 MINIAPP_TEST_DIRS := $(filter-out %/common,$(MINIAPP_DIRS))
-MINIAPP_USE_COMMON := $(addprefix miniapps/,electromagnetics tools toys)
+MINIAPP_USE_COMMON := $(addprefix miniapps/,electromagnetics meshing tools toys)
 
 EM_DIRS = $(EXAMPLE_DIRS) $(MINIAPP_DIRS)
 
@@ -203,28 +203,28 @@ CXXFLAGS ?= $(OPTIM_FLAGS)
 
 # MPI configuration
 ifneq ($(MFEM_USE_MPI),YES)
-   CXX_OR_MPICXX = $(CXX)
+   MFEM_HOST_CXX = $(CXX)
    PKGS_NEED_MPI = SUPERLU STRUMPACK PETSC PUMI
    $(foreach mpidep,$(PKGS_NEED_MPI),$(if $(MFEM_USE_$(mpidep):NO=),\
      $(warning *** [MPI is OFF] setting MFEM_USE_$(mpidep) = NO)\
      $(eval override MFEM_USE_$(mpidep)=NO),))
 else
-   CXX_OR_MPICXX = $(MPICXX)
+   MFEM_HOST_CXX = $(MPICXX)
    INCFLAGS += $(HYPRE_OPT)
    ALL_LIBS += $(HYPRE_LIB)
 endif
-ALL_LIBS += $(GSLIB_FPT_LIB)
 
 # Default configuration
 ifeq ($(MFEM_USE_CUDA)$(MFEM_USE_HIP),NONO)
-   MFEM_CXX ?= $(CXX_OR_MPICXX)
+   MFEM_CXX ?= $(MFEM_HOST_CXX)
+   MFEM_HOST_CXX := $(MFEM_CXX)
    XCOMPILER = $(CXX_XCOMPILER)
    XLINKER   = $(CXX_XLINKER)
 endif
 
 ifeq ($(MFEM_USE_CUDA),YES)
    MFEM_CXX ?= $(CUDA_CXX)
-   CXXFLAGS += $(CUDA_FLAGS) -ccbin $(CXX_OR_MPICXX)
+   CXXFLAGS += $(CUDA_FLAGS) -ccbin $(MFEM_HOST_CXX)
    XCOMPILER = $(CUDA_XCOMPILER)
    XLINKER   = $(CUDA_XLINKER)
    # CUDA_OPT and CUDA_LIB are added below
@@ -238,6 +238,7 @@ endif
 ifeq ($(MFEM_USE_HIP),YES)
    MFEM_CXX ?= $(HIP_CXX)
    ALL_LIBS += $(HIP_FLAGS)
+   # TODO: set XCOMPILER and XLINKER
    # HIP_OPT and HIP_LIB are added below
    # Compatibility test against MFEM_USE_CUDA
    ifeq ($(MFEM_USE_CUDA),YES)
@@ -322,14 +323,15 @@ MFEM_DEFINES = MFEM_VERSION MFEM_VERSION_STRING MFEM_GIT_STRING MFEM_USE_MPI\
  MFEM_USE_SUPERLU MFEM_USE_STRUMPACK MFEM_USE_GNUTLS\
  MFEM_USE_NETCDF MFEM_USE_PETSC MFEM_USE_MPFR MFEM_USE_SIDRE MFEM_USE_CONDUIT\
  MFEM_USE_PUMI MFEM_USE_HIOP MFEM_USE_GSLIB MFEM_USE_CUDA MFEM_USE_HIP\
- MFEM_USE_OCCA MFEM_USE_CEED MFEM_USE_RAJA MFEM_USE_UMPIRE MFEM_SOURCE_DIR\
- MFEM_INSTALL_DIR
+ MFEM_USE_OCCA MFEM_USE_CEED MFEM_USE_RAJA MFEM_USE_UMPIRE MFEM_USE_SIMD\
+ MFEM_USE_ADIOS2 MFEM_SOURCE_DIR MFEM_INSTALL_DIR
 
 # List of makefile variables that will be written to config.mk:
-MFEM_CONFIG_VARS = MFEM_CXX MFEM_CPPFLAGS MFEM_CXXFLAGS MFEM_INC_DIR\
- MFEM_TPLFLAGS MFEM_INCFLAGS MFEM_PICFLAG MFEM_FLAGS MFEM_LIB_DIR MFEM_EXT_LIBS\
- MFEM_LIBS MFEM_LIB_FILE MFEM_STATIC MFEM_SHARED MFEM_BUILD_TAG MFEM_PREFIX\
- MFEM_CONFIG_EXTRA MFEM_MPIEXEC MFEM_MPIEXEC_NP MFEM_MPI_NP MFEM_TEST_MK
+MFEM_CONFIG_VARS = MFEM_CXX MFEM_HOST_CXX MFEM_CPPFLAGS MFEM_CXXFLAGS\
+ MFEM_INC_DIR MFEM_TPLFLAGS MFEM_INCFLAGS MFEM_PICFLAG MFEM_FLAGS MFEM_LIB_DIR\
+ MFEM_EXT_LIBS MFEM_LIBS MFEM_LIB_FILE MFEM_STATIC MFEM_SHARED MFEM_BUILD_TAG\
+ MFEM_PREFIX MFEM_CONFIG_EXTRA MFEM_MPIEXEC MFEM_MPIEXEC_NP MFEM_MPI_NP\
+ MFEM_TEST_MK
 
 # Config vars: values of the form @VAL@ are replaced by $(VAL) in config.mk
 MFEM_CPPFLAGS  ?= $(CPPFLAGS)
@@ -390,11 +392,8 @@ ifneq (,$(filter install,$(MAKECMDGOALS)))
 endif
 
 # Source dirs in logical order
-DIRS = general linalg mesh fem fem/libceed
+DIRS = general linalg linalg/simd mesh fem fem/libceed
 SOURCE_FILES = $(foreach dir,$(DIRS),$(wildcard $(SRC)$(dir)/*.cpp))
-ADIOS2_FILES = $(SRC)general/adios2stream.h $(SRC)general/adios2stream.cpp \
- $(SRC)fem/adios2datacollection.hpp $(SRC)fem/adios2datacollection.cpp
-SOURCE_FILES := $(filter-out $(ADIOS2_FILES),$(SOURCE_FILES))
 RELSRC_FILES = $(patsubst $(SRC)%,%,$(SOURCE_FILES))
 OBJECT_FILES = $(patsubst $(SRC)%,$(BLD)%,$(SOURCE_FILES:.cpp=.o))
 OKL_DIRS = fem
@@ -644,7 +643,10 @@ status info:
 	$(info MFEM_USE_OCCA          = $(MFEM_USE_OCCA))
 	$(info MFEM_USE_CEED          = $(MFEM_USE_CEED))
 	$(info MFEM_USE_UMPIRE        = $(MFEM_USE_UMPIRE))
+	$(info MFEM_USE_SIMD          = $(MFEM_USE_SIMD))
+	$(info MFEM_USE_ADIOS2        = $(MFEM_USE_ADIOS2))
 	$(info MFEM_CXX               = $(value MFEM_CXX))
+	$(info MFEM_HOST_CXX          = $(value MFEM_HOST_CXX))
 	$(info MFEM_CPPFLAGS          = $(value MFEM_CPPFLAGS))
 	$(info MFEM_CXXFLAGS          = $(value MFEM_CXXFLAGS))
 	$(info MFEM_TPLFLAGS          = $(value MFEM_TPLFLAGS))
diff --git a/mesh/hexahedron.hpp b/mesh/hexahedron.hpp
index 2bd984ab982..a85d0345ddc 100644
--- a/mesh/hexahedron.hpp
+++ b/mesh/hexahedron.hpp
@@ -68,7 +68,7 @@ class Hexahedron : public Element
    virtual ~Hexahedron() { }
 };
 
-extern TriLinear3DFiniteElement HexahedronFE;
+extern class TriLinear3DFiniteElement HexahedronFE;
 
 }
 
diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp
index a49b33e947b..7de13c750e0 100644
--- a/mesh/mesh.cpp
+++ b/mesh/mesh.cpp
@@ -337,6 +337,7 @@ void Mesh::GetElementTransformation(int i, IsoparametricTransformation *ElTr)
 {
    ElTr->Attribute = GetAttribute(i);
    ElTr->ElementNo = i;
+   ElTr->ElementType = ElementTransformation::ELEMENT;
    if (Nodes == NULL)
    {
       GetPointMatrix(i, ElTr->GetPointMat());
@@ -367,6 +368,7 @@ void Mesh::GetElementTransformation(int i, const Vector &nodes,
 {
    ElTr->Attribute = GetAttribute(i);
    ElTr->ElementNo = i;
+   ElTr->ElementType = ElementTransformation::ELEMENT;
    DenseMatrix &pm = ElTr->GetPointMat();
    nodes.HostRead();
    if (Nodes == NULL)
@@ -420,6 +422,7 @@ void Mesh::GetBdrElementTransformation(int i, IsoparametricTransformation* ElTr)
 {
    ElTr->Attribute = GetBdrAttribute(i);
    ElTr->ElementNo = i; // boundary element number
+   ElTr->ElementType = ElementTransformation::BDR_ELEMENT;
    DenseMatrix &pm = ElTr->GetPointMat();
    if (Nodes == NULL)
    {
@@ -462,6 +465,7 @@ void Mesh::GetBdrElementTransformation(int i, IsoparametricTransformation* ElTr)
 
          IntegrationRule eir(face_el->GetDof());
          FaceElemTr.Loc1.Transf.ElementNo = elem_id;
+         FaceElemTr.Loc1.Transf.ElementType = ElementTransformation::ELEMENT;
          FaceElemTr.Loc1.Transform(face_el->GetNodes(), eir);
          Nodes->GetVectorValues(FaceElemTr.Loc1.Transf, eir, pm);
 
@@ -474,6 +478,7 @@ void Mesh::GetFaceTransformation(int FaceNo, IsoparametricTransformation *FTr)
 {
    FTr->Attribute = (Dim == 1) ? 1 : faces[FaceNo]->GetAttribute();
    FTr->ElementNo = FaceNo;
+   FTr->ElementType = ElementTransformation::FACE;
    DenseMatrix &pm = FTr->GetPointMat();
    if (Nodes == NULL)
    {
@@ -526,6 +531,7 @@ void Mesh::GetFaceTransformation(int FaceNo, IsoparametricTransformation *FTr)
 
          IntegrationRule eir(face_el->GetDof());
          FaceElemTr.Loc1.Transf.ElementNo = face_info.Elem1No;
+         FaceElemTr.Loc1.Transf.ElementType = ElementTransformation::ELEMENT;
          FaceElemTr.Loc1.Transform(face_el->GetNodes(), eir);
          Nodes->GetVectorValues(FaceElemTr.Loc1.Transf, eir, pm);
 
@@ -554,6 +560,7 @@ void Mesh::GetEdgeTransformation(int EdgeNo, IsoparametricTransformation *EdTr)
 
    EdTr->Attribute = 1;
    EdTr->ElementNo = EdgeNo;
+   EdTr->ElementType = ElementTransformation::EDGE;
    DenseMatrix &pm = EdTr->GetPointMat();
    if (Nodes == NULL)
    {
@@ -852,6 +859,7 @@ FaceElementTransformations *Mesh::GetFaceElementTransformations(int FaceNo,
 {
    FaceInfo &face_info = faces_info[FaceNo];
 
+   FaceElemTr.SetConfigurationMask(0);
    FaceElemTr.Elem1 = NULL;
    FaceElemTr.Elem2 = NULL;
 
@@ -877,8 +885,14 @@ FaceElementTransformations *Mesh::GetFaceElementTransformations(int FaceNo,
    }
 
    // setup the face transformation
-   FaceElemTr.FaceGeom = GetFaceGeometryType(FaceNo);
-   FaceElemTr.Face = (mask & 16) ? GetFaceTransformation(FaceNo) : NULL;
+   if (mask & 16)
+   {
+      GetFaceTransformation(FaceNo, &FaceElemTr);
+   }
+   else
+   {
+      FaceElemTr.SetGeometryType(GetFaceGeometryType(FaceNo));
+   }
 
    // setup Loc1 & Loc2
    int face_type = GetFaceElementType(FaceNo);
@@ -909,6 +923,8 @@ FaceElementTransformations *Mesh::GetFaceElementTransformations(int FaceNo,
       }
    }
 
+   FaceElemTr.SetConfigurationMask(mask);
+
    return &FaceElemTr;
 }
 
@@ -952,7 +968,9 @@ FaceElementTransformations *Mesh::GetBdrFaceTransformations(int BdrElemNo)
       return NULL;
    }
    tr = GetFaceElementTransformations(fn);
-   tr->Face->Attribute = boundary[BdrElemNo]->GetAttribute();
+   tr->Attribute = boundary[BdrElemNo]->GetAttribute();
+   tr->ElementNo = BdrElemNo;
+   tr->ElementType = ElementTransformation::BDR_FACE;
    return tr;
 }
 
@@ -3268,7 +3286,7 @@ void Mesh::Loader(std::istream &input, int generate_edges,
    }
    else if (mesh_type == "$MeshFormat") // Gmsh
    {
-      ReadGmshMesh(input);
+      ReadGmshMesh(input, curved, read_gf);
    }
    else if
    ((mesh_type.size() > 2 &&
diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp
index c67c9982fef..4303776d644 100644
--- a/mesh/mesh.hpp
+++ b/mesh/mesh.hpp
@@ -235,7 +235,7 @@ class Mesh
                     bool &finalize_topo);
    void ReadNURBSMesh(std::istream &input, int &curved, int &read_gf);
    void ReadInlineMesh(std::istream &input, bool generate_edges = false);
-   void ReadGmshMesh(std::istream &input);
+   void ReadGmshMesh(std::istream &input, int &curved, int &read_gf);
    /* Note NetCDF (optional library) is used for reading cubit files */
 #ifdef MFEM_USE_NETCDF
    void ReadCubit(const char *filename, int &curved, int &read_gf);
diff --git a/mesh/mesh_readers.cpp b/mesh/mesh_readers.cpp
index 703d410ae56..5d3b472e2ef 100644
--- a/mesh/mesh_readers.cpp
+++ b/mesh/mesh_readers.cpp
@@ -887,7 +887,7 @@ void Mesh::ReadInlineMesh(std::istream &input, bool generate_edges)
    }
 }
 
-void Mesh::ReadGmshMesh(std::istream &input)
+void Mesh::ReadGmshMesh(std::istream &input, int &curved, int &read_gf)
 {
    string buff;
    double version;
@@ -1110,8 +1110,14 @@ void Mesh::ReadGmshMesh(std::istream &input)
                      }
                      case 4: // 4-node tetrahedron
                      {
+#ifdef MFEM_USE_MEMALLOC
+                        elements_3D.push_back(TetMemory.Alloc());
+                        elements_3D.back()->SetVertices(&vert_indices[0]);
+                        elements_3D.back()->SetAttribute(phys_domain);
+#else
                         elements_3D.push_back(
                            new Tetrahedron(&vert_indices[0], phys_domain));
+#endif
                         break;
                      }
                      case 5: // 8-node hexahedron
@@ -1195,8 +1201,14 @@ void Mesh::ReadGmshMesh(std::istream &input)
                   }
                   case 4: // 4-node tetrahedron
                   {
+#ifdef MFEM_USE_MEMALLOC
+                     elements_3D.push_back(TetMemory.Alloc());
+                     elements_3D.back()->SetVertices(&vert_indices[0]);
+                     elements_3D.back()->SetAttribute(phys_domain);
+#else
                      elements_3D.push_back(
                         new Tetrahedron(&vert_indices[0], phys_domain));
+#endif
                      break;
                   }
                   case 5: // 8-node hexahedron
@@ -1291,6 +1303,66 @@ void Mesh::ReadGmshMesh(std::istream &input)
          MFEM_CONTRACT_VAR(elem_domain);
 
       } // section '$Elements'
+      else if (buff == "$Periodic") // Reading master/slave node pairs
+      {
+         curved = 1;
+         read_gf = 0;
+         spaceDim = 3;
+
+         Array<int> v2v(NumOfVertices);
+         for (int i = 0; i < v2v.Size(); i++)
+         {
+            v2v[i] = i;
+         }
+         int num_per_ent;
+         int num_nodes;
+         int slave, master;
+         input >> num_per_ent;
+         getline(input, buff); // Read end-of-line
+         for (int i = 0; i < num_per_ent; i++)
+         {
+            getline(input, buff); // Read and ignore entity dimension and tags
+            getline(input, buff); // Read and ignore affine mapping
+            // Read master/slave vertex pairs
+            input >> num_nodes;
+            for (int j=0; j<num_nodes; j++)
+            {
+               input >> slave >> master;
+               v2v[slave - 1] = master - 1;
+            }
+            getline(input, buff); // Read end-of-line
+         }
+
+         // Convert nodes to discontinuous GridFunction
+         this->SetCurvature(1, true, Dim, Ordering::byVDIM);
+
+         // Replace "slave" vertex indices in the element connectivity
+         // with their corresponding "master" vertex indices.
+         for (int i = 0; i < this->GetNE(); i++)
+         {
+            Element *el = this->GetElement(i);
+            int *v = el->GetVertices();
+            int nv = el->GetNVertices();
+            for (int j = 0; j < nv; j++)
+            {
+               v[j] = v2v[v[j]];
+            }
+         }
+         // Replace "slave" vertex indices in the boundary element connectivity
+         // with their corresponding "master" vertex indices.
+         for (int i = 0; i < this->GetNBE(); i++)
+         {
+            Element *el = this->GetBdrElement(i);
+            int *v = el->GetVertices();
+            int nv = el->GetNVertices();
+            for (int j = 0; j < nv; j++)
+            {
+               v[j] = v2v[v[j]];
+            }
+         }
+         this->RemoveUnusedVertices();
+         this->RemoveInternalBoundaries();
+      }
    } // we reach the end of the file
 }
 
@@ -1878,7 +1950,14 @@ void Mesh::ReadCubit(const char *filename, int &curved, int &read_gf)
             case (ELEMENT_TET4):
             case (ELEMENT_TET10):
             {
-               elements[elcount] = new Tetrahedron(renumberedVertID,ebprop[iblk]);
+#ifdef MFEM_USE_MEMALLOC
+               elements[elcount] = TetMemory.Alloc();
+               elements[elcount]->SetVertices(renumberedVertID);
+               elements[elcount]->SetAttribute(ebprop[iblk]);
+#else
+               elements[elcount] = new Tetrahedron(renumberedVertID,
+                                                   ebprop[iblk]);
+#endif
                break;
             }
             case (ELEMENT_HEX8):
diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp
index 23778801128..bcb22a621bd 100644
--- a/mesh/pmesh.cpp
+++ b/mesh/pmesh.cpp
@@ -34,6 +34,8 @@ ParMesh::ParMesh(const ParMesh &pmesh, bool copy_nodes)
      group_sedge(pmesh.group_sedge),
      group_stria(pmesh.group_stria),
      group_squad(pmesh.group_squad),
+     glob_elem_offset(-1),
+     glob_offset_sequence(-1),
      gtopo(pmesh.gtopo)
 {
    MyComm = pmesh.MyComm;
@@ -92,7 +94,9 @@ ParMesh::ParMesh(const ParMesh &pmesh, bool copy_nodes)
 
 ParMesh::ParMesh(MPI_Comm comm, Mesh &mesh, int *partitioning_,
                  int part_method)
-   : gtopo(comm)
+   : glob_elem_offset(-1)
+   , glob_offset_sequence(-1)
+   , gtopo(comm)
 {
    int *partitioning = NULL;
    Array<bool> activeBdrElem;
@@ -833,6 +837,8 @@ ParMesh::ParMesh(const ParNCMesh &pncmesh)
    : MyComm(pncmesh.MyComm)
    , NRanks(pncmesh.NRanks)
    , MyRank(pncmesh.MyRank)
+   , glob_elem_offset(-1)
+   , glob_offset_sequence(-1)
    , gtopo(MyComm)
    , pncmesh(NULL)
 {
@@ -841,6 +847,18 @@ ParMesh::ParMesh(const ParNCMesh &pncmesh)
    have_face_nbr_data = false;
 }
 
+void ParMesh::ComputeGlobalElementOffset() const
+{
+   if (glob_offset_sequence != sequence) // mesh has changed
+   {
+      long local_elems = NumOfElements;
+      MPI_Scan(&local_elems, &glob_elem_offset, 1, MPI_LONG, MPI_SUM, MyComm);
+      glob_elem_offset -= local_elems;
+
+      glob_offset_sequence = sequence; // don't recalculate until refinement etc.
+   }
+}
+
 void ParMesh::ReduceMeshGen()
 {
    int loc_meshgen = meshgen;
@@ -885,7 +903,9 @@ void ParMesh::FinalizeParTopo()
 }
 
 ParMesh::ParMesh(MPI_Comm comm, istream &input, bool refine)
-   : gtopo(comm)
+   : glob_elem_offset(-1)
+   , glob_offset_sequence(-1)
+   , gtopo(comm)
 {
    MyComm = comm;
    MPI_Comm_size(MyComm, &NRanks);
@@ -1062,6 +1082,8 @@ ParMesh::ParMesh(ParMesh *orig_mesh, int ref_factor, int ref_type)
      MyComm(orig_mesh->GetComm()),
      NRanks(orig_mesh->GetNRanks()),
      MyRank(orig_mesh->GetMyRank()),
+     glob_elem_offset(-1),
+     glob_offset_sequence(-1),
      gtopo(orig_mesh->gtopo),
      have_face_nbr_data(false),
      pncmesh(NULL)
@@ -1299,6 +1321,20 @@ void ParMesh::Finalize(bool refine, bool fix_orientation)
    FinalizeParTopo();
 }
 
+int ParMesh::GetLocalElementNum(long global_element_num) const
+{
+   ComputeGlobalElementOffset();
+   long local = global_element_num - glob_elem_offset;
+   if (local < 0 || local >= NumOfElements) { return -1; }
+   return local;
+}
+
+long ParMesh::GetGlobalElementNum(int local_element_num) const
+{
+   ComputeGlobalElementOffset();
+   return glob_elem_offset + local_element_num;
+}
+
 void ParMesh::DistributeAttributes(Array<int> &attr)
 {
    // Determine the largest attribute number across all processors
@@ -2392,12 +2428,15 @@ GetSharedFaceTransformations(int sf, bool fill2)
    }
 
    // setup the face transformation if the face is not a ghost
-   FaceElemTr.FaceGeom = face_geom;
    if (!is_ghost)
    {
-      FaceElemTr.Face = GetFaceTransformation(FaceNo);
+      GetFaceTransformation(FaceNo, &FaceElemTr);
       // NOTE: The above call overwrites FaceElemTr.Loc1
    }
+   else
+   {
+      FaceElemTr.SetGeometryType(face_geom);
+   }
 
    // setup Loc1 & Loc2
    int elem_type = GetElementType(face_info.Elem1No);
@@ -2436,8 +2475,7 @@ GetSharedFaceTransformations(int sf, bool fill2)
    // for ghost faces we need a special version of GetFaceTransformation
    if (is_ghost)
    {
-      FaceElemTr.Face =
-         GetGhostFaceTransformation(&FaceElemTr, face_type, face_geom);
+      GetGhostFaceTransformation(&FaceElemTr, face_type, face_geom);
    }
 
    return &FaceElemTr;
diff --git a/mesh/pmesh.hpp b/mesh/pmesh.hpp
index a5a183600f3..220bc4d554d 100644
--- a/mesh/pmesh.hpp
+++ b/mesh/pmesh.hpp
@@ -78,6 +78,10 @@ class ParMesh : public Mesh
    // sface ids: all triangles first, then all quads
    Array<int> sface_lface;
 
+   // glob_elem_offset + local element number defines a global element numbering
+   mutable long glob_elem_offset, glob_offset_sequence;
+   void ComputeGlobalElementOffset() const;
+
    /// Create from a nonconforming mesh.
    ParMesh(const ParNCMesh &pncmesh);
 
@@ -231,6 +235,13 @@ class ParMesh : public Mesh
    int GetNRanks() const { return NRanks; }
    int GetMyRank() const { return MyRank; }
 
+   /** Map a global element number to a local element number. If the global
+       element is not on this processor, return -1. */
+   int GetLocalElementNum(long global_element_num) const;
+
+   /// Map a local element number to a global element number.
+   long GetGlobalElementNum(int local_element_num) const;
+
    GroupTopology gtopo;
 
    // Face-neighbor elements and vertices
diff --git a/mesh/pumi.cpp b/mesh/pumi.cpp
index 38c009bb9f8..106699e4991 100644
--- a/mesh/pumi.cpp
+++ b/mesh/pumi.cpp
@@ -33,21 +33,15 @@ using namespace std;
 namespace mfem
 {
 
-PumiMesh::PumiMesh(apf::Mesh2* apf_mesh, int generate_edges, int refine,
-                   bool fix_orientation)
+static void ReadPumiElement(apf::MeshEntity* Ent, /* ptr to pumi entity */
+                            apf::Downward Verts,
+                            const int Attr, apf::Numbering* vert_num,
+                            Element* el /* ptr to mfem entity being created */
+                           )
 {
-   Load(apf_mesh, generate_edges, refine, fix_orientation);
-}
-
-Element *PumiMesh::ReadElement(apf::MeshEntity* Ent, const int geom,
-                               apf::Downward Verts,
-                               const int Attr, apf::Numbering* vert_num)
-{
-   Element *el;
    int nv, *v;
 
    // Create element in MFEM
-   el = NewElement(geom);
    nv = el->GetNVertices();
    v  = el->GetVertices();
 
@@ -59,10 +53,16 @@ Element *PumiMesh::ReadElement(apf::MeshEntity* Ent, const int geom,
 
    // Assign attribute
    el->SetAttribute(Attr);
+}
 
-   return el;
+PumiMesh::PumiMesh(apf::Mesh2* apf_mesh, int generate_edges, int refine,
+                   bool fix_orientation)
+{
+   Load(apf_mesh, generate_edges, refine, fix_orientation);
 }
 
+
+
 void PumiMesh::CountBoundaryEntity(apf::Mesh2* apf_mesh, const int BcDim,
                                    int &NumBc)
 {
@@ -185,7 +185,8 @@ void PumiMesh::ReadSCORECMesh(apf::Mesh2* apf_mesh, apf::Numbering* v_num_loc,
       int attr = 1;
 
       int geom_type = apf_mesh->getType(ent);
-      elements[j] = ReadElement(ent, geom_type, verts, attr, v_num_loc);
+      elements[j] = NewElement(geom_type);
+      ReadPumiElement(ent, verts, attr, v_num_loc, elements[j]);
       j++;
    }
    // End iterator
@@ -211,7 +212,8 @@ void PumiMesh::ReadSCORECMesh(apf::Mesh2* apf_mesh, apf::Numbering* v_num_loc,
          apf_mesh->getDownward(ent, 0, verts);
          int attr = 1;
          int geom_type = apf_mesh->getType(ent);
-         boundary[j] = ReadElement( ent, geom_type, verts, attr, v_num_loc);
+         boundary[j] = NewElement(geom_type);
+         ReadPumiElement(ent, verts, attr, v_num_loc, boundary[j]);
          j++;
       }
    }
@@ -241,33 +243,10 @@ void PumiMesh::ReadSCORECMesh(apf::Mesh2* apf_mesh, apf::Numbering* v_num_loc,
 }
 
 // ParPumiMesh implementation
-Element *ParPumiMesh::ReadElement(apf::MeshEntity* Ent, const int geom,
-                                  apf::Downward Verts,
-                                  const int Attr, apf::Numbering* vert_num)
-{
-   Element *el;
-   int nv, *v;
-
-   // Create element in MFEM
-   el = NewElement(geom);
-   nv = el->GetNVertices();
-   v  = el->GetVertices();
-
-   // Fill the connectivity
-   for (int i = 0; i < nv; ++i)
-   {
-      v[i] = apf::getNumber(vert_num, Verts[i], 0, 0);
-   }
-
-   // Assign attribute
-   el->SetAttribute(Attr);
-
-   return el;
-}
-
 // This function loads a parallel PUMI mesh and returns the parallel MFEM mesh
 // corresponding to it.
-ParPumiMesh::ParPumiMesh(MPI_Comm comm, apf::Mesh2* apf_mesh)
+ParPumiMesh::ParPumiMesh(MPI_Comm comm, apf::Mesh2* apf_mesh,
+                         int refine, bool fix_orientation)
 {
    // Set the communicator for gtopo
    gtopo.SetComm(comm);
@@ -312,11 +291,15 @@ ParPumiMesh::ParPumiMesh(MPI_Comm comm, apf::Mesh2* apf_mesh)
    // Create local numbering that respects the global ordering
    apf::Field* apf_field_crd = apf_mesh->getCoordinateField();
    apf::FieldShape* crd_shape = apf::getShape(apf_field_crd);
-   apf::Numbering* v_num_loc = apf::createNumbering(apf_mesh,
-                                                    "LocalVertexNumbering",
-                                                    crd_shape, 1);
+   // v_num_loc might already be associated the mesh. In that case
+   // there is no need to create it again.
+   v_num_loc = apf_mesh->findNumbering("LocalVertexNumbering");
+   if (!v_num_loc)
+      v_num_loc = apf::createNumbering(apf_mesh,
+                                       "LocalVertexNumbering",
+                                       crd_shape, 1);
 
-   // Construct the numbering v_loc_num and set the coordinates of the vertices.
+   // Construct the numbering v_num_loc and set the coordinates of the vertices.
    NumOfVertices = thisVertIds.Size();
    vertices.SetSize(NumOfVertices);
    itr = apf_mesh->begin(0);
@@ -352,7 +335,8 @@ ParPumiMesh::ParPumiMesh(MPI_Comm comm, apf::Mesh2* apf_mesh)
       // Get attribute Tag vs Geometry
       int attr = 1;
       int geom_type = apf_mesh->getType(ent);
-      elements[j] = ReadElement(ent, geom_type, verts, attr, v_num_loc);
+      elements[j] = NewElement(geom_type);
+      ReadPumiElement(ent, verts, attr, v_num_loc, elements[j]);
    }
    // End iterator
    apf_mesh->end(itr);
@@ -384,8 +368,9 @@ ParPumiMesh::ParPumiMesh(MPI_Comm comm, apf::Mesh2* apf_mesh)
          apf_mesh->getDownward(ent, 0, verts);
          int attr = 1 ;
          int geom_type = apf_mesh->getType(ent);
-         boundary[bdr_ctr++] = ReadElement(ent, geom_type, verts, attr,
-                                           v_num_loc);
+         boundary[bdr_ctr] = NewElement(geom_type);
+         ReadPumiElement(ent, verts, attr, v_num_loc, boundary[bdr_ctr]);
+         bdr_ctr++;
       }
    }
    apf_mesh->end(itr);
@@ -690,8 +675,9 @@ ParPumiMesh::ParPumiMesh(MPI_Comm comm, apf::Mesh2* apf_mesh)
       this->edge_vertex = NULL;
       own_nodes = 1;
    }
-}
 
+   Finalize(refine, fix_orientation);
+}
 
 // GridFunctionPumi Implementation needed for high order meshes
 GridFunctionPumi::GridFunctionPumi(Mesh* m, apf::Mesh2* PumiM,
@@ -923,223 +909,145 @@ void ParPumiMesh::UpdateMesh(const ParMesh* AdaptedpMesh)
    }
 }
 
-// Transfer a mixed vector-scalar field (i.e. velocity,pressure) and the
-// magnitude of the vector field to use for mesh adaptation.
-void ParPumiMesh::FieldMFEMtoPUMI(apf::Mesh2* apf_mesh,
-                                  ParGridFunction* grid_vel,
-                                  ParGridFunction* grid_pr,
-                                  apf::Field* VelField,
-                                  apf::Field* PrField,
-                                  apf::Field* VelMagField)
+int ParPumiMesh::RotationPUMItoMFEM(apf::Mesh2* apf_mesh,
+                                    apf::MeshEntity* tet,
+                                    int elemId)
 {
-   apf::FieldShape* VelFieldShape = getShape(VelField);
-   int num_nodes = 4 * VelFieldShape->countNodesOn(0) + // Vertex
-                   6 * VelFieldShape->countNodesOn(1) + // Edge
-                   4 * VelFieldShape->countNodesOn(2) + // Triangle
-                   VelFieldShape->countNodesOn(4); // Tetrahedron
-
-   // Define integration points
-   IntegrationRule pumi_nodes(num_nodes);
-   int ip_cnt = 0;
-   apf::Vector3 xi_crd(0.,0.,0.);
-
-   // Create a template of dof holders coordinates in parametric coordinates.
-   // The ordering is taken care of when the field is transferred to PUMI.
-
-   // Dofs on Vertices
-   IntegrationPoint& ip = pumi_nodes.IntPoint(ip_cnt++);
-   double pt_crd[3] = {0., 0., 0.};
-   ip.Set(pt_crd, 3);
-   for (int kk = 0; kk < 3; kk++)
+   MFEM_ASSERT(apf_mesh->getType(tet) == apf::Mesh::TET, "");
+   // get downward vertices of PUMI element
+   apf::Downward vs;
+   int nv = apf_mesh->getDownward(tet,0,vs);
+   int pumi_vid[12];
+   for (int i = 0; i < nv; i++)
    {
-      IntegrationPoint& ip = pumi_nodes.IntPoint(ip_cnt++);
-      double pt_crd[3] = {0.,0.,0.};
-      pt_crd[kk] = 1.0;
-      ip.Set(pt_crd, 3);
+      pumi_vid[i] = apf::getNumber(v_num_loc, vs[i], 0, 0);
    }
-   // Dofs on Edges
-   if (VelFieldShape->hasNodesIn(apf::Mesh::EDGE))
+
+   // get downward vertices of MFEM element
+   mfem::Array<int> mfem_vid;
+   this->GetElementVertices(elemId, mfem_vid);
+
+   // get rotated indices of PUMI element
+   int pumi_vid_rot[12];
+   for (int i = 0; i < nv; i++)
    {
-      const int nn = VelFieldShape->countNodesOn(apf::Mesh::EDGE);
-      for (int ii = 0; ii < 6; ii++)
-      {
-         for (int jj = 0; jj < nn; jj++)
-         {
-            VelFieldShape->getNodeXi(apf::Mesh::EDGE, jj, xi_crd);
-            xi_crd[0] = 0.5 * (xi_crd[0] + 1.);// from (-1,1) to (0,1)
-            double pt_crd[3] = {0., 0., 0.};
-            switch (ii)
-            {
-               case 0:
-                  pt_crd[0] = xi_crd[0];
-                  break;
-               case 1:
-                  pt_crd[0] = 1. - xi_crd[0];
-                  pt_crd[1] = xi_crd[0];
-                  break;
-               case 2:
-                  pt_crd[1] = xi_crd[0];
-                  break;
-               case 3:
-                  pt_crd[2] = xi_crd[0];
-                  break;
-               case 4:
-                  pt_crd[0] = 1. - xi_crd[0];
-                  pt_crd[2] = xi_crd[0];
-                  break;
-               case 5:
-                  pt_crd[1] = 1. - xi_crd[0];
-                  pt_crd[2] = xi_crd[0];
-                  break;
-            }
-            IntegrationPoint& ip = pumi_nodes.IntPoint(ip_cnt++);
-            ip.Set(pt_crd, 3);
-         }
-      }
+      pumi_vid_rot[i] = mfem_vid.Find(pumi_vid[i]);
    }
-   // Dofs on Faces
-   if (VelFieldShape->hasNodesIn(apf::Mesh::TRIANGLE))
+   apf::Downward vs_rot;
+   for (int i = 0; i < nv; i++)
    {
-      const int nn = VelFieldShape->countNodesOn(apf::Mesh::TRIANGLE);
-      for (int ii = 0; ii < 4; ii++)
+      vs_rot[i] = vs[pumi_vid_rot[i]];
+   }
+
+   return ma::findTetRotation(apf_mesh, tet, vs_rot);
+}
+
+// Convert parent coordinate form a PUMI tet to an MFEM tet
+IntegrationRule ParPumiMesh::ParentXisPUMItoMFEM(apf::Mesh2* apf_mesh,
+                                                 apf::MeshEntity* tet,
+                                                 int elemId,
+                                                 apf::NewArray<apf::Vector3>& pumi_xi,
+                                                 bool checkOrientation)
+{
+   int num_nodes = pumi_xi.size();
+   IntegrationRule mfem_xi(num_nodes);
+   int rotation = checkOrientation ? RotationPUMItoMFEM(apf_mesh, tet, elemId):0;
+   for (int i = 0; i < num_nodes; i++)
+   {
+      // for non zero "rotation", rotate the xi
+      if (rotation)
       {
-         for (int jj = 0; jj < nn; jj++)
-         {
-            VelFieldShape->getNodeXi(apf::Mesh::TRIANGLE, jj, xi_crd);
-            double pt_crd[3] = {0., 0., 0.};
-            switch (ii)
-            {
-               case 0:
-                  pt_crd[0] = xi_crd[0];
-                  pt_crd[1] = xi_crd[1];
-                  break;
-               case 1:
-                  pt_crd[0] = xi_crd[0];
-                  pt_crd[2] = xi_crd[2];
-                  break;
-               case 2:
-                  pt_crd[0] = xi_crd[0];
-                  pt_crd[1] = xi_crd[1];
-                  pt_crd[2] = xi_crd[2];
-                  break;
-               case 3:
-                  pt_crd[1] = xi_crd[0];
-                  pt_crd[2] = xi_crd[1];
-                  break;
-            }
-            IntegrationPoint& ip = pumi_nodes.IntPoint(ip_cnt++);
-            ip.Set(pt_crd, 3);
-         }
+         ma::unrotateTetXi(pumi_xi[i], rotation);
       }
+      IntegrationPoint& ip = mfem_xi.IntPoint(i);
+      double tmp_xi[3];
+      pumi_xi[i].toArray(tmp_xi);
+      ip.Set(tmp_xi,3);
    }
-   MFEM_ASSERT(ip_cnt == num_nodes, "");
+   return mfem_xi;
+}
 
-   // Other dofs
-   apf::MeshEntity* ent;
-   apf::MeshIterator* itr = apf_mesh->begin(3);
-   int iel = 0;
-   while ((ent = apf_mesh->iterate(itr)))
+// Convert parent coordinate from MFEM tet to PUMI tet
+void ParPumiMesh::ParentXisMFEMtoPUMI(apf::Mesh2* apf_mesh,
+                                      int elemId,
+                                      apf::MeshEntity* tet,
+                                      const IntegrationRule& mfem_xi,
+                                      apf::NewArray<apf::Vector3>& pumi_xi,
+                                      bool checkOrientation)
+{
+   int num_nodes = mfem_xi.Size();
+   if (!pumi_xi.allocated())
    {
-      // Get the solution
-      Vector u_vel, v_vel, w_vel;
-      grid_vel->GetValues(iel, pumi_nodes, u_vel, 1);
-      grid_vel->GetValues(iel, pumi_nodes, v_vel, 2);
-      grid_vel->GetValues(iel, pumi_nodes, w_vel, 3);
+      pumi_xi.allocate(num_nodes);
+   }
+   else
+   {
+      pumi_xi.resize(num_nodes);
+   }
 
-      Vector pr;
-      grid_pr->GetValues(iel, pumi_nodes, pr, 1);
+   int rotation = checkOrientation ? RotationPUMItoMFEM(apf_mesh, tet, elemId):0;
+   for (int i = 0; i < num_nodes; i++)
+   {
+      IntegrationPoint ip = mfem_xi.IntPoint(i);
+      pumi_xi[i] = apf::Vector3(ip.x, ip.y, ip.z);
 
-      // Transfer
-      apf::Downward vtxs;
-      int num_vts = apf_mesh->getDownward(ent, 0, vtxs);
-      for (int kk = 0; kk < num_vts; kk++)
+      // for non zero "rotation", un-rotate the xi
+      if (rotation)
       {
-         double mag = u_vel[kk] * u_vel[kk] + v_vel[kk] * v_vel[kk] +
-                      w_vel[kk] * w_vel[kk];
-         mag = sqrt(mag);
-         apf::setScalar(VelMagField, vtxs[kk], 0, mag);
-         // Set vel
-         double vels[3] = {u_vel[kk], v_vel[kk], w_vel[kk]};
-         apf::setComponents(VelField, vtxs[kk], 0, vels);
-
-         // Set Pr
-         apf::setScalar(PrField, vtxs[kk], 0, pr[kk]);
+         ma::rotateTetXi(pumi_xi[i], rotation);
       }
+   }
+}
 
-      int dofId = num_vts;
-
-      apf::EntityShape* es = VelFieldShape->getEntityShape(apf::Mesh::TET);
-      // Edge Dofs
-      if (VelFieldShape->hasNodesIn(apf::Mesh::EDGE))
-      {
-         int ndOnEdge = VelFieldShape->countNodesOn(apf::Mesh::EDGE);
-         Array<int> order(ndOnEdge);
-
-         apf::Downward edges;
-         int num_edge =  apf_mesh->getDownward(ent, apf::Mesh::EDGE, edges);
-         for (int ii = 0 ; ii < num_edge; ++ii)
-         {
-            es->alignSharedNodes(apf_mesh, ent, edges[ii], order);
-            for (int jj = 0; jj < ndOnEdge; jj++)
-            {
-               int cnt = dofId + order[jj];
-               double mag = u_vel[cnt] * u_vel[cnt] +
-                            v_vel[cnt] * v_vel[cnt] +
-                            w_vel[cnt] * w_vel[cnt];
-               mag = sqrt(mag);
-               apf::setScalar(VelMagField, edges[ii], jj, mag);
 
-               // Set vel
-               double vels[3] = {u_vel[cnt], v_vel[cnt], w_vel[cnt]};
-               apf::setComponents(VelField, edges[ii], jj, vels);
+// Transfer a mixed vector-scalar field (i.e. velocity,pressure) and the
+// magnitude of the vector field to use for mesh adaptation.
+void ParPumiMesh::FieldMFEMtoPUMI(apf::Mesh2* apf_mesh,
+                                  ParGridFunction* grid_vel,
+                                  ParGridFunction* grid_pr,
+                                  apf::Field* vel_field,
+                                  apf::Field* pr_field,
+                                  apf::Field* vel_mag_field)
+{
+   apf::FieldShape* field_shape = getShape(vel_field);
+   int dim = apf_mesh->getDimension();
 
-               // Set Pr
-               apf::setScalar(PrField, edges[ii], jj, pr[cnt]);
+   apf::MeshEntity* ent;
+   apf::MeshIterator* itr = apf_mesh->begin(dim);
+   int iel = 0;
+   while ((ent = apf_mesh->iterate(itr)))
+   {
+      apf::NewArray<apf::Vector3> pumi_nodes;
+      apf::getElementNodeXis(field_shape, apf_mesh, ent, pumi_nodes);
+      IntegrationRule mfem_nodes = ParentXisPUMItoMFEM(
+                                      apf_mesh, ent, iel, pumi_nodes, true);
+      // Get the solution
+      ElementTransformation* eltr = this->GetElementTransformation(iel);
+      DenseMatrix vel;
+      grid_vel->GetVectorValues(*eltr, mfem_nodes, vel);
+      Vector pr;
+      grid_pr->GetValues(iel, mfem_nodes, pr, 1);
 
-            }
-            // Counter
-            dofId += ndOnEdge;
-         }
-      }
-      // Face Dofs
-      if (VelFieldShape->hasNodesIn(apf::Mesh::TRIANGLE))
+      int non = 0;
+      for (int d = 0; d <= dim; d++)
       {
-         int ndOnFace = VelFieldShape->countNodesOn(apf::Mesh::TRIANGLE);
-         Array<int> order(ndOnFace);
-
-         apf::Downward faces;
-         int num_face = apf_mesh->getDownward(ent, apf::Mesh::TRIANGLE, faces);
-         for (int ii = 0; ii < num_face; ii++)
+         if (!field_shape->hasNodesIn(d)) { continue; }
+         apf::Downward a;
+         int na = apf_mesh->getDownward(ent,d,a);
+         for (int i = 0; i < na; i++)
          {
-            if ( ndOnFace > 1)
-            {
-               es->alignSharedNodes(apf_mesh, ent, faces[ii], order);
-            }
-            else
-            {
-               order[0] = 0;
-            }
-            for (int jj = 0; jj < ndOnFace; jj++)
+            int type = apf_mesh->getType(a[i]);
+            int nan = field_shape->countNodesOn(type);
+            for (int n = 0; n < nan; n++)
             {
-               int cnt = dofId + order[jj];
-               double mag = u_vel[cnt] * u_vel[cnt] +
-                            v_vel[cnt] * v_vel[cnt] +
-                            w_vel[cnt] * w_vel[cnt];
-               mag = sqrt(mag);
-               apf::setScalar(VelMagField, faces[ii], jj, mag);
-
-               // Set vel
-               double vels[3] = {u_vel[cnt], v_vel[cnt], w_vel[cnt]};
-               apf::setComponents(VelField, faces[ii], jj, vels);
-
-               // Set Pr
-               apf::setScalar(PrField, faces[ii], jj, pr[cnt]);
+               apf::Vector3 v(vel.GetColumn(non));
+               apf::setVector(vel_field, a[i], n, v);
+               apf::setScalar(pr_field, a[i], n, pr[non]);
+               apf::setScalar(vel_mag_field, a[i], n, v.getLength());
+               non++;
             }
-            // Counter
-            dofId += ndOnFace;
          }
       }
-
       iel++;
    }
    apf_mesh->end(itr);
@@ -1148,197 +1056,45 @@ void ParPumiMesh::FieldMFEMtoPUMI(apf::Mesh2* apf_mesh,
 // Transfer a scalar field its magnitude to use for mesh adaptation.
 void ParPumiMesh::FieldMFEMtoPUMI(apf::Mesh2* apf_mesh,
                                   ParGridFunction* grid_pr,
-                                  apf::Field* PrField,
-                                  apf::Field* PrMagField)
+                                  apf::Field* pr_field,
+                                  apf::Field* pr_mag_field)
 {
-   apf::FieldShape* PrFieldShape = getShape(PrField);
-   int num_nodes = 4 * PrFieldShape->countNodesOn(0) + // Vertex
-                   6 * PrFieldShape->countNodesOn(1) + // Edge
-                   4 * PrFieldShape->countNodesOn(2) + // Triangle
-                   PrFieldShape->countNodesOn(4); // Tetrahedron
-
-   // Define integration points
-   IntegrationRule pumi_nodes(num_nodes);
-   int ip_cnt = 0;
-   apf::Vector3 xi_crd(0.,0.,0.);
-
-   // Create a template of dof holders coordinates in parametric coordinates.
-   // The ordering is taken care of when the field is transferred to PUMI.
-
-   // Dofs on Vertices
-   IntegrationPoint& ip = pumi_nodes.IntPoint(ip_cnt++);
-   double pt_crd[3] = {0., 0., 0.};
-   ip.Set(pt_crd, 3);
-   for (int kk = 0; kk < 3; kk++)
-   {
-      IntegrationPoint& ip = pumi_nodes.IntPoint(ip_cnt++);
-      double pt_crd[3] = {0.,0.,0.};
-      pt_crd[kk] = 1.0;
-      ip.Set(pt_crd, 3);
-   }
-   // Dofs on Edges
-   if (PrFieldShape->hasNodesIn(apf::Mesh::EDGE))
-   {
-      const int nn = PrFieldShape->countNodesOn(apf::Mesh::EDGE);
-      for (int ii = 0; ii < 6; ii++)
-      {
-         for (int jj = 0; jj < nn; jj++)
-         {
-            PrFieldShape->getNodeXi(apf::Mesh::EDGE, jj, xi_crd);
-            xi_crd[0] = 0.5 * (xi_crd[0] + 1.); // from (-1,1) to (0,1)
-            double pt_crd[3] = {0., 0., 0.};
-            switch (ii)
-            {
-               case 0:
-                  pt_crd[0] = xi_crd[0];
-                  break;
-               case 1:
-                  pt_crd[0] = 1. - xi_crd[0];
-                  pt_crd[1] = xi_crd[0];
-                  break;
-               case 2:
-                  pt_crd[1] = xi_crd[0];
-                  break;
-               case 3:
-                  pt_crd[2] = xi_crd[0];
-                  break;
-               case 4:
-                  pt_crd[0] = 1. - xi_crd[0];
-                  pt_crd[2] = xi_crd[0];
-                  break;
-               case 5:
-                  pt_crd[1] = 1. - xi_crd[0];
-                  pt_crd[2] = xi_crd[0];
-                  break;
-            }
-            IntegrationPoint& ip = pumi_nodes.IntPoint(ip_cnt++);
-            ip.Set(pt_crd, 3);
-         }
-      }
-   }
-   // Dofs on Faces
-   if (PrFieldShape->hasNodesIn(apf::Mesh::TRIANGLE))
-   {
-      const int nn = PrFieldShape->countNodesOn(apf::Mesh::TRIANGLE);
-      for (int ii = 0; ii < 4; ii++)
-      {
-         for (int jj = 0; jj < nn; jj++)
-         {
-            PrFieldShape->getNodeXi(apf::Mesh::TRIANGLE, jj, xi_crd);
-            double pt_crd[3] = {0., 0., 0.};
-            switch (ii)
-            {
-               case 0:
-                  pt_crd[0] = xi_crd[0];
-                  pt_crd[1] = xi_crd[1];
-                  break;
-               case 1:
-                  pt_crd[0] = xi_crd[0];
-                  pt_crd[2] = xi_crd[2];
-                  break;
-               case 2:
-                  pt_crd[0] = xi_crd[0];
-                  pt_crd[1] = xi_crd[1];
-                  pt_crd[2] = xi_crd[2];
-                  break;
-               case 3:
-                  pt_crd[1] = xi_crd[0];
-                  pt_crd[2] = xi_crd[1];
-                  break;
-            }
-            IntegrationPoint& ip = pumi_nodes.IntPoint(ip_cnt++);
-            ip.Set(pt_crd, 3);
-         }
-      }
-   }
-   MFEM_ASSERT(ip_cnt == num_nodes, "");
+   apf::FieldShape* field_shape = getShape(pr_field);
+   int dim = apf_mesh->getDimension();
 
-   // Other dofs
    apf::MeshEntity* ent;
-   apf::MeshIterator* itr = apf_mesh->begin(3);
+   apf::MeshIterator* itr = apf_mesh->begin(dim);
    int iel = 0;
    while ((ent = apf_mesh->iterate(itr)))
    {
+      apf::NewArray<apf::Vector3> pumi_nodes;
+      apf::getElementNodeXis(field_shape, apf_mesh, ent, pumi_nodes);
+      IntegrationRule mfem_nodes = ParentXisPUMItoMFEM(
+                                      apf_mesh, ent, iel, pumi_nodes, true);
       // Get the solution
-      Vector pr;
-      grid_pr->GetValues(iel, pumi_nodes, pr, 1);
-
-      // Transfer
-      apf::Downward vtxs;
-      int num_vts = apf_mesh->getDownward(ent, 0, vtxs);
-      for (int kk = 0; kk < num_vts; kk++)
-      {
-         double mag;
-         (pr[kk] >= 0. ? mag = pr[kk] : mag = -pr[kk]);
-         apf::setScalar(PrMagField, vtxs[kk], 0, mag);
-
-         // Set Pr
-         apf::setScalar(PrField, vtxs[kk], 0, pr[kk]);
-      }
+      Vector vals;
+      grid_pr->GetValues(iel, mfem_nodes, vals, 1);
 
-      int dofId = num_vts;
-
-      apf::EntityShape* es = PrFieldShape->getEntityShape(apf::Mesh::TET);
-      // Edge Dofs
-      if (PrFieldShape->hasNodesIn(apf::Mesh::EDGE))
-      {
-         int ndOnEdge = PrFieldShape->countNodesOn(apf::Mesh::EDGE);
-         Array<int> order(ndOnEdge);
-
-         apf::Downward edges;
-         int num_edge =  apf_mesh->getDownward(ent, apf::Mesh::EDGE, edges);
-         for (int ii = 0 ; ii < num_edge; ++ii)
-         {
-            es->alignSharedNodes(apf_mesh, ent, edges[ii], order);
-            for (int jj = 0; jj < ndOnEdge; jj++)
-            {
-               int cnt = dofId + order[jj];
-               double mag;
-               (pr[cnt] >= 0. ? mag = pr[cnt] : mag = -pr[cnt]);
-               apf::setScalar(PrMagField, edges[ii], jj, mag);
-
-               // Set Pr
-               apf::setScalar(PrField, edges[ii], jj, pr[cnt]);
-
-            }
-            // Counter
-            dofId += ndOnEdge;
-         }
-      }
-
-      // Face Dofs
-      if (PrFieldShape->hasNodesIn(apf::Mesh::TRIANGLE))
+      int non = 0;
+      for (int d = 0; d <= dim; d++)
       {
-         int ndOnFace = PrFieldShape->countNodesOn(apf::Mesh::TRIANGLE);
-         Array<int> order(ndOnFace);
-
-         apf::Downward faces;
-         int num_face = apf_mesh->getDownward(ent, apf::Mesh::TRIANGLE, faces);
-         for (int ii = 0; ii < num_face; ii++)
+         if (!field_shape->hasNodesIn(d)) { continue; }
+         apf::Downward a;
+         int na = apf_mesh->getDownward(ent,d,a);
+         for (int i = 0; i < na; i++)
          {
-            if ( ndOnFace > 1)
+            int type = apf_mesh->getType(a[i]);
+            int nan = field_shape->countNodesOn(type);
+            for (int n = 0; n < nan; n++)
             {
-               es->alignSharedNodes(apf_mesh, ent, faces[ii], order);
+               double pr = vals[non];
+               double pr_mag = pr >= 0 ? pr : -pr;
+               apf::setScalar(pr_field, a[i], n, pr);
+               apf::setScalar(pr_mag_field, a[i], n, pr_mag);
+               non++;
             }
-            else
-            {
-               order[0] = 0;
-            }
-            for (int jj = 0; jj < ndOnFace; jj++)
-            {
-               int cnt = dofId + order[jj];
-               double mag;
-               (pr[cnt] >= 0. ? mag = pr[cnt] : mag = -pr[cnt]);
-               apf::setScalar(PrMagField, faces[ii], jj, mag);
-
-               // Set Pr
-               apf::setScalar(PrField, faces[ii], jj, pr[cnt]);
-            }
-            // Counter
-            dofId += ndOnFace;
          }
       }
-
       iel++;
    }
    apf_mesh->end(itr);
@@ -1348,279 +1104,148 @@ void ParPumiMesh::FieldMFEMtoPUMI(apf::Mesh2* apf_mesh,
 // adaptation
 void ParPumiMesh::VectorFieldMFEMtoPUMI(apf::Mesh2* apf_mesh,
                                         ParGridFunction* grid_vel,
-                                        apf::Field* VelField,
-                                        apf::Field* VelMagField)
+                                        apf::Field* vel_field,
+                                        apf::Field* vel_mag_field)
 {
-   apf::FieldShape* VelFieldShape = getShape(VelField);
-   int num_nodes = 4 * VelFieldShape->countNodesOn(0) + // Vertex
-                   6 * VelFieldShape->countNodesOn(1) + // Edge
-                   4 * VelFieldShape->countNodesOn(2) + // Triangle
-                   VelFieldShape->countNodesOn(4);// Tetrahedron
-
-   // Define integration points
-   IntegrationRule pumi_nodes(num_nodes);
-   int ip_cnt = 0;
-   apf::Vector3 xi_crd(0.,0.,0.);
-
-   // Create a template of dof holders coordinates in parametric coordinates.
-   // The ordering is taken care of when the field is transferred to PUMI.
-
-   // Dofs on Vertices
-   IntegrationPoint& ip = pumi_nodes.IntPoint(ip_cnt++);
-   double pt_crd[3] = {0., 0., 0.};
-   ip.Set(pt_crd, 3);
-   for (int kk = 0; kk < 3; kk++)
-   {
-      IntegrationPoint& ip = pumi_nodes.IntPoint(ip_cnt++);
-      double pt_crd[3] = {0.,0.,0.};
-      pt_crd[kk] = 1.0;
-      ip.Set(pt_crd, 3);
-   }
-   // Dofs on Edges
-   if (VelFieldShape->hasNodesIn(apf::Mesh::EDGE))
-   {
-      const int nn = VelFieldShape->countNodesOn(apf::Mesh::EDGE);
-      for (int ii = 0; ii < 6; ii++)
-      {
-         for (int jj = 0; jj < nn; jj++)
-         {
-            VelFieldShape->getNodeXi(apf::Mesh::EDGE, jj, xi_crd);
-            xi_crd[0] = 0.5 * (xi_crd[0] + 1.); // from (-1,1) to (0,1)
-            double pt_crd[3] = {0., 0., 0.};
-            switch (ii)
-            {
-               case 0:
-                  pt_crd[0] = xi_crd[0];
-                  break;
-               case 1:
-                  pt_crd[0] = 1. - xi_crd[0];
-                  pt_crd[1] = xi_crd[0];
-                  break;
-               case 2:
-                  pt_crd[1] = xi_crd[0];
-                  break;
-               case 3:
-                  pt_crd[2] = xi_crd[0];
-                  break;
-               case 4:
-                  pt_crd[0] = 1. - xi_crd[0];
-                  pt_crd[2] = xi_crd[0];
-                  break;
-               case 5:
-                  pt_crd[1] = 1. - xi_crd[0];
-                  pt_crd[2] = xi_crd[0];
-                  break;
-            }
-            IntegrationPoint& ip = pumi_nodes.IntPoint(ip_cnt++);
-            ip.Set(pt_crd, 3);
-         }
-      }
-   }
-   // Dofs on Faces
-   if (VelFieldShape->hasNodesIn(apf::Mesh::TRIANGLE))
-   {
-      const int nn = VelFieldShape->countNodesOn(apf::Mesh::TRIANGLE);
-      for (int ii = 0; ii < 4; ii++)
-      {
-         for (int jj = 0; jj < nn; jj++)
-         {
-            VelFieldShape->getNodeXi(apf::Mesh::TRIANGLE, jj, xi_crd);
-            double pt_crd[3] = {0., 0., 0.};
-            switch (ii)
-            {
-               case 0:
-                  pt_crd[0] = xi_crd[0];
-                  pt_crd[1] = xi_crd[1];
-                  break;
-               case 1:
-                  pt_crd[0] = xi_crd[0];
-                  pt_crd[2] = xi_crd[2];
-                  break;
-               case 2:
-                  pt_crd[0] = xi_crd[0];
-                  pt_crd[1] = xi_crd[1];
-                  pt_crd[2] = xi_crd[2];
-                  break;
-               case 3:
-                  pt_crd[1] = xi_crd[0];
-                  pt_crd[2] = xi_crd[1];
-                  break;
-            }
-            IntegrationPoint& ip = pumi_nodes.IntPoint(ip_cnt++);
-            ip.Set(pt_crd, 3);
-         }
-      }
-   }
-   MFEM_ASSERT(ip_cnt == num_nodes, "");
+   apf::FieldShape* field_shape = getShape(vel_field);
+   int dim = apf_mesh->getDimension();
 
-   // Other dofs
    apf::MeshEntity* ent;
-   apf::MeshIterator* itr = apf_mesh->begin(3);
+   apf::MeshIterator* itr = apf_mesh->begin(dim);
    int iel = 0;
    while ((ent = apf_mesh->iterate(itr)))
    {
+      apf::NewArray<apf::Vector3> pumi_nodes;
+      apf::getElementNodeXis(field_shape, apf_mesh, ent, pumi_nodes);
+      IntegrationRule mfem_nodes = ParentXisPUMItoMFEM(
+                                      apf_mesh, ent, iel, pumi_nodes, true);
       // Get the solution
-      Vector u_vel, v_vel, w_vel;
-      grid_vel->GetValues(iel, pumi_nodes, u_vel, 1);
-      grid_vel->GetValues(iel, pumi_nodes, v_vel, 2);
-      grid_vel->GetValues(iel, pumi_nodes, w_vel, 3);
-
-      // Transfer
-      apf::Downward vtxs;
-      int num_vts = apf_mesh->getDownward(ent, 0, vtxs);
-      for (int kk = 0; kk < num_vts; kk++)
-      {
-         double mag = u_vel[kk] * u_vel[kk] + v_vel[kk] * v_vel[kk] +
-                      w_vel[kk] * w_vel[kk];
-         mag = sqrt(mag);
-         apf::setScalar(VelMagField, vtxs[kk], 0, mag);
-         // Set vel
-         double vels[3] = {u_vel[kk], v_vel[kk], w_vel[kk]};
-         apf::setComponents(VelField, vtxs[kk], 0, vels);
-      }
-
-      int dofId = num_vts;
+      ElementTransformation* eltr = this->GetElementTransformation(iel);
+      DenseMatrix vel;
+      grid_vel->GetVectorValues(*eltr, mfem_nodes, vel);
 
-      apf::EntityShape* es = VelFieldShape->getEntityShape(apf::Mesh::TET);
-      // Edge Dofs
-      if (VelFieldShape->hasNodesIn(apf::Mesh::EDGE))
+      int non = 0;
+      for (int d = 0; d <= dim; d++)
       {
-         int ndOnEdge = VelFieldShape->countNodesOn(apf::Mesh::EDGE);
-         Array<int> order(ndOnEdge);
-
-         apf::Downward edges;
-         int num_edge =  apf_mesh->getDownward(ent, apf::Mesh::EDGE, edges);
-         for (int ii = 0 ; ii < num_edge; ++ii)
+         if (!field_shape->hasNodesIn(d)) { continue; }
+         apf::Downward a;
+         int na = apf_mesh->getDownward(ent,d,a);
+         for (int i = 0; i < na; i++)
          {
-            es->alignSharedNodes(apf_mesh, ent, edges[ii], order);
-            for (int jj = 0; jj < ndOnEdge; jj++)
+            int type = apf_mesh->getType(a[i]);
+            int nan = field_shape->countNodesOn(type);
+            for (int n = 0; n < nan; n++)
             {
-               int cnt = dofId + order[jj];
-               double mag = u_vel[cnt] * u_vel[cnt] +
-                            v_vel[cnt] * v_vel[cnt] +
-                            w_vel[cnt] * w_vel[cnt];
-               mag = sqrt(mag);
-               apf::setScalar(VelMagField, edges[ii], jj, mag);
-
-               // Set vel
-               double vels[3] = {u_vel[cnt], v_vel[cnt], w_vel[cnt]};
-               apf::setComponents(VelField, edges[ii], jj, vels);
+               apf::Vector3 v(vel.GetColumn(non));
+               apf::setScalar(vel_mag_field, a[i], n, v.getLength());
+               apf::setVector(vel_field, a[i], n, v);
+               non++;
             }
-            // Counter
-            dofId += ndOnEdge;
          }
       }
+      iel++;
+   }
+   apf_mesh->end(itr);
+}
 
-      // Face Dofs
-      if (VelFieldShape->hasNodesIn(apf::Mesh::TRIANGLE))
-      {
-         int ndOnFace = VelFieldShape->countNodesOn(apf::Mesh::TRIANGLE);
-         Array<int> order(ndOnFace);
+void ParPumiMesh::NedelecFieldMFEMtoPUMI(apf::Mesh2* apf_mesh,
+                                         ParGridFunction* gf,
+                                         apf::Field* nedelec_field)
+{
+   apf::FieldShape* nedelecFieldShape = nedelec_field->getShape();
+   int dim = apf_mesh->getDimension();
 
-         apf::Downward faces;
-         int num_face = apf_mesh->getDownward(ent, apf::Mesh::TRIANGLE, faces);
-         for (int ii = 0; ii < num_face; ii++)
+   // loop over all elements
+   size_t elemNo = 0;
+   apf::MeshEntity* ent;
+   apf::MeshIterator* it = apf_mesh->begin(dim);
+   while ( (ent = apf_mesh->iterate(it)) )
+   {
+      // get all the pumi nodes and rotate them
+      apf::NewArray<apf::Vector3> pumi_nodes;
+      apf::getElementNodeXis(nedelecFieldShape, apf_mesh, ent, pumi_nodes);
+      IntegrationRule mfem_nodes = ParentXisPUMItoMFEM(
+                                      apf_mesh, ent, elemNo, pumi_nodes, true);
+      // evaluate the vector field on the mfem nodes
+      ElementTransformation* eltr = this->GetElementTransformation(elemNo);
+      DenseMatrix mfem_field_vals;
+      gf->GetVectorValues(*eltr, mfem_nodes, mfem_field_vals);
+
+      // compute and store dofs on ND field
+      int non = 0;
+      for (int d = 0; d <= dim; d++)
+      {
+         if (!nedelecFieldShape->hasNodesIn(d)) { continue; }
+         apf::Downward a;
+         int na = apf_mesh->getDownward(ent,d,a);
+         for (int i = 0; i < na; i++)
          {
-            if ( ndOnFace > 1)
-            {
-               es->alignSharedNodes(apf_mesh, ent, faces[ii], order);
-            }
-            else
+            int type = apf_mesh->getType(a[i]);
+            int nan = nedelecFieldShape->countNodesOn(type);
+            apf::MeshElement* me = apf::createMeshElement(apf_mesh, a[i]);
+            for (int n = 0; n < nan; n++)
             {
-               order[0] = 0;
+               apf::Vector3 xi, tangent;
+               nedelecFieldShape->getNodeXi(type, n, xi);
+               nedelecFieldShape->getNodeTangent(type, n, tangent);
+               apf::Vector3 pumi_field_vector(mfem_field_vals.GetColumn(non));
+               apf::Matrix3x3 J;
+               apf::getJacobian(me, xi, J);
+               double dof = (J * pumi_field_vector) * tangent;
+               apf::setScalar(nedelec_field, a[i], n, dof);
+               non++;
             }
-            for (int jj = 0; jj < ndOnFace; jj++)
-            {
-               int cnt = dofId + order[jj];
-               double mag = u_vel[cnt] * u_vel[cnt] +
-                            v_vel[cnt] * v_vel[cnt] +
-                            w_vel[cnt] * w_vel[cnt];
-               mag = sqrt(mag);
-               apf::setScalar(VelMagField, faces[ii], jj, mag);
-
-               // Set vel
-               double vels[3] = {u_vel[cnt], v_vel[cnt], w_vel[cnt]};
-               apf::setComponents(VelField, faces[ii], jj, vels);
-            }
-            // Counter
-            dofId += ndOnFace;
+            apf::destroyMeshElement(me);
          }
       }
-
-      iel++;
+      elemNo++;
    }
-   apf_mesh->end(itr);
+   apf_mesh->end(it); // end loop over all elements
 }
 
 void ParPumiMesh::FieldPUMItoMFEM(apf::Mesh2* apf_mesh,
-                                  apf::Field* ScalarField,
-                                  ParGridFunction* Pr)
+                                  apf::Field* field,
+                                  ParGridFunction* grid)
 {
-   // Pr->Update();
-   // Find local numbering
-   v_num_loc = apf_mesh->findNumbering("LocalVertexNumbering");
-
-   // Loop over field to copy
-   getShape(ScalarField);
-   apf::MeshEntity* ent;
-   apf::MeshIterator* itr = apf_mesh->begin(0);
-   while ((ent = apf_mesh->iterate(itr)))
-   {
-      unsigned int id = apf::getNumber(v_num_loc, ent, 0, 0);
-      double fieldVal = apf::getScalar(ScalarField, ent, 0);
+   int nc = apf::countComponents(field);
+   ParFiniteElementSpace* fes = grid->ParFESpace();
+   ParMesh* pmesh = fes->GetParMesh();
 
-      (Pr->GetData())[id] = fieldVal;
-   }
-   apf_mesh->end(itr);
+   int dim = apf_mesh->getDimension();
 
-   // Check for higher order
-   getShape(ScalarField);
-   if ( Pr->FESpace()->GetOrder(1) > 1 )
+   apf::MeshIterator* it = apf_mesh->begin(dim);
+   for (int i = 0; i < pmesh->GetNE(); i++)
    {
-      // Assume all element type are the same i.e. tetrahedral
-      const FiniteElement* H1_elem = Pr->FESpace()->GetFE(1);
-      const IntegrationRule &All_nodes = H1_elem->GetNodes();
-      int nnodes = All_nodes.Size();
-
-      // Loop over elements
-      int nc = apf::countComponents(ScalarField);
-      int iel = 0;
-      itr = apf_mesh->begin(3);
-      while ((ent = apf_mesh->iterate(itr)))
+      const FiniteElement* mfem_elem = fes->GetFE(i);
+      const IntegrationRule &mfem_xi = mfem_elem->GetNodes();
+      int non = mfem_xi.Size();
+      apf::MeshEntity* ent = apf_mesh->iterate(it);
+      apf::NewArray<apf::Vector3> pumi_xi(non);
+      ParentXisMFEMtoPUMI(apf_mesh,
+                          i,
+                          ent,
+                          mfem_xi,
+                          pumi_xi,
+                          true);
+      Array<int> vdofs;
+      fes->GetElementVDofs(i, vdofs);
+      apf::MeshElement* me = apf::createMeshElement(apf_mesh, ent);
+      apf::Element* el = apf::createElement(field, me);
+      for (int j = 0; j < non; j++)
       {
-         Array<int> vdofs;
-         Pr->FESpace()->GetElementVDofs(iel, vdofs);
-
-         // Create PUMI element to interpolate
-         apf::MeshElement* mE = apf::createMeshElement(apf_mesh, ent);
-         apf::Element* elem = apf::createElement(ScalarField, mE);
-
-         // Vertices are already interpolated
-         for (int ip = 0; ip < nnodes; ip++) //num_vert
+         apf::DynamicVector values(nc);
+         apf::getComponents(el, pumi_xi[j], &values[0]);
+         // Fill the nodes list
+         for (int c = 0; c < nc; c++)
          {
-            // Take parametric coordinates of the node
-            apf::Vector3 param;
-            param[0] = All_nodes.IntPoint(ip).x;
-            param[1] = All_nodes.IntPoint(ip).y;
-            param[2] = All_nodes.IntPoint(ip).z;
-
-            // Compute the interpolating coordinates
-            apf::DynamicVector phCrd(nc);
-            apf::getComponents(elem, param, &phCrd[0]);
-
-            // Fill the nodes list
-            for (int kk = 0; kk < nc; ++kk)
-            {
-               int dof_ctr = ip + kk * nnodes;
-               (Pr->GetData())[vdofs[dof_ctr]] = phCrd[kk];
-            }
+            int dof_loc = j + c * non;
+            (grid->GetData())[vdofs[dof_loc]] = values[c];
          }
-         iel++;
-         apf::destroyElement(elem);
-         apf::destroyMeshElement(mE);
       }
-      apf_mesh->end(itr);
+      apf::destroyElement(el);
+      apf::destroyMeshElement(me);
    }
+   apf_mesh->end(it);
 }
 
 }
diff --git a/mesh/pumi.hpp b/mesh/pumi.hpp
index 360015b2059..2cd831457dc 100644
--- a/mesh/pumi.hpp
+++ b/mesh/pumi.hpp
@@ -30,12 +30,13 @@
 #include "mesh.hpp"
 #include "pmesh.hpp"
 
-#include <pumi.h>
 #include <apf.h>
 #include <apfMesh2.h>
 #include <apfShape.h>
+#include <apfField.h>
 #include <apfNumbering.h>
 #include <apfDynamicVector.h>
+#include <maMesh.h>
 
 namespace mfem
 {
@@ -44,8 +45,6 @@ namespace mfem
 class PumiMesh : public Mesh
 {
 protected:
-   Element *ReadElement(apf::MeshEntity* Ent, const int geom, apf::Downward Verts,
-                        const int Attr, apf::Numbering* vert_num);
    void CountBoundaryEntity(apf::Mesh2* apf_mesh, const int BcDim, int &NumBC);
 
    // Readers for PUMI mesh formats, used in the Load() method.
@@ -58,7 +57,6 @@ class PumiMesh : public Mesh
             bool fix_orientation = true);
 
    using Mesh::Load;
-
    /// Load a PUMI mesh (following the steps in the MFEM Load function).
    void Load(apf::Mesh2* apf_mesh, int generate_edges = 0, int refine = 1,
              bool fix_orientation = true);
@@ -72,45 +70,82 @@ class PumiMesh : public Mesh
 class ParPumiMesh : public ParMesh
 {
 private:
+   // This has to persist during an adaptive simulation, and therefore
+   // needs to be updated each time the mesh changes.
    apf::Numbering* v_num_loc;
 
-protected:
-   Element *ReadElement(apf::MeshEntity* Ent, const int geom, apf::Downward Verts,
-                        const int Attr, apf::Numbering* vert_num);
-
 public:
    /// Build a parallel MFEM mesh from a parallel PUMI mesh.
-   ParPumiMesh(MPI_Comm comm, apf::Mesh2* apf_mesh);
-
+   ParPumiMesh(MPI_Comm comm, apf::Mesh2* apf_mesh,
+               int refine = 1, bool fix_orientation = true);
+
+
+   /// Returns the PUMI-to-MFEM permutation (aka rotation, aka orientation)
+   /** This represents the change in tet-to-vertex connectivity between
+       the PUMI and MFEM meshes. E.g.,
+       PUMI_tet{v0,v1,v2,v3}  --->  MFEM_tet{v1,v0,v3,v2}
+       * Note that change in the orientation can be caused by
+         a) fixing wrong boundary element orientations
+         b) a call to ReorientTetMesh() which is required for Nedelec */
+   int RotationPUMItoMFEM(apf::Mesh2* apf_mesh,
+                          apf::MeshEntity* tet,
+                          int elemId);
+   /// Convert the parent coordinate from PUMI to MFEM
+   /** By default this functions assumes that there is always
+       change in the orientations of some of the elements. In case it
+       is known for sure that there is NO change in the orientation,
+       call the functions with last argument = false */
+   IntegrationRule ParentXisPUMItoMFEM(apf::Mesh2* apf_mesh,
+                                       apf::MeshEntity* tet,
+                                       int elemId,
+                                       apf::NewArray<apf::Vector3>& pumi_xi,
+                                       bool checkOrientation = true);
+   /// Convert the parent coordinate from MFEM to PUMI
+   /** This is the inverse of ParentXisPUMItoMFEM.
+       By default this functions assumes that there is always
+       change in the orientations of some of the elements. In case it
+       is known for sure that there is NO change in the orientation,
+       call the functions with last argument = false */
+   void ParentXisMFEMtoPUMI(apf::Mesh2* apf_mesh,
+                            int elemId,
+                            apf::MeshEntity* tet,
+                            const IntegrationRule& mfem_xi,
+                            apf::NewArray<apf::Vector3>& pumi_xi,
+                            bool checkOrientation = true);
    /// Transfer field from MFEM mesh to PUMI mesh [Mixed].
    void FieldMFEMtoPUMI(apf::Mesh2* apf_mesh,
-                        ParGridFunction* Vel,
-                        ParGridFunction* Pr,
-                        apf::Field* VelField,
-                        apf::Field* PrField,
-                        apf::Field* VelMagField);
+                        ParGridFunction* grid_vel,
+                        ParGridFunction* grid_pr,
+                        apf::Field* vel_field,
+                        apf::Field* pr_field,
+                        apf::Field* vel_mag_field);
 
    /// Transfer field from MFEM mesh to PUMI mesh [Scalar].
    void FieldMFEMtoPUMI(apf::Mesh2* apf_mesh,
-                        ParGridFunction* Pr,
-                        apf::Field* PrField,
-                        apf::Field* PrMagField);
+                        ParGridFunction* grid_pr,
+                        apf::Field* pr_field,
+                        apf::Field* pr_mag_field);
 
    /// Transfer field from MFEM mesh to PUMI mesh [Vector].
    void VectorFieldMFEMtoPUMI(apf::Mesh2* apf_mesh,
-                              ParGridFunction* Vel,
-                              apf::Field* VelField,
-                              apf::Field* VelMagField);
+                              ParGridFunction* grid_vel,
+                              apf::Field* vel_field,
+                              apf::Field* vel_mag_field);
+
+   /// Transfer Nedelec field from MFEM mesh to PUMI mesh [Vector].
+   void NedelecFieldMFEMtoPUMI(apf::Mesh2* apf_mesh,
+                               ParGridFunction* gf,
+                               apf::Field* nedelec_field);
 
    /// Update the mesh after adaptation.
    void UpdateMesh(const ParMesh* AdaptedpMesh);
 
-   /// Transfer a field from PUMI to MFEM after mesh adapt [Scalar].
+   /// Transfer a field from PUMI to MFEM after mesh adapt [Scalar and Vector].
    void FieldPUMItoMFEM(apf::Mesh2* apf_mesh,
-                        apf::Field* ScalarField,
-                        ParGridFunction* Pr);
+                        apf::Field* field,
+                        ParGridFunction* grid);
 
-   virtual ~ParPumiMesh() { }
+   virtual ~ParPumiMesh() {}
 };
 
 
diff --git a/mesh/quadrilateral.hpp b/mesh/quadrilateral.hpp
index dcf312222f9..33dc6d70b70 100644
--- a/mesh/quadrilateral.hpp
+++ b/mesh/quadrilateral.hpp
@@ -69,7 +69,7 @@ class Quadrilateral : public Element
    virtual ~Quadrilateral() { }
 };
 
-extern BiLinear2DFiniteElement QuadrilateralFE;
+extern class BiLinear2DFiniteElement QuadrilateralFE;
 
 }
 
diff --git a/miniapps/gslib/CMakeLists.txt b/miniapps/gslib/CMakeLists.txt
index ba387d6ac1b..6a21042e1f6 100644
--- a/miniapps/gslib/CMakeLists.txt
+++ b/miniapps/gslib/CMakeLists.txt
@@ -22,7 +22,7 @@ if (MFEM_USE_GSLIB)
   # Parallel apps.
   if (MFEM_USE_MPI)
     add_mfem_miniapp(pfindpts
-      MAIN findpts.cpp
+      MAIN pfindpts.cpp
       LIBRARIES mfem)
   endif()
 
diff --git a/miniapps/gslib/field-diff.cpp b/miniapps/gslib/field-diff.cpp
index 0a5c2ce5644..c722359db1e 100644
--- a/miniapps/gslib/field-diff.cpp
+++ b/miniapps/gslib/field-diff.cpp
@@ -156,27 +156,14 @@ int main (int argc, char *argv[])
       }
    }
 
-   FindPointsGSLIB finder;
-   const double rel_bbox_el = 0.05;
-   const double newton_tol  = 1.0e-12;
-   const int npts_at_once   = 256;
-   Array<unsigned int> el_id_out(pts_cnt), code_out(pts_cnt), task_id_out(pts_cnt);
-   Vector pos_r_out(pts_cnt * dim), dist_p_out(pts_cnt);
+   FindPointsGSLIB finder1, finder2;
    Vector interp_vals_1(pts_cnt), interp_vals_2(pts_cnt);
 
    // First solution.
-   finder.Setup(mesh_1, rel_bbox_el, newton_tol, npts_at_once);
-   finder.FindPoints(vxyz, code_out, task_id_out,
-                     el_id_out, pos_r_out, dist_p_out);
-   finder.Interpolate(code_out, task_id_out, el_id_out,
-                      pos_r_out, func_1, interp_vals_1);
+   finder1.Interpolate(mesh_1, vxyz, func_1, interp_vals_1);
 
    // Second solution.
-   finder.Setup(mesh_2, rel_bbox_el, newton_tol, npts_at_once);
-   finder.FindPoints(vxyz, code_out, task_id_out,
-                     el_id_out, pos_r_out, dist_p_out);
-   finder.Interpolate(code_out, task_id_out, el_id_out,
-                      pos_r_out, func_2, interp_vals_2);
+   finder2.Interpolate(mesh_2, vxyz, func_2, interp_vals_2);
 
    // Compute differences between the two sets of values.
    double avg_diff = 0.0, max_diff = 0.0, diff_p;
@@ -220,15 +207,8 @@ int main (int argc, char *argv[])
    const int nodes_cnt = vxyz.Size() / dim;
 
    // Difference at the nodes of mesh 1.
-   el_id_out.SetSize(nodes_cnt); code_out.SetSize(nodes_cnt);
-   task_id_out.SetSize(nodes_cnt);
-   pos_r_out.SetSize(nodes_cnt * dim); dist_p_out.SetSize(nodes_cnt * dim);
    interp_vals_2.SetSize(nodes_cnt);
-   finder.Setup(mesh_2, rel_bbox_el, newton_tol, npts_at_once);
-   finder.FindPoints(vxyz, code_out, task_id_out,
-                     el_id_out, pos_r_out, dist_p_out);
-   finder.Interpolate(code_out, task_id_out, el_id_out,
-                      pos_r_out, func_2, interp_vals_2);
+   finder2.Interpolate(vxyz, func_2, interp_vals_2);
    for (int n = 0; n < nodes_cnt; n++)
    {
       diff(n) = fabs(func_1(n) - interp_vals_2(n));
@@ -258,7 +238,8 @@ int main (int argc, char *argv[])
    std::cout << "Vol diff: " << vol_diff << std::endl;
 
    // Free the internal gslib data.
-   finder.FreeData();
+   finder1.FreeData();
+   finder2.FreeData();
 
    return 0;
 }
diff --git a/miniapps/gslib/findpts.cpp b/miniapps/gslib/findpts.cpp
index 2aba55e5a33..40dc524e441 100644
--- a/miniapps/gslib/findpts.cpp
+++ b/miniapps/gslib/findpts.cpp
@@ -132,13 +132,6 @@ int main (int argc, char *argv[])
       }
    }
 
-   // Setup the gslib mesh.
-   FindPointsGSLIB finder;
-   const double rel_bbox_el = 0.05;
-   const double newton_tol  = 1.0e-12;
-   const int npts_at_once   = 256;
-   finder.Setup(mesh, rel_bbox_el, newton_tol, npts_at_once);
-
    // Generate equidistant points in physical coordinates over the whole mesh.
    // Note that some points might be outside, if the mesh is not a box. Note
    // also that all tasks search the same points (not mandatory).
@@ -169,18 +162,13 @@ int main (int argc, char *argv[])
       }
    }
 
-   Array<unsigned int> el_id_out(pts_cnt), code_out(pts_cnt),
-         task_id_out(pts_cnt);
-   Vector pos_r_out(pts_cnt * dim), dist_p_out(pts_cnt);
-
-   // Finds points stored in vxyz.
-   finder.FindPoints(vxyz, code_out, task_id_out,
-                     el_id_out, pos_r_out, dist_p_out);
-
-   // Interpolate FE function values on the found points.
+   // Find and Interpolate FE function values on the desired points.
    Vector interp_vals(pts_cnt);
-   finder.Interpolate(code_out, task_id_out, el_id_out,
-                      pos_r_out, field_vals, interp_vals);
+   // FindPoints using GSLIB and interpolate
+   FindPointsGSLIB finder;
+   finder.Interpolate(mesh, vxyz, field_vals, interp_vals);
+   Array<unsigned int> code_out = finder.GetCode();
+   Vector dist_p_out = finder.GetDist();
 
    // Free the internal gslib data.
    finder.FreeData();
diff --git a/miniapps/gslib/pfindpts.cpp b/miniapps/gslib/pfindpts.cpp
index a380b5b6db5..6832bb8511a 100644
--- a/miniapps/gslib/pfindpts.cpp
+++ b/miniapps/gslib/pfindpts.cpp
@@ -160,13 +160,6 @@ int main (int argc, char *argv[])
       }
    }
 
-   // Setup the gslib mesh.
-   FindPointsGSLIB finder(MPI_COMM_WORLD);
-   const double rel_bbox_el = 0.05;
-   const double newton_tol  = 1.0e-12;
-   const int npts_at_once   = 256;
-   finder.Setup(pmesh, rel_bbox_el, newton_tol, npts_at_once);
-
    // Generate equidistant points in physical coordinates over the whole mesh.
    // Note that some points might be outside, if the mesh is not a box. Note
    // also that all tasks search the same points (not mandatory).
@@ -197,21 +190,14 @@ int main (int argc, char *argv[])
       }
    }
 
-   Array<unsigned int> el_id_out(pts_cnt), code_out(pts_cnt),
-         task_id_out(pts_cnt);
-   Vector pos_r_out(pts_cnt * dim), dist_p_out(pts_cnt);
-
-   // Finds points stored in vxyz.
-   finder.FindPoints(vxyz, code_out, task_id_out,
-                     el_id_out, pos_r_out, dist_p_out);
-
-   // Interpolate FE function values on the found points.
+   // Find and Interpolate FE function values on the desired points.
    Vector interp_vals(pts_cnt);
-   finder.Interpolate(code_out, task_id_out, el_id_out,
-                      pos_r_out, field_vals, interp_vals);
-
-   // Free the internal gslib data.
-   finder.FreeData();
+   // FindPoints using GSLIB and interpolate
+   FindPointsGSLIB finder(MPI_COMM_WORLD);
+   finder.Interpolate(pmesh, vxyz, field_vals, interp_vals);
+   Array<unsigned int> code_out    = finder.GetCode();
+   Array<unsigned int> task_id_out = finder.GetProc();
+   Vector dist_p_out = finder.GetDist();
 
    int face_pts = 0, not_found = 0, found_loc = 0, found_away = 0;
    double max_err = 0.0, max_dist = 0.0;
@@ -246,6 +232,8 @@ int main (int argc, char *argv[])
            << "\nPoints on faces:      " << face_pts << endl;
    }
 
+   // Free the internal gslib data.
+   finder.FreeData();
    MPI_Finalize();
    return 0;
 }
diff --git a/miniapps/meshing/CMakeLists.txt b/miniapps/meshing/CMakeLists.txt
index 626bd2c999b..e5afee4fc4b 100644
--- a/miniapps/meshing/CMakeLists.txt
+++ b/miniapps/meshing/CMakeLists.txt
@@ -31,7 +31,8 @@ add_mfem_miniapp(extruder
 
 add_mfem_miniapp(mesh-optimizer
   MAIN mesh-optimizer.cpp
-  LIBRARIES mfem)
+  ${MFEM_MINIAPPS_COMMON_HEADERS}
+  LIBRARIES mfem mfem-common)
 
 add_mfem_miniapp(minimal-surface
   MAIN minimal-surface.cpp
@@ -49,13 +50,14 @@ add_mfem_miniapp(twist
 add_test(NAME mesh-optimizer
   COMMAND mesh-optimizer -no-vis -m ${CMAKE_CURRENT_SOURCE_DIR}/icf.mesh)
 
-add_test(NAME minimal-surface COMMAND minimal-surface)
+add_test(NAME minimal-surface COMMAND minimal-surface -no-vis)
 
 # Parallel apps.
 if (MFEM_USE_MPI)
   add_mfem_miniapp(pmesh-optimizer
     MAIN pmesh-optimizer.cpp
-    LIBRARIES mfem)
+    ${MFEM_MINIAPPS_COMMON_HEADERS}
+    LIBRARIES mfem mfem-common)
 
   add_mfem_miniapp(pminimal-surface
     MAIN pminimal-surface.cpp
@@ -70,6 +72,6 @@ if (MFEM_USE_MPI)
 
   add_test(NAME pminimal-surface_np=4
     COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${MFEM_MPI_NP}
-    ${MPIEXEC_PREFLAGS} $<TARGET_FILE:pminimal-surface>
+    ${MPIEXEC_PREFLAGS} $<TARGET_FILE:pminimal-surface> -no-vis
     ${MPIEXEC_POSTFLAGS})
 endif()
diff --git a/miniapps/meshing/cube.mesh b/miniapps/meshing/cube.mesh
new file mode 100644
index 00000000000..6b3a3e28ca3
--- /dev/null
+++ b/miniapps/meshing/cube.mesh
@@ -0,0 +1,439 @@
+MFEM mesh v1.0
+
+#
+# MFEM Geometry Types (see mesh/geom.hpp):
+#
+# POINT       = 0
+# SEGMENT     = 1
+# TRIANGLE    = 2
+# SQUARE      = 3
+# TETRAHEDRON = 4
+# CUBE        = 5
+# PRISM       = 6
+#
+
+dimension
+3
+
+elements
+8
+1 5 0 1 4 3 9 10 13 12
+1 5 3 4 7 6 12 13 16 15
+1 5 12 13 16 15 21 22 25 24
+1 5 9 10 13 12 18 19 22 21
+1 5 10 11 14 13 19 20 23 22
+1 5 13 14 17 16 22 23 26 25
+1 5 4 5 8 7 13 14 17 16
+1 5 1 2 5 4 10 11 14 13
+
+boundary
+24
+3 3 0 3 4 1
+3 3 1 4 5 2
+3 3 3 6 7 4
+3 3 4 7 8 5
+3 3 18 19 22 21
+3 3 19 20 23 22
+3 3 21 22 25 24
+3 3 22 23 26 25
+1 3 0 9 12 3
+1 3 3 12 15 6
+1 3 9 18 21 12
+1 3 12 21 24 15
+1 3 2 5 14 11
+1 3 5 8 17 14
+1 3 11 14 23 20
+1 3 14 17 26 23
+2 3 0 1 10 9
+2 3 9 10 19 18
+2 3 1 2 11 10
+2 3 10 11 20 19
+2 3 6 15 16 7
+2 3 15 24 25 16
+2 3 7 16 17 8
+2 3 16 25 26 17
+
+vertices
+27
+
+nodes
+FiniteElementSpace
+FiniteElementCollection: H1_3D_P2
+VDim: 3
+Ordering: 0
+
+0
+0.5
+1
+0
+0.5
+1
+0
+0.5
+1
+0
+0.5
+1
+0
+0.5
+1
+0
+0.5
+1
+0
+0.5
+1
+0
+0.5
+1
+0
+0.5
+1
+0.25
+0.5
+0.25
+0
+0.25
+0.5
+0.25
+0
+0
+0.5
+0.5
+0
+0.5
+0.25
+0
+0.5
+0.25
+0
+0.5
+0
+0.25
+0.5
+0.25
+0
+0
+0.5
+0.5
+0
+0.25
+0.5
+0
+0
+0.5
+0.75
+1
+0.75
+0.75
+1
+0.75
+1
+1
+1
+0.75
+1
+0.75
+1
+0.75
+1
+0.75
+1
+1
+0.75
+1
+1
+0.25
+0.25
+0.5
+0.25
+0
+0.25
+0.25
+0.5
+0.25
+0
+0.25
+0.25
+0.5
+0.25
+0
+0.25
+0.25
+0.5
+0
+0.25
+0.75
+0.75
+1
+0.75
+0.75
+0.75
+1
+0.75
+0.75
+0.75
+0.75
+1
+0.75
+0.75
+0.75
+1
+0.25
+0.25
+0.25
+0.25
+0.75
+0.75
+0.75
+0.75
+0
+0
+0
+0.5
+0.5
+0.5
+1
+1
+1
+0
+0
+0
+0.5
+0.5
+0.5
+1
+1
+1
+0
+0
+0
+0.5
+0.5
+0.5
+1
+1
+1
+0
+0.25
+0.5
+0.25
+0
+0.25
+0.5
+0.25
+0
+0
+0.5
+0.5
+0.75
+1
+0.75
+0.75
+1
+0.75
+1
+1
+0.5
+0.75
+1
+0.75
+0.5
+0.5
+1
+1
+0
+0.25
+0.25
+0
+0
+0
+0.25
+0.5
+0
+0.25
+0.5
+0
+0.5
+0.75
+1
+0.75
+1
+1
+0.5
+0.75
+1
+0.5
+1
+0
+0.25
+0
+0.25
+0
+0.25
+0.5
+0.25
+0.25
+0.75
+0.75
+1
+0.75
+0.75
+0.5
+0.75
+1
+0.75
+0.75
+0
+0.25
+0.25
+0.25
+0.25
+0
+0.25
+0.5
+0.25
+0.75
+0.75
+1
+0.75
+0.75
+0.5
+0.75
+1
+0.25
+0
+0.25
+0.25
+0.75
+0.75
+0.25
+0.25
+0.75
+0.75
+0.25
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0.5
+0.5
+0.5
+0.5
+0.5
+0.5
+0.5
+0.5
+0.5
+1
+1
+1
+1
+1
+1
+1
+1
+1
+0
+0
+0
+0
+0.5
+0.5
+0.5
+0.5
+0.25
+0.25
+0.25
+0.25
+0
+0
+0
+0.5
+0.5
+0.5
+0.25
+0.25
+1
+1
+1
+1
+0.75
+0.75
+0.75
+0.75
+1
+1
+1
+0.75
+0.75
+0.5
+0.5
+0.5
+1
+1
+1
+0.75
+0.75
+0.5
+0.5
+1
+1
+0.75
+0
+0
+0
+0.25
+0.25
+0
+0
+0.25
+0
+0.25
+0.25
+0.25
+0.25
+0.5
+0
+0.25
+0.25
+0.25
+0.5
+0.75
+0.75
+0.75
+0.75
+1
+0.75
+0.75
+0.75
+1
+0.5
+0.75
+0.75
+0.75
+1
+0.5
+0.75
+0.75
+1
+0
+0.25
+0.25
+0.25
+0
+0.25
+0.25
+0.25
+0.25
+0.75
+0.75
+0.75
+0.75
+0.25
+0.25
diff --git a/miniapps/meshing/makefile b/miniapps/meshing/makefile
index fc811ba9d62..dac14839dd4 100644
--- a/miniapps/meshing/makefile
+++ b/miniapps/meshing/makefile
@@ -18,6 +18,10 @@ CONFIG_MK = $(MFEM_BUILD_DIR)/config/config.mk
 # MFEM_INSTALL_DIR = ../../mfem
 # CONFIG_MK = $(MFEM_INSTALL_DIR)/share/mfem/config.mk
 
+# Include defaults.mk to get XLINKER
+DEFAULTS_MK = $(MFEM_DIR)/config/defaults.mk
+include $(DEFAULTS_MK)
+
 MFEM_LIB_FILE = mfem_is_not_built
 -include $(CONFIG_MK)
 
@@ -31,19 +35,30 @@ else
    MINIAPPS = $(PAR_MINIAPPS) $(SEQ_MINIAPPS)
 endif
 
+COMMON_LIB = -L$(MFEM_BUILD_DIR)/miniapps/common -lmfem-common
+
+# If MFEM_SHARED is set, add the ../common rpath
+COMMON_LIB += $(if $(MFEM_SHARED:YES=),,\
+   $(if $(MFEM_USE_CUDA:YES=),$(CXX_XLINKER),$(CUDA_XLINKER))-rpath,$(abspath\
+   $(MFEM_BUILD_DIR)/miniapps/common))
+
 .SUFFIXES:
 .SUFFIXES: .o .cpp .mk
-.PHONY: all clean clean-build clean-exec
+.PHONY: all lib-common clean clean-build clean-exec
 
 # Remove built-in rule
 %: %.cpp
 
 # Replace the default implicit rule for *.cpp files
-%: $(SRC)%.cpp $(MFEM_LIB_FILE) $(CONFIG_MK)
-	$(MFEM_CXX) $(MFEM_FLAGS) $< -o $@ $(MFEM_LIBS)
+%: $(SRC)%.cpp $(MFEM_LIB_FILE) $(CONFIG_MK) | lib-common
+	$(MFEM_CXX) $(MFEM_FLAGS) $< -o $@ $(COMMON_LIB) $(MFEM_LIBS)
 
 all: $(MINIAPPS)
 
+# Rule for building lib-common
+lib-common:
+	$(MAKE) -C $(MFEM_BUILD_DIR)/miniapps/common
+
 # Rules to copy the *.mesh files - needed for running the sample runs when
 # building out-of-source:
 ifneq ($(SRC),)
diff --git a/miniapps/meshing/mesh-optimizer.cpp b/miniapps/meshing/mesh-optimizer.cpp
index 380df0c20f9..b3642f12428 100644
--- a/miniapps/meshing/mesh-optimizer.cpp
+++ b/miniapps/meshing/mesh-optimizer.cpp
@@ -35,16 +35,31 @@
 //   Adapted analytic Hessian:
 //     mesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 2 -tid 4 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8
 //   Adapted analytic Hessian with size+orientation:
-//     mesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 14 -tid 4 -ni 100 -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd 1
+//     mesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 14 -tid 4 -ni 100 -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd
 //   Adapted analytic Hessian with shape+size+orientation
-//     mesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 87 -tid 4 -ni 100 -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd 1
+//     mesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 87 -tid 4 -ni 100 -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd
 //   Adapted discrete size:
 //     mesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 7 -tid 5 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8
 //     mesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 2 -tid 5 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8 -cmb 2 -nor
+//
+//   Adapted size+aspect ratio to discrete material indicator
+//     mesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 7 -tid 6 -ni 100  -ls 2 -li 100 -bnd -qt 1 -qo 8
+//   Adapted discrete size+orientation (requires GSLIB)
+//   * mesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 14 -tid 8 -ni 100  -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd -ae 1
+//   Adapted discrete aspect-ratio+orientation (requires GSLIB)
+//   * mesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 87 -tid 8 -ni 10  -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd -ae 1
+//   Adapted discrete aspect ratio (3D)
+//     mesh-optimizer -m cube.mesh -o 2 -rs 2 -mid 302 -tid 7 -ni 20  -ls 2 -li 100 -bnd -qt 1 -qo 8
+//
+//   Adaptive limiting:
+//     mesh-optimizer -m stretched2D.mesh -o 2 -mid 2 -tid 1 -ni 50 -qo 5 -nor -vl 1 -alc 0.5 -ae 0
+//   Adaptive limiting through FD (requires GSLIB):
+//   * mesh-optimizer -m stretched2D.mesh -o 2 -mid 2 -tid 1 -ni 50 -qo 5 -nor -vl 1 -alc 0.5 -fd -ae 1
+//
 //   Blade shape:
 //     mesh-optimizer -m blade.mesh -o 4 -rs 0 -mid 2 -tid 1 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8
 //   Blade shape with FD-based solver:
-//     mesh-optimizer -m blade.mesh -o 4 -rs 0 -mid 2 -tid 1 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd 1
+//     mesh-optimizer -m blade.mesh -o 4 -rs 0 -mid 2 -tid 1 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd
 //   Blade limited shape:
 //     mesh-optimizer -m blade.mesh -o 4 -rs 0 -mid 2 -tid 1 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8 -lc 5000
 //   ICF shape and equal size:
@@ -64,191 +79,14 @@
 
 
 #include "mfem.hpp"
+#include "../common/mfem-common.hpp"
 #include <fstream>
 #include <iostream>
+#include "mesh-optimizer.hpp"
 
 using namespace mfem;
 using namespace std;
 
-double weight_fun(const Vector &x);
-
-double ind_values(const Vector &x)
-{
-   const int opt = 6;
-   const double small = 0.001, big = 0.01;
-   double val = 0.;
-
-   // Sine wave.
-   if (opt == 1)
-   {
-      const double X = x(0), Y = x(1);
-      val = std::tanh((10*(Y-0.5) + std::sin(4.0*M_PI*X)) + 1) -
-            std::tanh((10*(Y-0.5) + std::sin(4.0*M_PI*X)) - 1);
-   }
-   else if (opt == 2)
-   {
-      // Circle in the middle.
-      const double xc = x(0) - 0.5, yc = x(1) - 0.5;
-      const double r = sqrt(xc*xc + yc*yc);
-      double r1 = 0.15; double r2 = 0.35; double sf=30.0;
-      val = 0.5*(std::tanh(sf*(r-r1)) - std::tanh(sf*(r-r2)));
-   }
-   else if (opt == 3)
-   {
-      // cross
-      const double X = x(0), Y = x(1);
-      const double r1 = 0.45, r2 = 0.55;
-      const double sf = 40.0;
-
-      val = 0.5 * (std::tanh(sf*(X-r1)) - std::tanh(sf*(X-r2)) +
-                   std::tanh(sf*(Y-r1)) - std::tanh(sf*(Y-r2)));
-   }
-   else if (opt == 4)
-   {
-      // Multiple circles
-      double r1,r2,val,rval;
-      double sf = 10;
-      val = 0.;
-      // circle 1
-      r1= 0.25; r2 = 0.25; rval = 0.1;
-      double xc = x(0) - r1, yc = x(1) - r2;
-      double r = sqrt(xc*xc+yc*yc);
-      val = 0.5*(1+std::tanh(sf*(r+rval))) -
-            0.5*(1+std::tanh(sf*(r-rval))); // std::exp(val1);
-      // circle 2
-      r1= 0.75; r2 = 0.75;
-      xc = x(0) - r1, yc = x(1) - r2;
-      r = sqrt(xc*xc+yc*yc);
-      val += (0.5*(1+std::tanh(sf*(r+rval))) -
-              0.5*(1+std::tanh(sf*(r-rval)))); // std::exp(val1);
-      // circle 3
-      r1= 0.75; r2 = 0.25;
-      xc = x(0) - r1, yc = x(1) - r2;
-      r = sqrt(xc*xc+yc*yc);
-      val += 0.5*(1+std::tanh(sf*(r+rval))) -
-             0.5*(1+std::tanh(sf*(r-rval))); // std::exp(val1);
-      // circle 4
-      r1= 0.25; r2 = 0.75;
-      xc = x(0) - r1, yc = x(1) - r2;
-      r = sqrt(xc*xc+yc*yc);
-      val += 0.5*(1+std::tanh(sf*(r+rval))) -
-             0.5*(1+std::tanh(sf*(r-rval)));
-   }
-   else if (opt == 5)
-   {
-      // cross
-      double X = x(0)-0.5, Y = x(1)-0.5;
-      double rval = std::sqrt(X*X + Y*Y);
-      double thval = 60.*M_PI/180.;
-      double Xmod,Ymod;
-      Xmod = X*std::cos(thval) + Y*std::sin(thval);
-      Ymod= -X*std::sin(thval) + Y*std::cos(thval);
-      X = Xmod+0.5; Y = Ymod+0.5;
-      double r1 = 0.45; double r2 = 0.55; double sf=30.0;
-      val = (0.5*(1+std::tanh(sf*(X-r1))) - 0.5*(1+std::tanh(sf*(X-r2))) +
-             0.5*(1+std::tanh(sf*(Y-r1))) - 0.5*(1+std::tanh(sf*(Y-r2))));
-      if (rval > 0.4) { val = 0.; }
-   }
-   else if (opt == 6)
-   {
-      const double xc = x(0) - 0.0, yc = x(1) - 0.5;
-      const double r = sqrt(xc*xc + yc*yc);
-      double r1 = 0.45; double r2 = 0.55; double sf=30.0;
-      val = 0.5*(1+std::tanh(sf*(r-r1))) - 0.5*(1+std::tanh(sf*(r-r2)));
-   }
-
-   val = std::max(0.,val);
-   val = std::min(1.,val);
-
-   return val * small + (1.0 - val) * big;
-}
-
-class HessianCoefficient : public MatrixCoefficient
-{
-private:
-   int metric;
-
-public:
-   HessianCoefficient(int dim, int metric_id)
-      : MatrixCoefficient(dim), metric(metric_id) { }
-
-   virtual void Eval(DenseMatrix &K, ElementTransformation &T,
-                     const IntegrationPoint &ip)
-   {
-      Vector pos(3);
-      T.Transform(ip, pos);
-      if (metric != 14 && metric != 87)
-      {
-         const double xc = pos(0) - 0.5, yc = pos(1) - 0.5;
-         const double r = sqrt(xc*xc + yc*yc);
-         double r1 = 0.15; double r2 = 0.35; double sf=30.0;
-         const double eps = 0.5;
-
-         const double tan1 = std::tanh(sf*(r-r1)),
-                      tan2 = std::tanh(sf*(r-r2));
-
-         K(0, 0) = eps + 1.0 * (tan1 - tan2);
-         K(0, 1) = 0.0;
-         K(1, 0) = 0.0;
-         K(1, 1) = 1.0;
-      }
-      else if (metric == 14) // Size + Alignment
-      {
-         const double xc = pos(0), yc = pos(1);
-         double theta = M_PI * yc * (1.0 - yc) * cos(2 * M_PI * xc);
-         double alpha_bar = 0.1;
-
-         K(0, 0) =  cos(theta);
-         K(1, 0) =  sin(theta);
-         K(0, 1) = -sin(theta);
-         K(1, 1) =  cos(theta);
-
-         K *= alpha_bar;
-      }
-      else if (metric == 87) // Shape + Size + Alignment
-      {
-         Vector x = pos;
-         double xc = x(0)-0.5, yc = x(1)-0.5;
-         double th = 22.5*M_PI/180.;
-         double xn =  cos(th)*xc + sin(th)*yc;
-         double yn = -sin(th)*xc + cos(th)*yc;
-         double th2 = (th > 45.*M_PI/180) ? M_PI/2 - th : th;
-         double stretch = 1/cos(th2);
-         xc = xn/stretch; yc = yn/stretch;
-         xc = xn; yc=yn;
-
-         double tfac = 20;
-         double s1 = 3;
-         double s2 = 2;
-         double wgt = std::tanh((tfac*(yc) + s2*std::sin(s1*M_PI*xc)) + 1)
-                      - std::tanh((tfac*(yc) + s2*std::sin(s1*M_PI*xc)) - 1);
-         if (wgt > 1) { wgt = 1; }
-         if (wgt < 0) { wgt = 0; }
-         double  val = wgt;
-
-         xc = pos(0), yc = pos(1);
-         double theta = M_PI * (yc) * (1.0 - yc) * cos(2 * M_PI * xc);
-
-         K(0, 0) =  cos(theta);
-         K(1, 0) =  sin(theta);
-         K(0, 1) = -sin(theta);
-         K(1, 1) =  cos(theta);
-
-         double asp_ratio_tar = 0.1 + 1*(1-val)*(1-val);
-
-         K(0, 0) *=  1/pow(asp_ratio_tar,0.5);
-         K(1, 0) *=  1/pow(asp_ratio_tar,0.5);
-         K(0, 1) *=  pow(asp_ratio_tar,0.5);
-         K(1, 1) *=  pow(asp_ratio_tar,0.5);
-      }
-   }
-};
-
-// Additional IntegrationRules that can be used with the --quad-type option.
-IntegrationRules IntRulesLo(0, Quadrature1D::GaussLobatto);
-IntegrationRules IntRulesCU(0, Quadrature1D::ClosedUniform);
-
-
 int main(int argc, char *argv[])
 {
    // 0. Set the method's default parameters.
@@ -259,6 +97,7 @@ int main(int argc, char *argv[])
    int metric_id         = 1;
    int target_id         = 1;
    double lim_const      = 0.0;
+   double adapt_lim_const = 0.0;
    int quad_type         = 1;
    int quad_order        = 8;
    int newton_iter       = 10;
@@ -270,7 +109,8 @@ int main(int argc, char *argv[])
    bool normalization    = false;
    bool visualization    = true;
    int verbosity_level   = 0;
-   int fdscheme          = 0;
+   bool fdscheme         = false;
+   int adapt_eval        = 0;
 
    // 1. Parse command-line options.
    OptionsParser args(argc, argv);
@@ -312,6 +152,8 @@ int main(int argc, char *argv[])
                   "4: Given full analytic Jacobian (in physical space)\n\t"
                   "5: Ideal shape, given size (in physical space)");
    args.AddOption(&lim_const, "-lc", "--limit-const", "Limiting constant.");
+   args.AddOption(&adapt_lim_const, "-alc", "--adapt-limit-const",
+                  "Adaptive limiting coefficient constant.");
    args.AddOption(&quad_type, "-qt", "--quad-type",
                   "Quadrature rule type:\n\t"
                   "1: Gauss-Lobatto\n\t"
@@ -339,12 +181,15 @@ int main(int argc, char *argv[])
                   "--no-normalization",
                   "Make all terms in the optimization functional unitless.");
    args.AddOption(&fdscheme, "-fd", "--fd_approximation",
+                  "-no-fd", "--no-fd-approx",
                   "Enable finite difference based derivative computations.");
    args.AddOption(&visualization, "-vis", "--visualization", "-no-vis",
                   "--no-visualization",
                   "Enable or disable GLVis visualization.");
    args.AddOption(&verbosity_level, "-vl", "--verbosity-level",
                   "Set the verbosity level - 0, 1, or 2.");
+   args.AddOption(&adapt_eval, "-ae", "--adaptivity-evaluator",
+                  "0 - Advection based (DEFAULT), 1 - GSLIB.");
    args.Parse();
    if (!args.Good())
    {
@@ -485,13 +330,15 @@ int main(int argc, char *argv[])
    HessianCoefficient *adapt_coeff = NULL;
    H1_FECollection ind_fec(mesh_poly_deg, dim);
    FiniteElementSpace ind_fes(mesh, &ind_fec);
-   GridFunction size;
+   FiniteElementSpace ind_fesv(mesh, &ind_fec, dim);
+   GridFunction size(&ind_fes), aspr(&ind_fes), disc(&ind_fes), ori(&ind_fes);
+   GridFunction aspr3d(&ind_fesv), size3d(&ind_fesv);
    switch (target_id)
    {
       case 1: target_t = TargetConstructor::IDEAL_SHAPE_UNIT_SIZE; break;
       case 2: target_t = TargetConstructor::IDEAL_SHAPE_EQUAL_SIZE; break;
       case 3: target_t = TargetConstructor::IDEAL_SHAPE_GIVEN_SIZE; break;
-      case 4:
+      case 4: // Analytic
       {
          target_t = TargetConstructor::GIVEN_FULL;
          AnalyticAdaptTC *tc = new AnalyticAdaptTC(target_t);
@@ -500,19 +347,181 @@ int main(int argc, char *argv[])
          target_c = tc;
          break;
       }
-      case 5:
+      case 5: // Discrete size 2D
       {
          target_t = TargetConstructor::IDEAL_SHAPE_GIVEN_SIZE;
          DiscreteAdaptTC *tc = new DiscreteAdaptTC(target_t);
-         size.SetSpace(&ind_fes);
-         FunctionCoefficient ind_coeff(ind_values);
+         if (adapt_eval == 0)
+         {
+            tc->SetAdaptivityEvaluator(new AdvectorCG);
+         }
+         else
+         {
+#ifdef MFEM_USE_GSLIB
+            tc->SetAdaptivityEvaluator(new InterpolatorFP);
+#else
+            MFEM_ABORT("MFEM is not built with GSLIB.");
+#endif
+         }
+         FunctionCoefficient ind_coeff(discrete_size_2d);
          size.ProjectCoefficient(ind_coeff);
+         tc->SetSerialDiscreteTargetSize(size);
+         target_c = tc;
+         break;
+      }
+      case 6: // Discrete size + aspect ratio - 2D
+      {
+         GridFunction d_x(&ind_fes), d_y(&ind_fes);
+
+         target_t = TargetConstructor::GIVEN_SHAPE_AND_SIZE;
+         DiscreteAdaptTC *tc = new DiscreteAdaptTC(target_t);
+         FunctionCoefficient ind_coeff(material_indicator_2d);
+         disc.ProjectCoefficient(ind_coeff);
+         if (adapt_eval == 0)
+         {
+            tc->SetAdaptivityEvaluator(new AdvectorCG);
+         }
+         else
+         {
+#ifdef MFEM_USE_GSLIB
+            tc->SetAdaptivityEvaluator(new InterpolatorFP);
+#else
+            MFEM_ABORT("MFEM is not built with GSLIB.");
+#endif
+         }
+
+         //Diffuse the interface
+         DiffuseField(disc,2);
+
+         //Get  partials with respect to x and y of the grid function
+         disc.GetDerivative(1,0,d_x);
+         disc.GetDerivative(1,1,d_y);
+
+         //Compute the squared magnitude of the gradient
+         for (int i = 0; i < size.Size(); i++)
+         {
+            size(i) = std::pow(d_x(i),2)+std::pow(d_y(i),2);
+         }
+         const double max = size.Max();
+
+         for (int i = 0; i < d_x.Size(); i++)
+         {
+            d_x(i) = std::abs(d_x(i));
+            d_y(i) = std::abs(d_y(i));
+         }
+         const double eps = 0.01;
+         const double aspr_ratio = 20.0;
+         const double size_ratio = 40.0;
+
+         for (int i = 0; i < size.Size(); i++)
+         {
+            size(i) = (size(i)/max);
+            aspr(i) = (d_x(i)+eps)/(d_y(i)+eps);
+            aspr(i) = 0.1 + 0.9*(1-size(i))*(1-size(i));
+            if (aspr(i) > aspr_ratio) {aspr(i) = aspr_ratio;}
+            if (aspr(i) < 1.0/aspr_ratio) {aspr(i) = 1.0/aspr_ratio;}
+         }
+         Vector vals;
+         const int NE = mesh->GetNE();
+         double volume = 0.0, volume_ind = 0.0;
+
+         for (int i = 0; i < NE; i++)
+         {
+            ElementTransformation *Tr = mesh->GetElementTransformation(i);
+            const IntegrationRule &ir =
+               IntRules.Get(mesh->GetElementBaseGeometry(i), Tr->OrderJ());
+            size.GetValues(i, ir, vals);
+            for (int j = 0; j < ir.GetNPoints(); j++)
+            {
+               const IntegrationPoint &ip = ir.IntPoint(j);
+               Tr->SetIntPoint(&ip);
+               volume     += ip.weight * Tr->Weight();
+               volume_ind += vals(j) * ip.weight * Tr->Weight();
+            }
+         }
+
+         const double avg_zone_size = volume / NE;
+
+         const double small_avg_ratio = (volume_ind + (volume - volume_ind) /
+                                         size_ratio) /
+                                        volume;
+
+         const double small_zone_size = small_avg_ratio * avg_zone_size;
+         const double big_zone_size   = size_ratio * small_zone_size;
+
+         for (int i = 0; i < size.Size(); i++)
+         {
+            const double val = size(i);
+            const double a = (big_zone_size - small_zone_size) / small_zone_size;
+            size(i) = big_zone_size / (1.0+a*val);
+         }
+
+         DiffuseField(size, 2);
+         DiffuseField(aspr, 2);
+
+         tc->SetSerialDiscreteTargetSize(size);
+         tc->SetSerialDiscreteTargetAspectRatio(aspr);
+         target_c = tc;
+         break;
+      }
+      case 7: // Discrete aspect ratio 3D
+      {
+         target_t = TargetConstructor::GIVEN_SHAPE_AND_SIZE;
+         DiscreteAdaptTC *tc = new DiscreteAdaptTC(target_t);
+         if (adapt_eval == 0)
+         {
+            tc->SetAdaptivityEvaluator(new AdvectorCG);
+         }
+         else
+         {
+#ifdef MFEM_USE_GSLIB
+            tc->SetAdaptivityEvaluator(new InterpolatorFP);
+#else
+            MFEM_ABORT("MFEM is not built with GSLIB.");
+#endif
+         }
+         VectorFunctionCoefficient fd_aspr3d(dim, discrete_aspr_3d);
+         aspr3d.ProjectCoefficient(fd_aspr3d);
+
+         tc->SetSerialDiscreteTargetAspectRatio(aspr3d);
+         target_c = tc;
+         break;
+      }
+      case 8: // shape/size + orientation 2D
+      {
+         target_t = TargetConstructor::GIVEN_SHAPE_AND_SIZE;
+         DiscreteAdaptTC *tc = new DiscreteAdaptTC(target_t);
+         if (adapt_eval == 0)
+         {
+            tc->SetAdaptivityEvaluator(new AdvectorCG);
+         }
+         else
+         {
 #ifdef MFEM_USE_GSLIB
-         tc->SetAdaptivityEvaluator(new InterpolatorFP);
+            tc->SetAdaptivityEvaluator(new InterpolatorFP);
 #else
-         tc->SetAdaptivityEvaluator(new AdvectorCG);
+            MFEM_ABORT("MFEM is not built with GSLIB.");
 #endif
-         tc->SetSerialDiscreteTargetSpec(size);
+         }
+
+         if (metric_id == 14)
+         {
+            ConstantCoefficient ind_coeff(0.1*0.1);
+            size.ProjectCoefficient(ind_coeff);
+            tc->SetSerialDiscreteTargetSize(size);
+         }
+
+         if (metric_id == 87)
+         {
+            FunctionCoefficient aspr_coeff(discrete_aspr_2d);
+            aspr.ProjectCoefficient(aspr_coeff);
+            DiffuseField(aspr,2);
+            tc->SetSerialDiscreteTargetAspectRatio(aspr);
+         }
+
+         FunctionCoefficient ori_coeff(discrete_ori_2d);
+         ori.ProjectCoefficient(ori_coeff);
+         tc->SetSerialDiscreteTargetOrientation(ori);
          target_c = tc;
          break;
       }
@@ -551,6 +560,35 @@ int main(int argc, char *argv[])
    ConstantCoefficient lim_coeff(lim_const);
    if (lim_const != 0.0) { he_nlf_integ->EnableLimiting(x0, dist, lim_coeff); }
 
+   // Adaptive limiting.
+   GridFunction zeta_0(&ind_fes);
+   ConstantCoefficient coef_zeta(adapt_lim_const);
+   AdaptivityEvaluator *adapt_evaluator = NULL;
+   if (adapt_lim_const > 0.0)
+   {
+      FunctionCoefficient alim_coeff(adapt_lim_fun);
+      zeta_0.ProjectCoefficient(alim_coeff);
+
+      if (adapt_eval == 0) { adapt_evaluator = new AdvectorCG; }
+      else if (adapt_eval == 1)
+      {
+#ifdef MFEM_USE_GSLIB
+         adapt_evaluator = new InterpolatorFP;
+#else
+         MFEM_ABORT("MFEM is not built with GSLIB support!");
+#endif
+      }
+      else { MFEM_ABORT("Bad interpolation option."); }
+
+      he_nlf_integ->EnableAdaptiveLimiting(zeta_0, coef_zeta, *adapt_evaluator);
+      if (visualization)
+      {
+         socketstream vis1;
+         common::VisualizeField(vis1, "localhost", 19916, zeta_0, "Zeta 0",
+                                300, 600, 300, 300);
+      }
+   }
+
    // 14. Setup the final NonlinearForm (which defines the integral of interest,
    //     its first and second derivatives). Here we can use a combination of
    //     metrics, i.e., optimize the sum of two integrals, where both are
@@ -748,11 +786,13 @@ int main(int argc, char *argv[])
    // 21. Compute the amount of energy decrease.
    const double fin_energy = a.GetGridFunctionEnergy(x);
    double metric_part = fin_energy;
-   if (lim_const != 0.0)
+   if (lim_const > 0.0 || adapt_lim_const > 0.0)
    {
       lim_coeff.constant = 0.0;
+      coef_zeta.constant = 0.0;
       metric_part = a.GetGridFunctionEnergy(x);
       lim_coeff.constant = lim_const;
+      coef_zeta.constant = adapt_lim_const;
    }
    cout << "Initial strain energy: " << init_energy
         << " = metrics: " << init_energy
@@ -770,6 +810,13 @@ int main(int argc, char *argv[])
       vis_tmop_metric_s(mesh_poly_deg, *metric, *target_c, *mesh, title, 600);
    }
 
+   if (adapt_lim_const > 0.0 && visualization)
+   {
+      socketstream vis0;
+      common::VisualizeField(vis0, "localhost", 19916, zeta_0, "Xi 0",
+                             600, 600, 300, 300);
+   }
+
    // 23. Visualize the mesh displacement.
    if (visualization)
    {
@@ -790,7 +837,9 @@ int main(int argc, char *argv[])
    delete target_c2;
    delete metric2;
    delete coeff1;
+   delete adapt_evaluator;
    delete target_c;
+   delete adapt_coeff;
    delete metric;
    delete fespace;
    delete fec;
@@ -798,13 +847,3 @@ int main(int argc, char *argv[])
 
    return 0;
 }
-
-// Defined with respect to the icf mesh.
-double weight_fun(const Vector &x)
-{
-   const double r = sqrt(x(0)*x(0) + x(1)*x(1) + 1e-12);
-   const double den = 0.002;
-   double l2 = 0.2 + 0.5*std::tanh((r-0.16)/den) - 0.5*std::tanh((r-0.17)/den)
-               + 0.5*std::tanh((r-0.23)/den) - 0.5*std::tanh((r-0.24)/den);
-   return l2;
-}
diff --git a/miniapps/meshing/mesh-optimizer.hpp b/miniapps/meshing/mesh-optimizer.hpp
new file mode 100644
index 00000000000..f710f0a2f6c
--- /dev/null
+++ b/miniapps/meshing/mesh-optimizer.hpp
@@ -0,0 +1,241 @@
+//           MFEM Mesh Optimizer Miniapp - Serial/Parallel Shared Code
+
+#include "mfem.hpp"
+#include <fstream>
+#include <iostream>
+
+using namespace mfem;
+using namespace std;
+
+double discrete_size_2d(const Vector &x)
+{
+   int opt = 2;
+   const double small = 0.001, big = 0.01;
+   double val = 0.;
+
+   if (opt == 1) // sine wave.
+   {
+      const double X = x(0), Y = x(1);
+      val = std::tanh((10*(Y-0.5) + std::sin(4.0*M_PI*X)) + 1) -
+            std::tanh((10*(Y-0.5) + std::sin(4.0*M_PI*X)) - 1);
+   }
+   else if (opt == 2) // semi-circle
+   {
+      const double xc = x(0) - 0.0, yc = x(1) - 0.5;
+      const double r = sqrt(xc*xc + yc*yc);
+      double r1 = 0.45; double r2 = 0.55; double sf=30.0;
+      val = 0.5*(1+std::tanh(sf*(r-r1))) - 0.5*(1+std::tanh(sf*(r-r2)));
+   }
+
+   val = std::max(0.,val);
+   val = std::min(1.,val);
+
+   return val * small + (1.0 - val) * big;
+}
+
+double material_indicator_2d(const Vector &x)
+{
+   double xc = x(0)-0.5, yc = x(1)-0.5;
+   double th = 22.5*M_PI/180.;
+   double xn =  cos(th)*xc + sin(th)*yc;
+   double yn = -sin(th)*xc + cos(th)*yc;
+   double th2 = (th > 45.*M_PI/180) ? M_PI/2 - th : th;
+   double stretch = 1/cos(th2);
+   xc = xn/stretch; yc = yn/stretch;
+   double tfac = 20;
+   double s1 = 3;
+   double s2 = 3;
+   double wgt = std::tanh((tfac*(yc) + s2*std::sin(s1*M_PI*xc)) + 1);
+   if (wgt > 1) { wgt = 1; }
+   if (wgt < 0) { wgt = 0; }
+   return wgt;
+}
+
+double discrete_ori_2d(const Vector &x)
+{
+   return M_PI * x(1) * (1.0 - x(1)) * cos(2 * M_PI * x(0));
+}
+
+double discrete_aspr_2d(const Vector &x)
+{
+   double xc = x(0)-0.5, yc = x(1)-0.5;
+   double th = 22.5*M_PI/180.;
+   double xn =  cos(th)*xc + sin(th)*yc;
+   double yn = -sin(th)*xc + cos(th)*yc;
+   //double th2 = (th > 45.*M_PI/180) ? M_PI/2 - th : th;
+   //double stretch = 1/cos(th2);
+   xc = xn; yc = yn;
+
+   double tfac = 20;
+   double s1 = 3;
+   double s2 = 2;
+   double wgt = std::tanh((tfac*(yc) + s2*std::sin(s1*M_PI*xc)) + 1)
+                - std::tanh((tfac*(yc) + s2*std::sin(s1*M_PI*xc)) - 1);
+   if (wgt > 1) { wgt = 1; }
+   if (wgt < 0) { wgt = 0; }
+   return 0.1 + 1*(1-wgt)*(1-wgt);
+}
+
+void discrete_aspr_3d(const Vector &x, Vector &v)
+{
+   int dim = x.Size();
+   v.SetSize(dim);
+   double l1, l2, l3;
+   l1 = 1.;
+   l2 = 1. + 5*x(1);
+   l3 = 1. + 10*x(2);
+   v[0] = l1/pow(l2*l3,0.5);
+   v[1] = l2/pow(l1*l3,0.5);
+   v[2] = l3/pow(l2*l1,0.5);
+}
+
+class HessianCoefficient : public MatrixCoefficient
+{
+private:
+   int metric;
+
+public:
+   HessianCoefficient(int dim, int metric_id)
+      : MatrixCoefficient(dim), metric(metric_id) { }
+
+   virtual void Eval(DenseMatrix &K, ElementTransformation &T,
+                     const IntegrationPoint &ip)
+   {
+      Vector pos(3);
+      T.Transform(ip, pos);
+      if (metric != 14 && metric != 87)
+      {
+         const double xc = pos(0) - 0.5, yc = pos(1) - 0.5;
+         const double r = sqrt(xc*xc + yc*yc);
+         double r1 = 0.15; double r2 = 0.35; double sf=30.0;
+         const double eps = 0.5;
+
+         const double tan1 = std::tanh(sf*(r-r1)),
+                      tan2 = std::tanh(sf*(r-r2));
+
+         K(0, 0) = eps + 1.0 * (tan1 - tan2);
+         K(0, 1) = 0.0;
+         K(1, 0) = 0.0;
+         K(1, 1) = 1.0;
+      }
+      else if (metric == 14) // Size + Alignment
+      {
+         const double xc = pos(0), yc = pos(1);
+         double theta = M_PI * yc * (1.0 - yc) * cos(2 * M_PI * xc);
+         double alpha_bar = 0.1;
+
+         K(0, 0) =  cos(theta);
+         K(1, 0) =  sin(theta);
+         K(0, 1) = -sin(theta);
+         K(1, 1) =  cos(theta);
+
+         K *= alpha_bar;
+      }
+      else if (metric == 87) // Shape + Alignment
+      {
+         Vector x = pos;
+         double xc = x(0)-0.5, yc = x(1)-0.5;
+         double th = 22.5*M_PI/180.;
+         double xn =  cos(th)*xc + sin(th)*yc;
+         double yn = -sin(th)*xc + cos(th)*yc;
+         xc = xn; yc=yn;
+
+         double tfac = 20;
+         double s1 = 3;
+         double s2 = 2;
+         double wgt = std::tanh((tfac*(yc) + s2*std::sin(s1*M_PI*xc)) + 1)
+                      - std::tanh((tfac*(yc) + s2*std::sin(s1*M_PI*xc)) - 1);
+         if (wgt > 1) { wgt = 1; }
+         if (wgt < 0) { wgt = 0; }
+
+         xc = pos(0), yc = pos(1);
+         double theta = M_PI * (yc) * (1.0 - yc) * cos(2 * M_PI * xc);
+
+         K(0, 0) =  cos(theta);
+         K(1, 0) =  sin(theta);
+         K(0, 1) = -sin(theta);
+         K(1, 1) =  cos(theta);
+
+         double asp_ratio_tar = 0.1 + 1*(1-wgt)*(1-wgt);
+
+         K(0, 0) *=  1/pow(asp_ratio_tar,0.5);
+         K(1, 0) *=  1/pow(asp_ratio_tar,0.5);
+         K(0, 1) *=  pow(asp_ratio_tar,0.5);
+         K(1, 1) *=  pow(asp_ratio_tar,0.5);
+      }
+   }
+};
+
+// Additional IntegrationRules that can be used with the --quad-type option.
+IntegrationRules IntRulesLo(0, Quadrature1D::GaussLobatto);
+IntegrationRules IntRulesCU(0, Quadrature1D::ClosedUniform);
+
+// Defined with respect to the icf mesh.
+double weight_fun(const Vector &x)
+{
+   const double r = sqrt(x(0)*x(0) + x(1)*x(1) + 1e-12);
+   const double den = 0.002;
+   double l2 = 0.2 + 0.5*std::tanh((r-0.16)/den) - 0.5*std::tanh((r-0.17)/den)
+               + 0.5*std::tanh((r-0.23)/den) - 0.5*std::tanh((r-0.24)/den);
+   return l2;
+}
+
+// Used for the adaptive limiting examples.
+double adapt_lim_fun(const Vector &x)
+{
+   const double xc = x(0) - 0.1, yc = x(1) - 0.2;
+   const double r = sqrt(xc*xc + yc*yc);
+   double r1 = 0.45; double r2 = 0.55; double sf=30.0;
+   double val = 0.5*(1+std::tanh(sf*(r-r1))) - 0.5*(1+std::tanh(sf*(r-r2)));
+
+   val = std::max(0.,val);
+   val = std::min(1.,val);
+   return val;
+}
+
+void DiffuseField(GridFunction &field, int smooth_steps)
+{
+   //Setup the Laplacian operator
+   BilinearForm *Lap = new BilinearForm(field.FESpace());
+   Lap->AddDomainIntegrator(new DiffusionIntegrator());
+   Lap->Assemble();
+   Lap->Finalize();
+
+   //Setup the smoothing operator
+   DSmoother *S = new DSmoother(0,1.0,smooth_steps);
+   S->iterative_mode = true;
+   S->SetOperator(Lap->SpMat());
+
+   Vector tmp(field.Size());
+   tmp = 0.0;
+   S->Mult(tmp, field);
+
+   delete S;
+   delete Lap;
+}
+
+#ifdef MFEM_USE_MPI
+void DiffuseField(ParGridFunction &field, int smooth_steps)
+{
+   //Setup the Laplacian operator
+   ParBilinearForm *Lap = new ParBilinearForm(field.ParFESpace());
+   Lap->AddDomainIntegrator(new DiffusionIntegrator());
+   Lap->Assemble();
+   Lap->Finalize();
+   HypreParMatrix *A = Lap->ParallelAssemble();
+
+   HypreSmoother *S = new HypreSmoother(*A,0,smooth_steps);
+   S->iterative_mode = true;
+
+   Vector tmp(A->Width());
+   field.SetTrueVector();
+   Vector fieldtrue = field.GetTrueVector();
+   tmp = 0.0;
+   S->Mult(tmp, fieldtrue);
+
+   field.SetFromTrueDofs(fieldtrue);
+
+   delete S;
+   delete Lap;
+}
+#endif
diff --git a/miniapps/meshing/pmesh-optimizer.cpp b/miniapps/meshing/pmesh-optimizer.cpp
index cf5ebc50972..c71f8bc8e44 100644
--- a/miniapps/meshing/pmesh-optimizer.cpp
+++ b/miniapps/meshing/pmesh-optimizer.cpp
@@ -35,16 +35,31 @@
 //   Adapted analytic Hessian:
 //     mpirun -np 4 pmesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 2 -tid 4 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8
 //   Adapted analytic Hessian with size+orientation:
-//     mpirun -np 4 pmesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 14 -tid 4 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd 1
+//     mpirun -np 4 pmesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 14 -tid 4 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd
 //   Adapted analytic Hessian with Shape+size+orientation
-//     mpirun -np 4 pmesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 87 -tid 4 -ni 100 -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd 1
+//     mpirun -np 4 pmesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 87 -tid 4 -ni 100 -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd
 //   Adapted discrete size:
 //     mpirun -np 4 pmesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 7 -tid 5 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8
 //     mpirun -np 4 pmesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 2 -tid 5 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8 -cmb 2 -nor
+//
+//   Adapted size+aspect ratio to discrete material indicator
+//     mpirun -np 4 pmesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 7 -tid 6 -ni 100  -ls 2 -li 100 -bnd -qt 1 -qo 8
+//   Adapted discrete size+orientation (requires GSLIB)
+//   * mpirun -np 4 pmesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 14 -tid 8 -ni 100  -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd -ae 1
+//   Adapted discrete aspect-ratio+orientation (requires GSLIB)
+//   * mpirun -np 4 pmesh-optimizer -m square01.mesh -o 2 -rs 2 -mid 87 -tid 8 -ni 10  -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd -ae 1
+//   Adapted discrete aspect ratio (3D)
+//     mpirun -np 4 pmesh-optimizer -m cube.mesh -o 2 -rs 2 -mid 302 -tid 7 -ni 20  -ls 2 -li 100 -bnd -qt 1 -qo 8
+//
+//   Adaptive limiting:
+//     mpirun -np 4 pmesh-optimizer -m stretched2D.mesh -o 2 -mid 2 -tid 1 -ni 50 -qo 5 -nor -vl 1 -alc 0.5 -ae 0
+//   Adaptive limiting through FD (requires GSLIB):
+//   * mpirun -np 4 pmesh-optimizer -m stretched2D.mesh -o 2 -mid 2 -tid 1 -ni 50 -qo 5 -nor -vl 1 -alc 0.5 -fd -ae 1
+//
 //   Blade shape:
 //     mpirun -np 4 pmesh-optimizer -m blade.mesh -o 4 -rs 0 -mid 2 -tid 1 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8
 //   Blade shape with FD-based solver:
-//     mpirun -np 4 pmesh-optimizer -m blade.mesh -o 4 -rs 0 -mid 2 -tid 1 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd 1
+//     mpirun -np 4 pmesh-optimizer -m blade.mesh -o 4 -rs 0 -mid 2 -tid 1 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8 -fd
 //   Blade limited shape:
 //     mpirun -np 4 pmesh-optimizer -m blade.mesh -o 4 -rs 0 -mid 2 -tid 1 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8 -lc 5000
 //   ICF shape and equal size:
@@ -63,191 +78,14 @@
 //     mpirun -np 4 pmesh-optimizer -m ./amr-quad-q2.mesh -o 2 -rs 1 -mid 9 -tid 2 -ni 200 -ls 2 -li 100 -bnd -qt 1 -qo 8
 
 #include "mfem.hpp"
+#include "../common/mfem-common.hpp"
 #include <iostream>
 #include <fstream>
+#include "mesh-optimizer.hpp"
 
 using namespace mfem;
 using namespace std;
 
-double weight_fun(const Vector &x);
-
-double ind_values(const Vector &x)
-{
-   const int opt = 6;
-   const double small = 0.001, big = 0.01;
-   double val = 0.;
-
-   // Sine wave.
-   if (opt == 1)
-   {
-      const double X = x(0), Y = x(1);
-      val = std::tanh((10*(Y-0.5) + std::sin(4.0*M_PI*X)) + 1) -
-            std::tanh((10*(Y-0.5) + std::sin(4.0*M_PI*X)) - 1);
-   }
-   else if (opt == 2)
-   {
-      // Circle in the middle.
-      const double xc = x(0) - 0.5, yc = x(1) - 0.5;
-      const double r = sqrt(xc*xc + yc*yc);
-      double r1 = 0.15; double r2 = 0.35; double sf=30.0;
-      val = 0.5*(std::tanh(sf*(r-r1)) - std::tanh(sf*(r-r2)));
-   }
-   else if (opt == 3)
-   {
-      // cross
-      const double X = x(0), Y = x(1);
-      const double r1 = 0.45, r2 = 0.55;
-      const double sf = 40.0;
-
-      val = 0.5 * (std::tanh(sf*(X-r1)) - std::tanh(sf*(X-r2)) +
-                   std::tanh(sf*(Y-r1)) - std::tanh(sf*(Y-r2)));
-   }
-   else if (opt == 4)
-   {
-      // Multiple circles
-      double r1,r2,val,rval;
-      double sf = 10;
-      val = 0.;
-      // circle 1
-      r1= 0.25; r2 = 0.25; rval = 0.1;
-      double xc = x(0) - r1, yc = x(1) - r2;
-      double r = sqrt(xc*xc+yc*yc);
-      val = 0.5*(1+std::tanh(sf*(r+rval))) -
-            0.5*(1+std::tanh(sf*(r-rval))); // std::exp(val1);
-      // circle 2
-      r1= 0.75; r2 = 0.75;
-      xc = x(0) - r1, yc = x(1) - r2;
-      r = sqrt(xc*xc+yc*yc);
-      val += (0.5*(1+std::tanh(sf*(r+rval))) -
-              0.5*(1+std::tanh(sf*(r-rval)))); // std::exp(val1);
-      // circle 3
-      r1= 0.75; r2 = 0.25;
-      xc = x(0) - r1, yc = x(1) - r2;
-      r = sqrt(xc*xc+yc*yc);
-      val += 0.5*(1+std::tanh(sf*(r+rval))) -
-             0.5*(1+std::tanh(sf*(r-rval))); // std::exp(val1);
-      // circle 4
-      r1= 0.25; r2 = 0.75;
-      xc = x(0) - r1, yc = x(1) - r2;
-      r = sqrt(xc*xc+yc*yc);
-      val += 0.5*(1+std::tanh(sf*(r+rval))) -
-             0.5*(1+std::tanh(sf*(r-rval)));
-   }
-   else if (opt == 5)
-   {
-      // cross
-      double X = x(0)-0.5, Y = x(1)-0.5;
-      double rval = std::sqrt(X*X + Y*Y);
-      double thval = 60.*M_PI/180.;
-      double Xmod,Ymod;
-      Xmod = X*std::cos(thval) + Y*std::sin(thval);
-      Ymod= -X*std::sin(thval) + Y*std::cos(thval);
-      X = Xmod+0.5; Y = Ymod+0.5;
-      double r1 = 0.45; double r2 = 0.55; double sf=30.0;
-      val = (0.5*(1+std::tanh(sf*(X-r1))) - 0.5*(1+std::tanh(sf*(X-r2))) +
-             0.5*(1+std::tanh(sf*(Y-r1))) - 0.5*(1+std::tanh(sf*(Y-r2))));
-      if (rval > 0.4) { val = 0.; }
-   }
-   else if (opt == 6)
-   {
-      const double xc = x(0) - 0.0, yc = x(1) - 0.5;
-      const double r = sqrt(xc*xc + yc*yc);
-      double r1 = 0.45; double r2 = 0.55; double sf=30.0;
-      val = 0.5*(1+std::tanh(sf*(r-r1))) - 0.5*(1+std::tanh(sf*(r-r2)));
-   }
-
-   val = std::max(0.,val);
-   val = std::min(1.,val);
-
-   return val * small + (1.0 - val) * big;
-}
-
-class HessianCoefficient : public MatrixCoefficient
-{
-private:
-   int metric;
-
-public:
-   HessianCoefficient(int dim, int metric_id)
-      : MatrixCoefficient(dim), metric(metric_id) { }
-
-   virtual void Eval(DenseMatrix &K, ElementTransformation &T,
-                     const IntegrationPoint &ip)
-   {
-      Vector pos(3);
-      T.Transform(ip, pos);
-      if (metric != 14 && metric != 87)
-      {
-         const double xc = pos(0) - 0.5, yc = pos(1) - 0.5;
-         const double r = sqrt(xc*xc + yc*yc);
-         double r1 = 0.15; double r2 = 0.35; double sf=30.0;
-         const double eps = 0.5;
-
-         const double tan1 = std::tanh(sf*(r-r1)),
-                      tan2 = std::tanh(sf*(r-r2));
-
-         K(0, 0) = eps + 1.0 * (tan1 - tan2);
-         K(0, 1) = 0.0;
-         K(1, 0) = 0.0;
-         K(1, 1) = 1.0;
-      }
-      else if (metric == 14) // Size + Alignment
-      {
-         const double xc = pos(0), yc = pos(1);
-         double theta = M_PI * yc * (1.0 - yc) * cos(2 * M_PI * xc);
-         double alpha_bar = 0.1;
-
-         K(0, 0) =  cos(theta);
-         K(1, 0) =  sin(theta);
-         K(0, 1) = -sin(theta);
-         K(1, 1) =  cos(theta);
-
-         K *= alpha_bar;
-      }
-      else if (metric == 87) // Shape + Size + Alignment
-      {
-         Vector x = pos;
-         double xc = x(0)-0.5, yc = x(1)-0.5;
-         double th = 22.5*M_PI/180.;
-         double xn =  cos(th)*xc + sin(th)*yc;
-         double yn = -sin(th)*xc + cos(th)*yc;
-         double th2 = (th > 45.*M_PI/180) ? M_PI/2 - th : th;
-         double stretch = 1/cos(th2);
-         xc = xn/stretch; yc = yn/stretch;
-         xc = xn; yc=yn;
-
-         double tfac = 20;
-         double s1 = 3;
-         double s2 = 2;
-         double wgt = std::tanh((tfac*(yc) + s2*std::sin(s1*M_PI*xc)) + 1)
-                      - std::tanh((tfac*(yc) + s2*std::sin(s1*M_PI*xc)) - 1);
-         if (wgt > 1) { wgt = 1; }
-         if (wgt < 0) { wgt = 0; }
-         double  val = wgt;
-
-         xc = pos(0), yc = pos(1);
-         double theta = M_PI * (yc) * (1.0 - yc) * cos(2 * M_PI * xc);
-
-         K(0, 0) =  cos(theta);
-         K(1, 0) =  sin(theta);
-         K(0, 1) = -sin(theta);
-         K(1, 1) =  cos(theta);
-
-         double asp_ratio_tar = 0.1 + 1*(1-val)*(1-val);
-
-         K(0, 0) *=  1/pow(asp_ratio_tar,0.5);
-         K(1, 0) *=  1/pow(asp_ratio_tar,0.5);
-         K(0, 1) *=  pow(asp_ratio_tar,0.5);
-         K(1, 1) *=  pow(asp_ratio_tar,0.5);
-      }
-   }
-};
-
-
-// Additional IntegrationRules that can be used with the --quad-type option.
-IntegrationRules IntRulesLo(0, Quadrature1D::GaussLobatto);
-IntegrationRules IntRulesCU(0, Quadrature1D::ClosedUniform);
-
 int main (int argc, char *argv[])
 {
    // 0. Initialize MPI.
@@ -265,6 +103,7 @@ int main (int argc, char *argv[])
    int metric_id         = 1;
    int target_id         = 1;
    double lim_const      = 0.0;
+   double adapt_lim_const = 0.0;
    int quad_type         = 1;
    int quad_order        = 8;
    int newton_iter       = 10;
@@ -276,7 +115,8 @@ int main (int argc, char *argv[])
    bool normalization    = false;
    bool visualization    = true;
    int verbosity_level   = 0;
-   int fdscheme          = 0;
+   bool fdscheme         = false;
+   int adapt_eval        = 0;
 
    // 2. Parse command-line options.
    OptionsParser args(argc, argv);
@@ -319,6 +159,8 @@ int main (int argc, char *argv[])
                   "4: Given full analytic Jacobian (in physical space)\n\t"
                   "5: Ideal shape, given size (in physical space)");
    args.AddOption(&lim_const, "-lc", "--limit-const", "Limiting constant.");
+   args.AddOption(&adapt_lim_const, "-alc", "--adapt-limit-const",
+                  "Adaptive limiting coefficient constant.");
    args.AddOption(&quad_type, "-qt", "--quad-type",
                   "Quadrature rule type:\n\t"
                   "1: Gauss-Lobatto\n\t"
@@ -346,12 +188,15 @@ int main (int argc, char *argv[])
                   "--no-normalization",
                   "Make all terms in the optimization functional unitless.");
    args.AddOption(&fdscheme, "-fd", "--fd_approximation",
+                  "-no-fd", "--no-fd-approx",
                   "Enable finite difference based derivative computations.");
    args.AddOption(&visualization, "-vis", "--visualization", "-no-vis",
                   "--no-visualization",
                   "Enable or disable GLVis visualization.");
    args.AddOption(&verbosity_level, "-vl", "--verbosity-level",
                   "Set the verbosity level - 0, 1, or 2.");
+   args.AddOption(&adapt_eval, "-ae", "--adaptivity-evaluator",
+                  "0 - Advection based (DEFAULT), 1 - GSLIB.");
    args.Parse();
    if (!args.Good())
    {
@@ -374,6 +219,7 @@ int main (int argc, char *argv[])
       else { cout << "(NONE)"; }
       cout << endl;
    }
+
    ParMesh *pmesh = new ParMesh(MPI_COMM_WORLD, *mesh);
 
    delete mesh;
@@ -469,10 +315,10 @@ int main (int argc, char *argv[])
    //     num_mpi_tasks".
    {
       ostringstream mesh_name;
-      mesh_name << "perturbed." << setfill('0') << setw(6) << myid;
+      mesh_name << "perturbed.mesh";
       ofstream mesh_ofs(mesh_name.str().c_str());
       mesh_ofs.precision(8);
-      pmesh->Print(mesh_ofs);
+      pmesh->PrintAsOne(mesh_ofs);
    }
 
    // 11. Store the starting (prior to the optimization) positions.
@@ -514,7 +360,10 @@ int main (int argc, char *argv[])
    HessianCoefficient *adapt_coeff = NULL;
    H1_FECollection ind_fec(mesh_poly_deg, dim);
    ParFiniteElementSpace ind_fes(pmesh, &ind_fec);
-   ParGridFunction size;
+   ParFiniteElementSpace ind_fesv(pmesh, &ind_fec, dim);
+   ParGridFunction size(&ind_fes), aspr(&ind_fes), disc(&ind_fes), ori(&ind_fes);
+   ParGridFunction aspr3d(&ind_fesv), size3d(&ind_fesv);
+
    switch (target_id)
    {
       case 1: target_t = TargetConstructor::IDEAL_SHAPE_UNIT_SIZE; break;
@@ -533,15 +382,183 @@ int main (int argc, char *argv[])
       {
          target_t = TargetConstructor::IDEAL_SHAPE_GIVEN_SIZE;
          DiscreteAdaptTC *tc = new DiscreteAdaptTC(target_t);
-         size.SetSpace(&ind_fes);
-         FunctionCoefficient ind_coeff(ind_values);
+         if (adapt_eval == 0)
+         {
+            tc->SetAdaptivityEvaluator(new AdvectorCG);
+         }
+         else
+         {
+#ifdef MFEM_USE_GSLIB
+            tc->SetAdaptivityEvaluator(new InterpolatorFP);
+#else
+            MFEM_ABORT("MFEM is not built with GSLIB.");
+#endif
+         }
+         FunctionCoefficient ind_coeff(discrete_size_2d);
          size.ProjectCoefficient(ind_coeff);
+         tc->SetParDiscreteTargetSize(size);
+         target_c = tc;
+         break;
+      }
+      case 6: //material indicator 2D
+      {
+         ParGridFunction d_x(&ind_fes), d_y(&ind_fes);
+
+         target_t = TargetConstructor::GIVEN_SHAPE_AND_SIZE;
+         DiscreteAdaptTC *tc = new DiscreteAdaptTC(target_t);
+         FunctionCoefficient ind_coeff(material_indicator_2d);
+         disc.ProjectCoefficient(ind_coeff);
+         if (adapt_eval == 0)
+         {
+            tc->SetAdaptivityEvaluator(new AdvectorCG);
+         }
+         else
+         {
+#ifdef MFEM_USE_GSLIB
+            tc->SetAdaptivityEvaluator(new InterpolatorFP);
+#else
+            MFEM_ABORT("MFEM is not built with GSLIB.");
+#endif
+         }
+         //Diffuse the interface
+         DiffuseField(disc,2);
+
+         //Get  partials with respect to x and y of the grid function
+         disc.GetDerivative(1,0,d_x);
+         disc.GetDerivative(1,1,d_y);
+
+         //Compute the squared magnitude of the gradient
+         for (int i = 0; i < size.Size(); i++)
+         {
+            size(i) = std::pow(d_x(i),2)+std::pow(d_y(i),2);
+         }
+         const double max = size.Max();
+         double max_all;
+         MPI_Allreduce(&max, &max_all, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+
+         for (int i = 0; i < d_x.Size(); i++)
+         {
+            d_x(i) = std::abs(d_x(i));
+            d_y(i) = std::abs(d_y(i));
+         }
+         const double eps = 0.01;
+         const double aspr_ratio = 20.0;
+         const double size_ratio = 40.0;
+
+         for (int i = 0; i < size.Size(); i++)
+         {
+            size(i) = (size(i)/max_all);
+            aspr(i) = (d_x(i)+eps)/(d_y(i)+eps);
+            aspr(i) = 0.1 + 0.9*(1-size(i))*(1-size(i));
+            if (aspr(i) > aspr_ratio) {aspr(i) = aspr_ratio;}
+            if (aspr(i) < 1.0/aspr_ratio) {aspr(i) = 1.0/aspr_ratio;}
+         }
+         Vector vals;
+         const int NE = pmesh->GetNE();
+         double volume = 0.0, volume_ind = 0.0;
+
+         for (int i = 0; i < NE; i++)
+         {
+            ElementTransformation *Tr = pmesh->GetElementTransformation(i);
+            const IntegrationRule &ir =
+               IntRules.Get(pmesh->GetElementBaseGeometry(i), Tr->OrderJ());
+            size.GetValues(i, ir, vals);
+            for (int j = 0; j < ir.GetNPoints(); j++)
+            {
+               const IntegrationPoint &ip = ir.IntPoint(j);
+               Tr->SetIntPoint(&ip);
+               volume     += ip.weight * Tr->Weight();
+               volume_ind += vals(j) * ip.weight * Tr->Weight();
+            }
+         }
+         double volume_all, volume_ind_all;
+         int NE_ALL;
+         MPI_Allreduce(&volume, &volume_all, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+         MPI_Allreduce(&volume_ind, &volume_ind_all, 1, MPI_DOUBLE, MPI_SUM,
+                       MPI_COMM_WORLD);
+         MPI_Allreduce(&NE, &NE_ALL, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+
+         const double avg_zone_size = volume_all / NE_ALL;
+
+         const double small_avg_ratio =
+            (volume_ind_all + (volume_all - volume_ind_all) / size_ratio)
+            / volume_all;
+
+         const double small_zone_size = small_avg_ratio * avg_zone_size;
+         const double big_zone_size   = size_ratio * small_zone_size;
+
+         for (int i = 0; i < size.Size(); i++)
+         {
+            const double val = size(i);
+            const double a = (big_zone_size - small_zone_size) / small_zone_size;
+            size(i) = big_zone_size / (1.0+a*val);
+         }
+
+         DiffuseField(size, 2);
+         DiffuseField(aspr, 2);
+
+         tc->SetParDiscreteTargetSize(size);
+         tc->SetParDiscreteTargetAspectRatio(aspr);
+         target_c = tc;
+         break;
+      }
+      case 7: // aspect-ratio 3D
+      {
+         target_t = TargetConstructor::GIVEN_SHAPE_AND_SIZE;
+         DiscreteAdaptTC *tc = new DiscreteAdaptTC(target_t);
+         if (adapt_eval == 0)
+         {
+            tc->SetAdaptivityEvaluator(new AdvectorCG);
+         }
+         else
+         {
+#ifdef MFEM_USE_GSLIB
+            tc->SetAdaptivityEvaluator(new InterpolatorFP);
+#else
+            MFEM_ABORT("MFEM is not built with GSLIB.");
+#endif
+         }
+         VectorFunctionCoefficient fd_aspr3d(dim, discrete_aspr_3d);
+         aspr3d.ProjectCoefficient(fd_aspr3d);
+         tc->SetParDiscreteTargetAspectRatio(aspr3d);
+         target_c = tc;
+         break;
+      }
+      case 8: // shape/size + orientation 2D
+      {
+         target_t = TargetConstructor::GIVEN_SHAPE_AND_SIZE;
+         DiscreteAdaptTC *tc = new DiscreteAdaptTC(target_t);
+         if (adapt_eval == 0)
+         {
+            tc->SetAdaptivityEvaluator(new AdvectorCG);
+         }
+         else
+         {
 #ifdef MFEM_USE_GSLIB
-         tc->SetAdaptivityEvaluator(new InterpolatorFP);
+            tc->SetAdaptivityEvaluator(new InterpolatorFP);
 #else
-         tc->SetAdaptivityEvaluator(new AdvectorCG);
+            MFEM_ABORT("MFEM is not built with GSLIB.");
 #endif
-         tc->SetParDiscreteTargetSpec(size);
+         }
+
+         if (metric_id == 14)
+         {
+            ConstantCoefficient ind_coeff(0.1*0.1);
+            size.ProjectCoefficient(ind_coeff);
+            tc->SetParDiscreteTargetSize(size);
+         }
+
+         if (metric_id == 87)
+         {
+            FunctionCoefficient aspr_coeff(discrete_aspr_2d);
+            aspr.ProjectCoefficient(aspr_coeff);
+            DiffuseField(aspr,2);
+            tc->SetParDiscreteTargetAspectRatio(aspr);
+         }
+
+         FunctionCoefficient ori_coeff(discrete_ori_2d);
+         ori.ProjectCoefficient(ori_coeff);
+         tc->SetParDiscreteTargetOrientation(ori);
          target_c = tc;
          break;
       }
@@ -585,6 +602,35 @@ int main (int argc, char *argv[])
    ConstantCoefficient lim_coeff(lim_const);
    if (lim_const != 0.0) { he_nlf_integ->EnableLimiting(x0, dist, lim_coeff); }
 
+   // Adaptive limiting.
+   ParGridFunction zeta_0(&ind_fes);
+   ConstantCoefficient coef_zeta(adapt_lim_const);
+   AdaptivityEvaluator *adapt_evaluator = NULL;
+   if (adapt_lim_const > 0.0)
+   {
+      FunctionCoefficient alim_coeff(adapt_lim_fun);
+      zeta_0.ProjectCoefficient(alim_coeff);
+
+      if (adapt_eval == 0) { adapt_evaluator = new AdvectorCG; }
+      else if (adapt_eval == 1)
+      {
+#ifdef MFEM_USE_GSLIB
+         adapt_evaluator = new InterpolatorFP;
+#else
+         MFEM_ABORT("MFEM is not built with GSLIB support!");
+#endif
+      }
+      else { MFEM_ABORT("Bad interpolation option."); }
+
+      he_nlf_integ->EnableAdaptiveLimiting(zeta_0, coef_zeta, *adapt_evaluator);
+      if (visualization)
+      {
+         socketstream vis1;
+         common::VisualizeField(vis1, "localhost", 19916, zeta_0, "Zeta 0",
+                                300, 600, 300, 300);
+      }
+   }
+
    // 15. Setup the final NonlinearForm (which defines the integral of interest,
    //     its first and second derivatives). Here we can use a combination of
    //     metrics, i.e., optimize the sum of two integrals, where both are
@@ -783,20 +829,22 @@ int main (int argc, char *argv[])
    //     using GLVis: "glvis -m optimized -np num_mpi_tasks".
    {
       ostringstream mesh_name;
-      mesh_name << "optimized." << setfill('0') << setw(6) << myid;
+      mesh_name << "optimized.mesh";
       ofstream mesh_ofs(mesh_name.str().c_str());
       mesh_ofs.precision(8);
-      pmesh->Print(mesh_ofs);
+      pmesh->PrintAsOne(mesh_ofs);
    }
 
    // 22. Compute the amount of energy decrease.
    const double fin_energy = a.GetParGridFunctionEnergy(x);
    double metric_part = fin_energy;
-   if (lim_const != 0.0)
+   if (lim_const > 0.0 || adapt_lim_const > 0.0)
    {
       lim_coeff.constant = 0.0;
+      coef_zeta.constant = 0.0;
       metric_part = a.GetParGridFunctionEnergy(x);
       lim_coeff.constant = lim_const;
+      coef_zeta.constant = adapt_lim_const;
    }
    if (myid == 0)
    {
@@ -817,6 +865,13 @@ int main (int argc, char *argv[])
       vis_tmop_metric_p(mesh_poly_deg, *metric, *target_c, *pmesh, title, 600);
    }
 
+   if (adapt_lim_const > 0.0 && visualization)
+   {
+      socketstream vis0;
+      common::VisualizeField(vis0, "localhost", 19916, zeta_0, "Xi 0",
+                             600, 600, 300, 300);
+   }
+
    // 23. Visualize the mesh displacement.
    if (visualization)
    {
@@ -843,6 +898,7 @@ int main (int argc, char *argv[])
    delete target_c2;
    delete metric2;
    delete coeff1;
+   delete adapt_evaluator;
    delete target_c;
    delete adapt_coeff;
    delete metric;
@@ -853,13 +909,3 @@ int main (int argc, char *argv[])
    MPI_Finalize();
    return 0;
 }
-
-// Defined with respect to the icf mesh.
-double weight_fun(const Vector &x)
-{
-   const double r = sqrt(x(0)*x(0) + x(1)*x(1) + 1e-12);
-   const double den = 0.002;
-   double l2 = 0.2 + 0.5 * (std::tanh((r-0.16)/den) - std::tanh((r-0.17)/den)
-                            + std::tanh((r-0.23)/den) - std::tanh((r-0.24)/den));
-   return l2;
-}
diff --git a/miniapps/meshing/stretched2D.mesh b/miniapps/meshing/stretched2D.mesh
new file mode 100644
index 00000000000..6fab492c72e
--- /dev/null
+++ b/miniapps/meshing/stretched2D.mesh
@@ -0,0 +1,930 @@
+MFEM mesh v1.0
+
+#
+# MFEM Geometry Types (see mesh/geom.hpp):
+#
+# POINT       = 0
+# SEGMENT     = 1
+# TRIANGLE    = 2
+# SQUARE      = 3
+# TETRAHEDRON = 4
+# CUBE        = 5
+# PRISM       = 6
+#
+
+dimension
+2
+
+elements
+256
+1 3 0 81 225 84
+1 3 81 25 82 225
+1 3 225 82 65 83
+1 3 84 225 83 28
+1 3 25 85 226 82
+1 3 85 1 86 226
+1 3 226 86 26 87
+1 3 82 226 87 65
+1 3 65 87 227 90
+1 3 87 26 88 227
+1 3 227 88 6 89
+1 3 90 227 89 27
+1 3 28 83 228 92
+1 3 83 65 90 228
+1 3 228 90 27 91
+1 3 92 228 91 5
+1 3 1 93 229 86
+1 3 93 29 94 229
+1 3 229 94 66 95
+1 3 86 229 95 26
+1 3 29 96 230 94
+1 3 96 2 97 230
+1 3 230 97 30 98
+1 3 94 230 98 66
+1 3 66 98 231 101
+1 3 98 30 99 231
+1 3 231 99 7 100
+1 3 101 231 100 31
+1 3 26 95 232 88
+1 3 95 66 101 232
+1 3 232 101 31 102
+1 3 88 232 102 6
+1 3 2 103 233 97
+1 3 103 32 104 233
+1 3 233 104 67 105
+1 3 97 233 105 30
+1 3 32 106 234 104
+1 3 106 3 107 234
+1 3 234 107 33 108
+1 3 104 234 108 67
+1 3 67 108 235 111
+1 3 108 33 109 235
+1 3 235 109 8 110
+1 3 111 235 110 34
+1 3 30 105 236 99
+1 3 105 67 111 236
+1 3 236 111 34 112
+1 3 99 236 112 7
+1 3 3 113 237 107
+1 3 113 35 114 237
+1 3 237 114 68 115
+1 3 107 237 115 33
+1 3 35 116 238 114
+1 3 116 4 117 238
+1 3 238 117 36 118
+1 3 114 238 118 68
+1 3 68 118 239 121
+1 3 118 36 119 239
+1 3 239 119 9 120
+1 3 121 239 120 37
+1 3 33 115 240 109
+1 3 115 68 121 240
+1 3 240 121 37 122
+1 3 109 240 122 8
+1 3 5 91 241 125
+1 3 91 27 123 241
+1 3 241 123 69 124
+1 3 125 241 124 40
+1 3 27 89 242 123
+1 3 89 6 126 242
+1 3 242 126 38 127
+1 3 123 242 127 69
+1 3 69 127 243 130
+1 3 127 38 128 243
+1 3 243 128 11 129
+1 3 130 243 129 39
+1 3 40 124 244 132
+1 3 124 69 130 244
+1 3 244 130 39 131
+1 3 132 244 131 10
+1 3 6 102 245 126
+1 3 102 31 133 245
+1 3 245 133 70 134
+1 3 126 245 134 38
+1 3 31 100 246 133
+1 3 100 7 135 246
+1 3 246 135 41 136
+1 3 133 246 136 70
+1 3 70 136 247 139
+1 3 136 41 137 247
+1 3 247 137 12 138
+1 3 139 247 138 42
+1 3 38 134 248 128
+1 3 134 70 139 248
+1 3 248 139 42 140
+1 3 128 248 140 11
+1 3 7 112 249 135
+1 3 112 34 141 249
+1 3 249 141 71 142
+1 3 135 249 142 41
+1 3 34 110 250 141
+1 3 110 8 143 250
+1 3 250 143 43 144
+1 3 141 250 144 71
+1 3 71 144 251 147
+1 3 144 43 145 251
+1 3 251 145 13 146
+1 3 147 251 146 44
+1 3 41 142 252 137
+1 3 142 71 147 252
+1 3 252 147 44 148
+1 3 137 252 148 12
+1 3 8 122 253 143
+1 3 122 37 149 253
+1 3 253 149 72 150
+1 3 143 253 150 43
+1 3 37 120 254 149
+1 3 120 9 151 254
+1 3 254 151 45 152
+1 3 149 254 152 72
+1 3 72 152 255 155
+1 3 152 45 153 255
+1 3 255 153 14 154
+1 3 155 255 154 46
+1 3 43 150 256 145
+1 3 150 72 155 256
+1 3 256 155 46 156
+1 3 145 256 156 13
+1 3 10 131 257 159
+1 3 131 39 157 257
+1 3 257 157 73 158
+1 3 159 257 158 49
+1 3 39 129 258 157
+1 3 129 11 160 258
+1 3 258 160 47 161
+1 3 157 258 161 73
+1 3 73 161 259 164
+1 3 161 47 162 259
+1 3 259 162 16 163
+1 3 164 259 163 48
+1 3 49 158 260 166
+1 3 158 73 164 260
+1 3 260 164 48 165
+1 3 166 260 165 15
+1 3 11 140 261 160
+1 3 140 42 167 261
+1 3 261 167 74 168
+1 3 160 261 168 47
+1 3 42 138 262 167
+1 3 138 12 169 262
+1 3 262 169 50 170
+1 3 167 262 170 74
+1 3 74 170 263 173
+1 3 170 50 171 263
+1 3 263 171 17 172
+1 3 173 263 172 51
+1 3 47 168 264 162
+1 3 168 74 173 264
+1 3 264 173 51 174
+1 3 162 264 174 16
+1 3 12 148 265 169
+1 3 148 44 175 265
+1 3 265 175 75 176
+1 3 169 265 176 50
+1 3 44 146 266 175
+1 3 146 13 177 266
+1 3 266 177 52 178
+1 3 175 266 178 75
+1 3 75 178 267 181
+1 3 178 52 179 267
+1 3 267 179 18 180
+1 3 181 267 180 53
+1 3 50 176 268 171
+1 3 176 75 181 268
+1 3 268 181 53 182
+1 3 171 268 182 17
+1 3 13 156 269 177
+1 3 156 46 183 269
+1 3 269 183 76 184
+1 3 177 269 184 52
+1 3 46 154 270 183
+1 3 154 14 185 270
+1 3 270 185 54 186
+1 3 183 270 186 76
+1 3 76 186 271 189
+1 3 186 54 187 271
+1 3 271 187 19 188
+1 3 189 271 188 55
+1 3 52 184 272 179
+1 3 184 76 189 272
+1 3 272 189 55 190
+1 3 179 272 190 18
+1 3 15 165 273 193
+1 3 165 48 191 273
+1 3 273 191 77 192
+1 3 193 273 192 58
+1 3 48 163 274 191
+1 3 163 16 194 274
+1 3 274 194 56 195
+1 3 191 274 195 77
+1 3 77 195 275 198
+1 3 195 56 196 275
+1 3 275 196 21 197
+1 3 198 275 197 57
+1 3 58 192 276 200
+1 3 192 77 198 276
+1 3 276 198 57 199
+1 3 200 276 199 20
+1 3 16 174 277 194
+1 3 174 51 201 277
+1 3 277 201 78 202
+1 3 194 277 202 56
+1 3 51 172 278 201
+1 3 172 17 203 278
+1 3 278 203 59 204
+1 3 201 278 204 78
+1 3 78 204 279 207
+1 3 204 59 205 279
+1 3 279 205 22 206
+1 3 207 279 206 60
+1 3 56 202 280 196
+1 3 202 78 207 280
+1 3 280 207 60 208
+1 3 196 280 208 21
+1 3 17 182 281 203
+1 3 182 53 209 281
+1 3 281 209 79 210
+1 3 203 281 210 59
+1 3 53 180 282 209
+1 3 180 18 211 282
+1 3 282 211 61 212
+1 3 209 282 212 79
+1 3 79 212 283 215
+1 3 212 61 213 283
+1 3 283 213 23 214
+1 3 215 283 214 62
+1 3 59 210 284 205
+1 3 210 79 215 284
+1 3 284 215 62 216
+1 3 205 284 216 22
+1 3 18 190 285 211
+1 3 190 55 217 285
+1 3 285 217 80 218
+1 3 211 285 218 61
+1 3 55 188 286 217
+1 3 188 19 219 286
+1 3 286 219 63 220
+1 3 217 286 220 80
+1 3 80 220 287 223
+1 3 220 63 221 287
+1 3 287 221 24 222
+1 3 223 287 222 64
+1 3 61 218 288 213
+1 3 218 80 223 288
+1 3 288 223 64 224
+1 3 213 288 224 23
+
+boundary
+64
+2 1 0 81
+2 1 81 25
+2 1 25 85
+2 1 85 1
+2 1 1 93
+2 1 93 29
+2 1 29 96
+2 1 96 2
+2 1 2 103
+2 1 103 32
+2 1 32 106
+2 1 106 3
+2 1 3 113
+2 1 113 35
+2 1 35 116
+2 1 116 4
+2 1 21 197
+2 1 197 57
+2 1 57 199
+2 1 199 20
+2 1 22 206
+2 1 206 60
+2 1 60 208
+2 1 208 21
+2 1 23 214
+2 1 214 62
+2 1 62 216
+2 1 216 22
+2 1 24 222
+2 1 222 64
+2 1 64 224
+2 1 224 23
+1 1 5 92
+1 1 92 28
+1 1 28 84
+1 1 84 0
+1 1 10 132
+1 1 132 40
+1 1 40 125
+1 1 125 5
+1 1 15 166
+1 1 166 49
+1 1 49 159
+1 1 159 10
+1 1 20 200
+1 1 200 58
+1 1 58 193
+1 1 193 15
+1 1 4 117
+1 1 117 36
+1 1 36 119
+1 1 119 9
+1 1 9 151
+1 1 151 45
+1 1 45 153
+1 1 153 14
+1 1 14 185
+1 1 185 54
+1 1 54 187
+1 1 187 19
+1 1 19 219
+1 1 219 63
+1 1 63 221
+1 1 221 24
+
+vertices
+289
+
+nodes
+FiniteElementSpace
+FiniteElementCollection: H1_2D_P1
+VDim: 2
+Ordering: 0
+
+0
+0.25
+0.5
+0.75
+1
+0
+0.25
+0.5
+0.75
+1
+0
+0.25
+0.5
+0.75
+1
+0
+0.25
+0.5
+0.75
+1
+0
+0.25
+0.5
+0.75
+1
+0.125
+0.25
+0.125
+0
+0.375
+0.5
+0.375
+0.625
+0.75
+0.625
+0.875
+1
+0.875
+0.25
+0.125
+0
+0.5
+0.375
+0.75
+0.625
+1
+0.875
+0.25
+0.125
+0
+0.5
+0.375
+0.75
+0.625
+1
+0.875
+0.25
+0.125
+0
+0.5
+0.375
+0.75
+0.625
+1
+0.875
+0.125
+0.375
+0.625
+0.875
+0.125
+0.375
+0.625
+0.875
+0.125
+0.375
+0.625
+0.875
+0.125
+0.375
+0.625
+0.875
+0.0625
+0.125
+0.0625
+0
+0.1875
+0.25
+0.1875
+0.25
+0.1875
+0.125
+0.0625
+0
+0.3125
+0.375
+0.3125
+0.4375
+0.5
+0.4375
+0.5
+0.4375
+0.375
+0.3125
+0.5625
+0.625
+0.5625
+0.6875
+0.75
+0.6875
+0.75
+0.6875
+0.625
+0.5625
+0.8125
+0.875
+0.8125
+0.9375
+1
+0.9375
+1
+0.9375
+0.875
+0.8125
+0.125
+0.0625
+0
+0.25
+0.1875
+0.25
+0.1875
+0.125
+0.0625
+0
+0.375
+0.3125
+0.5
+0.4375
+0.5
+0.4375
+0.375
+0.3125
+0.625
+0.5625
+0.75
+0.6875
+0.75
+0.6875
+0.625
+0.5625
+0.875
+0.8125
+1
+0.9375
+1
+0.9375
+0.875
+0.8125
+0.125
+0.0625
+0
+0.25
+0.1875
+0.25
+0.1875
+0.125
+0.0625
+0
+0.375
+0.3125
+0.5
+0.4375
+0.5
+0.4375
+0.375
+0.3125
+0.625
+0.5625
+0.75
+0.6875
+0.75
+0.6875
+0.625
+0.5625
+0.875
+0.8125
+1
+0.9375
+1
+0.9375
+0.875
+0.8125
+0.125
+0.0625
+0
+0.25
+0.1875
+0.25
+0.1875
+0.125
+0.0625
+0
+0.375
+0.3125
+0.5
+0.4375
+0.5
+0.4375
+0.375
+0.3125
+0.625
+0.5625
+0.75
+0.6875
+0.75
+0.6875
+0.625
+0.5625
+0.875
+0.8125
+1
+0.9375
+1
+0.9375
+0.875
+0.8125
+0.0625
+0.1875
+0.1875
+0.0625
+0.3125
+0.4375
+0.4375
+0.3125
+0.5625
+0.6875
+0.6875
+0.5625
+0.8125
+0.9375
+0.9375
+0.8125
+0.0625
+0.1875
+0.1875
+0.0625
+0.3125
+0.4375
+0.4375
+0.3125
+0.5625
+0.6875
+0.6875
+0.5625
+0.8125
+0.9375
+0.9375
+0.8125
+0.0625
+0.1875
+0.1875
+0.0625
+0.3125
+0.4375
+0.4375
+0.3125
+0.5625
+0.6875
+0.6875
+0.5625
+0.8125
+0.9375
+0.9375
+0.8125
+0.0625
+0.1875
+0.1875
+0.0625
+0.3125
+0.4375
+0.4375
+0.3125
+0.5625
+0.6875
+0.6875
+0.5625
+0.8125
+0.9375
+0.9375
+0.8125
+0
+0
+0
+0
+0
+0.015625
+0.015625
+0.015625
+0.015625
+0.015625
+0.125
+0.125
+0.125
+0.125
+0.125
+0.421875
+0.421875
+0.421875
+0.421875
+0.421875
+1
+1
+1
+1
+1
+0
+0.001953125
+0.015625
+0.001953125
+0
+0.001953125
+0.015625
+0
+0.001953125
+0.015625
+0
+0.001953125
+0.015625
+0.052734375
+0.125
+0.052734375
+0.052734375
+0.125
+0.052734375
+0.125
+0.052734375
+0.125
+0.24414062
+0.421875
+0.24414062
+0.24414062
+0.421875
+0.24414062
+0.421875
+0.24414062
+0.421875
+0.66992188
+1
+0.66992188
+0.66992188
+1
+0.66992188
+1
+0.66992188
+1
+0.001953125
+0.001953125
+0.001953125
+0.001953125
+0.052734375
+0.052734375
+0.052734375
+0.052734375
+0.24414062
+0.24414062
+0.24414062
+0.24414062
+0.66992188
+0.66992188
+0.66992188
+0.66992188
+0
+0.00024414062
+0.001953125
+0.00024414062
+0
+0.00024414062
+0.001953125
+0.0065917969
+0.015625
+0.0065917969
+0.015625
+0.0065917969
+0
+0.00024414062
+0.001953125
+0
+0.00024414062
+0.001953125
+0.0065917969
+0.015625
+0.0065917969
+0.015625
+0
+0.00024414062
+0.001953125
+0
+0.00024414062
+0.001953125
+0.0065917969
+0.015625
+0.0065917969
+0.015625
+0
+0.00024414062
+0.001953125
+0
+0.00024414062
+0.001953125
+0.0065917969
+0.015625
+0.0065917969
+0.015625
+0.030517578
+0.052734375
+0.030517578
+0.030517578
+0.052734375
+0.083740234
+0.125
+0.083740234
+0.125
+0.083740234
+0.030517578
+0.052734375
+0.030517578
+0.052734375
+0.083740234
+0.125
+0.083740234
+0.125
+0.030517578
+0.052734375
+0.030517578
+0.052734375
+0.083740234
+0.125
+0.083740234
+0.125
+0.030517578
+0.052734375
+0.030517578
+0.052734375
+0.083740234
+0.125
+0.083740234
+0.125
+0.17797852
+0.24414062
+0.17797852
+0.17797852
+0.24414062
+0.32495117
+0.421875
+0.32495117
+0.421875
+0.32495117
+0.17797852
+0.24414062
+0.17797852
+0.24414062
+0.32495117
+0.421875
+0.32495117
+0.421875
+0.17797852
+0.24414062
+0.17797852
+0.24414062
+0.32495117
+0.421875
+0.32495117
+0.421875
+0.17797852
+0.24414062
+0.17797852
+0.24414062
+0.32495117
+0.421875
+0.32495117
+0.421875
+0.53637695
+0.66992188
+0.53637695
+0.53637695
+0.66992188
+0.82397461
+1
+0.82397461
+1
+0.82397461
+0.53637695
+0.66992188
+0.53637695
+0.66992188
+0.82397461
+1
+0.82397461
+1
+0.53637695
+0.66992188
+0.53637695
+0.66992188
+0.82397461
+1
+0.82397461
+1
+0.53637695
+0.66992188
+0.53637695
+0.66992188
+0.82397461
+1
+0.82397461
+1
+0.00024414062
+0.00024414062
+0.0065917969
+0.0065917969
+0.00024414062
+0.00024414062
+0.0065917969
+0.0065917969
+0.00024414062
+0.00024414062
+0.0065917969
+0.0065917969
+0.00024414062
+0.00024414062
+0.0065917969
+0.0065917969
+0.030517578
+0.030517578
+0.083740234
+0.083740234
+0.030517578
+0.030517578
+0.083740234
+0.083740234
+0.030517578
+0.030517578
+0.083740234
+0.083740234
+0.030517578
+0.030517578
+0.083740234
+0.083740234
+0.17797852
+0.17797852
+0.32495117
+0.32495117
+0.17797852
+0.17797852
+0.32495117
+0.32495117
+0.17797852
+0.17797852
+0.32495117
+0.32495117
+0.17797852
+0.17797852
+0.32495117
+0.32495117
+0.53637695
+0.53637695
+0.82397461
+0.82397461
+0.53637695
+0.53637695
+0.82397461
+0.82397461
+0.53637695
+0.53637695
+0.82397461
+0.82397461
+0.53637695
+0.53637695
+0.82397461
+0.82397461
diff --git a/miniapps/nurbs/nurbs_ex1.cpp b/miniapps/nurbs/nurbs_ex1.cpp
index f6f57c18736..808c4b5ae7c 100644
--- a/miniapps/nurbs/nurbs_ex1.cpp
+++ b/miniapps/nurbs/nurbs_ex1.cpp
@@ -61,7 +61,7 @@ class Diffusion2Integrator: public BilinearFormIntegrator
       double w;
 
 #ifdef MFEM_THREAD_SAFE
-      Vector shape[nd];
+      Vector shape(nd);
       Vector laplace(nd);
 #else
       shape.SetSize(nd);
diff --git a/miniapps/nurbs/nurbs_ex1p.cpp b/miniapps/nurbs/nurbs_ex1p.cpp
index 8d37e8d2e8c..652cff537e4 100644
--- a/miniapps/nurbs/nurbs_ex1p.cpp
+++ b/miniapps/nurbs/nurbs_ex1p.cpp
@@ -75,7 +75,7 @@ class Diffusion2Integrator: public BilinearFormIntegrator
       double w;
 
 #ifdef MFEM_THREAD_SAFE
-      Vector shape[nd];
+      Vector shape(nd);
       Vector laplace(nd);
 #else
       shape.SetSize(nd);
diff --git a/miniapps/performance/CMakeLists.txt b/miniapps/performance/CMakeLists.txt
index d8053bea7dd..375e6874c5a 100644
--- a/miniapps/performance/CMakeLists.txt
+++ b/miniapps/performance/CMakeLists.txt
@@ -18,22 +18,23 @@ endif()
 set(PERFORMANCE_CXX_OPTIONS)
 if (${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
   list(APPEND PERFORMANCE_CXX_OPTIONS
-    ${MFEM_PERF_CXX_ARCH_FLAGS}
-    "-Wpedantic"
-    "-Wall"
+    "-march=native"
     "-fcolor-diagnostics"
     "-fvectorize"
     "-fslp-vectorize"
-    "-fslp-vectorize-aggressive"
     "-ffp-contract=fast")
 elseif (${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")
   list(APPEND PERFORMANCE_CXX_OPTIONS
     ${MFEM_PERF_CXX_ARCH_FLAGS}
-    "-Wall"
-    "--param" "max-completely-peel-times=3")
+    "-Wall")
   if (NOT MFEM_USE_CUDA)
      list(APPEND PERFORMANCE_CXX_OPTIONS "-pedantic")
   endif()
+  if (NOT MFEM_USE_SIMD)
+    list(APPEND PERFORMANCE_CXX_OPTIONS "--param" "max-completely-peel-times=3")
+  endif()
+elseif (${CMAKE_CXX_COMPILER_ID} MATCHES "Intel")
+  list(APPEND PERFORMANCE_CXX_OPTIONS "-xHost")
 endif()
 
 add_mfem_miniapp(performance_ex1
diff --git a/miniapps/performance/ex1.cpp b/miniapps/performance/ex1.cpp
index c0b3ab69a33..97bcc273c94 100644
--- a/miniapps/performance/ex1.cpp
+++ b/miniapps/performance/ex1.cpp
@@ -124,6 +124,9 @@ int main(int argc, char *argv[])
       return 3;
    }
 
+   cout << "\nMFEM SIMD width: " << MFEM_SIMD_BYTES/sizeof(double)
+        << " doubles\n" << endl;
+
    // See class BasisType in fem/fe_coll.hpp for available basis types
    int basis = BasisType::GetType(basis_type[0]);
    cout << "Using " << BasisType::Name(basis) << " basis ..." << endl;
diff --git a/miniapps/performance/ex1p.cpp b/miniapps/performance/ex1p.cpp
index ecf9e92c6b0..c55819e88b2 100644
--- a/miniapps/performance/ex1p.cpp
+++ b/miniapps/performance/ex1p.cpp
@@ -144,6 +144,12 @@ int main(int argc, char *argv[])
       return 3;
    }
 
+   if (myid == 0)
+   {
+      cout << "\nMFEM SIMD width: " << MFEM_SIMD_BYTES/sizeof(double)
+           << " doubles\n" << endl;
+   }
+
    // See class BasisType in fem/fe_coll.hpp for available basis types
    int basis = BasisType::GetType(basis_type[0]);
    if (myid == 0)
diff --git a/miniapps/performance/makefile b/miniapps/performance/makefile
index 25b9f67d418..a98d303a6e4 100644
--- a/miniapps/performance/makefile
+++ b/miniapps/performance/makefile
@@ -24,48 +24,90 @@ MFEM_LIB_FILE = mfem_is_not_built
 # Distinguish x86 from PowerPC systems
 MFEM_MACHINE ?= $(shell uname -m)
 
-# Choose the switch MFEM_PERF_SW: gcc_x86_64, gcc_ppc64, or clang.
+# Choose the switch MFEM_PERF_SW: gcc_x86_64, gcc_ppc64, clang, etc.
 # The value of MFEM_PERF_SW is used to select MFEM_PERF_CXXFLAGS below.
-ifneq (,$(MFEM_PERF_SW))
-   # Use the value of MFEM_PERF_SW if already defined
-else ifneq (,$(filter %clang++ %mpiclang++,$(MFEM_CXX)))
-   MFEM_PERF_SW = clang
-else ifneq (,$(filter %g++ %mpicxx %mpic++,$(MFEM_CXX)))
-   ifeq ($(MFEM_MACHINE),x86_64)
+define cxx_detect
+cxx_v="$$($(MFEM_HOST_CXX) --version -c 2>&1)";
+if [ 0 -ne $$? ]; then
+  cxx_id="unknown";
+elif [ -z "$${cxx_v##g++*}" ]; then
+  cxx_id="gcc";
+elif [ -z "$${cxx_v##*clang version*}" -o -z "$${cxx_v##*LLVM version*}" ]; then
+  cxx_id="clang";
+elif [ -z "$${cxx_v##*icpc*}" ]; then
+    cxx_id="icc";
+elif [ -z "$${cxx_v##*IBM XL*}" ]; then
+  cxx_id="xlc";
+elif [ -z "$${cxx_v##*pgc++*}" ]; then
+  cxx_id="pgi";
+else
+  cxx_id="unknown";
+fi;
+printf "%s" "$$cxx_id"
+endef
+define DETECT_PERF_CXXFLAGS
+ifneq (,$$(MFEM_HOST_CXX))
+   MFEM_PERF_SW := $$(shell $$(cxx_detect))
+   $$(info Detected host compiler: $$(MFEM_PERF_SW))
+   ifeq (unknown,$$(MFEM_PERF_SW))
+      $$(info -------------------------------------------)
+      $$(info Output from '$$(MFEM_HOST_CXX) --version -c')
+      $$(info -------------------------------------------)
+      $$(shell $$(MFEM_HOST_CXX) --version -c 1>&2)
+      $$(info -------------------------------------------)
+   endif
+endif
+ifeq (gcc,$$(MFEM_PERF_SW))
+   ifeq ($$(MFEM_MACHINE),x86_64)
       MFEM_PERF_SW = gcc_x86_64
-   else ifneq (,$(findstring ppc64,$(MFEM_MACHINE)))
+   else ifneq (,$$(findstring ppc64,$$(MFEM_MACHINE)))
       MFEM_PERF_SW = gcc_ppc64
    endif
 endif
+# Choose MFEM_PERF_CXXFLAGS based on MFEM_PERF_SW:
+MFEM_PERF_CXXFLAGS = $$(MFEM_PERF_CXXFLAGS_$$(MFEM_PERF_SW))
+# Add MFEM_PERF_CXXFLAGS to MFEM_CXXFLAGS:
+ifeq (YES,$$(MFEM_USE_CUDA))
+  ifneq (,$$(MFEM_PERF_CXXFLAGS))
+    MFEM_CXXFLAGS += -Xcompiler="$$(MFEM_PERF_CXXFLAGS)"
+  endif
+else
+  MFEM_CXXFLAGS += $$(MFEM_PERF_CXXFLAGS)
+endif
+DETECT_PERF_CXXFLAGS_DONE = YES
+endef
 
 # Compiler specific optimizations.
 # For best performance, GCC 5 (or newer) is recommended.
 
+ifneq (YES,$(MFEM_USE_CUDA))
+  PEDANTIC_FLAG = -pedantic
+endif
+
 # - GCC extra options:
-# MFEM_PERF_CXXFLAGS_gcc_common += -std=c++03
-MFEM_PERF_CXXFLAGS_gcc_common += -std=c++11
-MFEM_PERF_CXXFLAGS_gcc_common += -pedantic -Wall
-MFEM_PERF_CXXFLAGS_gcc_common += --param max-completely-peel-times=3
-# MFEM_PERF_CXXFLAGS_gcc_common += -fdump-tree-optimized-blocks
+MFEM_PERF_CXXFLAGS_gcc_common += $(PEDANTIC_FLAG) -Wall
+ifeq ($(MFEM_USE_SIMD),NO)
+  MFEM_PERF_CXXFLAGS_gcc_common += --param max-completely-peel-times=3
+endif
+#MFEM_PERF_CXXFLAGS_gcc_common += -fdump-tree-optimized-blocks
 MFEM_PERF_CXXFLAGS_gcc_x86_64 = -march=native $(MFEM_PERF_CXXFLAGS_gcc_common)
 MFEM_PERF_CXXFLAGS_gcc_ppc64 = -mcpu=native -mtune=native\
  $(MFEM_PERF_CXXFLAGS_gcc_common)
 
+# - XLC extra options:
+MFEM_PERF_CXXFLAGS_xlc = -mcpu=native
+
 # - Clang extra options:
 MFEM_PERF_CXXFLAGS_clang += -march=native
-# MFEM_PERF_CXXFLAGS_clang += -std=c++03
-MFEM_PERF_CXXFLAGS_clang += -std=c++11
-MFEM_PERF_CXXFLAGS_clang += -pedantic -Wall
+MFEM_PERF_CXXFLAGS_clang += $(PEDANTIC_FLAG) -Wall
 MFEM_PERF_CXXFLAGS_clang += -fcolor-diagnostics
 MFEM_PERF_CXXFLAGS_clang += -fvectorize
 MFEM_PERF_CXXFLAGS_clang += -fslp-vectorize
-MFEM_PERF_CXXFLAGS_clang += -fslp-vectorize-aggressive
 MFEM_PERF_CXXFLAGS_clang += -ffp-contract=fast
 
-# Choose MFEM_PERF_CXXFLAGS based on MFEM_PERF_SW:
-MFEM_PERF_CXXFLAGS = $(MFEM_PERF_CXXFLAGS_$(MFEM_PERF_SW))
-# Add MFEM_PERF_CXXFLAGS to MFEM_CXXFLAGS:
-MFEM_CXXFLAGS += $(MFEM_PERF_CXXFLAGS)
+# - Intel C++ compiler extra options:
+MFEM_PERF_CXXFLAGS_icc += -xHost
+
 
 SEQ_MINIAPPS = ex1
 PAR_MINIAPPS = ex1p
@@ -84,6 +126,7 @@ endif
 
 # Replace the default implicit rule for *.cpp files
 %: $(SRC)%.cpp $(MFEM_LIB_FILE) $(CONFIG_MK)
+	$(if $(DETECT_PERF_CXXFLAGS_DONE),,$(eval $(DETECT_PERF_CXXFLAGS)))
 	$(MFEM_CXX) $(MFEM_FLAGS) $< -o $@ $(MFEM_LIBS)
 
 all: $(MINIAPPS)
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 31c0b224197..5b25bf42d41 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -48,6 +48,7 @@ set(UNIT_TESTS_SRCS
   fem/test_operatorjacobismoother.cpp
   fem/test_pa_coeff.cpp
   fem/test_pa_kernels.cpp
+  fem/test_quadf_coef.cpp
   fem/test_quadraturefunc.cpp
   miniapps/test_sedov.cpp
 )
diff --git a/tests/unit/fem/test_1d_bilininteg.cpp b/tests/unit/fem/test_1d_bilininteg.cpp
index c738a61b686..38f6124ef80 100644
--- a/tests/unit/fem/test_1d_bilininteg.cpp
+++ b/tests/unit/fem/test_1d_bilininteg.cpp
@@ -55,7 +55,7 @@ TEST_CASE("1D Bilinear Mass Integrators",
 
       SECTION("Mapping H1 to L2")
       {
-         L2_FECollection    fec_l2(order - 1, dim);
+         L2_FECollection    fec_l2(order, dim);
          FiniteElementSpace fespace_l2(&mesh, &fec_l2);
 
          BilinearForm m_l2(&fespace_l2);
diff --git a/tests/unit/fem/test_2d_bilininteg.cpp b/tests/unit/fem/test_2d_bilininteg.cpp
index ea53d7a03d3..936f58d3bf7 100644
--- a/tests/unit/fem/test_2d_bilininteg.cpp
+++ b/tests/unit/fem/test_2d_bilininteg.cpp
@@ -183,7 +183,7 @@ TEST_CASE("2D Bilinear Mass Integrators",
 
       SECTION("Mapping H1 to L2")
       {
-         L2_FECollection    fec_l2(order - 1, dim);
+         L2_FECollection    fec_l2(order, dim);
          FiniteElementSpace fespace_l2(&mesh, &fec_l2);
 
          BilinearForm m_l2(&fespace_l2);
@@ -2768,7 +2768,7 @@ TEST_CASE("2D Bilinear Dot Product Integrators",
       }
       SECTION("Mapping ND to L2")
       {
-         L2_FECollection    fec_l2(order - 1, dim);
+         L2_FECollection    fec_l2(order, dim);
          FiniteElementSpace fespace_l2(&mesh, &fec_l2);
 
          BilinearForm m_l2(&fespace_l2);
@@ -2832,7 +2832,7 @@ TEST_CASE("2D Bilinear Dot Product Integrators",
       }
       SECTION("Mapping RT to L2")
       {
-         L2_FECollection    fec_l2(order - 1, dim);
+         L2_FECollection    fec_l2(order, dim);
          FiniteElementSpace fespace_l2(&mesh, &fec_l2);
 
          BilinearForm m_l2(&fespace_l2);
diff --git a/tests/unit/fem/test_3d_bilininteg.cpp b/tests/unit/fem/test_3d_bilininteg.cpp
index 57f9c4dd4d9..94bad6ea10b 100644
--- a/tests/unit/fem/test_3d_bilininteg.cpp
+++ b/tests/unit/fem/test_3d_bilininteg.cpp
@@ -528,7 +528,7 @@ TEST_CASE("3D Bilinear Mass Integrators",
 
          SECTION("Mapping H1 to L2")
          {
-            L2_FECollection    fec_l2(order - 1, dim);
+            L2_FECollection    fec_l2(order, dim);
             FiniteElementSpace fespace_l2(&mesh, &fec_l2);
 
             BilinearForm m_l2(&fespace_l2);
@@ -741,6 +741,7 @@ TEST_CASE("3D Bilinear Mass Integrators",
 }
 
 TEST_CASE("3D Bilinear Vector Mass Integrators",
+          "[VectorFEMassIntegrator]"
           "[MixedVectorMassIntegrator]"
           "[MixedVectorIntegrator]"
           "[BilinearFormIntegrator]"
@@ -790,7 +791,7 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                Vector tmp_rt(fespace_rt.GetNDofs());
 
-               SECTION("Without Coefficient")
+               SECTION("Without Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_nd, &fespace_rt);
                   blf.AddDomainIntegrator(new MixedVectorMassIntegrator());
@@ -826,6 +827,31 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                   delete diffv;
                }
+               SECTION("Without Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_nd, &fespace_rt);
+                  blf.AddDomainIntegrator(new VectorFEMassIntegrator());
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_nd,tmp_rt); g_rt = 0.0;
+                  PCG(m_rt, s_rt, tmp_rt, g_rt, 0, 200, cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_rt.ComputeL2Error(F3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_rt, &fespace_nd);
+                  blfw.AddDomainIntegrator(new VectorFEMassIntegrator());
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
+                  delete blfT;
+                  delete diff;
+               }
             }
             {
                // Tests requiring a higher order RT space
@@ -843,7 +869,7 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                Vector tmp_rt(fespace_rt.GetNDofs());
 
-               SECTION("With Scalar Coefficient")
+               SECTION("With Scalar Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_nd, &fespace_rt);
                   blf.AddDomainIntegrator(
@@ -870,7 +896,32 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
                   delete blfT;
                   delete diff;
                }
-               SECTION("With Diagonal Matrix Coefficient")
+               SECTION("With Scalar Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_nd, &fespace_rt);
+                  blf.AddDomainIntegrator(new VectorFEMassIntegrator(q3_coef));
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_nd,tmp_rt); g_rt = 0.0;
+                  PCG(m_rt, s_rt, tmp_rt, g_rt, 0, 200, cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_rt.ComputeL2Error(qF3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_rt, &fespace_nd);
+                  blfw.AddDomainIntegrator(new VectorFEMassIntegrator(q3_coef));
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
+                  delete blfT;
+                  delete diff;
+               }
+               SECTION("With Diagonal Matrix Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_nd, &fespace_rt);
                   blf.AddDomainIntegrator(
@@ -897,7 +948,34 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
                   delete blfT;
                   delete diff;
                }
-               SECTION("With Matrix Coefficient")
+               SECTION("With Diagonal Matrix Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_nd, &fespace_rt);
+                  blf.AddDomainIntegrator(
+                     new VectorFEMassIntegrator(D3_coef));
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_nd,tmp_rt); g_rt = 0.0;
+                  PCG(m_rt, s_rt, tmp_rt, g_rt, 0, 200, cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_rt.ComputeL2Error(DF3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_rt, &fespace_nd);
+                  blfw.AddDomainIntegrator(
+                     new VectorFEMassIntegrator(D3_coef));
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
+                  delete blfT;
+                  delete diff;
+               }
+               SECTION("With Matrix Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_nd, &fespace_rt);
                   blf.AddDomainIntegrator(
@@ -921,6 +999,32 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                   REQUIRE( diff->MaxNorm() < tol );
 
+                  delete blfT;
+                  delete diff;
+               }
+               SECTION("With Matrix Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_nd, &fespace_rt);
+                  blf.AddDomainIntegrator(new VectorFEMassIntegrator(M3_coef));
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_nd,tmp_rt); g_rt = 0.0;
+                  PCG(m_rt, s_rt, tmp_rt, g_rt, 0, 200, cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_rt.ComputeL2Error(MF3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_rt, &fespace_nd);
+                  blfw.AddDomainIntegrator(
+                     new VectorFEMassIntegrator(MT3_coef));
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
                   delete blfT;
                   delete diff;
                }
@@ -943,7 +1047,7 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                Vector tmp_nd(fespace_nd.GetNDofs());
 
-               SECTION("Without Coefficient")
+               SECTION("Without Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_nd, &fespace_nd);
                   blf.AddDomainIntegrator(new MixedVectorMassIntegrator());
@@ -979,6 +1083,31 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                   delete diffv;
                }
+               SECTION("Without Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_nd, &fespace_nd);
+                  blf.AddDomainIntegrator(new VectorFEMassIntegrator());
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_nd,tmp_nd); g_nd = 0.0;
+                  PCG(m_nd, s_nd, tmp_nd, g_nd, 0, 200, cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_nd.ComputeL2Error(F3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_nd, &fespace_nd);
+                  blfw.AddDomainIntegrator(new VectorFEMassIntegrator());
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
+                  delete blfT;
+                  delete diff;
+               }
             }
             {
                // Tests requiring a higher order ND space
@@ -996,7 +1125,7 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                Vector tmp_ndp(fespace_ndp.GetNDofs());
 
-               SECTION("With Scalar Coefficient")
+               SECTION("With Scalar Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_nd, &fespace_ndp);
                   blf.AddDomainIntegrator(
@@ -1024,7 +1153,33 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
                   delete blfT;
                   delete diff;
                }
-               SECTION("With Diagonal Matrix Coefficient")
+               SECTION("With Scalar Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_nd, &fespace_ndp);
+                  blf.AddDomainIntegrator(new VectorFEMassIntegrator(q3_coef));
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_nd,tmp_ndp); g_ndp = 0.0;
+                  PCG(m_ndp, s_ndp, tmp_ndp, g_ndp, 0, 200,
+                      cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_ndp.ComputeL2Error(qF3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_ndp, &fespace_nd);
+                  blfw.AddDomainIntegrator(new VectorFEMassIntegrator(q3_coef));
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
+                  delete blfT;
+                  delete diff;
+               }
+               SECTION("With Diagonal Matrix Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_nd, &fespace_ndp);
                   blf.AddDomainIntegrator(
@@ -1052,7 +1207,35 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
                   delete blfT;
                   delete diff;
                }
-               SECTION("With Matrix Coefficient")
+               SECTION("With Diagonal Matrix Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_nd, &fespace_ndp);
+                  blf.AddDomainIntegrator(
+                     new VectorFEMassIntegrator(D3_coef));
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_nd,tmp_ndp); g_ndp = 0.0;
+                  PCG(m_ndp, s_ndp, tmp_ndp, g_ndp, 0, 200,
+                      cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_ndp.ComputeL2Error(DF3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_ndp, &fespace_nd);
+                  blfw.AddDomainIntegrator(
+                     new VectorFEMassIntegrator(D3_coef));
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
+                  delete blfT;
+                  delete diff;
+               }
+               SECTION("With Matrix Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_nd, &fespace_ndp);
                   blf.AddDomainIntegrator(
@@ -1077,6 +1260,33 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                   REQUIRE( diff->MaxNorm() < tol );
 
+                  delete blfT;
+                  delete diff;
+               }
+               SECTION("With Matrix Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_nd, &fespace_ndp);
+                  blf.AddDomainIntegrator(new VectorFEMassIntegrator(M3_coef));
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_nd,tmp_ndp); g_ndp = 0.0;
+                  PCG(m_ndp, s_ndp, tmp_ndp, g_ndp, 0, 200,
+                      cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_ndp.ComputeL2Error(MF3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_ndp, &fespace_nd);
+                  blfw.AddDomainIntegrator(
+                     new VectorFEMassIntegrator(MT3_coef));
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
                   delete blfT;
                   delete diff;
                }
@@ -1109,7 +1319,7 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                Vector tmp_nd(fespace_nd.GetNDofs());
 
-               SECTION("Without Coefficient")
+               SECTION("Without Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_rt, &fespace_nd);
                   blf.AddDomainIntegrator(new MixedVectorMassIntegrator());
@@ -1145,6 +1355,31 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                   delete diffv;
                }
+               SECTION("Without Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_rt, &fespace_nd);
+                  blf.AddDomainIntegrator(new VectorFEMassIntegrator());
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_rt,tmp_nd); g_nd = 0.0;
+                  PCG(m_nd, s_nd, tmp_nd, g_nd, 0, 200, cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_nd.ComputeL2Error(F3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_nd, &fespace_rt);
+                  blfw.AddDomainIntegrator(new VectorFEMassIntegrator());
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
+                  delete blfT;
+                  delete diff;
+               }
             }
             {
                // Tests requiring a higher order ND space
@@ -1162,7 +1397,7 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                Vector tmp_nd(fespace_nd.GetNDofs());
 
-               SECTION("With Scalar Coefficient")
+               SECTION("With Scalar Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_rt, &fespace_nd);
                   blf.AddDomainIntegrator(
@@ -1189,7 +1424,32 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
                   delete blfT;
                   delete diff;
                }
-               SECTION("With Diagonal Matrix Coefficient")
+               SECTION("With Scalar Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_rt, &fespace_nd);
+                  blf.AddDomainIntegrator(new VectorFEMassIntegrator(q3_coef));
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_rt,tmp_nd); g_nd = 0.0;
+                  PCG(m_nd, s_nd, tmp_nd, g_nd, 0, 200, cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_nd.ComputeL2Error(qF3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_nd, &fespace_rt);
+                  blfw.AddDomainIntegrator(new VectorFEMassIntegrator(q3_coef));
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
+                  delete blfT;
+                  delete diff;
+               }
+               SECTION("With Diagonal Matrix Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_rt, &fespace_nd);
                   blf.AddDomainIntegrator(
@@ -1216,7 +1476,34 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
                   delete blfT;
                   delete diff;
                }
-               SECTION("With Matrix Coefficient")
+               SECTION("With Diagonal Matrix Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_rt, &fespace_nd);
+                  blf.AddDomainIntegrator(
+                     new VectorFEMassIntegrator(D3_coef));
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_rt,tmp_nd); g_nd = 0.0;
+                  PCG(m_nd, s_nd, tmp_nd, g_nd, 0, 200, cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_nd.ComputeL2Error(DF3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_nd, &fespace_rt);
+                  blfw.AddDomainIntegrator(
+                     new VectorFEMassIntegrator(D3_coef));
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
+                  delete blfT;
+                  delete diff;
+               }
+               SECTION("With Matrix Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_rt, &fespace_nd);
                   blf.AddDomainIntegrator(
@@ -1240,6 +1527,32 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                   REQUIRE( diff->MaxNorm() < tol );
 
+                  delete blfT;
+                  delete diff;
+               }
+               SECTION("With Matrix Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_rt, &fespace_nd);
+                  blf.AddDomainIntegrator(new VectorFEMassIntegrator(M3_coef));
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_rt,tmp_nd); g_nd = 0.0;
+                  PCG(m_nd, s_nd, tmp_nd, g_nd, 0, 200, cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_nd.ComputeL2Error(MF3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_nd, &fespace_rt);
+                  blfw.AddDomainIntegrator(
+                     new VectorFEMassIntegrator(MT3_coef));
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
                   delete blfT;
                   delete diff;
                }
@@ -1262,7 +1575,7 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                Vector tmp_rt(fespace_rt.GetNDofs());
 
-               SECTION("Without Coefficient")
+               SECTION("Without Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_rt, &fespace_rt);
                   blf.AddDomainIntegrator(new MixedVectorMassIntegrator());
@@ -1298,6 +1611,31 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                   delete diffv;
                }
+               SECTION("Without Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_rt, &fespace_rt);
+                  blf.AddDomainIntegrator(new VectorFEMassIntegrator());
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_rt,tmp_rt); g_rt = 0.0;
+                  PCG(m_rt, s_rt, tmp_rt, g_rt, 0, 200, cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_rt.ComputeL2Error(F3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_rt, &fespace_rt);
+                  blfw.AddDomainIntegrator(new VectorFEMassIntegrator());
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
+                  delete blfT;
+                  delete diff;
+               }
             }
             {
                // Tests requiring a higher order RT space
@@ -1315,7 +1653,7 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                Vector tmp_rtp(fespace_rtp.GetNDofs());
 
-               SECTION("With Scalar Coefficient")
+               SECTION("With Scalar Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_rt, &fespace_rtp);
                   blf.AddDomainIntegrator(
@@ -1343,7 +1681,33 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
                   delete blfT;
                   delete diff;
                }
-               SECTION("With Diagonal Matrix Coefficient")
+               SECTION("With Scalar Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_rt, &fespace_rtp);
+                  blf.AddDomainIntegrator(new VectorFEMassIntegrator(q3_coef));
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_rt,tmp_rtp); g_rtp = 0.0;
+                  PCG(m_rtp, s_rtp, tmp_rtp, g_rtp, 0, 200,
+                      cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_rtp.ComputeL2Error(qF3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_rtp, &fespace_rt);
+                  blfw.AddDomainIntegrator(new VectorFEMassIntegrator(q3_coef));
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
+                  delete blfT;
+                  delete diff;
+               }
+               SECTION("With Diagonal Matrix Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_rt, &fespace_rtp);
                   blf.AddDomainIntegrator(
@@ -1371,7 +1735,35 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
                   delete blfT;
                   delete diff;
                }
-               SECTION("With Matrix Coefficient")
+               SECTION("With Diagonal Matrix Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_rt, &fespace_rtp);
+                  blf.AddDomainIntegrator(
+                     new VectorFEMassIntegrator(D3_coef));
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_rt,tmp_rtp); g_rtp = 0.0;
+                  PCG(m_rtp, s_rtp, tmp_rtp, g_rtp, 0, 200,
+                      cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_rtp.ComputeL2Error(DF3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_rtp, &fespace_rt);
+                  blfw.AddDomainIntegrator(
+                     new VectorFEMassIntegrator(D3_coef));
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
+                  delete blfT;
+                  delete diff;
+               }
+               SECTION("With Matrix Coefficient (MixedVector)")
                {
                   MixedBilinearForm blf(&fespace_rt, &fespace_rtp);
                   blf.AddDomainIntegrator(
@@ -1396,6 +1788,33 @@ TEST_CASE("3D Bilinear Vector Mass Integrators",
 
                   REQUIRE( diff->MaxNorm() < tol );
 
+                  delete blfT;
+                  delete diff;
+               }
+               SECTION("With Matrix Coefficient (VectorFE)")
+               {
+                  MixedBilinearForm blf(&fespace_rt, &fespace_rtp);
+                  blf.AddDomainIntegrator(new VectorFEMassIntegrator(M3_coef));
+                  blf.Assemble();
+                  blf.Finalize();
+
+                  blf.Mult(f_rt,tmp_rtp); g_rtp = 0.0;
+                  PCG(m_rtp, s_rtp, tmp_rtp, g_rtp, 0, 200,
+                      cg_rtol * cg_rtol, 0.0);
+
+                  REQUIRE( g_rtp.ComputeL2Error(MF3_coef) < tol );
+
+                  MixedBilinearForm blfw(&fespace_rtp, &fespace_rt);
+                  blfw.AddDomainIntegrator(
+                     new VectorFEMassIntegrator(MT3_coef));
+                  blfw.Assemble();
+                  blfw.Finalize();
+
+                  SparseMatrix * blfT = Transpose(blfw.SpMat());
+                  SparseMatrix * diff = Add(1.0,blf.SpMat(),-1.0,*blfT);
+
+                  REQUIRE( diff->MaxNorm() < tol );
+
                   delete blfT;
                   delete diff;
                }
@@ -2588,7 +3007,7 @@ TEST_CASE("3D Bilinear Vector Dot Product Integrators",
          }
          SECTION("Mapping ND to L2")
          {
-            L2_FECollection    fec_l2(order - 1, dim);
+            L2_FECollection    fec_l2(order, dim);
             FiniteElementSpace fespace_l2(&mesh, &fec_l2);
 
             BilinearForm m_l2(&fespace_l2);
@@ -2652,7 +3071,7 @@ TEST_CASE("3D Bilinear Vector Dot Product Integrators",
          }
          SECTION("Mapping RT to L2")
          {
-            L2_FECollection    fec_l2(order - 1, dim);
+            L2_FECollection    fec_l2(order, dim);
             FiniteElementSpace fespace_l2(&mesh, &fec_l2);
 
             BilinearForm m_l2(&fespace_l2);
diff --git a/tests/unit/fem/test_assemblediagonalpa.cpp b/tests/unit/fem/test_assemblediagonalpa.cpp
index a847ce1662a..267bc6fdbd3 100644
--- a/tests/unit/fem/test_assemblediagonalpa.cpp
+++ b/tests/unit/fem/test_assemblediagonalpa.cpp
@@ -197,4 +197,83 @@ TEST_CASE("Vector Diffusion Diagonal PA",
    }
 }
 
+TEST_CASE("Hcurl/Hdiv diagonal PA")
+{
+   for (int dimension = 2; dimension < 4; ++dimension)
+   {
+      for (int spaceType = 0; spaceType < 2; ++spaceType)
+         for (int integrator = 0; integrator < 2; ++integrator)
+         {
+            for (int ne = 1; ne < 3; ++ne)
+            {
+               if (spaceType == 0)
+                  std::cout << "Testing " << dimension <<
+                            "D partial assembly H(curl) diagonal for integrator " << integrator << ": "
+                            << std::pow(ne, dimension) << " elements." << std::endl;
+               else
+                  std::cout << "Testing " << dimension <<
+                            "D partial assembly H(div) diagonal for integrator " << integrator << ": "
+                            << std::pow(ne, dimension) << " elements." << std::endl;
+
+               for (int order = 1; order < 4; ++order)
+               {
+                  Mesh * mesh;
+                  if (dimension == 2)
+                  {
+                     mesh = new Mesh(ne, ne, Element::QUADRILATERAL, 1, 1.0, 1.0);
+                  }
+                  else
+                  {
+                     mesh = new Mesh(ne, ne, ne, Element::HEXAHEDRON, 1, 1.0, 1.0, 1.0);
+                  }
+
+                  FiniteElementCollection* fec = (spaceType == 0) ?
+                                                 (FiniteElementCollection*) new ND_FECollection(order, dimension) :
+                                                 (FiniteElementCollection*) new RT_FECollection(order, dimension);
+
+                  FiniteElementSpace fespace(mesh, fec);
+                  BilinearForm paform(&fespace);
+                  BilinearForm faform(&fespace);
+                  ConstantCoefficient one(1.0);
+                  paform.SetAssemblyLevel(AssemblyLevel::PARTIAL);
+                  if (integrator == 0)
+                  {
+                     paform.AddDomainIntegrator(new VectorFEMassIntegrator(one));
+                     faform.AddDomainIntegrator(new VectorFEMassIntegrator(one));
+                  }
+                  else
+                  {
+                     if (spaceType == 0)
+                     {
+                        paform.AddDomainIntegrator(new CurlCurlIntegrator(one));
+                        faform.AddDomainIntegrator(new CurlCurlIntegrator(one));
+                     }
+                     else
+                     {
+                        paform.AddDomainIntegrator(new DivDivIntegrator(one));
+                        faform.AddDomainIntegrator(new DivDivIntegrator(one));
+                     }
+                  }
+                  paform.Assemble();
+                  Vector pa_diag(fespace.GetVSize());
+                  paform.AssembleDiagonal(pa_diag);
+
+                  faform.Assemble();
+                  faform.Finalize();
+                  Vector assembly_diag(fespace.GetVSize());
+                  faform.SpMat().GetDiag(assembly_diag);
+
+                  assembly_diag -= pa_diag;
+                  double error = assembly_diag.Norml2();
+                  std::cout << "    order: " << order << ", error norm: " << error << std::endl;
+                  REQUIRE(assembly_diag.Norml2() < 1.e-12);
+
+                  delete mesh;
+                  delete fec;
+               }
+            }
+         }
+   }
+}
+
 } // namespace assemblediagonalpa
diff --git a/tests/unit/fem/test_ea_kernels.cpp b/tests/unit/fem/test_ea_kernels.cpp
new file mode 100644
index 00000000000..d09cffcf93e
--- /dev/null
+++ b/tests/unit/fem/test_ea_kernels.cpp
@@ -0,0 +1,148 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#include "catch.hpp"
+#include "mfem.hpp"
+#include <fstream>
+#include <iostream>
+
+using namespace mfem;
+
+namespace ea_kernels
+{
+
+void velocity_function(const Vector &x, Vector &v)
+{
+   int dim = x.Size();
+   switch (dim)
+   {
+      case 1: v(0) = 1.0; break;
+      case 2: v(0) = x(1); v(1) = -x(0); break;
+      case 3: v(0) = x(1); v(1) = -x(0); v(2) = x(0); break;
+   }
+}
+
+void AddConvectionIntegrators(BilinearForm &k, VectorCoefficient &velocity,
+                              bool dg)
+{
+   k.AddDomainIntegrator(new ConvectionIntegrator(velocity, -1.0));
+
+   if (dg)
+   {
+      k.AddInteriorFaceIntegrator(
+         new TransposeIntegrator(new DGTraceIntegrator(velocity, 1.0, -0.5)));
+      k.AddBdrFaceIntegrator(
+         new TransposeIntegrator(new DGTraceIntegrator(velocity, 1.0, -0.5)));
+   }
+}
+
+void test_ea(Mesh &&mesh, int order, bool dg, const int pb)
+{
+   mesh.EnsureNodes();
+   mesh.SetCurvature(mesh.GetNodalFESpace()->GetOrder(0));
+   int dim = mesh.Dimension();
+
+   FiniteElementCollection *fec;
+   if (dg)
+   {
+      fec = new L2_FECollection(order, dim, BasisType::GaussLobatto);
+   }
+   else
+   {
+      fec = new H1_FECollection(order, dim);
+   }
+
+   FiniteElementSpace fespace(&mesh, fec);
+
+   BilinearForm k_ea(&fespace);
+   BilinearForm k_fa(&fespace);
+
+   ConstantCoefficient one(1.0);
+   VectorFunctionCoefficient vel_coeff(dim, velocity_function);
+
+   if (pb==0) // Mass
+   {
+      k_fa.AddDomainIntegrator(new MassIntegrator(one));
+      k_ea.AddDomainIntegrator(new MassIntegrator(one));
+   }
+   else if (pb==1) // Convection
+   {
+      AddConvectionIntegrators(k_fa, vel_coeff, dg);
+      AddConvectionIntegrators(k_ea, vel_coeff, dg);
+   }
+   else if (pb==2) // Diffusion
+   {
+      k_fa.AddDomainIntegrator(new DiffusionIntegrator(one));
+      k_ea.AddDomainIntegrator(new DiffusionIntegrator(one));
+   }
+
+   k_fa.Assemble();
+   k_fa.Finalize();
+
+   k_ea.SetAssemblyLevel(AssemblyLevel::ELEMENT);
+   k_ea.Assemble();
+
+   GridFunction x(&fespace), y_fa(&fespace), y_ea(&fespace);
+
+   x.Randomize(1);
+
+   k_fa.Mult(x,y_fa);
+   k_ea.Mult(x,y_ea);
+
+   y_ea -= y_fa;
+
+   REQUIRE(y_ea.Norml2() < 1.e-12);
+
+   delete fec;
+}
+
+//Basic unit test for convection
+TEST_CASE("Element Assembly", "[ElementAssembly]")
+{
+   for (int pb : {0, 1, 2})
+   {
+      for (bool dg : {true, false})
+      {
+         SECTION("2D")
+         {
+            for (int order : {2, 3, 4})
+            {
+               test_ea(Mesh("../../data/periodic-square.mesh", 1, 1), order, dg, pb);
+               test_ea(Mesh("../../data/periodic-hexagon.mesh", 1, 1), order, dg, pb);
+               test_ea(Mesh("../../data/star-q3.mesh", 1, 1), order, dg, pb);
+            }
+         }
+
+         SECTION("3D")
+         {
+            int order = 2;
+            test_ea(Mesh("../../data/periodic-cube.mesh", 1, 1), order, dg, pb);
+            test_ea(Mesh("../../data/fichera-q3.mesh", 1, 1), order, dg, pb);
+         }
+      }
+
+      // Test AMR cases (DG not implemented)
+      SECTION("AMR 2D")
+      {
+         for (int order : {2, 3, 4})
+         {
+            test_ea(Mesh("../../data/amr-quad.mesh", 1, 1), order, false, 0);
+         }
+      }
+      SECTION("AMR 3D")
+      {
+         int order = 2;
+         test_ea(Mesh("../../data/fichera-amr.mesh", 1, 1), order, false, 0);
+      }
+   }
+}//test case
+
+}// namespace pa_kernels
diff --git a/tests/unit/fem/test_face_elem_trans.cpp b/tests/unit/fem/test_face_elem_trans.cpp
new file mode 100644
index 00000000000..70fc8bed564
--- /dev/null
+++ b/tests/unit/fem/test_face_elem_trans.cpp
@@ -0,0 +1,95 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#include "mfem.hpp"
+#include "catch.hpp"
+
+using namespace mfem;
+
+namespace face_elem_trans
+{
+
+TEST_CASE("3D FaceElementTransformations",
+          "[FaceElementTransformations]")
+{
+   int log = 0;
+   int n = 1;
+   int dim = 3;
+   int order = 1;
+
+   Mesh mesh(n, n, n, Element::TETRAHEDRON, 1, 2.0, 3.0, 5.0);
+
+   SECTION("Transform")
+   {
+      int npts = 0;
+      for (int f=0; f<mesh.GetNFaces(); f++)
+      {
+         if (log > 0)
+         {
+            std::cout << "Getting trans for face " << f << std::endl;
+         }
+         FaceElementTransformations *T =
+            mesh.GetInteriorFaceTransformations(f);
+
+         if (T != NULL)
+         {
+            const IntegrationRule &ir = IntRules.Get(T->GetGeometryType(),
+                                                     2*order + 2);
+            if (log > 0)
+            {
+               std::cout << f << " " << T->Elem1No
+                         << " " << T->Elem2No << std::endl;
+            }
+
+            double tip_data[3];
+            double tip1_data[3];
+            double tip2_data[3];
+            Vector tip(tip_data, 3);
+            Vector tip1(tip1_data, 3);
+            Vector tip2(tip2_data, 3);
+
+            for (int j=0; j<ir.GetNPoints(); j++)
+            {
+               npts++;
+               const IntegrationPoint &ip = ir.IntPoint(j);
+               IntegrationPoint eip1, eip2;
+
+               T->SetIntPoint(&ip);
+               T->Transform(ip, tip);
+
+               T->Loc1.Transform(ip, eip1);
+               T->Elem1->Transform(eip1, tip1);
+
+               tip1 -= tip;
+
+               REQUIRE(tip1.Norml2() == Approx(0.0));
+
+               if (T->Elem2)
+               {
+                  T->Loc2.Transform(ip, eip2);
+                  T->Elem2->Transform(eip2, tip2);
+
+                  tip2 -= tip;
+
+                  REQUIRE(tip2.Norml2() == Approx(0.0));
+               }
+            }
+         }
+         if (log > 0)
+         {
+            std::cout << "Checked " << npts << " points within face "
+                      << f << std::endl;
+         }
+      }
+   }
+}
+
+} // namespace face_elem_trans
diff --git a/tests/unit/fem/test_get_value.cpp b/tests/unit/fem/test_get_value.cpp
new file mode 100644
index 00000000000..af225bb6750
--- /dev/null
+++ b/tests/unit/fem/test_get_value.cpp
@@ -0,0 +1,1832 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#include "mfem.hpp"
+#include "catch.hpp"
+
+using namespace mfem;
+
+namespace get_value
+{
+
+double func_1D_lin(const Vector &x)
+{
+   return x[0];
+}
+
+double func_2D_lin(const Vector &x)
+{
+   return x[0] + 2.0 * x[1];
+}
+
+double func_3D_lin(const Vector &x)
+{
+   return x[0] + 2.0 * x[1] + 3.0 * x[2];
+}
+
+void Func_2D_lin(const Vector &x, Vector &v)
+{
+   v.SetSize(2);
+   v[0] =  1.234 * x[0] - 2.357 * x[1];
+   v[1] =  2.537 * x[0] + 4.321 * x[1];
+}
+
+void Func_3D_lin(const Vector &x, Vector &v)
+{
+   v.SetSize(3);
+   v[0] =  1.234 * x[0] - 2.357 * x[1] + 3.572 * x[2];
+   v[1] =  2.537 * x[0] + 4.321 * x[1] - 1.234 * x[2];
+   v[2] = -2.572 * x[0] + 1.321 * x[1] + 3.234 * x[2];
+}
+
+TEST_CASE("1D GetValue",
+          "[GridFunction]"
+          "[GridFunctionCoefficient]")
+{
+   int log = 1;
+   int n = 1;
+   int dim = 1;
+   int order = 1;
+   int npts = 0;
+
+   double tol = 1e-6;
+
+   for (int type = (int)Element::SEGMENT;
+        type <= (int)Element::SEGMENT; type++)
+   {
+      Mesh mesh(n, 2.0);
+
+      FunctionCoefficient linCoef(func_1D_lin);
+
+      SECTION("1D GetValue tests for element type " + std::to_string(type))
+      {
+         H1_FECollection h1_fec(order, dim);
+         DG_FECollection dgv_fec(order, dim, BasisType::GaussLegendre,
+                                 FiniteElement::VALUE);
+         DG_FECollection dgi_fec(order, dim, BasisType::GaussLegendre,
+                                 FiniteElement::INTEGRAL);
+
+         FiniteElementSpace h1_fespace(&mesh, &h1_fec);
+         FiniteElementSpace dgv_fespace(&mesh, &dgv_fec);
+         FiniteElementSpace dgi_fespace(&mesh, &dgi_fec);
+
+         GridFunction h1_x(&h1_fespace);
+         GridFunction dgv_x(&dgv_fespace);
+         GridFunction dgi_x(&dgi_fespace);
+
+         GridFunctionCoefficient h1_xCoef(&h1_x);
+         GridFunctionCoefficient dgv_xCoef(&dgv_x);
+         GridFunctionCoefficient dgi_xCoef(&dgi_x);
+
+         h1_x.ProjectCoefficient(linCoef);
+         dgv_x.ProjectCoefficient(linCoef);
+         dgi_x.ProjectCoefficient(linCoef);
+
+         SECTION("Domain Evaluation 1D")
+         {
+            std::cout << "Domain Evaluation 1D" << std::endl;
+            for (int e = 0; e < mesh.GetNE(); e++)
+            {
+               ElementTransformation *T = mesh.GetElementTransformation(e);
+               const FiniteElement   *fe = h1_fespace.GetFE(e);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double h1_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[1];
+               Vector tip(tip_data, 1);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  double f_val = func_1D_lin(tip);
+
+                  double h1_gf_val = h1_xCoef.Eval(*T, ip);
+                  double dgv_gf_val = dgv_xCoef.Eval(*T, ip);
+                  double dgi_gf_val = dgi_xCoef.Eval(*T, ip);
+
+                  h1_err += fabs(f_val - h1_gf_val);
+                  dgv_err += fabs(f_val - dgv_gf_val);
+                  dgi_err += fabs(f_val - dgi_gf_val);
+
+                  if (log > 0 && fabs(f_val - h1_gf_val) > tol)
+                  {
+                     std::cout << e << ":" << j << " h1  " << f_val << " "
+                               << h1_gf_val << " " << fabs(f_val - h1_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgv_gf_val) > tol)
+                  {
+                     std::cout << e << ":" << j << " dgv " << f_val << " "
+                               << dgv_gf_val << " " << fabs(f_val - dgv_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgi_gf_val) > tol)
+                  {
+                     std::cout << e << ":" << j << " dgi " << f_val << " "
+                               << dgi_gf_val << " " << fabs(f_val - dgi_gf_val)
+                               << std::endl;
+                  }
+               }
+               h1_err /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE(h1_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Boundary Evaluation 1D (H1 Context)")
+         {
+            std::cout << "Boundary Evaluation 1D (H1 Context)" << std::endl;
+            for (int be = 0; be < mesh.GetNBE(); be++)
+            {
+               ElementTransformation *T = mesh.GetBdrElementTransformation(be);
+               const FiniteElement   *fe = h1_fespace.GetBE(be);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double h1_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[1];
+               Vector tip(tip_data, 1);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  double f_val = func_1D_lin(tip);
+                  double h1_gf_val = h1_xCoef.Eval(*T, ip);
+                  double dgv_gf_val = dgv_xCoef.Eval(*T, ip);
+                  double dgi_gf_val = dgi_xCoef.Eval(*T, ip);
+
+                  h1_err += fabs(f_val - h1_gf_val);
+                  dgv_err += fabs(f_val - dgv_gf_val);
+                  dgi_err += fabs(f_val - dgi_gf_val);
+
+                  if (log > 0 && fabs(f_val - h1_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " h1  " << f_val << " "
+                               << h1_gf_val << " " << fabs(f_val - h1_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgv_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " dgv " << f_val << " "
+                               << dgv_gf_val << " " << fabs(f_val - dgv_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgi_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " dgi " << f_val << " "
+                               << dgi_gf_val << " " << fabs(f_val - dgi_gf_val)
+                               << std::endl;
+                  }
+               }
+               h1_err /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE(h1_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Boundary Evaluation 1D (DG Context)")
+         {
+            std::cout << "Boundary Evaluation 1D (DG Context)" << std::endl;
+            for (int be = 0; be < mesh.GetNBE(); be++)
+            {
+               FaceElementTransformations *T =
+                  mesh.GetBdrFaceTransformations(be);
+               const IntegrationRule &ir = IntRules.Get(T->GetGeometryType(),
+                                                        2*order + 2);
+
+               double h1_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[1];
+               Vector tip(tip_data, 1);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  double f_val = func_1D_lin(tip);
+
+                  double h1_gf_val = h1_xCoef.Eval(*T, ip);
+                  double dgv_gf_val = dgv_xCoef.Eval(*T, ip);
+                  double dgi_gf_val = dgi_xCoef.Eval(*T, ip);
+
+                  h1_err += fabs(f_val - h1_gf_val);
+                  dgv_err += fabs(f_val - dgv_gf_val);
+                  dgi_err += fabs(f_val - dgi_gf_val);
+
+                  if (log > 0 && fabs(f_val - h1_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " h1  " << f_val << " "
+                               << h1_gf_val << " " << fabs(f_val - h1_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgv_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " dgv " << f_val << " "
+                               << dgv_gf_val << " " << fabs(f_val - dgv_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgi_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " dgi " << f_val << " "
+                               << dgi_gf_val << " " << fabs(f_val - dgi_gf_val)
+                               << std::endl;
+                  }
+               }
+               h1_err /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE(h1_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+      }
+   }
+   std::cout << "Checked GridFunction::GetValue at "
+             << npts << " 1D points" << std::endl;
+}
+
+TEST_CASE("2D GetValue",
+          "[GridFunction]"
+          "[GridFunctionCoefficient]")
+{
+   int log = 1;
+   int n = 1;
+   int dim = 2;
+   int order = 1;
+   int npts = 0;
+
+   double tol = 1e-6;
+
+   for (int type = (int)Element::TRIANGLE;
+        type <= (int)Element::QUADRILATERAL; type++)
+   {
+      Mesh mesh(n, n, (Element::Type)type, 1, 2.0, 3.0);
+
+      FunctionCoefficient linCoef(func_2D_lin);
+
+      SECTION("2D GetValue tests for element type " + std::to_string(type))
+      {
+         H1_FECollection h1_fec(order, dim);
+         DG_FECollection dgv_fec(order, dim, BasisType::GaussLegendre,
+                                 FiniteElement::VALUE);
+         DG_FECollection dgi_fec(order, dim, BasisType::GaussLegendre,
+                                 FiniteElement::INTEGRAL);
+
+         FiniteElementSpace h1_fespace(&mesh, &h1_fec);
+         FiniteElementSpace dgv_fespace(&mesh, &dgv_fec);
+         FiniteElementSpace dgi_fespace(&mesh, &dgi_fec);
+
+         GridFunction h1_x(&h1_fespace);
+         GridFunction dgv_x(&dgv_fespace);
+         GridFunction dgi_x(&dgi_fespace);
+
+         GridFunctionCoefficient h1_xCoef(&h1_x);
+         GridFunctionCoefficient dgv_xCoef(&dgv_x);
+         GridFunctionCoefficient dgi_xCoef(&dgi_x);
+
+         h1_x.ProjectCoefficient(linCoef);
+         dgv_x.ProjectCoefficient(linCoef);
+         dgi_x.ProjectCoefficient(linCoef);
+
+         SECTION("Domain Evaluation 2D")
+         {
+            std::cout << "Domain Evaluation 2D" << std::endl;
+            for (int e = 0; e < mesh.GetNE(); e++)
+            {
+               ElementTransformation *T = mesh.GetElementTransformation(e);
+               const FiniteElement   *fe = h1_fespace.GetFE(e);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double h1_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  double f_val = func_2D_lin(tip);
+
+                  double h1_gf_val = h1_xCoef.Eval(*T, ip);
+                  double dgv_gf_val = dgv_xCoef.Eval(*T, ip);
+                  double dgi_gf_val = dgi_xCoef.Eval(*T, ip);
+
+                  h1_err += fabs(f_val - h1_gf_val);
+                  dgv_err += fabs(f_val - dgv_gf_val);
+                  dgi_err += fabs(f_val - dgi_gf_val);
+
+                  if (log > 0 && fabs(f_val - h1_gf_val) > tol)
+                  {
+                     std::cout << e << ":" << j << " h1  " << f_val << " "
+                               << h1_gf_val << " " << fabs(f_val - h1_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgv_gf_val) > tol)
+                  {
+                     std::cout << e << ":" << j << " dgv " << f_val << " "
+                               << dgv_gf_val << " " << fabs(f_val - dgv_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgi_gf_val) > tol)
+                  {
+                     std::cout << e << ":" << j << " dgi " << f_val << " "
+                               << dgi_gf_val << " " << fabs(f_val - dgi_gf_val)
+                               << std::endl;
+                  }
+               }
+               h1_err /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE(h1_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Boundary Evaluation 2D (H1 Context)")
+         {
+            std::cout << "Boundary Evaluation 2D (H1 Context)" << std::endl;
+            for (int be = 0; be < mesh.GetNBE(); be++)
+            {
+               ElementTransformation *T = mesh.GetBdrElementTransformation(be);
+               const FiniteElement   *fe = h1_fespace.GetBE(be);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double h1_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  double f_val = func_2D_lin(tip);
+                  double h1_gf_val = h1_xCoef.Eval(*T, ip);
+                  double dgv_gf_val = dgv_xCoef.Eval(*T, ip);
+                  double dgi_gf_val = dgi_xCoef.Eval(*T, ip);
+
+                  h1_err += fabs(f_val - h1_gf_val);
+                  dgv_err += fabs(f_val - dgv_gf_val);
+                  dgi_err += fabs(f_val - dgi_gf_val);
+
+                  if (log > 0 && fabs(f_val - h1_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " h1  " << f_val << " "
+                               << h1_gf_val << " " << fabs(f_val - h1_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgv_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " dgv " << f_val << " "
+                               << dgv_gf_val << " " << fabs(f_val - dgv_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgi_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " dgi " << f_val << " "
+                               << dgi_gf_val << " " << fabs(f_val - dgi_gf_val)
+                               << std::endl;
+                  }
+               }
+               h1_err /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE(h1_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Boundary Evaluation 2D (DG Context)")
+         {
+            std::cout << "Boundary Evaluation 2D (DG Context)" << std::endl;
+            for (int be = 0; be < mesh.GetNBE(); be++)
+            {
+               FaceElementTransformations *T =
+                  mesh.GetBdrFaceTransformations(be);
+               const IntegrationRule &ir = IntRules.Get(T->GetGeometryType(),
+                                                        2*order + 2);
+
+               double h1_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  double f_val = func_2D_lin(tip);
+
+                  double h1_gf_val = h1_xCoef.Eval(*T, ip);
+                  double dgv_gf_val = dgv_xCoef.Eval(*T, ip);
+                  double dgi_gf_val = dgi_xCoef.Eval(*T, ip);
+
+                  h1_err += fabs(f_val - h1_gf_val);
+                  dgv_err += fabs(f_val - dgv_gf_val);
+                  dgi_err += fabs(f_val - dgi_gf_val);
+
+                  if (log > 0 && fabs(f_val - h1_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " h1  " << f_val << " "
+                               << h1_gf_val << " " << fabs(f_val - h1_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgv_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " dgv " << f_val << " "
+                               << dgv_gf_val << " " << fabs(f_val - dgv_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgi_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " dgi " << f_val << " "
+                               << dgi_gf_val << " " << fabs(f_val - dgi_gf_val)
+                               << std::endl;
+                  }
+               }
+               h1_err /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE(h1_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Edge Evaluation 2D (H1 Context)")
+         {
+            std::cout << "Edge Evaluation 2D (H1 Context)" << std::endl;
+            for (int e = 0; e < mesh.GetNEdges(); e++)
+            {
+               ElementTransformation *T = mesh.GetEdgeTransformation(e);
+               const FiniteElement   *fe = h1_fespace.GetEdgeElement(e);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double h1_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  double f_val = func_3D_lin(tip);
+                  double h1_gf_val = h1_xCoef.Eval(*T, ip);
+
+                  h1_err += fabs(f_val - h1_gf_val);
+
+                  if (log > 0 && fabs(f_val - h1_gf_val) > tol)
+                  {
+                     std::cout << e << ":" << j << " h1  " << f_val << " "
+                               << h1_gf_val << " " << fabs(f_val - h1_gf_val)
+                               << std::endl;
+                  }
+               }
+               h1_err /= ir.GetNPoints();
+
+               REQUIRE(h1_err == Approx(0.0));
+            }
+         }
+      }
+   }
+   std::cout << "Checked GridFunction::GetValue at "
+             << npts << " 2D points" << std::endl;
+}
+
+TEST_CASE("3D GetValue",
+          "[GridFunction]"
+          "[GridFunctionCoefficient]")
+{
+   int log = 1;
+   int n = 1;
+   int dim = 3;
+   int order = 1;
+   int npts = 0;
+
+   double tol = 1e-6;
+
+   for (int type = (int)Element::TETRAHEDRON;
+        type <= (int)Element::WEDGE; type++)
+   {
+      Mesh mesh(n, n, n, (Element::Type)type, 1, 2.0, 3.0, 5.0);
+
+      FunctionCoefficient linCoef(func_3D_lin);
+
+      SECTION("3D GetValue tests for element type " + std::to_string(type))
+      {
+         H1_FECollection h1_fec(order, dim);
+         DG_FECollection dgv_fec(order, dim, BasisType::GaussLegendre,
+                                 FiniteElement::VALUE);
+         DG_FECollection dgi_fec(order, dim, BasisType::GaussLegendre,
+                                 FiniteElement::INTEGRAL);
+
+         FiniteElementSpace h1_fespace(&mesh, &h1_fec);
+         FiniteElementSpace dgv_fespace(&mesh, &dgv_fec);
+         FiniteElementSpace dgi_fespace(&mesh, &dgi_fec);
+
+         GridFunction h1_x(&h1_fespace);
+         GridFunction dgv_x(&dgv_fespace);
+         GridFunction dgi_x(&dgi_fespace);
+
+         GridFunctionCoefficient h1_xCoef(&h1_x);
+         GridFunctionCoefficient dgv_xCoef(&dgv_x);
+         GridFunctionCoefficient dgi_xCoef(&dgi_x);
+
+         h1_x.ProjectCoefficient(linCoef);
+         dgv_x.ProjectCoefficient(linCoef);
+         dgi_x.ProjectCoefficient(linCoef);
+
+         SECTION("Domain Evaluation 3D")
+         {
+            std::cout << "Domain Evaluation 3D" << std::endl;
+            for (int e = 0; e < mesh.GetNE(); e++)
+            {
+               ElementTransformation *T = mesh.GetElementTransformation(e);
+               const FiniteElement   *fe = h1_fespace.GetFE(e);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double h1_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  double f_val = func_3D_lin(tip);
+
+                  double h1_gf_val = h1_xCoef.Eval(*T, ip);
+                  double dgv_gf_val = dgv_xCoef.Eval(*T, ip);
+                  double dgi_gf_val = dgi_xCoef.Eval(*T, ip);
+
+                  h1_err += fabs(f_val - h1_gf_val);
+                  dgv_err += fabs(f_val - dgv_gf_val);
+                  dgi_err += fabs(f_val - dgi_gf_val);
+
+                  if (log > 0 && fabs(f_val - h1_gf_val) > tol)
+                  {
+                     std::cout << e << ":" << j << " h1  " << f_val << " "
+                               << h1_gf_val << " " << fabs(f_val - h1_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgv_gf_val) > tol)
+                  {
+                     std::cout << e << ":" << j << " dgv " << f_val << " "
+                               << dgv_gf_val << " " << fabs(f_val - dgv_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgi_gf_val) > tol)
+                  {
+                     std::cout << e << ":" << j << " dgi " << f_val << " "
+                               << dgi_gf_val << " " << fabs(f_val - dgi_gf_val)
+                               << std::endl;
+                  }
+               }
+               h1_err /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE(h1_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Boundary Evaluation 3D (H1 Context)")
+         {
+            std::cout << "Boundary Evaluation 3D (H1 Context)" << std::endl;
+            for (int be = 0; be < mesh.GetNBE(); be++)
+            {
+               ElementTransformation *T = mesh.GetBdrElementTransformation(be);
+               const FiniteElement   *fe = h1_fespace.GetBE(be);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double h1_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  double f_val = func_3D_lin(tip);
+                  double h1_gf_val = h1_xCoef.Eval(*T, ip);
+                  double dgv_gf_val = dgv_xCoef.Eval(*T, ip);
+                  double dgi_gf_val = dgi_xCoef.Eval(*T, ip);
+
+                  h1_err += fabs(f_val - h1_gf_val);
+                  dgv_err += fabs(f_val - dgv_gf_val);
+                  dgi_err += fabs(f_val - dgi_gf_val);
+
+                  if (log > 0 && fabs(f_val - h1_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " h1  " << f_val << " "
+                               << h1_gf_val << " " << fabs(f_val - h1_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgv_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " dgv " << f_val << " "
+                               << dgv_gf_val << " " << fabs(f_val - dgv_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgi_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " dgi " << f_val << " "
+                               << dgi_gf_val << " " << fabs(f_val - dgi_gf_val)
+                               << std::endl;
+                  }
+               }
+               h1_err /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE(h1_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Boundary Evaluation 3D (DG Context)")
+         {
+            std::cout << "Boundary Evaluation 3D (DG Context)" << std::endl;
+            for (int be = 0; be < mesh.GetNBE(); be++)
+            {
+               FaceElementTransformations *T =
+                  mesh.GetBdrFaceTransformations(be);
+               const IntegrationRule &ir = IntRules.Get(T->GetGeometryType(),
+                                                        2*order + 2);
+
+               double h1_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  double f_val = func_3D_lin(tip);
+
+                  double h1_gf_val = h1_xCoef.Eval(*T, ip);
+                  double dgv_gf_val = dgv_xCoef.Eval(*T, ip);
+                  double dgi_gf_val = dgi_xCoef.Eval(*T, ip);
+
+                  h1_err += fabs(f_val - h1_gf_val);
+                  dgv_err += fabs(f_val - dgv_gf_val);
+                  dgi_err += fabs(f_val - dgi_gf_val);
+
+                  if (log > 0 && fabs(f_val - h1_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " h1  " << f_val << " "
+                               << h1_gf_val << " " << fabs(f_val - h1_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgv_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " dgv " << f_val << " "
+                               << dgv_gf_val << " " << fabs(f_val - dgv_gf_val)
+                               << std::endl;
+                  }
+                  if (log > 0 && fabs(f_val - dgi_gf_val) > tol)
+                  {
+                     std::cout << be << ":" << j << " dgi " << f_val << " "
+                               << dgi_gf_val << " " << fabs(f_val - dgi_gf_val)
+                               << std::endl;
+                  }
+               }
+               h1_err /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE(h1_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Edge Evaluation 3D (H1 Context)")
+         {
+            std::cout << "Edge Evaluation 3D (H1 Context)" << std::endl;
+            for (int e = 0; e < mesh.GetNEdges(); e++)
+            {
+               ElementTransformation *T = mesh.GetEdgeTransformation(e);
+               const FiniteElement   *fe = h1_fespace.GetEdgeElement(e);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double h1_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  double f_val = func_3D_lin(tip);
+                  double h1_gf_val = h1_xCoef.Eval(*T, ip);
+
+                  h1_err += fabs(f_val - h1_gf_val);
+
+                  if (log > 0 && fabs(f_val - h1_gf_val) > tol)
+                  {
+                     std::cout << e << ":" << j << " h1  " << f_val << " "
+                               << h1_gf_val << " " << fabs(f_val - h1_gf_val)
+                               << std::endl;
+                  }
+               }
+               h1_err /= ir.GetNPoints();
+
+               REQUIRE(h1_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Face Evaluation 3D (H1 Context)")
+         {
+            std::cout << "Face Evaluation 3D (H1 Context)" << std::endl;
+            for (int f = 0; f < mesh.GetNFaces(); f++)
+            {
+               ElementTransformation *T = mesh.GetFaceTransformation(f);
+               const FiniteElement   *fe = h1_fespace.GetFaceElement(f);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double h1_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  double f_val = func_3D_lin(tip);
+                  double h1_gf_val = h1_xCoef.Eval(*T, ip);
+
+                  h1_err += fabs(f_val - h1_gf_val);
+
+                  if (log > 0 && fabs(f_val - h1_gf_val) > tol)
+                  {
+                     std::cout << f << ":" << j << " h1  " << f_val << " "
+                               << h1_gf_val << " " << fabs(f_val - h1_gf_val)
+                               << std::endl;
+                  }
+               }
+               h1_err /= ir.GetNPoints();
+
+               REQUIRE(h1_err == Approx(0.0));
+            }
+         }
+      }
+   }
+   std::cout << "Checked GridFunction::GetValue at "
+             << npts << " 3D points" << std::endl;
+}
+
+TEST_CASE("2D GetVectorValue",
+          "[GridFunction]"
+          "[VectorGridFunctionCoefficient]")
+{
+   int log = 1;
+   int n = 1;
+   int dim = 2;
+   int order = 1;
+   int npts = 0;
+
+   double tol = 1e-6;
+
+   for (int type = (int)Element::TRIANGLE;
+        type <= (int)Element::QUADRILATERAL; type++)
+   {
+      Mesh mesh(n, n, (Element::Type)type, 1, 2.0, 3.0);
+
+      VectorFunctionCoefficient linCoef(dim, Func_2D_lin);
+
+      SECTION("2D GetVectorValue tests for element type " +
+              std::to_string(type))
+      {
+         H1_FECollection  h1_fec(order, dim);
+         ND_FECollection  nd_fec(order+1, dim);
+         RT_FECollection  rt_fec(order+1, dim);
+         L2_FECollection  l2_fec(order, dim);
+         DG_FECollection dgv_fec(order, dim, BasisType::GaussLegendre,
+                                 FiniteElement::VALUE);
+         DG_FECollection dgi_fec(order, dim, BasisType::GaussLegendre,
+                                 FiniteElement::INTEGRAL);
+
+         FiniteElementSpace  h1_fespace(&mesh,  &h1_fec, dim);
+         FiniteElementSpace  nd_fespace(&mesh,  &nd_fec);
+         FiniteElementSpace  rt_fespace(&mesh,  &rt_fec);
+         FiniteElementSpace  l2_fespace(&mesh,  &l2_fec, dim);
+         FiniteElementSpace dgv_fespace(&mesh, &dgv_fec, dim);
+         FiniteElementSpace dgi_fespace(&mesh, &dgi_fec, dim);
+
+         GridFunction  h1_x( &h1_fespace);
+         GridFunction  nd_x( &nd_fespace);
+         GridFunction  rt_x( &rt_fespace);
+         GridFunction  l2_x( &l2_fespace);
+         GridFunction dgv_x(&dgv_fespace);
+         GridFunction dgi_x(&dgi_fespace);
+
+         VectorGridFunctionCoefficient  h1_xCoef( &h1_x);
+         VectorGridFunctionCoefficient  nd_xCoef( &nd_x);
+         VectorGridFunctionCoefficient  rt_xCoef( &rt_x);
+         VectorGridFunctionCoefficient  l2_xCoef( &l2_x);
+         VectorGridFunctionCoefficient dgv_xCoef(&dgv_x);
+         VectorGridFunctionCoefficient dgi_xCoef(&dgi_x);
+
+         h1_x.ProjectCoefficient(linCoef);
+         nd_x.ProjectCoefficient(linCoef);
+         rt_x.ProjectCoefficient(linCoef);
+         l2_x.ProjectCoefficient(linCoef);
+         dgv_x.ProjectCoefficient(linCoef);
+         dgi_x.ProjectCoefficient(linCoef);
+
+         Vector      f_val(dim);      f_val = 0.0;
+         Vector  h1_gf_val(dim);  h1_gf_val = 0.0;
+         Vector  nd_gf_val(dim);  nd_gf_val = 0.0;
+         Vector  rt_gf_val(dim);  rt_gf_val = 0.0;
+         Vector  l2_gf_val(dim);  l2_gf_val = 0.0;
+         Vector dgv_gf_val(dim); dgv_gf_val = 0.0;
+         Vector dgi_gf_val(dim); dgi_gf_val = 0.0;
+
+         SECTION("Domain Evaluation 2D")
+         {
+            std::cout << "Domain Evaluation 2D" << std::endl;
+            for (int e = 0; e < mesh.GetNE(); e++)
+            {
+               ElementTransformation *T = mesh.GetElementTransformation(e);
+               const FiniteElement   *fe = h1_fespace.GetFE(e);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double  h1_err = 0.0;
+               double  nd_err = 0.0;
+               double  rt_err = 0.0;
+               double  l2_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  Func_2D_lin(tip, f_val);
+
+                  h1_xCoef.Eval(h1_gf_val, *T, ip);
+                  nd_xCoef.Eval(nd_gf_val, *T, ip);
+                  rt_xCoef.Eval(rt_gf_val, *T, ip);
+                  l2_xCoef.Eval(l2_gf_val, *T, ip);
+                  dgv_xCoef.Eval(dgv_gf_val, *T, ip);
+                  dgi_xCoef.Eval(dgi_gf_val, *T, ip);
+
+                  double  h1_dist = Distance(f_val,  h1_gf_val, 2);
+                  double  nd_dist = Distance(f_val,  nd_gf_val, 2);
+                  double  rt_dist = Distance(f_val,  rt_gf_val, 2);
+                  double  l2_dist = Distance(f_val,  l2_gf_val, 2);
+                  double dgv_dist = Distance(f_val, dgv_gf_val, 2);
+                  double dgi_dist = Distance(f_val, dgi_gf_val, 2);
+
+                  h1_err  +=  h1_dist;
+                  nd_err  +=  nd_dist;
+                  rt_err  +=  rt_dist;
+                  l2_err  +=  l2_dist;
+                  dgv_err += dgv_dist;
+                  dgi_err += dgi_dist;
+
+                  if (log > 0 && h1_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " h1  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << h1_gf_val[0] << "," << h1_gf_val[1] << ") "
+                               << h1_dist << std::endl;
+                  }
+                  if (log > 0 && nd_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " nd  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << nd_gf_val[0] << "," << nd_gf_val[1] << ") "
+                               << nd_dist << std::endl;
+                  }
+                  if (log > 0 && rt_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " rt  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << rt_gf_val[0] << "," << rt_gf_val[1] << ") "
+                               << rt_dist << std::endl;
+                  }
+                  if (log > 0 && l2_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " l2  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << l2_gf_val[0] << "," << l2_gf_val[1] << ") "
+                               << l2_dist << std::endl;
+                  }
+                  if (log > 0 && dgv_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " dgv ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << dgv_gf_val[0] << "," << dgv_gf_val[1] << ") "
+                               << dgv_dist << std::endl;
+                  }
+                  if (log > 0 && dgi_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " dgi ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << dgi_gf_val[0] << "," << dgi_gf_val[1] << ") "
+                               << dgi_dist << std::endl;
+                  }
+               }
+               h1_err  /= ir.GetNPoints();
+               nd_err  /= ir.GetNPoints();
+               rt_err  /= ir.GetNPoints();
+               l2_err  /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE( h1_err == Approx(0.0));
+               REQUIRE( nd_err == Approx(0.0));
+               REQUIRE( rt_err == Approx(0.0));
+               REQUIRE( l2_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Boundary Evaluation 2D (H1 Context)")
+         {
+            std::cout << "Boundary Evaluation 2D (H1 Context)" << std::endl;
+            for (int be = 0; be < mesh.GetNBE(); be++)
+            {
+               ElementTransformation *T = mesh.GetBdrElementTransformation(be);
+               const FiniteElement   *fe = h1_fespace.GetBE(be);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double  h1_err = 0.0;
+               double  nd_err = 0.0;
+               double  rt_err = 0.0;
+               double  l2_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  Func_2D_lin(tip, f_val);
+
+                  h1_xCoef.Eval(h1_gf_val, *T, ip);
+                  nd_xCoef.Eval(nd_gf_val, *T, ip);
+                  rt_xCoef.Eval(rt_gf_val, *T, ip);
+                  l2_xCoef.Eval(l2_gf_val, *T, ip);
+                  dgv_xCoef.Eval(dgv_gf_val, *T, ip);
+                  dgi_xCoef.Eval(dgi_gf_val, *T, ip);
+
+                  double  h1_dist = Distance(f_val,  h1_gf_val, 2);
+                  double  nd_dist = Distance(f_val,  nd_gf_val, 2);
+                  double  rt_dist = Distance(f_val,  rt_gf_val, 2);
+                  double  l2_dist = Distance(f_val,  l2_gf_val, 2);
+                  double dgv_dist = Distance(f_val, dgv_gf_val, 2);
+                  double dgi_dist = Distance(f_val, dgi_gf_val, 2);
+
+                  h1_err  +=  h1_dist;
+                  nd_err  +=  nd_dist;
+                  rt_err  +=  rt_dist;
+                  l2_err  +=  l2_dist;
+                  dgv_err += dgv_dist;
+                  dgi_err += dgi_dist;
+
+                  if (log > 0 && h1_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " h1  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << h1_gf_val[0] << "," << h1_gf_val[1] << ") "
+                               << h1_dist << std::endl;
+                  }
+                  if (log > 0 && nd_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " nd  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << nd_gf_val[0] << "," << nd_gf_val[1] << ") "
+                               << nd_dist << std::endl;
+                  }
+                  if (log > 0 && rt_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " rt  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << rt_gf_val[0] << "," << rt_gf_val[1] << ") "
+                               << rt_dist << std::endl;
+                  }
+                  if (log > 0 && l2_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " l2  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << l2_gf_val[0] << "," << l2_gf_val[1] << ") "
+                               << l2_dist << std::endl;
+                  }
+                  if (log > 0 && dgv_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " dgv ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << dgv_gf_val[0] << "," << dgv_gf_val[1] << ") "
+                               << dgv_dist << std::endl;
+                  }
+                  if (log > 0 && dgi_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " dgi ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << dgi_gf_val[0] << "," << dgi_gf_val[1] << ") "
+                               << dgi_dist << std::endl;
+                  }
+               }
+               h1_err  /= ir.GetNPoints();
+               nd_err  /= ir.GetNPoints();
+               rt_err  /= ir.GetNPoints();
+               l2_err  /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE( h1_err == Approx(0.0));
+               REQUIRE( nd_err == Approx(0.0));
+               REQUIRE( rt_err == Approx(0.0));
+               REQUIRE( l2_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Boundary Evaluation 2D (DG Context)")
+         {
+            std::cout << "Boundary Evaluation 2D (DG Context)" << std::endl;
+            for (int be = 0; be < mesh.GetNBE(); be++)
+            {
+               FaceElementTransformations *T =
+                  mesh.GetBdrFaceTransformations(be);
+               const IntegrationRule &ir = IntRules.Get(T->GetGeometryType(),
+                                                        2*order + 2);
+
+               double  h1_err = 0.0;
+               double  nd_err = 0.0;
+               double  rt_err = 0.0;
+               double  l2_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  Func_2D_lin(tip, f_val);
+
+                  h1_xCoef.Eval(h1_gf_val, *T, ip);
+                  nd_xCoef.Eval(nd_gf_val, *T, ip);
+                  rt_xCoef.Eval(rt_gf_val, *T, ip);
+                  l2_xCoef.Eval(l2_gf_val, *T, ip);
+                  dgv_xCoef.Eval(dgv_gf_val, *T, ip);
+                  dgi_xCoef.Eval(dgi_gf_val, *T, ip);
+
+                  double  h1_dist = Distance(f_val,  h1_gf_val, 2);
+                  double  nd_dist = Distance(f_val,  nd_gf_val, 2);
+                  double  rt_dist = Distance(f_val,  rt_gf_val, 2);
+                  double  l2_dist = Distance(f_val,  l2_gf_val, 2);
+                  double dgv_dist = Distance(f_val, dgv_gf_val, 2);
+                  double dgi_dist = Distance(f_val, dgi_gf_val, 2);
+
+                  h1_err  +=  h1_dist;
+                  nd_err  +=  nd_dist;
+                  rt_err  +=  rt_dist;
+                  l2_err  +=  l2_dist;
+                  dgv_err += dgv_dist;
+                  dgi_err += dgi_dist;
+
+                  if (log > 0 && h1_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " h1  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << h1_gf_val[0] << "," << h1_gf_val[1] << ") "
+                               << h1_dist << std::endl;
+                  }
+                  if (log > 0 && nd_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " nd  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << nd_gf_val[0] << "," << nd_gf_val[1] << ") "
+                               << nd_dist << std::endl;
+                  }
+                  if (log > 0 && rt_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " rt  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << rt_gf_val[0] << "," << rt_gf_val[1] << ") "
+                               << rt_dist << std::endl;
+                  }
+                  if (log > 0 && l2_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " l2  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << l2_gf_val[0] << "," << l2_gf_val[1] << ") "
+                               << l2_dist << std::endl;
+                  }
+                  if (log > 0 && dgv_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " dgv ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << dgv_gf_val[0] << "," << dgv_gf_val[1] << ") "
+                               << dgv_dist << std::endl;
+                  }
+                  if (log > 0 && dgi_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " dgi ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << dgi_gf_val[0] << "," << dgi_gf_val[1] << ") "
+                               << dgi_dist << std::endl;
+                  }
+               }
+               h1_err  /= ir.GetNPoints();
+               nd_err  /= ir.GetNPoints();
+               rt_err  /= ir.GetNPoints();
+               l2_err  /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE( h1_err == Approx(0.0));
+               REQUIRE( nd_err == Approx(0.0));
+               REQUIRE( rt_err == Approx(0.0));
+               REQUIRE( l2_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Edge Evaluation 2D")
+         {
+            std::cout << "Edge Evaluation 2D" << std::endl;
+            for (int e = 0; e < mesh.GetNEdges(); e++)
+            {
+               ElementTransformation *T = mesh.GetEdgeTransformation(e);
+               const FiniteElement   *fe = h1_fespace.GetEdgeElement(e);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double  h1_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  Func_2D_lin(tip, f_val);
+
+                  h1_xCoef.Eval(h1_gf_val, *T, ip);
+
+                  double  h1_dist = Distance(f_val,  h1_gf_val, 2);
+
+                  h1_err  +=  h1_dist;
+
+                  if (log > 0 && h1_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " h1  ("
+                               << f_val[0] << "," << f_val[1] << ") vs. ("
+                               << h1_gf_val[0] << "," << h1_gf_val[1] << ") "
+                               << h1_dist << std::endl;
+                  }
+               }
+               h1_err  /= ir.GetNPoints();
+
+               REQUIRE( h1_err == Approx(0.0));
+            }
+         }
+      }
+   }
+   std::cout << "Checked GridFunction::GetVectorValue at "
+             << npts << " 2D points" << std::endl;
+}
+
+TEST_CASE("3D GetVectorValue",
+          "[GridFunction]"
+          "[VectorGridFunctionCoefficient]")
+{
+   int log = 1;
+   int n = 1;
+   int dim = 3;
+   int order = 1;
+   int npts = 0;
+
+   double tol = 1e-6;
+
+   for (int type = (int)Element::TETRAHEDRON;
+        type <= (int)Element::HEXAHEDRON; type++)
+   {
+      Mesh mesh(n, n, n, (Element::Type)type, 1, 2.0, 3.0, 5.0);
+
+      VectorFunctionCoefficient linCoef(dim, Func_3D_lin);
+
+      SECTION("3D GetVectorValue tests for element type " +
+              std::to_string(type))
+      {
+         H1_FECollection  h1_fec(order, dim);
+         ND_FECollection  nd_fec(order+1, dim);
+         RT_FECollection  rt_fec(order+1, dim);
+         L2_FECollection  l2_fec(order, dim);
+         DG_FECollection dgv_fec(order, dim, BasisType::GaussLegendre,
+                                 FiniteElement::VALUE);
+         DG_FECollection dgi_fec(order, dim, BasisType::GaussLegendre,
+                                 FiniteElement::INTEGRAL);
+
+         FiniteElementSpace  h1_fespace(&mesh,  &h1_fec, dim);
+         FiniteElementSpace  nd_fespace(&mesh,  &nd_fec);
+         FiniteElementSpace  rt_fespace(&mesh,  &rt_fec);
+         FiniteElementSpace  l2_fespace(&mesh,  &l2_fec, dim);
+         FiniteElementSpace dgv_fespace(&mesh, &dgv_fec, dim);
+         FiniteElementSpace dgi_fespace(&mesh, &dgi_fec, dim);
+
+         GridFunction  h1_x( &h1_fespace);
+         GridFunction  nd_x( &nd_fespace);
+         GridFunction  rt_x( &rt_fespace);
+         GridFunction  l2_x( &l2_fespace);
+         GridFunction dgv_x(&dgv_fespace);
+         GridFunction dgi_x(&dgi_fespace);
+
+         VectorGridFunctionCoefficient  h1_xCoef( &h1_x);
+         VectorGridFunctionCoefficient  nd_xCoef( &nd_x);
+         VectorGridFunctionCoefficient  rt_xCoef( &rt_x);
+         VectorGridFunctionCoefficient  l2_xCoef( &l2_x);
+         VectorGridFunctionCoefficient dgv_xCoef(&dgv_x);
+         VectorGridFunctionCoefficient dgi_xCoef(&dgi_x);
+
+         h1_x.ProjectCoefficient(linCoef);
+         nd_x.ProjectCoefficient(linCoef);
+         rt_x.ProjectCoefficient(linCoef);
+         l2_x.ProjectCoefficient(linCoef);
+         dgv_x.ProjectCoefficient(linCoef);
+         dgi_x.ProjectCoefficient(linCoef);
+
+         Vector      f_val(dim);      f_val = 0.0;
+         Vector  h1_gf_val(dim);  h1_gf_val = 0.0;
+         Vector  nd_gf_val(dim);  nd_gf_val = 0.0;
+         Vector  rt_gf_val(dim);  rt_gf_val = 0.0;
+         Vector  l2_gf_val(dim);  l2_gf_val = 0.0;
+         Vector dgv_gf_val(dim); dgv_gf_val = 0.0;
+         Vector dgi_gf_val(dim); dgi_gf_val = 0.0;
+
+         SECTION("Domain Evaluation 3D")
+         {
+            std::cout << "Domain Evaluation 3D" << std::endl;
+            for (int e = 0; e < mesh.GetNE(); e++)
+            {
+               ElementTransformation *T = mesh.GetElementTransformation(e);
+               const FiniteElement   *fe = h1_fespace.GetFE(e);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double  h1_err = 0.0;
+               double  nd_err = 0.0;
+               double  rt_err = 0.0;
+               double  l2_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  Func_3D_lin(tip, f_val);
+
+                  h1_xCoef.Eval(h1_gf_val, *T, ip);
+                  nd_xCoef.Eval(nd_gf_val, *T, ip);
+                  rt_xCoef.Eval(rt_gf_val, *T, ip);
+                  l2_xCoef.Eval(l2_gf_val, *T, ip);
+                  dgv_xCoef.Eval(dgv_gf_val, *T, ip);
+                  dgi_xCoef.Eval(dgi_gf_val, *T, ip);
+
+                  double  h1_dist = Distance(f_val,  h1_gf_val, dim);
+                  double  nd_dist = Distance(f_val,  nd_gf_val, dim);
+                  double  rt_dist = Distance(f_val,  rt_gf_val, dim);
+                  double  l2_dist = Distance(f_val,  l2_gf_val, dim);
+                  double dgv_dist = Distance(f_val, dgv_gf_val, dim);
+                  double dgi_dist = Distance(f_val, dgi_gf_val, dim);
+
+                  h1_err  +=  h1_dist;
+                  nd_err  +=  nd_dist;
+                  rt_err  +=  rt_dist;
+                  l2_err  +=  l2_dist;
+                  dgv_err += dgv_dist;
+                  dgi_err += dgi_dist;
+
+                  if (log > 0 && h1_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " h1  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << h1_gf_val[0] << "," << h1_gf_val[1] << ","
+                               << h1_gf_val[2] << ") " << h1_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && nd_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " nd  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << nd_gf_val[0] << "," << nd_gf_val[1] << ","
+                               << nd_gf_val[2] << ") " << nd_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && rt_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " rt  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << rt_gf_val[0] << "," << rt_gf_val[1] << ","
+                               << rt_gf_val[2] << ") " << rt_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && l2_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " l2  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << l2_gf_val[0] << "," << l2_gf_val[1] << ","
+                               << l2_gf_val[2] << ") " << l2_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && dgv_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " dgv ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << dgv_gf_val[0] << "," << dgv_gf_val[1] << ","
+                               << dgv_gf_val[2] << ") " << dgv_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && dgi_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " dgi ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << dgi_gf_val[0] << "," << dgi_gf_val[1] << ","
+                               << dgi_gf_val[2] << ") " << dgi_dist
+                               << std::endl;
+                  }
+               }
+               h1_err  /= ir.GetNPoints();
+               nd_err  /= ir.GetNPoints();
+               rt_err  /= ir.GetNPoints();
+               l2_err  /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE( h1_err == Approx(0.0));
+               REQUIRE( nd_err == Approx(0.0));
+               REQUIRE( rt_err == Approx(0.0));
+               REQUIRE( l2_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Boundary Evaluation 3D (H1 Context)")
+         {
+            std::cout << "Boundary Evaluation 3D (H1 Context)" << std::endl;
+            for (int be = 0; be < mesh.GetNBE(); be++)
+            {
+               ElementTransformation *T = mesh.GetBdrElementTransformation(be);
+               const FiniteElement   *fe = h1_fespace.GetBE(be);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double  h1_err = 0.0;
+               double  nd_err = 0.0;
+               double  rt_err = 0.0;
+               double  l2_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  Func_3D_lin(tip, f_val);
+
+                  h1_xCoef.Eval(h1_gf_val, *T, ip);
+                  nd_xCoef.Eval(nd_gf_val, *T, ip);
+                  rt_xCoef.Eval(rt_gf_val, *T, ip);
+                  l2_xCoef.Eval(l2_gf_val, *T, ip);
+                  dgv_xCoef.Eval(dgv_gf_val, *T, ip);
+                  dgi_xCoef.Eval(dgi_gf_val, *T, ip);
+
+                  double  h1_dist = Distance(f_val,  h1_gf_val, dim);
+                  double  nd_dist = Distance(f_val,  nd_gf_val, dim);
+                  double  rt_dist = Distance(f_val,  rt_gf_val, dim);
+                  double  l2_dist = Distance(f_val,  l2_gf_val, dim);
+                  double dgv_dist = Distance(f_val, dgv_gf_val, dim);
+                  double dgi_dist = Distance(f_val, dgi_gf_val, dim);
+
+                  h1_err  +=  h1_dist;
+                  nd_err  +=  nd_dist;
+                  rt_err  +=  rt_dist;
+                  l2_err  +=  l2_dist;
+                  dgv_err += dgv_dist;
+                  dgi_err += dgi_dist;
+
+                  if (log > 0 && h1_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " h1  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << h1_gf_val[0] << "," << h1_gf_val[1] << ","
+                               << h1_gf_val[2] << ") " << h1_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && nd_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " nd  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << nd_gf_val[0] << "," << nd_gf_val[1] << ","
+                               << nd_gf_val[2] << ") " << nd_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && rt_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " rt  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << rt_gf_val[0] << "," << rt_gf_val[1] << ","
+                               << rt_gf_val[2] << ") " << rt_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && l2_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " l2  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << l2_gf_val[0] << "," << l2_gf_val[1] << ","
+                               << l2_gf_val[2] << ") " << l2_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && dgv_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " dgv ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << dgv_gf_val[0] << "," << dgv_gf_val[1] << ","
+                               << dgv_gf_val[2] << ") " << dgv_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && dgi_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " dgi ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << dgi_gf_val[0] << "," << dgi_gf_val[1] << ","
+                               << dgi_gf_val[2] << ") " << dgi_dist
+                               << std::endl;
+                  }
+               }
+               h1_err  /= ir.GetNPoints();
+               nd_err  /= ir.GetNPoints();
+               rt_err  /= ir.GetNPoints();
+               l2_err  /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE( h1_err == Approx(0.0));
+               REQUIRE( nd_err == Approx(0.0));
+               REQUIRE( rt_err == Approx(0.0));
+               REQUIRE( l2_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Boundary Evaluation 3D (DG Context)")
+         {
+            std::cout << "Boundary Evaluation 3D (DG Context)" << std::endl;
+            for (int be = 0; be < mesh.GetNBE(); be++)
+            {
+               FaceElementTransformations *T =
+                  mesh.GetBdrFaceTransformations(be);
+               const IntegrationRule &ir = IntRules.Get(T->GetGeometryType(),
+                                                        2*order + 2);
+
+               double  h1_err = 0.0;
+               double  nd_err = 0.0;
+               double  rt_err = 0.0;
+               double  l2_err = 0.0;
+               double dgv_err = 0.0;
+               double dgi_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  Func_3D_lin(tip, f_val);
+
+                  h1_xCoef.Eval(h1_gf_val, *T, ip);
+                  nd_xCoef.Eval(nd_gf_val, *T, ip);
+                  rt_xCoef.Eval(rt_gf_val, *T, ip);
+                  l2_xCoef.Eval(l2_gf_val, *T, ip);
+                  dgv_xCoef.Eval(dgv_gf_val, *T, ip);
+                  dgi_xCoef.Eval(dgi_gf_val, *T, ip);
+
+                  double  h1_dist = Distance(f_val,  h1_gf_val, dim);
+                  double  nd_dist = Distance(f_val,  nd_gf_val, dim);
+                  double  rt_dist = Distance(f_val,  rt_gf_val, dim);
+                  double  l2_dist = Distance(f_val,  l2_gf_val, dim);
+                  double dgv_dist = Distance(f_val, dgv_gf_val, dim);
+                  double dgi_dist = Distance(f_val, dgi_gf_val, dim);
+
+                  h1_err  +=  h1_dist;
+                  nd_err  +=  nd_dist;
+                  rt_err  +=  rt_dist;
+                  l2_err  +=  l2_dist;
+                  dgv_err += dgv_dist;
+                  dgi_err += dgi_dist;
+
+                  if (log > 0 && h1_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " h1  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << h1_gf_val[0] << "," << h1_gf_val[1] << ","
+                               << h1_gf_val[2] << ") " << h1_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && nd_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " nd  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << nd_gf_val[0] << "," << nd_gf_val[1] << ","
+                               << nd_gf_val[2] << ") " << nd_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && rt_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " rt  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << rt_gf_val[0] << "," << rt_gf_val[1] << ","
+                               << rt_gf_val[2] << ") " << rt_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && l2_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " l2  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << l2_gf_val[0] << "," << l2_gf_val[1] << ","
+                               << l2_gf_val[2] << ") " << l2_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && dgv_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " dgv ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << dgv_gf_val[0] << "," << dgv_gf_val[1] << ","
+                               << dgv_gf_val[2] << ") " << dgv_dist
+                               << std::endl;
+                  }
+                  if (log > 0 && dgi_dist > tol)
+                  {
+                     std::cout << be << ":" << j << " dgi ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << dgi_gf_val[0] << "," << dgi_gf_val[1] << ","
+                               << dgi_gf_val[2] << ") " << dgi_dist
+                               << std::endl;
+                  }
+               }
+               h1_err  /= ir.GetNPoints();
+               nd_err  /= ir.GetNPoints();
+               rt_err  /= ir.GetNPoints();
+               l2_err  /= ir.GetNPoints();
+               dgv_err /= ir.GetNPoints();
+               dgi_err /= ir.GetNPoints();
+
+               REQUIRE( h1_err == Approx(0.0));
+               REQUIRE( nd_err == Approx(0.0));
+               REQUIRE( rt_err == Approx(0.0));
+               REQUIRE( l2_err == Approx(0.0));
+               REQUIRE(dgv_err == Approx(0.0));
+               REQUIRE(dgi_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Edge Evaluation 3D")
+         {
+            std::cout << "Edge Evaluation 3D" << std::endl;
+            for (int e = 0; e < mesh.GetNEdges(); e++)
+            {
+               ElementTransformation *T = mesh.GetEdgeTransformation(e);
+               const FiniteElement   *fe = h1_fespace.GetEdgeElement(e);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double  h1_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  Func_3D_lin(tip, f_val);
+
+                  h1_xCoef.Eval(h1_gf_val, *T, ip);
+
+                  double  h1_dist = Distance(f_val,  h1_gf_val, dim);
+
+                  h1_err  +=  h1_dist;
+
+                  if (log > 0 && h1_dist > tol)
+                  {
+                     std::cout << e << ":" << j << " h1  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << h1_gf_val[0] << "," << h1_gf_val[1] << ","
+                               << h1_gf_val[2] << ") " << h1_dist
+                               << std::endl;
+                  }
+               }
+               h1_err  /= ir.GetNPoints();
+
+               REQUIRE( h1_err == Approx(0.0));
+            }
+         }
+
+         SECTION("Face Evaluation 3D")
+         {
+            std::cout << "Face Evaluation 3D" << std::endl;
+            for (int f = 0; f < mesh.GetNFaces(); f++)
+            {
+               ElementTransformation *T = mesh.GetFaceTransformation(f);
+               const FiniteElement   *fe = h1_fespace.GetFaceElement(f);
+               const IntegrationRule &ir = IntRules.Get(fe->GetGeomType(),
+                                                        2*order + 2);
+
+               double  h1_err = 0.0;
+
+               double tip_data[dim];
+               Vector tip(tip_data, dim);
+               for (int j=0; j<ir.GetNPoints(); j++)
+               {
+                  npts++;
+                  const IntegrationPoint &ip = ir.IntPoint(j);
+                  T->SetIntPoint(&ip);
+                  T->Transform(ip, tip);
+
+                  Func_3D_lin(tip, f_val);
+
+                  h1_xCoef.Eval(h1_gf_val, *T, ip);
+
+                  double  h1_dist = Distance(f_val,  h1_gf_val, dim);
+
+                  h1_err  +=  h1_dist;
+
+                  if (log > 0 && h1_dist > tol)
+                  {
+                     std::cout << f << ":" << j << " h1  ("
+                               << f_val[0] << "," << f_val[1] << ","
+                               << f_val[2] << ") vs. ("
+                               << h1_gf_val[0] << "," << h1_gf_val[1] << ","
+                               << h1_gf_val[2] << ") " << h1_dist
+                               << std::endl;
+                  }
+               }
+               h1_err  /= ir.GetNPoints();
+
+               REQUIRE( h1_err == Approx(0.0));
+            }
+         }
+      }
+   }
+   std::cout << "Checked GridFunction::GetVectorValue at "
+             << npts << " 3D points" << std::endl;
+}
+
+} // namespace get_value
diff --git a/tests/unit/fem/test_pa_coeff.cpp b/tests/unit/fem/test_pa_coeff.cpp
index 74734dfefa7..b651514d9cd 100644
--- a/tests/unit/fem/test_pa_coeff.cpp
+++ b/tests/unit/fem/test_pa_coeff.cpp
@@ -156,7 +156,7 @@ TEST_CASE("H1 pa_coeff")
    }
 }
 
-TEST_CASE("Hcurl pa_coeff")
+TEST_CASE("Hcurl/Hdiv pa_coeff")
 {
    for (dimension = 2; dimension < 4; ++dimension)
    {
@@ -174,141 +174,156 @@ TEST_CASE("Hcurl pa_coeff")
       for (int coeffType = 0; coeffType < 2; ++coeffType)
       {
          Coefficient* coeff = nullptr;
-         Coefficient* curlCoeff = nullptr;
+         Coefficient* coeff2 = nullptr;
          if (coeffType == 0)
          {
             coeff = new ConstantCoefficient(12.34);
-            curlCoeff = new ConstantCoefficient(12.34);
+            coeff2 = new ConstantCoefficient(12.34);
          }
          else if (coeffType == 1)
          {
             coeff = new FunctionCoefficient(&coeffFunction);
-            curlCoeff = new FunctionCoefficient(&linearFunction);
+            coeff2 = new FunctionCoefficient(&linearFunction);
          }
 
-         for (int integrator = 0; integrator < 3; ++integrator)
+         for (int spaceType = 0; spaceType < 2; ++spaceType)
          {
-            std::cout << "Testing " << dimension << "D ND partial assembly with "
-                      << "coeffType " << coeffType << " and "
-                      << "integrator " << integrator << std::endl;
-            for (int order = 1; order < 4; ++order)
+            for (int integrator = 0; integrator < 3; ++integrator)
             {
-               FiniteElementCollection* ND_fec =
-                  new ND_FECollection(order, dimension);
-               FiniteElementSpace ND_fespace(mesh, ND_fec);
+               if (spaceType == 0)
+                  std::cout << "Testing " << dimension
+                            << "D ND partial assembly with " << "coeffType "
+                            << coeffType << " and " << "integrator "
+                            << integrator << std::endl;
+               else
+                  std::cout << "Testing " << dimension
+                            << "D RT partial assembly with " << "coeffType "
+                            << coeffType << " and " << "integrator "
+                            << integrator << std::endl;
 
-               // Set essential boundary conditions on the entire boundary.
-               Array<int> tdof_ess(ND_fespace.GetVSize());
-               for (int i=0; i<ND_fespace.GetVSize(); ++i)
+               for (int order = 1; order < 4; ++order)
                {
-                  tdof_ess[i] = 0;
-               }
+                  FiniteElementCollection* fec = (spaceType == 0) ?
+                                                 (FiniteElementCollection*) new ND_FECollection(order, dimension) :
+                                                 (FiniteElementCollection*) new RT_FECollection(order, dimension);
 
-               for (int i=0; i<mesh->GetNBE(); ++i)
-               {
-                  Array<int> dofs;
-                  ND_fespace.GetBdrElementDofs(i, dofs);
-                  for (int j=0; j<dofs.Size(); ++j)
+                  FiniteElementSpace fespace(mesh, fec);
+
+                  // Set essential boundary conditions on the entire boundary.
+                  Array<int> tdof_ess(fespace.GetVSize());
+                  for (int i=0; i<fespace.GetVSize(); ++i)
                   {
-                     const int dof_j = (dofs[j] >= 0) ? dofs[j] : -1 - dofs[j];
-                     tdof_ess[dof_j] = 1;
+                     tdof_ess[i] = 0;
                   }
-               }
 
-               int num_ess = 0;
-               for (int i=0; i<ND_fespace.GetVSize(); ++i)
-               {
-                  if (tdof_ess[i] == 1)
+                  for (int i=0; i<mesh->GetNBE(); ++i)
                   {
-                     num_ess++;
+                     Array<int> dofs;
+                     fespace.GetBdrElementDofs(i, dofs);
+                     for (int j=0; j<dofs.Size(); ++j)
+                     {
+                        const int dof_j = (dofs[j] >= 0) ? dofs[j] : -1 - dofs[j];
+                        tdof_ess[dof_j] = 1;
+                     }
                   }
-               }
 
-               Array<int> ess_tdof_list(num_ess);
-               num_ess = 0;
-               for (int i=0; i<ND_fespace.GetVSize(); ++i)
-               {
-                  if (tdof_ess[i] == 1)
+                  int num_ess = 0;
+                  for (int i=0; i<fespace.GetVSize(); ++i)
                   {
-                     ess_tdof_list[num_ess] = i;
-                     num_ess++;
+                     if (tdof_ess[i] == 1)
+                     {
+                        num_ess++;
+                     }
                   }
-               }
 
-               BilinearForm paform(&ND_fespace);
-               paform.SetAssemblyLevel(AssemblyLevel::PARTIAL);
-               if (integrator < 2)
-               {
-                  paform.AddDomainIntegrator(new VectorFEMassIntegrator(*coeff));
-               }
-               if (integrator > 0)
-               {
-                  paform.AddDomainIntegrator(new CurlCurlIntegrator(*curlCoeff));
-               }
-               paform.Assemble();
-               OperatorHandle paopr;
-               paform.FormSystemMatrix(ess_tdof_list, paopr);
+                  Array<int> ess_tdof_list(num_ess);
+                  num_ess = 0;
+                  for (int i=0; i<fespace.GetVSize(); ++i)
+                  {
+                     if (tdof_ess[i] == 1)
+                     {
+                        ess_tdof_list[num_ess] = i;
+                        num_ess++;
+                     }
+                  }
 
-               BilinearForm assemblyform(&ND_fespace);
-               if (integrator < 2)
-               {
-                  assemblyform.AddDomainIntegrator(
-                     new VectorFEMassIntegrator(*coeff));
-               }
-               if (integrator > 0)
-               {
-                  assemblyform.AddDomainIntegrator(new CurlCurlIntegrator(*curlCoeff));
+                  BilinearForm paform(&fespace);
+                  paform.SetAssemblyLevel(AssemblyLevel::PARTIAL);
+                  BilinearForm assemblyform(&fespace);
+                  if (integrator < 2)
+                  {
+                     paform.AddDomainIntegrator(new VectorFEMassIntegrator(*coeff));
+                     assemblyform.AddDomainIntegrator(
+                        new VectorFEMassIntegrator(*coeff));
+                  }
+                  if (integrator > 0)
+                  {
+                     if (spaceType == 0)
+                     {
+                        paform.AddDomainIntegrator(new CurlCurlIntegrator(*coeff2));
+                        assemblyform.AddDomainIntegrator(new CurlCurlIntegrator(*coeff2));
+                     }
+                     else
+                     {
+                        paform.AddDomainIntegrator(new DivDivIntegrator(*coeff2));
+                        assemblyform.AddDomainIntegrator(new DivDivIntegrator(*coeff2));
+                     }
+                  }
+                  paform.Assemble();
+                  OperatorHandle paopr;
+                  paform.FormSystemMatrix(ess_tdof_list, paopr);
+
+                  assemblyform.SetDiagonalPolicy(Matrix::DIAG_ONE);
+                  assemblyform.Assemble();
+                  assemblyform.Finalize();
+                  SparseMatrix A_explicit;
+                  assemblyform.FormSystemMatrix(ess_tdof_list, A_explicit);
+
+                  Vector xin(fespace.GetTrueVSize());
+                  xin.Randomize();
+                  Vector y_mat(xin);
+                  y_mat = 0.0;
+                  Vector y_assembly(xin);
+                  y_assembly = 0.0;
+                  Vector y_pa(xin);
+                  y_pa = 0.0;
+
+                  paopr->Mult(xin, y_pa);
+                  assemblyform.Mult(xin, y_assembly);
+                  A_explicit.Mult(xin, y_mat);
+
+                  y_pa -= y_mat;
+                  double pa_error = y_pa.Norml2();
+                  std::cout << "  order: " << order
+                            << ", pa error norm: " << pa_error << std::endl;
+                  REQUIRE(pa_error < 1.e-10);
+
+                  y_assembly -= y_mat;
+                  double assembly_error = y_assembly.Norml2();
+                  std::cout << "  order: " << order
+                            << ", assembly error norm: " << assembly_error
+                            << std::endl;
+                  REQUIRE(assembly_error < 1.e-12);
+
+                  delete fec;
                }
-               assemblyform.SetDiagonalPolicy(Matrix::DIAG_ONE);
-               assemblyform.Assemble();
-               assemblyform.Finalize();
-               SparseMatrix A_explicit;
-               assemblyform.FormSystemMatrix(ess_tdof_list, A_explicit);
-
-               Vector xin(ND_fespace.GetTrueVSize());
-               xin.Randomize();
-               Vector y_mat(xin);
-               y_mat = 0.0;
-               Vector y_assembly(xin);
-               y_assembly = 0.0;
-               Vector y_pa(xin);
-               y_pa = 0.0;
-
-               paopr->Mult(xin, y_pa);
-               assemblyform.Mult(xin, y_assembly);
-               A_explicit.Mult(xin, y_mat);
-
-               y_pa -= y_mat;
-               double pa_error = y_pa.Norml2();
-               std::cout << "  order: " << order
-                         << ", pa error norm: " << pa_error << std::endl;
-               REQUIRE(pa_error < 1.e-11);
-
-               y_assembly -= y_mat;
-               double assembly_error = y_assembly.Norml2();
-               std::cout << "  order: " << order
-                         << ", assembly error norm: " << assembly_error
-                         << std::endl;
-               REQUIRE(assembly_error < 1.e-12);
-
-               delete ND_fec;
             }
          }
 
          delete coeff;
-         delete curlCoeff;
+         delete coeff2;
       }
 
       delete mesh;
    }
 }
 
-TEST_CASE("Hcurl H1 mixed pa_coeff")
+TEST_CASE("Hcurl/Hdiv mixed pa_coeff")
 {
    for (dimension = 2; dimension < 4; ++dimension)
    {
       Mesh* mesh;
-      const int ne = 2;
+      const int ne = 3;
       if (dimension == 2)
       {
          mesh = new Mesh(ne, ne, Element::QUADRILATERAL, 1, 1.0, 1.0);
@@ -330,64 +345,132 @@ TEST_CASE("Hcurl H1 mixed pa_coeff")
             coeff = new FunctionCoefficient(&coeffFunction);
          }
 
-         // Currently, we test only one integrator. More could be tested here
-         // when they are implemented, using different test spaces (e.g. vector L2, H(div)).
-         for (int integrator = 0; integrator < 1; ++integrator)
+         for (int spaceType = 0; spaceType < 2; ++spaceType)
          {
-            std::cout << "Testing " << dimension << "D ND H1 mixed partial assembly with "
-                      << "coeffType " << coeffType << " and "
-                      << "integrator " << integrator << std::endl;
-            for (int order = 1; order < 4; ++order)
+            if (spaceType == 1 && coeffType == 1)
             {
-               FiniteElementCollection* ND_fec =
-                  new ND_FECollection(order, dimension);
-               FiniteElementSpace ND_fespace(mesh, ND_fec);
+               continue;  // This case fails, maybe because of insufficient quadrature.
+            }
 
-               FiniteElementCollection* h1_fec =
-                  new H1_FECollection(order, dimension);
-               FiniteElementSpace h1_fespace(mesh, h1_fec);
+            // Currently, we test only one integrator.
+            for (int integrator = 0; integrator < 1; ++integrator)
+            {
+               if (spaceType == 0)
+                  std::cout << "Testing " << dimension << "D ND H1 mixed partial assembly with "
+                            << "coeffType " << coeffType << " and "
+                            << "integrator " << integrator << std::endl;
+               else
+                  std::cout << "Testing " << dimension << "D RT L2 mixed partial assembly with "
+                            << "coeffType " << coeffType << " and "
+                            << "integrator " << integrator << std::endl;
 
-               Array<int> ess_tdof_list;
+               for (int order = 1; order < 4; ++order)
+               {
+                  FiniteElementCollection* vec_fec = (spaceType == 0) ?
+                                                     (FiniteElementCollection*) new ND_FECollection(order, dimension) :
+                                                     (FiniteElementCollection*) new RT_FECollection(order-1, dimension);
 
-               MixedBilinearForm paform(&h1_fespace, &ND_fespace);
-               paform.SetAssemblyLevel(AssemblyLevel::PARTIAL);
-               paform.AddDomainIntegrator(new MixedVectorGradientIntegrator(*coeff));
-               paform.Assemble();
+                  FiniteElementCollection* scalar_fec = (spaceType == 0) ?
+                                                        (FiniteElementCollection*) new H1_FECollection(order, dimension) :
+                                                        (FiniteElementCollection*) new L2_FECollection(order-1, dimension);
 
-               MixedBilinearForm assemblyform(&h1_fespace, &ND_fespace);
-               assemblyform.AddDomainIntegrator(new MixedVectorGradientIntegrator(*coeff));
-               assemblyform.Assemble();
-               assemblyform.Finalize();
-               const SparseMatrix& A_explicit = assemblyform.SpMat();
+                  FiniteElementSpace v_fespace(mesh, vec_fec);
+                  FiniteElementSpace s_fespace(mesh, scalar_fec);
 
-               Vector xin(h1_fespace.GetTrueVSize());
-               xin.Randomize();
-               Vector y_mat(ND_fespace.GetTrueVSize());
-               y_mat = 0.0;
-               Vector y_assembly(ND_fespace.GetTrueVSize());
-               y_assembly = 0.0;
-               Vector y_pa(ND_fespace.GetTrueVSize());
-               y_pa = 0.0;
+                  Array<int> ess_tdof_list;
 
-               paform.Mult(xin, y_pa);
-               assemblyform.Mult(xin, y_assembly);
-               A_explicit.Mult(xin, y_mat);
+                  MixedBilinearForm *paform = NULL;
+                  MixedBilinearForm *assemblyform = NULL;
 
-               y_pa -= y_mat;
-               double pa_error = y_pa.Norml2();
-               std::cout << "  order: " << order
-                         << ", pa error norm: " << pa_error << std::endl;
-               REQUIRE(pa_error < 1.e-12);
+                  if (spaceType == 0)
+                  {
+                     assemblyform = new MixedBilinearForm(&s_fespace, &v_fespace);
+                     assemblyform->AddDomainIntegrator(new MixedVectorGradientIntegrator(*coeff));
 
-               y_assembly -= y_mat;
-               double assembly_error = y_assembly.Norml2();
-               std::cout << "  order: " << order
-                         << ", assembly error norm: " << assembly_error
-                         << std::endl;
-               REQUIRE(assembly_error < 1.e-12);
+                     paform = new MixedBilinearForm(&s_fespace, &v_fespace);
+                     paform->SetAssemblyLevel(AssemblyLevel::PARTIAL);
+                     paform->AddDomainIntegrator(new MixedVectorGradientIntegrator(*coeff));
+                  }
+                  else
+                  {
+                     assemblyform = new MixedBilinearForm(&v_fespace, &s_fespace);
+                     assemblyform->AddDomainIntegrator(new VectorFEDivergenceIntegrator(*coeff));
 
-               delete ND_fec;
-               delete h1_fec;
+                     paform = new MixedBilinearForm(&v_fespace, &s_fespace);
+                     paform->SetAssemblyLevel(AssemblyLevel::PARTIAL);
+                     paform->AddDomainIntegrator(new VectorFEDivergenceIntegrator(*coeff));
+                  }
+
+                  assemblyform->Assemble();
+                  assemblyform->Finalize();
+
+                  paform->Assemble();
+
+                  const SparseMatrix& A_explicit = assemblyform->SpMat();
+
+                  Vector xin((spaceType == 0) ? s_fespace.GetTrueVSize() :
+                             v_fespace.GetTrueVSize());
+                  xin.Randomize();
+                  Vector y_mat((spaceType == 0) ? v_fespace.GetTrueVSize() :
+                               s_fespace.GetTrueVSize());
+                  y_mat = 0.0;
+                  Vector y_assembly(y_mat.Size());
+                  y_assembly = 0.0;
+                  Vector y_pa(y_mat.Size());
+                  y_pa = 0.0;
+
+                  paform->Mult(xin, y_pa);
+                  assemblyform->Mult(xin, y_assembly);
+                  A_explicit.Mult(xin, y_mat);
+
+                  y_pa -= y_mat;
+                  double pa_error = y_pa.Norml2();
+                  std::cout << "  order: " << order
+                            << ", pa error norm: " << pa_error << std::endl;
+                  REQUIRE(pa_error < 1.e-12);
+
+                  y_assembly -= y_mat;
+                  double assembly_error = y_assembly.Norml2();
+                  std::cout << "  order: " << order
+                            << ", assembly error norm: " << assembly_error
+                            << std::endl;
+                  REQUIRE(assembly_error < 1.e-12);
+
+                  if (spaceType == 1)
+                  {
+                     // Test the transpose.
+                     xin.SetSize((spaceType == 0) ? v_fespace.GetTrueVSize() :
+                                 s_fespace.GetTrueVSize());
+                     xin.Randomize();
+
+                     y_mat.SetSize((spaceType == 0) ? s_fespace.GetTrueVSize() :
+                                   v_fespace.GetTrueVSize());
+                     y_assembly.SetSize(y_mat.Size());
+                     y_pa.SetSize(y_mat.Size());
+
+                     paform->MultTranspose(xin, y_pa);
+                     assemblyform->MultTranspose(xin, y_assembly);
+                     A_explicit.MultTranspose(xin, y_mat);
+
+                     y_pa -= y_mat;
+                     pa_error = y_pa.Norml2();
+                     std::cout << "  order: " << order
+                               << ", pa transpose error norm: " << pa_error << std::endl;
+                     REQUIRE(pa_error < 1.e-12);
+
+                     y_assembly -= y_mat;
+                     assembly_error = y_assembly.Norml2();
+                     std::cout << "  order: " << order
+                               << ", assembly transpose error norm: " << assembly_error
+                               << std::endl;
+                     REQUIRE(assembly_error < 1.e-12);
+                  }
+
+                  delete paform;
+                  delete assemblyform;
+                  delete vec_fec;
+                  delete scalar_fec;
+               }
             }
          }
 
diff --git a/tests/unit/fem/test_quadf_coef.cpp b/tests/unit/fem/test_quadf_coef.cpp
new file mode 100644
index 00000000000..334d016f3b8
--- /dev/null
+++ b/tests/unit/fem/test_quadf_coef.cpp
@@ -0,0 +1,162 @@
+// Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
+// LICENSE and NOTICE for details. LLNL-CODE-806117.
+//
+// This file is part of the MFEM library. For more information and source code
+// availability visit https://mfem.org.
+//
+// MFEM is free software; you can redistribute it and/or modify it under the
+// terms of the BSD-3 license. We welcome feedback and contributions, see file
+// CONTRIBUTING.md for details.
+
+#include "mfem.hpp"
+#include "catch.hpp"
+
+using namespace mfem;
+
+namespace qf_coeff
+{
+
+TEST_CASE("Quadrature Function Coefficients",
+          "[Quadrature Function Coefficients]")
+{
+   int order_h1 = 2, n = 4, dim = 3;
+   double tol = 1e-14;
+
+   Mesh mesh(n, n, n, Element::HEXAHEDRON, false, 1.0, 1.0, 1.0);
+   mesh.SetCurvature(order_h1);
+
+   int intOrder = 2 * order_h1 + 1;
+
+   QuadratureSpace qspace(&mesh, intOrder);
+   QuadratureFunction quadf_coeff(&qspace, 1);
+   QuadratureFunction quadf_vcoeff(&qspace, dim);
+
+   const IntegrationRule ir = qspace.GetElementIntRule(0);
+
+   const GeometricFactors *geom_facts =
+      mesh.GetGeometricFactors(ir, GeometricFactors::COORDINATES);
+
+   {
+      int nelems = quadf_coeff.Size() / quadf_coeff.GetVDim() / ir.GetNPoints();
+      int vdim = ir.GetNPoints();
+
+      for (int i = 0; i < nelems; i++)
+      {
+         for (int j = 0; j < vdim; j++)
+         {
+            //X has dims nqpts x sdim x ne
+            quadf_coeff((i * vdim) + j) =
+               geom_facts->X((i * vdim * dim) + (vdim * 2) + j );
+         }
+      }
+   }
+
+   {
+      int nqpts = ir.GetNPoints();
+      int nelems = quadf_vcoeff.Size() / quadf_vcoeff.GetVDim() / nqpts;
+      int vdim = quadf_vcoeff.GetVDim();
+
+      for (int i = 0; i < nelems; i++)
+      {
+         for (int j = 0; j < vdim; j++)
+         {
+            for (int k = 0; k < nqpts; k++)
+            {
+               //X has dims nqpts x sdim x ne
+               quadf_vcoeff((i * nqpts * vdim) + (k * vdim ) + j) =
+                  geom_facts->X((i * nqpts * vdim) + (j * nqpts) + k);
+            }
+         }
+      }
+   }
+
+   QuadratureFunctionCoefficient qfc(quadf_coeff);
+   VectorQuadratureFunctionCoefficient qfvc(quadf_vcoeff);
+
+   SECTION("Operators on VecQuadFuncCoeff")
+   {
+      std::cout << "Testing VecQuadFuncCoeff: " << std::endl;
+#ifdef MFEM_USE_EXCEPTIONS
+      std::cout << " Setting Component" << std::endl;
+      REQUIRE_THROWS(qfvc.SetComponent(3, 1));
+      REQUIRE_THROWS(qfvc.SetComponent(-1, 1));
+      REQUIRE_NOTHROW(qfvc.SetComponent(1, 2));
+      REQUIRE_THROWS(qfvc.SetComponent(0, 4));
+      REQUIRE_THROWS(qfvc.SetComponent(1, 3));
+      REQUIRE_NOTHROW(qfvc.SetComponent(0, 2));
+      REQUIRE_THROWS(qfvc.SetComponent(0, 0));
+#endif
+      qfvc.SetComponent(0, 3);
+   }
+
+   SECTION("Operators on VectorQuadratureLFIntegrator")
+   {
+      std::cout << "Testing VectorQuadratureLFIntegrator: " << std::endl;
+      H1_FECollection    fec_h1(order_h1, dim);
+      FiniteElementSpace fespace_h1(&mesh, &fec_h1, dim);
+
+      GridFunction nodes(&fespace_h1);
+      mesh.GetNodes(nodes);
+
+      Vector output(nodes.Size());
+      output = 0.0;
+
+      LinearForm lf(&fespace_h1);
+      lf.AddDomainIntegrator(new VectorQuadratureLFIntegrator(qfvc, NULL));
+
+      lf.Assemble();
+
+      BilinearForm L2(&fespace_h1);
+
+      L2.AddDomainIntegrator(new VectorMassIntegrator());
+      L2.Assemble();
+
+      SparseMatrix mat = L2.SpMat();
+
+      mat.Mult(nodes, output);
+
+      output -= lf;
+
+      REQUIRE(output.Norml2() < tol);
+   }
+
+   SECTION("Operators on QuadratureLFIntegrator")
+   {
+      std::cout << "Testing QuadratureLFIntegrator: " << std::endl;
+      H1_FECollection    fec_h1(order_h1, dim);
+      FiniteElementSpace fespace_h1(&mesh, &fec_h1, 1);
+      FiniteElementSpace fespace_h3(&mesh, &fec_h1, 3);
+
+      GridFunction nodes(&fespace_h3);
+      mesh.GetNodes(nodes);
+
+      Vector output(nodes.Size() / dim);
+      Vector nz(nodes.Size() / dim);
+      output = 0.0;
+
+      nz.MakeRef(nodes, nz.Size() * 2);
+
+      LinearForm lf(&fespace_h1);
+      lf.AddDomainIntegrator(new QuadratureLFIntegrator(qfc, NULL));
+
+      lf.Assemble();
+
+      BilinearForm L2(&fespace_h1);
+
+      L2.AddDomainIntegrator(new MassIntegrator(&ir));
+      L2.Assemble();
+
+      SparseMatrix mat = L2.SpMat();
+
+      mat.Mult(nz, output);
+
+      output -= lf;
+
+      REQUIRE(output.Norml2() < tol);
+   }
+
+}
+
+} // namespace qf_coeff
+
diff --git a/tests/unit/linalg/test_ilu.cpp b/tests/unit/linalg/test_ilu.cpp
index 76f4e1c8694..7fcfa699fb2 100644
--- a/tests/unit/linalg/test_ilu.cpp
+++ b/tests/unit/linalg/test_ilu.cpp
@@ -144,28 +144,28 @@ TEST_CASE("ILU Factorization", "[ILU]")
    REQUIRE(AB(0,1,1) == Approx(5.0));
    REQUIRE(AB(1,1,1) == Approx(9.0));
 
-   REQUIRE(AB(0,0,2) == Approx(0.4));
-   REQUIRE(AB(1,0,2) == Approx(3.4));
-   REQUIRE(AB(0,1,2) == Approx(0.6));
-   REQUIRE(AB(1,1,2) == Approx(-11.4));
-
-   REQUIRE(AB(0,0,3) == Approx(-5.4));
-   REQUIRE(AB(1,0,3) == Approx(84.6));
-   REQUIRE(AB(0,1,3) == Approx(-5.4));
-   REQUIRE(AB(1,1,3) == Approx(93.6));
-
-   REQUIRE(AB(0,0,4) == Approx(5.0));
-   REQUIRE(AB(1,0,4) == Approx(2.0));
-   REQUIRE(AB(0,1,4) == Approx(6.0));
-   REQUIRE(AB(1,1,4) == Approx(3.0));
-
-   REQUIRE(AB(0,0,5) == Approx(32.0/27.0));
-   REQUIRE(AB(1,0,5) == Approx(4.0/9.0));
-   REQUIRE(AB(0,1,5) == Approx(1.0/9.0));
-   REQUIRE(AB(1,1,5) == Approx(1.0/9.0));
-
-   REQUIRE(AB(0,0,6) == Approx(-31.0/27.0));
-   REQUIRE(AB(1,0,6) == Approx(59.0/9.0));
-   REQUIRE(AB(0,1,6) == Approx(-13.0/9.0));
-   REQUIRE(AB(1,1,6) == Approx(-2.0));
+   REQUIRE(AB(0,0,2) == Approx(5.0));
+   REQUIRE(AB(1,0,2) == Approx(9.0));
+   REQUIRE(AB(0,1,2) == Approx(6.0));
+   REQUIRE(AB(1,1,2) == Approx(1.0));
+
+   REQUIRE(AB(0,0,3) == Approx(3.0));
+   REQUIRE(AB(1,0,3) == Approx(7.0));
+   REQUIRE(AB(0,1,3) == Approx(4.0));
+   REQUIRE(AB(1,1,3) == Approx(8.0));
+
+   REQUIRE(AB(0,0,4) == Approx(0.4));
+   REQUIRE(AB(1,0,4) == Approx(3.4));
+   REQUIRE(AB(0,1,4) == Approx(0.6));
+   REQUIRE(AB(1,1,4) == Approx(-11.4));
+
+   REQUIRE(AB(0,0,5) == Approx(1.0));
+   REQUIRE(AB(1,0,5) == Approx(25.0/49.0));
+   REQUIRE(AB(0,1,5) == Approx(0.0));
+   REQUIRE(AB(1,1,5) == Approx(-3.0/49.0));
+
+   REQUIRE(AB(0,0,6) == Approx(-8.4));
+   REQUIRE(AB(1,0,6) == Approx(20457.0/245.0));
+   REQUIRE(AB(0,1,6) == Approx(-9.4));
+   REQUIRE(AB(1,1,6) == Approx(22552.0/245.0));
 }
diff --git a/tests/unit/linalg/test_matrix_rectangular.cpp b/tests/unit/linalg/test_matrix_rectangular.cpp
index c728176b702..bed668b1f01 100644
--- a/tests/unit/linalg/test_matrix_rectangular.cpp
+++ b/tests/unit/linalg/test_matrix_rectangular.cpp
@@ -159,6 +159,111 @@ TEST_CASE("ParallelFormRectangular",
    }
 }
 
+TEST_CASE("HypreParMatrixBlocks",
+          "[Parallel], [BlockMatrix]")
+{
+   SECTION("HypreParMatrixFromBlocks")
+   {
+      int rank;
+      MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+      Mesh mesh(10, 10, Element::QUADRILATERAL, 0, 1.0, 1.0);
+      int dim = mesh.Dimension();
+      int order = 2;
+
+      int nattr = mesh.bdr_attributes.Max();
+      Array<int> ess_trial_tdof_list, ess_test_tdof_list;
+
+      Array<int> ess_bdr(nattr);
+      ess_bdr = 0;
+      ess_bdr[0] = 1;
+
+      ParMesh pmesh(MPI_COMM_WORLD, mesh);
+
+      FiniteElementCollection *hdiv_coll(new RT_FECollection(order, dim));
+      FiniteElementCollection *l2_coll(new L2_FECollection(order, dim));
+
+      ParFiniteElementSpace R_space(&pmesh, hdiv_coll);
+      ParFiniteElementSpace W_space(&pmesh, l2_coll);
+
+      ParBilinearForm RmVarf(&R_space);
+      ParBilinearForm WmVarf(&W_space);
+      ParMixedBilinearForm bVarf(&R_space, &W_space);
+
+      HypreParMatrix *MR, *MW, *B;
+
+      RmVarf.AddDomainIntegrator(new VectorFEMassIntegrator());
+      RmVarf.Assemble();
+      RmVarf.Finalize();
+      MR = RmVarf.ParallelAssemble();
+
+      WmVarf.AddDomainIntegrator(new MassIntegrator());
+      WmVarf.Assemble();
+      WmVarf.Finalize();
+      MW = WmVarf.ParallelAssemble();
+
+      bVarf.AddDomainIntegrator(new VectorFEDivergenceIntegrator);
+      bVarf.Assemble();
+      bVarf.Finalize();
+      B = bVarf.ParallelAssemble();
+      (*B) *= -1;
+
+      HypreParMatrix *BT = B->Transpose();
+
+      Array<int> blockRow_trueOffsets(3); // number of variables + 1
+      blockRow_trueOffsets[0] = 0;
+      blockRow_trueOffsets[1] = R_space.TrueVSize();
+      blockRow_trueOffsets[2] = W_space.TrueVSize();
+      blockRow_trueOffsets.PartialSum();
+
+      Array<int> blockCol_trueOffsets(4); // number of variables + 1
+      blockCol_trueOffsets[0] = 0;
+      blockCol_trueOffsets[1] = R_space.TrueVSize();
+      blockCol_trueOffsets[2] = W_space.TrueVSize();
+      blockCol_trueOffsets[3] = W_space.TrueVSize();
+      blockCol_trueOffsets.PartialSum();
+
+      BlockOperator blockOper(blockRow_trueOffsets, blockCol_trueOffsets);
+      blockOper.SetBlock(0, 0, MR);
+      blockOper.SetBlock(0, 1, BT);
+      blockOper.SetBlock(1, 0, B);
+      blockOper.SetBlock(0, 2, BT, 3.14);
+      blockOper.SetBlock(1, 2, MW);
+
+      Array2D<HypreParMatrix*> hBlocks(2,3);
+      hBlocks = NULL;
+      hBlocks(0, 0) = MR;
+      hBlocks(0, 1) = BT;
+      hBlocks(1, 0) = B;
+      hBlocks(0, 2) = BT;
+      hBlocks(1, 2) = MW;
+
+      Array2D<double> blockCoeff(2,3);
+      blockCoeff = 1.0;
+      blockCoeff(0, 2) = 3.14;
+      HypreParMatrix *H = HypreParMatrixFromBlocks(hBlocks, &blockCoeff);
+
+      Vector x(blockCol_trueOffsets[3]);
+      Vector yB(blockRow_trueOffsets[2]);
+      Vector yH(blockRow_trueOffsets[2]);
+
+      x.Randomize();
+      yB = 0.0;
+      yH = 0.0;
+
+      blockOper.Mult(x, yB);
+      H->Mult(x, yH);
+
+      yH -= yB;
+      double error = yH.Norml2();
+      std::cout << "  order: " << order
+                << ", block matrix error norm on rank " << rank << ": " << error << std::endl;
+      REQUIRE(error < 1.e-12);
+
+      delete H;
+   }
+}
+
 #endif
 
 } // namespace mfem