Replace hadoop minicluster tests with testcontainers (#3082)

* Fix CI continue on test failure * Increase timeout in postgis test to reduce CI failures
locationtech · Mar 25, 2024 · 4ae4192 · 4ae4192
1 parent 46efa9c
commit 4ae4192
Show file tree

Hide file tree

Showing 14 changed files with 142 additions and 104 deletions.
diff --git a/.github/workflows/build-and-test-2.12.yml b/.github/workflows/build-and-test-2.12.yml
@@ -28,23 +28,24 @@ jobs:
         run: ./build/mvn clean install $MAVEN_CLI_OPTS -DskipTests -T4
       - name: Unit tests
         id: test
+        continue-on-error: true
         run: |
           set -o pipefail
           mvn surefire:test $MAVEN_CLI_OPTS $MAVEN_TEST_OPTS | tee -a test.log
-        continue-on-error: true
       - name: Unit tests (retry)
         id: test-retry
         if: steps.test.outcome=='failure'
+        continue-on-error: true
         run: |
           set -o pipefail
-          RESUME_FROM="$(tail -n2 test.log | grep 'rf' | sed 's/.*-rf/-rf/')"
+          RESUME_FROM="$(grep --text 'mvn <args> -rf ' test.log | tail -n1 | sed 's/.*-rf/-rf/')"
           mvn surefire:test $MAVEN_CLI_OPTS $MAVEN_TEST_OPTS $RESUME_FROM | tee -a test.log
       - name: Unit tests (retry)
         id: test-retry-retry
         if: steps.test-retry.outcome=='failure'
         run: |
           set -o pipefail
-          RESUME_FROM="$(tail -n2 test.log | grep 'rf' | sed 's/.*-rf/-rf/')"
+          RESUME_FROM="$(grep --text 'mvn <args> -rf ' test.log | tail -n1 | sed 's/.*-rf/-rf/')"
           mvn surefire:test $MAVEN_CLI_OPTS $MAVEN_TEST_OPTS $RESUME_FROM | tee -a test.log
       - name: Remove geomesa artifacts
         if: success() || failure()

diff --git a/.github/workflows/build-and-test-2.13.yml b/.github/workflows/build-and-test-2.13.yml
@@ -30,23 +30,24 @@ jobs:
         run: ./build/mvn clean install $MAVEN_CLI_OPTS -DskipTests -T4
       - name: Unit tests
         id: test
+        continue-on-error: true
         run: |
           set -o pipefail
           mvn surefire:test $MAVEN_CLI_OPTS $MAVEN_TEST_OPTS | tee -a test.log
-        continue-on-error: true
       - name: Unit tests (retry)
         id: test-retry
         if: steps.test.outcome=='failure'
+        continue-on-error: true
         run: |
           set -o pipefail
-          RESUME_FROM="$(tail -n2 test.log | grep 'rf' | sed 's/.*-rf/-rf/')"
+          RESUME_FROM="$(grep --text 'mvn <args> -rf ' test.log | tail -n1 | sed 's/.*-rf/-rf/')"
           mvn surefire:test $MAVEN_CLI_OPTS $MAVEN_TEST_OPTS $RESUME_FROM | tee -a test.log
       - name: Unit tests (retry)
         id: test-retry-retry
         if: steps.test-retry.outcome=='failure'
         run: |
           set -o pipefail
-          RESUME_FROM="$(tail -n2 test.log | grep 'rf' | sed 's/.*-rf/-rf/')"
+          RESUME_FROM="$(grep --text 'mvn <args> -rf ' test.log | tail -n1 | sed 's/.*-rf/-rf/')"
           mvn surefire:test $MAVEN_CLI_OPTS $MAVEN_TEST_OPTS $RESUME_FROM | tee -a test.log
       - name: Remove geomesa artifacts
         if: success() || failure()

diff --git a/build/cqs.tsv b/build/cqs.tsv
@@ -275,8 +275,8 @@ org.apache.hadoop:hadoop-client	3.3.6	provided
 org.apache.hadoop:hadoop-common	3.3.6	provided
 org.apache.hadoop:hadoop-distcp	3.3.6	provided
 org.apache.hadoop:hadoop-hdfs	3.3.6	provided
-org.apache.hadoop:hadoop-mapreduce-client-common	3.3.6	provided
 org.apache.hadoop:hadoop-mapreduce-client-core	3.3.6	provided
+org.apache.hadoop:hadoop-mapreduce-client-jobclient	3.3.6	provided
 org.apache.hadoop:hadoop-yarn-api	3.3.6	provided
 org.apache.hadoop:hadoop-yarn-common	3.3.6	provided
 org.apache.hbase:hbase-server	2.5.7-hadoop3	provided
@@ -302,7 +302,6 @@ org.apache.arrow:arrow-vector	tests:15.0.2	test
 org.apache.cassandra:cassandra-all	3.11.14	test
 org.apache.cassandra:cassandra-thrift	3.11.14	test
 org.apache.curator:curator-test	5.6.0	test
-org.apache.hadoop:hadoop-minicluster	3.3.6	test
 org.apache.hbase:hbase-testing-util	2.5.7-hadoop3	test
 org.apache.kafka:kafka-clients	test:3.7.0	test
 org.apache.kafka:kafka-streams-test-utils	3.7.0	test
@@ -311,7 +310,7 @@ org.apache.logging.log4j:log4j-core	2.22.1	test
 org.apache.sedona:sedona-common	1.5.0	test
 org.cassandraunit:cassandra-unit	3.7.1.0	test
 org.codehaus.groovy:groovy-jsr223	3.0.20	test
-org.geomesa.testcontainers:testcontainers-accumulo	1.1.0	test
+org.geomesa.testcontainers:testcontainers-accumulo	1.3.0	test
 org.geotools:gt-epsg-hsql	30.2	test
 org.jruby:jruby	9.4.5.0	test
 org.mockito:mockito-core	2.28.2	test

diff --git a/geomesa-accumulo/geomesa-accumulo-jobs/pom.xml b/geomesa-accumulo/geomesa-accumulo-jobs/pom.xml
@@ -82,20 +82,6 @@
             <groupId>org.geomesa.testcontainers</groupId>
             <artifactId>testcontainers-accumulo</artifactId>
         </dependency>
-        <dependency>
-            <!-- 'works with' due to license issues -->
-            <groupId>org.xerial.snappy</groupId>
-            <artifactId>snappy-java</artifactId>
-            <version>${snappy.java.version}</version>
-            <scope>test</scope>
-        </dependency>
-        <!-- used by hadoop-minicluster -->
-        <dependency>
-            <groupId>org.mockito</groupId>
-            <artifactId>mockito-core</artifactId>
-            <version>${hadoop.minicluster.mockito.version}</version>
-            <scope>test</scope>
-        </dependency>
     </dependencies>
 
 </project>
diff --git a/geomesa-fs/geomesa-fs-datastore/pom.xml b/geomesa-fs/geomesa-fs-datastore/pom.xml
@@ -64,6 +64,7 @@
             <artifactId>hadoop-mapreduce-client-core</artifactId>
         </dependency>
 
+        <!-- test dependencies -->
         <dependency>
             <groupId>org.specs2</groupId>
             <artifactId>specs2-core_${scala.binary.version}</artifactId>
@@ -82,6 +83,31 @@
             <artifactId>geomesa-fs-storage-orc_${scala.binary.version}</artifactId>
             <scope>test</scope>
         </dependency>
+
+        <dependency>
+            <groupId>org.testcontainers</groupId>
+            <artifactId>testcontainers</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.geomesa.testcontainers</groupId>
+            <artifactId>testcontainers-accumulo</artifactId>
+        </dependency>
     </dependencies>
 
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
 </project>
diff --git a/...geomesa-fs-datastore/src/test/scala/org/locationtech/geomesa/fs/HadoopSharedCluster.scala b/...geomesa-fs-datastore/src/test/scala/org/locationtech/geomesa/fs/HadoopSharedCluster.scala
@@ -0,0 +1,59 @@
+/***********************************************************************
+ * Copyright (c) 2013-2024 Commonwealth Computer Research, Inc.
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the Apache License, Version 2.0
+ * which accompanies this distribution and is available at
+ * http://www.opensource.org/licenses/apache2.0.php.
+ ***********************************************************************/
+
+package org.locationtech.geomesa.fs
+
+import com.typesafe.scalalogging.StrictLogging
+import org.apache.hadoop.conf.Configuration
+import org.geomesa.testcontainers.HadoopContainer
+import org.testcontainers.utility.DockerImageName
+
+import java.io.{ByteArrayInputStream, StringWriter}
+import java.nio.charset.StandardCharsets
+import java.util.concurrent.atomic.AtomicBoolean
+import scala.util.Try
+
+/**
+ * Hadoop cluster for testing. Singleton object that is shared between all test classes in the jvm.
+ */
+object HadoopSharedCluster extends StrictLogging {
+
+  val ImageName =
+    DockerImageName.parse("ghcr.io/geomesa/accumulo-uno")
+        .withTag(sys.props.getOrElse("accumulo.docker.tag", "2.1.2"))
+
+  lazy val Container: HadoopContainer = tryContainer.get
+
+  lazy val ContainerConfig: String = {
+    val conf = new Configuration(false)
+    conf.addResource(new ByteArrayInputStream(Container.getConfigurationXml.getBytes(StandardCharsets.UTF_8)), "")
+    conf.set("parquet.compression", "GZIP", "") // default is snappy which is not on our classpath
+    val writer = new StringWriter()
+    conf.writeXml(writer)
+    writer.toString
+  }
+
+  private lazy val tryContainer: Try[HadoopContainer] = Try {
+    logger.info("Starting Hadoop container")
+    val container = new HadoopContainer(ImageName)
+    initialized.getAndSet(true)
+    container.start()
+    logger.info("Started Hadoop container")
+    container
+  }
+
+  private val initialized = new AtomicBoolean(false)
+
+  sys.addShutdownHook({
+    if (initialized.get) {
+      logger.info("Stopping Hadoop container")
+      tryContainer.foreach(_.stop())
+      logger.info("Stopped Hadoop container")
+    }
+  })
+}
diff --git a/geomesa-fs/geomesa-fs-spark-runtime/pom.xml b/geomesa-fs/geomesa-fs-spark-runtime/pom.xml
@@ -93,16 +93,17 @@
             <scope>test</scope>
         </dependency>
         <dependency>
-            <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-minicluster</artifactId>
-            <scope>test</scope>
+            <groupId>org.locationtech.geomesa</groupId>
+            <artifactId>geomesa-fs-datastore_${scala.binary.version}</artifactId>
+            <classifier>tests</classifier>
         </dependency>
-        <!-- used by hadoop-minicluster -->
         <dependency>
-            <groupId>org.mockito</groupId>
-            <artifactId>mockito-core</artifactId>
-            <version>${hadoop.minicluster.mockito.version}</version>
-            <scope>test</scope>
+            <groupId>org.testcontainers</groupId>
+            <artifactId>testcontainers</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.geomesa.testcontainers</groupId>
+            <artifactId>testcontainers-accumulo</artifactId>
         </dependency>
     </dependencies>
 

diff --git a/...-runtime/src/test/scala/org/locationtech/geomesa/fs/spark/FileSystemRDDProviderTest.scala b/...-runtime/src/test/scala/org/locationtech/geomesa/fs/spark/FileSystemRDDProviderTest.scala
@@ -9,13 +9,12 @@
 package org.locationtech.geomesa.fs.spark
 
 import com.typesafe.scalalogging.LazyLogging
-import org.apache.commons.io.FileUtils
-import org.apache.hadoop.hdfs.{HdfsConfiguration, MiniDFSCluster}
 import org.apache.spark.sql.{SQLContext, SparkSession}
 import org.geotools.api.data.{DataStore, DataStoreFinder, Transaction}
 import org.geotools.filter.text.ecql.ECQL
 import org.junit.runner.RunWith
 import org.locationtech.geomesa.features.ScalaSimpleFeature
+import org.locationtech.geomesa.fs.HadoopSharedCluster
 import org.locationtech.geomesa.spark.SparkSQLTestUtils
 import org.locationtech.geomesa.spark.sql.SQLTypes
 import org.locationtech.geomesa.utils.geotools.{FeatureUtils, SimpleFeatureTypes}
@@ -24,8 +23,6 @@ import org.locationtech.geomesa.utils.text.WKTUtils
 import org.specs2.mutable.Specification
 import org.specs2.runner.JUnitRunner
 
-import java.nio.file.{Files, Path}
-
 
 @RunWith(classOf[JUnitRunner])
 class FileSystemRDDProviderTest extends Specification with LazyLogging {
@@ -36,26 +33,16 @@ class FileSystemRDDProviderTest extends Specification with LazyLogging {
 
   sequential
 
-  val tempDir: Path = Files.createTempDirectory("fsSparkTest")
-
-  var cluster: MiniDFSCluster = _
-  var directory: String = _
-
   var spark: SparkSession = _
   var sc: SQLContext = _
 
-  lazy val params = Map("fs.path" -> directory)
+  lazy val path = s"${HadoopSharedCluster.Container.getHdfsUrl}/${getClass.getSimpleName}/"
+  lazy val params = Map("fs.path" -> path)
   lazy val ds: DataStore = DataStoreFinder.getDataStore(params.asJava)
 
   val formats = Seq("orc", "parquet")
 
   step {
-    // Start MiniCluster
-    val conf = new HdfsConfiguration()
-    conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, tempDir.toFile.getAbsolutePath)
-    cluster = new MiniDFSCluster.Builder(conf).build()
-    directory = cluster.getURI + "/data/chicago"
-
     formats.foreach { format =>
       val sft = SimpleFeatureTypes.createType(format,
         "arrest:String,case_number:Int:index=full:cardinality=high,dtg:Date,*geom:Point:srid=4326")
@@ -216,8 +203,5 @@ class FileSystemRDDProviderTest extends Specification with LazyLogging {
 
   step {
     ds.dispose()
-    // Stop MiniCluster
-    cluster.shutdown()
-    FileUtils.deleteDirectory(tempDir.toFile)
   }
 }
diff --git a/geomesa-fs/geomesa-fs-tools/pom.xml b/geomesa-fs/geomesa-fs-tools/pom.xml
@@ -47,6 +47,10 @@
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-mapreduce-client-core</artifactId>
         </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+        </dependency>
 
         <dependency>
             <groupId>org.slf4j</groupId>
@@ -69,19 +73,17 @@
             <artifactId>specs2-junit_${scala.binary.version}</artifactId>
         </dependency>
         <dependency>
-            <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-mapreduce-client-common</artifactId>
+            <groupId>org.locationtech.geomesa</groupId>
+            <artifactId>geomesa-fs-datastore_${scala.binary.version}</artifactId>
+            <classifier>tests</classifier>
         </dependency>
         <dependency>
-            <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-minicluster</artifactId>
+            <groupId>org.testcontainers</groupId>
+            <artifactId>testcontainers</artifactId>
         </dependency>
-        <!-- used by hadoop-minicluster -->
         <dependency>
-            <groupId>org.mockito</groupId>
-            <artifactId>mockito-core</artifactId>
-            <version>${hadoop.minicluster.mockito.version}</version>
-            <scope>test</scope>
+            <groupId>org.geomesa.testcontainers</groupId>
+            <artifactId>testcontainers-accumulo</artifactId>
         </dependency>
     </dependencies>