diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 83a63d85..f6dee466 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -32,4 +32,50 @@ jobs: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Run Tests - run: docker-compose run conceptql \ No newline at end of file + run: docker-compose run conceptql + Run-Spark-Tests: + strategy: + matrix: + include: + - vocab: gdm + - vocab: ohdsi + runs-on: ubuntu-22.04 + env: + SPARK_VERSION: 3.5.0 + CONCEPTQL_DATA_MODEL: gdm + steps: + - uses: actions/checkout@v3 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: 3.2 + bundler-cache: true + + - uses: actions/cache@v3 + with: + path: ~/spark + key: spark-${{ env.SPARK_VERSION }} + id: cache-spark + - name: Download Spark + if: steps.cache-spark.outputs.cache-hit != 'true' + run: | + wget -q https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz + tar xzf spark-$SPARK_VERSION-bin-hadoop3.tgz + mv spark-$SPARK_VERSION-bin-hadoop3 ~/spark + + - uses: actions/cache@v3 + with: + path: /tmp/synpuf_test_data + key: synpuf-test-data + id: cache-synpuf-test-data + - name: Download Data + if: steps.cache-synpuf-test-data.outputs.cache-hit != 'true' + run: | + cd /tmp + curl -sSL "https://www.dropbox.com/scl/fi/hha5zjm9d5ezkk8bfvtnc/synpuf_test_data.tgz?rlkey=lythw2s6342609ave66cam2ms&dl=1" > synpuf_test_data.tgz + tar xzf synpuf_test_data.tgz + + - run: ~/spark/sbin/start-thriftserver.sh --driver-memory 5G && sleep 20 + - run: bundle exec ruby test/all.rb + env: + CONCEPTQL_PARQUET_TEST_DIR: /tmp/synpuf_test_data/${{ matrix.vocab }} + SEQUELIZER_URL: hexspace://localhost:10000/default \ No newline at end of file diff --git a/Gemfile b/Gemfile index f1b19781..b01040e7 100644 --- a/Gemfile +++ b/Gemfile @@ -3,7 +3,7 @@ source 'https://rubygems.org' # Specify your gem's dependencies in conceptql.gemspec gemspec gem "pg" -gem "sequel-hexspace", path: "../sequel-hexspace" +gem "sequel-hexspace", github: "outcomesinsights/sequel-hexspace" group :test, :development do gem "nokogiri" diff --git a/Gemfile.lock b/Gemfile.lock index a28a8b25..ec9cfd1f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,5 +1,6 @@ -PATH - remote: ../sequel-hexspace +GIT + remote: https://github.com/outcomesinsights/sequel-hexspace.git + revision: a6cddba0a45283581a8bf3e1a4580a146ea4eb1c specs: sequel-hexspace (1.0.0) hexspace diff --git a/dockers/standard/Dockerfile b/dockers/standard/Dockerfile index 7d2ebc04..9e5058ea 100644 --- a/dockers/standard/Dockerfile +++ b/dockers/standard/Dockerfile @@ -1,4 +1,4 @@ -FROM ruby:2.7-slim +FROM ruby:3.2-slim-bullseye ENV PATH="/root/.local/bin:${PATH}" diff --git a/lib/conceptql/spark_prepper.rb b/lib/conceptql/spark_prepper.rb index aa90fce1..66b51bbf 100644 --- a/lib/conceptql/spark_prepper.rb +++ b/lib/conceptql/spark_prepper.rb @@ -20,5 +20,9 @@ def prep db.create_view(table_name, temp: true, if_not_exists: true, using: 'org.apache.spark.sql.parquet', options: { path: parquet_file.expand_path }) end end + if ENV["CI"].present? + # Broadcast joins are running out of memory in GitHub Actions + #db.run("SET spark.sql.autoBroadcastJoinThreshold=6134169") + end end end \ No newline at end of file