Skip to content

warehouse-run-data-pipeline #172

warehouse-run-data-pipeline

warehouse-run-data-pipeline #172

name: warehouse-run-data-pipeline
env:
X_GITHUB_GRAPHQL_API: ${{ vars.X_GITHUB_GRAPHQL_API }}
X_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
CLOUDQUERY_VERSION: 5.5.0
CLOUDQUERY_FILE_DIRECTORY: /tmp/cloudquery
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
CLOUDSQL_DB_USER: ${{ secrets.CLOUDSQL_DB_USER }}
CLOUDSQL_DB_PASSWORD: ${{ secrets.CLOUDSQL_DB_PASSWORD }}
CLOUDSQL_DB_NAME: ${{ vars.CLOUDSQL_DB_NAME }}
CLOUDSQL_REGION: ${{ vars.CLOUDSQL_REGION }}
CLOUDSQL_INSTANCE_ID: ${{ vars.CLOUDSQL_INSTANCE_ID }}
GOOGLE_PROJECT_ID: ${{ vars.GOOGLE_PROJECT_ID }}
CLOUDSTORAGE_BUCKET_NAME: ${{ vars.CLOUDSTORAGE_BUCKET_NAME }}
BIGQUERY_DATASET_ID: ${{ vars.BIGQUERY_DATASET_ID }}
# For now this only runs on a schedule once a day. Once we have made some of the
# plugin workflows more incremental we will run this on _every_ commit to main
on:
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
inputs:
docker_tag:
description: The docker tag to use for cloudquery plugins (only)
skip_cloudquery_plugins:
description: Skip CloudQuery plugins (run dbt only)
default: 'false'
required: false
schedule:
# Schedule every day at 2AM UTC. This is so we ensure anything that is #
# commited daily has completed writing from whatever data source. This likely
# isn't necessary in the future if we do everything incrementally
- cron: '0 2 * * *'
jobs:
warehouse-run-data-pipeline:
name: warehouse-run-data-pipeline
environment: indexer
runs-on: ubuntu-latest
permissions:
contents: 'read'
id-token: 'write'
env:
DOCKER_TAG: ${{ inputs.docker_tag != '' && inputs.docker_tag || github.sha }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: 'Login to GitHub Container Registry'
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: "Setup Python, Poetry and Dependencies"
uses: packetcoders/action-setup-cache-python-poetry@main
with:
python-version: 3.12
poetry-version: 1.7.1
- name: Run poetry install
run: |
poetry install
# At this time this auth isn't working for dbt
# - uses: 'google-github-actions/auth@v2'
# with:
# service_account: oso-github-actions@oso-production.iam.gserviceaccount.com
# workload_identity_provider: projects/1054148520225/locations/global/workloadIdentityPools/github/providers/oso-github-actions
# create_credentials_file: true
# access_token_lifetime: 3600s
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_JSON }}'
create_credentials_file: true
- name: 'Set up Cloud SDK'
uses: 'google-github-actions/setup-gcloud@v2'
with:
version: '>= 363.0.0'
- name: Download and install cloudquery
if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
run: |
curl -L https://github.com/cloudquery/cloudquery/releases/download/cli-v${CLOUDQUERY_VERSION}/cloudquery_linux_amd64 -o /tmp/cloudquery &&
chmod a+x /tmp/cloudquery &&
mv /tmp/cloudquery /usr/local/bin/cloudquery
# For now this is a bit of a hack for the oss-directory plugins as the output from one plugin is the input to
# another. Ideally we would simply tell whatever system to run and it will handle dependencies.
- name: Run cloudquery for oss-directory
if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
run: |
cloudquery sync .github/workflows/cloudquery/oss-directory.yml --log-level debug --log-console
- name: Concat the project jsonl files (if there are many)
if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
run: |
ls -laht ${CLOUDQUERY_FILE_DIRECTORY}/ &&
find ${CLOUDQUERY_FILE_DIRECTORY}/projects_ossd -name "*.json" -type f -exec cat {} \; > ${CLOUDQUERY_FILE_DIRECTORY}/projects.json &&
head -n 5 ${CLOUDQUERY_FILE_DIRECTORY}/projects.json
- uses: actions/upload-artifact@v4
if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
with:
name: projects.json
path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/projects.json
# This is currently a punt on how to run this properly because a previous cloudquery
# plugin's output can't be used as input into a different one
# We start the github-resolve-repos container with a volume that can access the project file and use
# Ideally we'd either have a plugin that can act as both a destination/source (so we can chain multiple plugins)
# Or potentially we use something else that can achieve a similar things
- name: Run cloudquery for github-resolve-directory
if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
run: |
docker run -d --rm -p 7777:7777 \
-v ${CLOUDQUERY_FILE_DIRECTORY}:${CLOUDQUERY_FILE_DIRECTORY} \
--name github-resolve-repos \
ghcr.io/opensource-observer/cloudquery-github-resolve-repos:${DOCKER_TAG} \
serve --address 0.0.0.0:7777 &&
cloudquery sync .github/workflows/cloudquery/github-resolve-repos.yml --log-level debug --log-console &&
docker logs github-resolve-repos 2>&1 | tee ${CLOUDQUERY_FILE_DIRECTORY}/github-resolve-repos.log &&
docker stop github-resolve-repos
- uses: actions/upload-artifact@v4
if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
with:
name: github-resolve-repos.log
path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/github-resolve-repos.log
- name: Setup dbt
run: |
bash .github/scripts/create-dbt-profile.sh ${GOOGLE_APPLICATION_CREDENTIALS} &&
gcloud auth list
- name: Run dbt for production
run: |
poetry run dbt run --target production
- name: Run dbt for the base_playground
run: |
poetry run dbt run --target base_playground
- name: Run dbt for the playground
run: |
poetry run dbt run --target playground --full-refresh
env:
PLAYGROUND_DAYS: 30
- name: Copy the bigquery tables to cloudsql
run: |
poetry run bq2cloudsql
env:
DBT_TARGET: production