warehouse-run-data-pipeline #172

Workflow file for this run

.github/workflows/warehouse-run-data-pipeline.yml at 546b2d7

	name: warehouse-run-data-pipeline
	env:
	X_GITHUB_GRAPHQL_API: ${{ vars.X_GITHUB_GRAPHQL_API }}
	X_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	CLOUDQUERY_VERSION: 5.5.0
	CLOUDQUERY_FILE_DIRECTORY: /tmp/cloudquery
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	CLOUDSQL_DB_USER: ${{ secrets.CLOUDSQL_DB_USER }}
	CLOUDSQL_DB_PASSWORD: ${{ secrets.CLOUDSQL_DB_PASSWORD }}
	CLOUDSQL_DB_NAME: ${{ vars.CLOUDSQL_DB_NAME }}
	CLOUDSQL_REGION: ${{ vars.CLOUDSQL_REGION }}
	CLOUDSQL_INSTANCE_ID: ${{ vars.CLOUDSQL_INSTANCE_ID }}
	GOOGLE_PROJECT_ID: ${{ vars.GOOGLE_PROJECT_ID }}
	CLOUDSTORAGE_BUCKET_NAME: ${{ vars.CLOUDSTORAGE_BUCKET_NAME }}
	BIGQUERY_DATASET_ID: ${{ vars.BIGQUERY_DATASET_ID }}

	# For now this only runs on a schedule once a day. Once we have made some of the
	# plugin workflows more incremental we will run this on _every_ commit to main
	on:
	# Allows you to run this workflow manually from the Actions tab
	workflow_dispatch:
	inputs:
	docker_tag:
	description: The docker tag to use for cloudquery plugins (only)
	skip_cloudquery_plugins:
	description: Skip CloudQuery plugins (run dbt only)
	default: 'false'
	required: false
	schedule:

	# Schedule every day at 2AM UTC. This is so we ensure anything that is #
	# commited daily has completed writing from whatever data source. This likely
	# isn't necessary in the future if we do everything incrementally
	- cron: '0 2 * * *'

	jobs:
	warehouse-run-data-pipeline:
	name: warehouse-run-data-pipeline
	environment: indexer
	runs-on: ubuntu-latest

	permissions:
	contents: 'read'
	id-token: 'write'

	env:
	DOCKER_TAG: ${{ inputs.docker_tag != '' && inputs.docker_tag \|\| github.sha }}

	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	fetch-depth: 1

	- name: 'Login to GitHub Container Registry'
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: "Setup Python, Poetry and Dependencies"
	uses: packetcoders/action-setup-cache-python-poetry@main
	with:
	python-version: 3.12
	poetry-version: 1.7.1

	- name: Run poetry install
	run: \|
	poetry install

	# At this time this auth isn't working for dbt
	# - uses: 'google-github-actions/auth@v2'
	# with:
	# service_account: oso-github-actions@oso-production.iam.gserviceaccount.com
	# workload_identity_provider: projects/1054148520225/locations/global/workloadIdentityPools/github/providers/oso-github-actions
	# create_credentials_file: true
	# access_token_lifetime: 3600s

	- uses: 'google-github-actions/auth@v2'
	with:
	credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_JSON }}'
	create_credentials_file: true

	- name: 'Set up Cloud SDK'
	uses: 'google-github-actions/setup-gcloud@v2'
	with:
	version: '>= 363.0.0'

	- name: Download and install cloudquery
	if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
	run: \|
	curl -L https://github.com/cloudquery/cloudquery/releases/download/cli-v${CLOUDQUERY_VERSION}/cloudquery_linux_amd64 -o /tmp/cloudquery &&
	chmod a+x /tmp/cloudquery &&
	mv /tmp/cloudquery /usr/local/bin/cloudquery

	# For now this is a bit of a hack for the oss-directory plugins as the output from one plugin is the input to
	# another. Ideally we would simply tell whatever system to run and it will handle dependencies.
	- name: Run cloudquery for oss-directory
	if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
	run: \|
	cloudquery sync .github/workflows/cloudquery/oss-directory.yml --log-level debug --log-console

	- name: Concat the project jsonl files (if there are many)
	if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
	run: \|
	ls -laht ${CLOUDQUERY_FILE_DIRECTORY}/ &&
	find ${CLOUDQUERY_FILE_DIRECTORY}/projects_ossd -name "*.json" -type f -exec cat {} \; > ${CLOUDQUERY_FILE_DIRECTORY}/projects.json &&
	head -n 5 ${CLOUDQUERY_FILE_DIRECTORY}/projects.json

	- uses: actions/upload-artifact@v4
	if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
	with:
	name: projects.json
	path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/projects.json

	# This is currently a punt on how to run this properly because a previous cloudquery
	# plugin's output can't be used as input into a different one
	# We start the github-resolve-repos container with a volume that can access the project file and use
	# Ideally we'd either have a plugin that can act as both a destination/source (so we can chain multiple plugins)
	# Or potentially we use something else that can achieve a similar things
	- name: Run cloudquery for github-resolve-directory
	if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
	run: \|
	docker run -d --rm -p 7777:7777 \
	-v ${CLOUDQUERY_FILE_DIRECTORY}:${CLOUDQUERY_FILE_DIRECTORY} \
	--name github-resolve-repos \
	ghcr.io/opensource-observer/cloudquery-github-resolve-repos:${DOCKER_TAG} \
	serve --address 0.0.0.0:7777 &&
	cloudquery sync .github/workflows/cloudquery/github-resolve-repos.yml --log-level debug --log-console &&
	docker logs github-resolve-repos 2>&1 \| tee ${CLOUDQUERY_FILE_DIRECTORY}/github-resolve-repos.log &&
	docker stop github-resolve-repos

	- uses: actions/upload-artifact@v4
	if: ${{ inputs.skip_cloudquery_plugins != 'true' }}
	with:
	name: github-resolve-repos.log
	path: ${{ env.CLOUDQUERY_FILE_DIRECTORY }}/github-resolve-repos.log

	- name: Setup dbt
	run: \|
	bash .github/scripts/create-dbt-profile.sh ${GOOGLE_APPLICATION_CREDENTIALS} &&
	gcloud auth list

	- name: Run dbt for production
	run: \|
	poetry run dbt run --target production

	- name: Run dbt for the base_playground
	run: \|
	poetry run dbt run --target base_playground

	- name: Run dbt for the playground
	run: \|
	poetry run dbt run --target playground --full-refresh
	env:
	PLAYGROUND_DAYS: 30

	- name: Copy the bigquery tables to cloudsql
	run: \|
	poetry run bq2cloudsql
	env:
	DBT_TARGET: production

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

warehouse-run-data-pipeline #172

Workflow file

warehouse-run-data-pipeline #172

Jobs

Run details

Workflow file for this run