mozilla · KevinMind · Mar 14, 2025 · Mar 13, 2025 · Mar 14, 2025 · Mar 14, 2025
diff --git a/.github/workflows/health_check.yml b/.github/workflows/health_check.yml
@@ -9,6 +9,10 @@ on:
     # Every 5 minutes
     - cron: '*/5 * * * *'
 
+env:
+  health_check_file: health_check.json
+  health_check_blocks_file: health_check_blocks.json
+
 jobs:
   health_check:
     runs-on: ubuntu-latest
@@ -20,11 +24,47 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - uses: ./.github/actions/run-docker
+      - name: Run health check
+        id: health_check
+        continue-on-error: true
+        uses: ./.github/actions/run-docker
         with:
           target: development
           version: local
-          run: ./scripts/health_check.py --env ${{ matrix.environment }} --verbose
+          run: |
+            ./scripts/health_check.py \
+            --env ${{ matrix.environment }} \
+            --verbose \
+            --output ${{ env.health_check_file }}
 
+      - name: Set message blocks
+        id: blocks
+        if: steps.health_check.outcome == 'failure'
+        shell: bash
+        run: |
+          # Create the message blocks file
+          ./scripts/health_check_blocks.py \
+          --input ${{ env.health_check_file }} \
+          --output ${{ env.health_check_blocks_file }}
+          # Multiline output needs to use a delimiter to be passed to
+          # the GITHUB_OUTPUT file.
+          blocks=$(cat ${{ env.health_check_blocks_file }})
+          echo "blocks<<EOF"$'\n'$blocks$'\n'EOF >> $GITHUB_OUTPUT
+          cat $GITHUB_OUTPUT
 
+      - uses: mozilla/addons/.github/actions/slack@main
+        if: |
+          github.event_name == 'scheduled' &&
+          steps.health_check.outcome == 'failure'
+        with:
+          slack_token: ${{ secrets.SLACK_TOKEN }}
+          payload: |
+            {
+              "channel": "${{ secrets.SLACK_ADDONS_PRODUCTION_CHANNEL }}",
+              "blocks": ${{ toJson(steps.blocks.outputs.blocks) }},
+              "text": "Health check failed",
+              # Don't unfurl links or media
+              "unfurl_links": false,
+              "unfurl_media": false,
+            }
 
diff --git a/.github/workflows/health_check_completed.yml b/.github/workflows/health_check_completed.yml
@@ -0,0 +1,45 @@
+name: Health Check Completed
+
+on:
+  workflow_run:
+    workflows: Health Check
+    types: [completed]
+
+jobs:
+  context:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Context
+        id: context
+        uses: ./.github/actions/context
+
+  health_check_failure_notification:
+    if: |
+      github.event.workflow_run.event == 'scheduled' &&
+      github.event.workflow_run.conclusion == 'failure'
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Context
+        id: context
+        uses: ./.github/actions/context
+
+      - uses: mozilla/addons/.github/actions/slack-workflow-notification@main
+        with:
+          slack_token: ${{ secrets.SLACK_TOKEN }}
+          slack_channel: ${{ secrets.SLACK_ADDONS_PRODUCTION_CHANNEL }}
+          emoji: ':x:'
+          actor: ${{ vars.slack_actor }}
+          conclusion: ${{ github.event.workflow_run.conclusion }}
+          workflow_id: ${{ github.event.workflow_run.id }}
+          workflow_url: ${{ github.event.workflow_run.url }}
+          event: ${{ github.event.workflow_run.event }}
+          env: ci
+          ref: ''
+          ref_link: ''
+
+
diff --git a/scripts/health_check.py b/scripts/health_check.py
@@ -25,7 +25,7 @@ def __init__(self, env: ENV_ENUM, verbose: bool = False):
         self.environment = ENV_ENUM[env]
         self.verbose = verbose
 
-    def _fetch(self, path: str) -> dict[str, str] | None:
+    def _fetch(self, path: str):
         url = f'{self.environment.value}/{path}'
         if self.verbose:
             print(f'Requesting {url} for {self.environment.name}')
@@ -47,10 +47,13 @@ def _fetch(self, path: str) -> dict[str, str] | None:
                     }
                 )
 
-        if self.verbose and data is not None:
+        if data is None:
+            return {}
+
+        if self.verbose:
             print(json.dumps(data, indent=2))
 
-        return data
+        return {'url': url, 'data': data}
 
     def version(self):
         return self._fetch('__version__')
@@ -62,49 +65,58 @@ def monitors(self):
         return self._fetch('services/__heartbeat__')
 
 
-def main(env: ENV_ENUM, verbose: bool = False):
+def main(env: ENV_ENUM, verbose: bool, retries: int = 0, attempt: int = 0):
     fetcher = Fetcher(env, verbose)
 
     version_data = fetcher.version()
     heartbeat_data = fetcher.heartbeat()
     monitors_data = fetcher.monitors()
 
-    if version_data is None:
-        raise ValueError('Error fetching version data')
+    combined_data = {
+        'heartbeat': heartbeat_data,
+        'monitors': monitors_data,
+    }
 
-    if heartbeat_data is None:
-        raise ValueError('Error fetching heartbeat data')
+    has_failures = any(
+        monitor['state'] is False
+        for data in combined_data.values()
+        for monitor in data.get('data', {}).values()
+    )
 
-    if monitors_data is None:
-        raise ValueError('Error fetching monitors data')
+    if has_failures and attempt < retries:
+        wait_for = 2**attempt
+        if verbose:
+            print(f'waiting for {wait_for} seconds')
+        time.sleep(wait_for)
+        return main(env, verbose, retries, attempt + 1)
 
-    combined_data = {**heartbeat_data, **monitors_data}
-    failing_monitors = [
-        name for name, monitor in combined_data.items() if monitor['state'] is False
-    ]
+    results = {
+        'version': version_data,
+        'heartbeat': heartbeat_data,
+        'monitors': monitors_data,
+    }
 
-    if len(failing_monitors) > 0:
-        raise ValueError(f'Some monitors are failing {failing_monitors}')
+    return results, has_failures
 
 
 if __name__ == '__main__':
     args = argparse.ArgumentParser()
     args.add_argument(
         '--env', type=str, choices=list(ENV_ENUM.__members__.keys()), required=True
     )
+    args.add_argument('--output', type=str)
     args.add_argument('--verbose', action='store_true')
     args.add_argument('--retries', type=int, default=3)
     args = args.parse_args()
 
-    attempt = 1
+    data, has_failures = main(args.env, args.verbose, args.retries)
 
-    while attempt <= args.retries:
-        try:
-            main(args.env, args.verbose)
-            break
-        except Exception as e:
-            print(f'Error: {e}')
-            if attempt == args.retries:
-                raise
-            time.sleep(2**attempt)
-            attempt += 1
+    if args.output:
+        with open(args.output, 'w') as f:
+            json_data = json.dumps(data, indent=2)
+            f.write(json_data)
+        if args.verbose:
+            print(f'Health check data saved to {args.output}')
+
+    if has_failures:
+        raise ValueError(f'Health check failed: {data}')
diff --git a/scripts/health_check_blocks.py b/scripts/health_check_blocks.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+
+
+def format_monitors(data: dict, source: str):
+    monitors = data['data']
+    failures = []
+
+    for name, monitor in monitors.items():
+        if not monitor['state']:
+            failures.append(
+                {
+                    'type': 'rich_text_section',
+                    'elements': [
+                        {
+                            'type': 'text',
+                            'text': f'{name}: ',
+                            'style': {
+                                'bold': True,
+                            },
+                        },
+                        {
+                            'type': 'text',
+                            'text': f'{monitor["status"]}',
+                        },
+                    ],
+                }
+            )
+
+    if failures:
+        return {
+            'type': 'rich_text',
+            'elements': [
+                {
+                    'type': 'rich_text_section',
+                    'elements': [
+                        {
+                            'type': 'text',
+                            'text': f'{source.capitalize()}:',
+                            'style': {
+                                'bold': True,
+                            },
+                        }
+                    ],
+                },
+                {
+                    'type': 'rich_text_list',
+                    'elements': failures,
+                    'style': 'bullet',
+                    'indent': 0,
+                    'border': 1,
+                },
+            ],
+        }
+
+
+def format_context(data: dict):
+    version_data = data.get('version', {}).get('data', {})
+    version_elements = [
+        {'type': 'mrkdwn', 'text': f'{key.capitalize()}: {value} |'}
+        for key, value in version_data.items()
+        if value and key in ['version', 'commit', 'build']
+    ]
+    url_elements = [
+        {'type': 'mrkdwn', 'text': f'<{data["url"]}|{name.capitalize()}> |'}
+        for name, data in data.items()
+    ]
+    return {'type': 'context', 'elements': version_elements + url_elements}
+
+
+def format_header(emoji: str, text: setattr):
+    return {
+        'type': 'rich_text',
+        'elements': [
+            {
+                'type': 'rich_text_section',
+                'elements': [
+                    {
+                        'type': 'emoji',
+                        'name': emoji,
+                    },
+                    {
+                        'type': 'text',
+                        'text': 'Health Check Alert: ',
+                        'style': {'bold': True},
+                    },
+                    {
+                        'type': 'text',
+                        'text': text,
+                    },
+                ],
+            }
+        ],
+    }
+
+
+def create_blocks(health_data: dict):
+    """Create a Slack message from health check data."""
+    failing_monitors = []
+
+    for name, data in health_data.items():
+        if name in ['monitors', 'heartbeat']:
+            if monitors := format_monitors(data, name):
+                failing_monitors.append(monitors)
+
+    if not failing_monitors:
+        return []
+
+    return [
+        format_header('x', 'Issues Detected'),
+        *failing_monitors,
+        format_context(health_data),
+    ]
+
+
+def main():
+    args = argparse.ArgumentParser()
+    args.add_argument('--input', type=str, required=True)
+    args.add_argument('--output', type=str, required=True)
+    args.add_argument('--verbose', action='store_true')
+
+    args = args.parse_args()
+
+    with open(args.input) as f:
+        health_data = json.load(f)
+
+    if args.verbose:
+        print(f'Health data loaded from {args.input}')
+
+    blocks = create_blocks(health_data)
+    with open(args.output, 'w') as f:
+        json.dump(blocks, f)
+
+    if args.verbose:
+        print(f'Blocks saved to {args.output}')
+
+
+if __name__ == '__main__':
+    main()