lyft · achantavy · Feb 11, 2022 · Feb 9, 2022 · Feb 9, 2022 · Feb 9, 2022
diff --git a/cartography/intel/aws/s3.py b/cartography/intel/aws/s3.py
@@ -15,6 +15,7 @@
 from botocore.exceptions import EndpointConnectionError
 from policyuniverse.policy import Policy
 
+from cartography.util import merge_module_sync_metadata
 from cartography.util import run_analysis_job
 from cartography.util import run_cleanup_job
 from cartography.util import timeit
@@ -619,6 +620,13 @@ def load_s3_buckets(neo4j_session: neo4j.Session, data: Dict, current_aws_accoun
             AWS_ACCOUNT_ID=current_aws_account_id,
             aws_update_tag=aws_update_tag,
         )
+    merge_module_sync_metadata(
+        neo4j_session,
+        group_type='AWSAccount',
+        group_id=current_aws_account_id,
+        synced_type='S3Bucket',
+        update_tag=aws_update_tag,
+    )
 
 
 @timeit

diff --git a/cartography/util.py b/cartography/util.py
@@ -2,8 +2,10 @@
 import re
 import sys
 from functools import wraps
+from string import Template
 from typing import Dict
 from typing import Optional
+from typing import Union
 
 import botocore
 import neo4j
@@ -47,6 +49,40 @@ def run_cleanup_job(
     )
 
 
+def merge_module_sync_metadata(
+    neo4j_session: neo4j.Session,
+    group_type: str,
+    group_id: Union[str, int],
+    synced_type: str,
+    update_tag: int,
+):
+    '''
+    This function creates `ModuleSyncMetadata` nodes when called from each of the individual modules or sub-modules.
+    The 'types' used here should be actual node labels. For example, if we did sync a particular AWSAccount's S3Buckets,
+    the `grouptype` is 'AWSAccount', the `groupid` is the particular account's `id`, and the `syncedtype` is 'S3Bucket'.
+
+    :param neo4j_session: Neo4j session object
+    :param group_type: The parent module's type
+    :param group_id: The parent module's id
+    :param synced_type: The sub-module's type
+    :param update_tag: Timestamp used to determine data freshness
+    '''
+    template = Template("""
+        MERGE (n:ModuleSyncMetadata{id:'${group_type}_${group_id}_${synced_type}'})
+        ON CREATE SET
+            n:SyncMetadata, n.firstseen=timestamp()
+        SET n.syncedtype='${synced_type}',
+            n.grouptype='${group_type}',
+            n.groupid={group_id},
+            n.lastupdated={UPDATE_TAG}
+    """)
+    neo4j_session.run(
+        template.safe_substitute(group_type=group_type, group_id=group_id, synced_type=synced_type),
+        group_id=group_id,
+        UPDATE_TAG=update_tag,
+    )
+
+
 def load_resource_binary(package, resource_name):
     return open_binary(package, resource_name)
 

diff --git a/docs/schema/README.md b/docs/schema/README.md
@@ -56,5 +56,8 @@
 ## Kubernetes
 - Click [here](kubernetes.md)
 
+## SyncMetadata
+- Click [here](syncmetadata.md)
+
 ## More to come!
 👍
diff --git a/docs/schema/syncmetadata.md b/docs/schema/syncmetadata.md
@@ -0,0 +1,17 @@
+## SyncMetadata
+
+SyncMetadata nodes are created by sync jobs to convey information about the job itself. See this doc for how this is
+used.
+
+## SyncMetadata:ModuleSyncMetadata
+
+This is a node to represent some metadata about the sync job of a particular module or sub-module. Its existence should suggest that a paritcular sync job did happen.
+The 'types' used here should be actual node labels. For example, if we did sync a particular AWSAccount's S3Buckets,
+the `grouptype` is 'AWSAccount', the `groupid` is the particular account's `id`, and the `syncedtype` is 'S3Bucket'.
+
+| Field | Description | Source|
+|-------|-------------|------|
+|**id**|`{group_type}_{group_id}_{synced_type}`|util.py|
+|grouptype| The parent module's type |util.py|
+|groupid|The parent module's id|util.py|
+|syncedtype|The sub-module's type|util.py|
diff --git a/tests/integration/cartography/data/jobs/test_syntax.py b/tests/integration/cartography/data/jobs/test_syntax.py
@@ -11,9 +11,9 @@
 
 def test_analysis_jobs_cypher_syntax(neo4j_session):
     parameters = {
-        'AWS_ID': None,
-        'UPDATE_TAG': None,
-        'OKTA_ORG_ID': None,
+        'AWS_ID': 'my_aws_account_id',
+        'UPDATE_TAG': 'my_update_tag',
+        'OKTA_ORG_ID': 'my_okta_org_id',
     }
 
     for job_name in contents('cartography.data.jobs.analysis'):

diff --git a/tests/integration/cartography/intel/aws/test_s3.py b/tests/integration/cartography/intel/aws/test_s3.py
@@ -48,6 +48,43 @@ def test_load_s3_buckets(neo4j_session, *args):
     assert actual_nodes == expected_nodes
 
 
+def test_load_s3_buckets_sync_metadata(neo4j_session, *args):
+    # Arrange
+    data = tests.data.aws.s3.LIST_BUCKETS
+    expected_nodes = {
+        (
+            f'AWSAccount_{TEST_ACCOUNT_ID}_S3Bucket',
+            'AWSAccount',
+            TEST_ACCOUNT_ID,
+            'S3Bucket',
+            TEST_UPDATE_TAG,
+        ),
+    }
+    # Act
+    cartography.intel.aws.s3.load_s3_buckets(neo4j_session, data, TEST_ACCOUNT_ID, TEST_UPDATE_TAG)
+    nodes = neo4j_session.run(f"""
+        MATCH (m:ModuleSyncMetadata{{id:'AWSAccount_{TEST_ACCOUNT_ID}_S3Bucket'}})
+        RETURN
+            m.id,
+            m.syncedtype,
+            m.grouptype,
+            m.groupid,
+            m.lastupdated
+    """)
+    # Assert
+    actual_nodes = {
+        (
+            n['m.id'],
+            n['m.grouptype'],
+            n['m.groupid'],
+            n['m.syncedtype'],
+            n['m.lastupdated'],
+        )
+        for n in nodes
+    }
+    assert actual_nodes == expected_nodes
+
+
 def test_load_s3_encryption(neo4j_session, *args):
     """
     Ensure that expected bucket gets loaded with their encryption fields.