lyft · achantavy · May 12, 2023 · May 5, 2023 · May 5, 2023 · May 12, 2023
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ Start [here](https://lyft.github.io/cartography/install.html).
 - [Duo CRXcavator](https://lyft.github.io/cartography/modules/crxcavator/index.html) - Chrome extensions, GSuite users
 - [Oracle Cloud Infrastructure](docs/setup/config/oci.md) - IAM
 - [Okta](https://lyft.github.io/cartography/modules/okta/index.html) - users, groups, organizations, roles, applications, factors, trusted origins, reply URIs
-- [Github](https://lyft.github.io/cartography/modules/github/index.html) - repos, branches, users
+- [Github](https://lyft.github.io/cartography/modules/github/index.html) - repos, branches, users, teams
 - [DigitalOcean](https://lyft.github.io/cartography/modules/digitalocean/index.html)
 - [Microsoft Azure](https://lyft.github.io/cartography/modules/azure/index.html) -  CosmosDB, SQL, Storage, Virtual Machine
 - [Kubernetes](https://lyft.github.io/cartography/modules/kubernetes/index.html) - Cluster, Namespace, Service, Pod, Container

diff --git a/cartography/intel/github/__init__.py b/cartography/intel/github/__init__.py
@@ -6,6 +6,7 @@
 from requests import exceptions
 
 import cartography.intel.github.repos
+import cartography.intel.github.teams
 import cartography.intel.github.users
 from cartography.config import Config
 from cartography.util import timeit
@@ -46,5 +47,12 @@ def start_github_ingestion(neo4j_session: neo4j.Session, config: Config) -> None
                 auth_data['url'],
                 auth_data['name'],
             )
+            cartography.intel.github.teams.sync_github_teams(
+                neo4j_session,
+                common_job_parameters,
+                auth_data['token'],
+                auth_data['url'],
+                auth_data['name'],
+            )
         except exceptions.RequestException as e:
             logger.error("Could not complete request to the GitHub API: %s", e)
diff --git a/cartography/intel/github/repos.py b/cartography/intel/github/repos.py
@@ -101,8 +101,14 @@ def get(token: str, api_url: str, organization: str) -> List[Dict]:
     :return: A list of dicts representing repos. See tests.data.github.repos for data shape.
     """
     # TODO: link the Github organization to the repositories
-    repos, _ = fetch_all(token, api_url, organization, GITHUB_ORG_REPOS_PAGINATED_GRAPHQL, 'repositories', 'nodes')
-    return repos
+    repos, _ = fetch_all(
+        token,
+        api_url,
+        organization,
+        GITHUB_ORG_REPOS_PAGINATED_GRAPHQL,
+        'repositories',
+    )
+    return repos.nodes
 
 
 def transform(repos_json: List[Dict]) -> Dict:
@@ -539,8 +545,11 @@ def load_python_requirements(neo4j_session: neo4j.Session, update_tag: int, requ
 
 
 def sync(
-    neo4j_session: neo4j.Session, common_job_parameters: Dict, github_api_key: str, github_url: str,
-    organization: str,
+        neo4j_session: neo4j.Session,
+        common_job_parameters: Dict[str, Any],
+        github_api_key: str,
+        github_url: str,
+        organization: str,
 ) -> None:
     """
     Performs the sequential tasks to collect, transform, and sync github data
@@ -554,5 +563,5 @@ def sync(
     logger.info("Syncing GitHub repos")
     repos_json = get(github_api_key, github_url, organization)
     repo_data = transform(repos_json)
-    load(neo4j_session, common_job_parameters, repo_data)
+    load(neo4j_session, repo_data, common_job_parameters['UPDATE_TAG'])
     run_cleanup_job('github_repos_cleanup.json', neo4j_session, common_job_parameters)
diff --git a/cartography/intel/github/teams.py b/cartography/intel/github/teams.py
@@ -0,0 +1,175 @@
+import logging
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Tuple
+
+import neo4j
+
+from cartography.client.core.tx import load
+from cartography.graph.job import GraphJob
+from cartography.intel.github.util import fetch_all
+from cartography.intel.github.util import PaginatedGraphqlData
+from cartography.models.github.teams import GitHubTeamSchema
+from cartography.util import timeit
+
+logger = logging.getLogger(__name__)
+
+
+@timeit
+def get_teams(org: str, api_url: str, token: str) -> Tuple[PaginatedGraphqlData, Dict[str, Any]]:
+    org_teams_gql = """
+        query($login: String!, $cursor: String) {
+            organization(login: $login) {
+                login
+                url
+                teams(first:100, after: $cursor) {
+                    nodes {
+                        slug
+                        url
+                        description
+                        repositories(first: 100) {
+                            totalCount
+                        }
+                    }
+                    pageInfo{
+                        endCursor
+                        hasNextPage
+                    }
+                }
+            }
+        }
+    """
+    return fetch_all(token, api_url, org, org_teams_gql, 'teams')
+
+
+@timeit
+def _get_team_repos_for_multiple_teams(
+        team_raw_data: List[Dict[str, Any]],
+        org: str,
+        api_url: str,
+        token: str,
+) -> Dict[str, Any]:
+    result = {}
+    for team in team_raw_data:
+        team_name = team['slug']
+        repo_count = team['repositories']['totalCount']
+
+        team_repos = _get_team_repos(org, api_url, token, team_name) if repo_count > 0 else None
+
+        # Shape = [(repo_url, 'WRITE'), ...]]
+        repo_urls = [t['url'] for t in team_repos.nodes] if team_repos else []
+        repo_permissions = [t['permission'] for t in team_repos.edges] if team_repos else []
+
+        result[team_name] = list(zip(repo_urls, repo_permissions))
+    return result
+
+
+@timeit
+def _get_team_repos(org: str, api_url: str, token: str, team: str) -> PaginatedGraphqlData:
+    team_repos_gql = """
+    query($login: String!, $team: String!, $cursor: String) {
+        organization(login: $login) {
+            url
+            login
+            team(slug: $team) {
+                slug
+                repositories(first:100, after: $cursor) {
+                    edges {
+                        permission
+                    }
+                    nodes {
+                        url
+                    }
+                    pageInfo {
+                        endCursor
+                        hasNextPage
+                    }
+                }
+            }
+        }
+        rateLimit {
+            limit
+            cost
+            remaining
+            resetAt
+        }
+    }
+    """
+    team_repos, _ = fetch_all(
+        token,
+        api_url,
+        org,
+        team_repos_gql,
+        'team',
+        resource_inner_type='repositories',
+        team=team,
+    )
+    return team_repos
+
+
+def transform_teams(
+        team_paginated_data: PaginatedGraphqlData,
+        org_data: Dict[str, Any],
+        team_repo_data: Dict[str, Any],
+) -> List[Dict[str, Any]]:
+    result = []
+    for team in team_paginated_data.nodes:
+        team_name = team['slug']
+        repo_info = {
+            'name': team_name,
+            'url': team['url'],
+            'description': team['description'],
+            'repo_count': team['repositories']['totalCount'],
+            'org_url': org_data['url'],
+            'org_login': org_data['login'],
+        }
+        repo_permissions = team_repo_data[team_name]
+        if not repo_permissions:
+            result.append(repo_info)
+            continue
+
+        # `permission` can be one of ADMIN, READ, WRITE, TRIAGE, or MAINTAIN
+        for repo_url, permission in repo_permissions:
+            repo_info_copy = repo_info.copy()
+            repo_info_copy[permission] = repo_url
+            result.append(repo_info_copy)
+    return result
+
+
+@timeit
+def load_team_repos(
+        neo4j_session: neo4j.Session,
+        data: List[Dict[str, Any]],
+        update_tag: int,
+        organization_url: str,
+) -> None:
+    logger.info(f"Loading {len(data)} GitHub team-repos to the graph")
+    load(
+        neo4j_session,
+        GitHubTeamSchema(),
+        data,
+        lastupdated=update_tag,
+        org_url=organization_url,
+    )
+
+
+@timeit
+def cleanup(neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any]) -> None:
+    GraphJob.from_node_schema(GitHubTeamSchema(), common_job_parameters).run(neo4j_session)
+
+
+@timeit
+def sync_github_teams(
+        neo4j_session: neo4j.Session,
+        common_job_parameters: Dict[str, Any],
+        github_api_key: str,
+        github_url: str,
+        organization: str,
+) -> None:
+    teams_paginated, org_data = get_teams(organization, github_url, github_api_key)
+    team_repos = _get_team_repos_for_multiple_teams(teams_paginated.nodes, organization, github_url, github_api_key)
+    processed_data = transform_teams(teams_paginated, org_data, team_repos)
+    load_team_repos(neo4j_session, processed_data, common_job_parameters['UPDATE_TAG'], org_data['url'])
+    common_job_parameters['org_url'] = org_data['url']
+    cleanup(neo4j_session, common_job_parameters)
diff --git a/cartography/intel/github/users.py b/cartography/intel/github/users.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Any
 from typing import Dict
 from typing import List
 from typing import Tuple
@@ -55,8 +56,14 @@ def get(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
     :return: A 2-tuple containing 1. a list of dicts representing users - see tests.data.github.users.GITHUB_USER_DATA
     for shape, and 2. data on the owning GitHub organization - see tests.data.github.users.GITHUB_ORG_DATA for shape.
     """
-    users, org = fetch_all(token, api_url, organization, GITHUB_ORG_USERS_PAGINATED_GRAPHQL, 'membersWithRole', 'edges')
-    return users, org
+    users, org = fetch_all(
+        token,
+        api_url,
+        organization,
+        GITHUB_ORG_USERS_PAGINATED_GRAPHQL,
+        'membersWithRole',
+    )
+    return users.edges, org
 
 
 @timeit
@@ -99,8 +106,11 @@ def load_organization_users(
 
 @timeit
 def sync(
-    neo4j_session: neo4j.Session, common_job_parameters: Dict, github_api_key: str, github_url: str,
-    organization: str,
+        neo4j_session: neo4j.Session,
+        common_job_parameters: Dict[str, Any],
+        github_api_key: str,
+        github_url: str,
+        organization: str,
 ) -> None:
     logger.info("Syncing GitHub users")
     user_data, org_data = get(github_api_key, github_url, organization)

diff --git a/cartography/intel/github/util.py b/cartography/intel/github/util.py
@@ -1,8 +1,10 @@
 import json
 import logging
 import time
+from typing import Any
 from typing import Dict
 from typing import List
+from typing import NamedTuple
 from typing import Optional
 from typing import Tuple
 
@@ -13,6 +15,11 @@
 _TIMEOUT = (60, 60)
 
 
+class PaginatedGraphqlData(NamedTuple):
+    nodes: List[Dict[str, Any]]
+    edges: List[Dict[str, Any]]
+
+
 def call_github_api(query: str, variables: str, token: str, api_url: str) -> Dict:
     """
     Calls the GitHub v4 API and executes a query
@@ -44,7 +51,14 @@ def call_github_api(query: str, variables: str, token: str, api_url: str) -> Dic
     return response_json  # type: ignore
 
 
-def fetch_page(token: str, api_url: str, organization: str, query: str, cursor: Optional[str] = None) -> Dict:
+def fetch_page(
+        token: str,
+        api_url: str,
+        organization: str,
+        query: str,
+        cursor: Optional[str] = None,
+        **kwargs: Any,
+) -> Dict[str, Any]:
     """
     Return a single page of max size 100 elements from the Github api_url using the given `query` and `cursor` params.
     :param token: The API token as string. Must have permission for the object being paginated.
@@ -53,9 +67,11 @@ def fetch_page(token: str, api_url: str, organization: str, query: str, cursor:
     :param query: The GraphQL query, e.g. `GITHUB_ORG_USERS_PAGINATED_GRAPHQL`
     :param cursor: The GraphQL cursor string (behaves like a page number) for Github objects in the given
     organization. If None, the Github API will return the first page of repos.
+    :param kwargs: Other keyword args to add as key-value pairs to the GraphQL query.
     :return: The raw response object from the requests.get().json() call.
     """
     gql_vars = {
+        **kwargs,
         'login': organization,
         'cursor': cursor,
     }
@@ -65,8 +81,15 @@ def fetch_page(token: str, api_url: str, organization: str, query: str, cursor:
 
 
 def fetch_all(
-    token: str, api_url: str, organization: str, query: str, resource_type: str, field_name: str, retries: int = 5,
-) -> Tuple[List[Dict], Dict]:
+        token: str,
+        api_url: str,
+        organization: str,
+        query: str,
+        resource_type: str,
+        retries: int = 5,
+        resource_inner_type: Optional[str] = None,
+        **kwargs: Any,
+) -> Tuple[PaginatedGraphqlData, Dict[str, Any]]:
     """
     Fetch and return all data items of the given `resource_type` and `field_name` from Github's paginated GraphQL API as
     a list, along with information on the organization that they belong to.
@@ -77,19 +100,22 @@ def fetch_all(
     :param resource_type: The name of the paginated resource under the organization e.g. `membersWithRole` or
     `repositories`. See the fields under https://docs.github.com/en/graphql/reference/objects#organization for a full
     list.
-    :param field_name: The field name of the resource_type to append items from - this is usually "nodes" or "edges".
-    See the field list in https://docs.github.com/en/graphql/reference/objects#repositoryconnection for other examples.
     :param retries: Number of retries to perform.  Github APIs are often flakey and retrying the request helps.
+    :param resource_inner_type: Optional str. Default = None. Sometimes we need to paginate a field that is inside
+    `resource_type` - for example: organization['team']['repositories']. In this case, we specify 'repositories' as the
+    `resource_inner_type`.
+    :param kwargs: Additional key-value args (other than `login` and `cursor`) to pass to the GraphQL query variables.
     :return: A 2-tuple containing 1. A list of data items of the given `resource_type` and `field_name`,  and 2. a dict
     containing the `url` and the `login` fields of the organization that the items belong to.
     """
     cursor = None
     has_next_page = True
-    data: List[Dict] = []
+    data: PaginatedGraphqlData = PaginatedGraphqlData(nodes=[], edges=[])
     retry = 0
+
     while has_next_page:
         try:
-            resp = fetch_page(token, api_url, organization, query, cursor)
+            resp = fetch_page(token, api_url, organization, query, cursor, **kwargs)
             retry = 0
         except requests.exceptions.Timeout:
             retry += 1
@@ -109,7 +135,13 @@ def fetch_all(
             continue
 
         resource = resp['data']['organization'][resource_type]
-        data.extend(resource[field_name])
+        if resource_inner_type:
+            resource = resp['data']['organization'][resource_type][resource_inner_type]
+
+        # Allow for paginating both nodes and edges fields of the GitHub GQL structure.
+        data.nodes.extend(resource.get('nodes', []))
+        data.edges.extend(resource.get('edges', []))
+
         cursor = resource['pageInfo']['endCursor']
         has_next_page = resource['pageInfo']['hasNextPage']
     org_data = {'url': resp['data']['organization']['url'], 'login': resp['data']['organization']['login']}

diff --git a/cartography/models/github/__init__.py b/cartography/models/github/__init__.py