Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for GitHub teams and team-repos #1167

Merged
merged 4 commits into from
May 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -24,7 +24,7 @@ Start [here](https://lyft.github.io/cartography/install.html).
- [Duo CRXcavator](https://lyft.github.io/cartography/modules/crxcavator/index.html) - Chrome extensions, GSuite users
- [Oracle Cloud Infrastructure](docs/setup/config/oci.md) - IAM
- [Okta](https://lyft.github.io/cartography/modules/okta/index.html) - users, groups, organizations, roles, applications, factors, trusted origins, reply URIs
- [Github](https://lyft.github.io/cartography/modules/github/index.html) - repos, branches, users
- [Github](https://lyft.github.io/cartography/modules/github/index.html) - repos, branches, users, teams
- [DigitalOcean](https://lyft.github.io/cartography/modules/digitalocean/index.html)
- [Microsoft Azure](https://lyft.github.io/cartography/modules/azure/index.html) - CosmosDB, SQL, Storage, Virtual Machine
- [Kubernetes](https://lyft.github.io/cartography/modules/kubernetes/index.html) - Cluster, Namespace, Service, Pod, Container
Expand Down
8 changes: 8 additions & 0 deletions cartography/intel/github/__init__.py
Expand Up @@ -6,6 +6,7 @@
from requests import exceptions

import cartography.intel.github.repos
import cartography.intel.github.teams
import cartography.intel.github.users
from cartography.config import Config
from cartography.util import timeit
Expand Down Expand Up @@ -46,5 +47,12 @@ def start_github_ingestion(neo4j_session: neo4j.Session, config: Config) -> None
auth_data['url'],
auth_data['name'],
)
cartography.intel.github.teams.sync_github_teams(
neo4j_session,
common_job_parameters,
auth_data['token'],
auth_data['url'],
auth_data['name'],
)
except exceptions.RequestException as e:
logger.error("Could not complete request to the GitHub API: %s", e)
19 changes: 14 additions & 5 deletions cartography/intel/github/repos.py
Expand Up @@ -101,8 +101,14 @@ def get(token: str, api_url: str, organization: str) -> List[Dict]:
:return: A list of dicts representing repos. See tests.data.github.repos for data shape.
"""
# TODO: link the Github organization to the repositories
repos, _ = fetch_all(token, api_url, organization, GITHUB_ORG_REPOS_PAGINATED_GRAPHQL, 'repositories', 'nodes')
return repos
repos, _ = fetch_all(
token,
api_url,
organization,
GITHUB_ORG_REPOS_PAGINATED_GRAPHQL,
'repositories',
)
return repos.nodes


def transform(repos_json: List[Dict]) -> Dict:
Expand Down Expand Up @@ -539,8 +545,11 @@ def load_python_requirements(neo4j_session: neo4j.Session, update_tag: int, requ


def sync(
neo4j_session: neo4j.Session, common_job_parameters: Dict, github_api_key: str, github_url: str,
organization: str,
neo4j_session: neo4j.Session,
common_job_parameters: Dict[str, Any],
github_api_key: str,
github_url: str,
organization: str,
) -> None:
"""
Performs the sequential tasks to collect, transform, and sync github data
Expand All @@ -554,5 +563,5 @@ def sync(
logger.info("Syncing GitHub repos")
repos_json = get(github_api_key, github_url, organization)
repo_data = transform(repos_json)
load(neo4j_session, common_job_parameters, repo_data)
load(neo4j_session, repo_data, common_job_parameters['UPDATE_TAG'])
run_cleanup_job('github_repos_cleanup.json', neo4j_session, common_job_parameters)
175 changes: 175 additions & 0 deletions cartography/intel/github/teams.py
@@ -0,0 +1,175 @@
import logging
from typing import Any
from typing import Dict
from typing import List
from typing import Tuple

import neo4j

from cartography.client.core.tx import load
from cartography.graph.job import GraphJob
from cartography.intel.github.util import fetch_all
from cartography.intel.github.util import PaginatedGraphqlData
from cartography.models.github.teams import GitHubTeamSchema
from cartography.util import timeit

logger = logging.getLogger(__name__)


@timeit
def get_teams(org: str, api_url: str, token: str) -> Tuple[PaginatedGraphqlData, Dict[str, Any]]:
org_teams_gql = """
query($login: String!, $cursor: String) {
organization(login: $login) {
login
url
teams(first:100, after: $cursor) {
nodes {
slug
url
description
repositories(first: 100) {
totalCount
}
}
pageInfo{
endCursor
hasNextPage
}
}
}
}
"""
return fetch_all(token, api_url, org, org_teams_gql, 'teams')


@timeit
def _get_team_repos_for_multiple_teams(
team_raw_data: List[Dict[str, Any]],
org: str,
api_url: str,
token: str,
) -> Dict[str, Any]:
result = {}
for team in team_raw_data:
team_name = team['slug']
repo_count = team['repositories']['totalCount']

team_repos = _get_team_repos(org, api_url, token, team_name) if repo_count > 0 else None

# Shape = [(repo_url, 'WRITE'), ...]]
repo_urls = [t['url'] for t in team_repos.nodes] if team_repos else []
repo_permissions = [t['permission'] for t in team_repos.edges] if team_repos else []

result[team_name] = list(zip(repo_urls, repo_permissions))
return result


@timeit
def _get_team_repos(org: str, api_url: str, token: str, team: str) -> PaginatedGraphqlData:
team_repos_gql = """
query($login: String!, $team: String!, $cursor: String) {
organization(login: $login) {
url
login
team(slug: $team) {
slug
repositories(first:100, after: $cursor) {
edges {
permission
}
nodes {
url
}
pageInfo {
endCursor
hasNextPage
}
}
}
}
rateLimit {
limit
cost
remaining
resetAt
}
}
"""
team_repos, _ = fetch_all(
token,
api_url,
org,
team_repos_gql,
'team',
resource_inner_type='repositories',
team=team,
)
return team_repos


def transform_teams(
team_paginated_data: PaginatedGraphqlData,
org_data: Dict[str, Any],
team_repo_data: Dict[str, Any],
) -> List[Dict[str, Any]]:
result = []
for team in team_paginated_data.nodes:
team_name = team['slug']
repo_info = {
'name': team_name,
'url': team['url'],
'description': team['description'],
'repo_count': team['repositories']['totalCount'],
'org_url': org_data['url'],
'org_login': org_data['login'],
}
repo_permissions = team_repo_data[team_name]
if not repo_permissions:
result.append(repo_info)
continue

# `permission` can be one of ADMIN, READ, WRITE, TRIAGE, or MAINTAIN
for repo_url, permission in repo_permissions:
repo_info_copy = repo_info.copy()
repo_info_copy[permission] = repo_url
result.append(repo_info_copy)
return result


@timeit
def load_team_repos(
neo4j_session: neo4j.Session,
data: List[Dict[str, Any]],
update_tag: int,
organization_url: str,
) -> None:
logger.info(f"Loading {len(data)} GitHub team-repos to the graph")
load(
neo4j_session,
GitHubTeamSchema(),
data,
lastupdated=update_tag,
org_url=organization_url,
)


@timeit
def cleanup(neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any]) -> None:
GraphJob.from_node_schema(GitHubTeamSchema(), common_job_parameters).run(neo4j_session)


@timeit
def sync_github_teams(
neo4j_session: neo4j.Session,
common_job_parameters: Dict[str, Any],
github_api_key: str,
github_url: str,
organization: str,
) -> None:
teams_paginated, org_data = get_teams(organization, github_url, github_api_key)
team_repos = _get_team_repos_for_multiple_teams(teams_paginated.nodes, organization, github_url, github_api_key)
processed_data = transform_teams(teams_paginated, org_data, team_repos)
load_team_repos(neo4j_session, processed_data, common_job_parameters['UPDATE_TAG'], org_data['url'])
common_job_parameters['org_url'] = org_data['url']
cleanup(neo4j_session, common_job_parameters)
18 changes: 14 additions & 4 deletions cartography/intel/github/users.py
@@ -1,4 +1,5 @@
import logging
from typing import Any
from typing import Dict
from typing import List
from typing import Tuple
Expand Down Expand Up @@ -55,8 +56,14 @@ def get(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
:return: A 2-tuple containing 1. a list of dicts representing users - see tests.data.github.users.GITHUB_USER_DATA
for shape, and 2. data on the owning GitHub organization - see tests.data.github.users.GITHUB_ORG_DATA for shape.
"""
users, org = fetch_all(token, api_url, organization, GITHUB_ORG_USERS_PAGINATED_GRAPHQL, 'membersWithRole', 'edges')
return users, org
users, org = fetch_all(
token,
api_url,
organization,
GITHUB_ORG_USERS_PAGINATED_GRAPHQL,
'membersWithRole',
)
return users.edges, org


@timeit
Expand Down Expand Up @@ -99,8 +106,11 @@ def load_organization_users(

@timeit
def sync(
neo4j_session: neo4j.Session, common_job_parameters: Dict, github_api_key: str, github_url: str,
organization: str,
neo4j_session: neo4j.Session,
common_job_parameters: Dict[str, Any],
github_api_key: str,
github_url: str,
organization: str,
) -> None:
logger.info("Syncing GitHub users")
user_data, org_data = get(github_api_key, github_url, organization)
Expand Down
48 changes: 40 additions & 8 deletions cartography/intel/github/util.py
@@ -1,8 +1,10 @@
import json
import logging
import time
from typing import Any
from typing import Dict
from typing import List
from typing import NamedTuple
from typing import Optional
from typing import Tuple

Expand All @@ -13,6 +15,11 @@
_TIMEOUT = (60, 60)


class PaginatedGraphqlData(NamedTuple):
nodes: List[Dict[str, Any]]
edges: List[Dict[str, Any]]


def call_github_api(query: str, variables: str, token: str, api_url: str) -> Dict:
"""
Calls the GitHub v4 API and executes a query
Expand Down Expand Up @@ -44,7 +51,14 @@ def call_github_api(query: str, variables: str, token: str, api_url: str) -> Dic
return response_json # type: ignore


def fetch_page(token: str, api_url: str, organization: str, query: str, cursor: Optional[str] = None) -> Dict:
def fetch_page(
token: str,
api_url: str,
organization: str,
query: str,
cursor: Optional[str] = None,
**kwargs: Any,
) -> Dict[str, Any]:
"""
Return a single page of max size 100 elements from the Github api_url using the given `query` and `cursor` params.
:param token: The API token as string. Must have permission for the object being paginated.
Expand All @@ -53,9 +67,11 @@ def fetch_page(token: str, api_url: str, organization: str, query: str, cursor:
:param query: The GraphQL query, e.g. `GITHUB_ORG_USERS_PAGINATED_GRAPHQL`
:param cursor: The GraphQL cursor string (behaves like a page number) for Github objects in the given
organization. If None, the Github API will return the first page of repos.
:param kwargs: Other keyword args to add as key-value pairs to the GraphQL query.
:return: The raw response object from the requests.get().json() call.
"""
gql_vars = {
**kwargs,
'login': organization,
'cursor': cursor,
}
Expand All @@ -65,8 +81,15 @@ def fetch_page(token: str, api_url: str, organization: str, query: str, cursor:


def fetch_all(
token: str, api_url: str, organization: str, query: str, resource_type: str, field_name: str, retries: int = 5,
) -> Tuple[List[Dict], Dict]:
token: str,
api_url: str,
organization: str,
query: str,
resource_type: str,
retries: int = 5,
resource_inner_type: Optional[str] = None,
**kwargs: Any,
) -> Tuple[PaginatedGraphqlData, Dict[str, Any]]:
"""
Fetch and return all data items of the given `resource_type` and `field_name` from Github's paginated GraphQL API as
a list, along with information on the organization that they belong to.
Expand All @@ -77,19 +100,22 @@ def fetch_all(
:param resource_type: The name of the paginated resource under the organization e.g. `membersWithRole` or
`repositories`. See the fields under https://docs.github.com/en/graphql/reference/objects#organization for a full
list.
:param field_name: The field name of the resource_type to append items from - this is usually "nodes" or "edges".
See the field list in https://docs.github.com/en/graphql/reference/objects#repositoryconnection for other examples.
:param retries: Number of retries to perform. Github APIs are often flakey and retrying the request helps.
:param resource_inner_type: Optional str. Default = None. Sometimes we need to paginate a field that is inside
`resource_type` - for example: organization['team']['repositories']. In this case, we specify 'repositories' as the
`resource_inner_type`.
:param kwargs: Additional key-value args (other than `login` and `cursor`) to pass to the GraphQL query variables.
:return: A 2-tuple containing 1. A list of data items of the given `resource_type` and `field_name`, and 2. a dict
containing the `url` and the `login` fields of the organization that the items belong to.
"""
cursor = None
has_next_page = True
data: List[Dict] = []
data: PaginatedGraphqlData = PaginatedGraphqlData(nodes=[], edges=[])
retry = 0

while has_next_page:
try:
resp = fetch_page(token, api_url, organization, query, cursor)
resp = fetch_page(token, api_url, organization, query, cursor, **kwargs)
retry = 0
except requests.exceptions.Timeout:
retry += 1
Expand All @@ -109,7 +135,13 @@ def fetch_all(
continue

resource = resp['data']['organization'][resource_type]
data.extend(resource[field_name])
if resource_inner_type:
resource = resp['data']['organization'][resource_type][resource_inner_type]

# Allow for paginating both nodes and edges fields of the GitHub GQL structure.
data.nodes.extend(resource.get('nodes', []))
data.edges.extend(resource.get('edges', []))

cursor = resource['pageInfo']['endCursor']
has_next_page = resource['pageInfo']['hasNextPage']
org_data = {'url': resp['data']['organization']['url'], 'login': resp['data']['organization']['login']}
Expand Down
Empty file.