# gfe-db Pipeline Trigger Development Notebook

Runs on a schedule, checks the IMGT/HLA repository for new release branches and triggers the update pipeline if one is found.

Activites:
- Create initial state: scrape IMGTHLA repo for list of release branches
 - Reduce to only 4 digit integers that increment by 10
- Lambda strategy
 - Fetch initial release list from S3
 - Fetch current list of branches from the IMGTHLA repo
  - look for new branches and validate: 1) follows the release version format of 4 digits and 2) is equal to the previous version plus 10

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import datetime
import copy
import json
import re
import requests
import numpy as np
import boto3

In [3]:
# Notebook libraries
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv());

GITHUB_PERSONAL_ACCESS_TOKEN = os.environ["GITHUB_PERSONAL_ACCESS_TOKEN"]
GITHUB_REPOSITORY_OWNER = os.environ["GITHUB_REPOSITORY_OWNER"]
GITHUB_REPOSITORY_NAME = os.environ["GITHUB_REPOSITORY_NAME"]
GFE_BUCKET = os.environ["GFE_BUCKET"]
UPDATE_PIPELINE_STATE_MACHINE_ARN = os.environ["UPDATE_PIPELINE_STATE_MACHINE_ARN"]

In [4]:
print(f"GITHUB_REPOSITORY_OWNER: {GITHUB_REPOSITORY_OWNER}")
print(f"GITHUB_REPOSITORY_NAME: {GITHUB_REPOSITORY_NAME}")
print(f"GFE_BUCKET: {GFE_BUCKET}")
print(f"UPDATE_PIPELINE_STATE_MACHINE_ARN: {UPDATE_PIPELINE_STATE_MACHINE_ARN}")

GITHUB_REPOSITORY_OWNER: ANHIG
GITHUB_REPOSITORY_NAME: IMGTHLA
GFE_BUCKET: dev-gfe-db-531868584498-us-east-1
UPDATE_PIPELINE_STATE_MACHINE_ARN: arn:aws:states:us-east-1:531868584498:stateMachine:dev-gfe-db-pipeline


## Utility Functions

In [5]:
def get_branches(owner, repo):
    """Return a list of GitHub branches for the specified repository"""

    base_url = 'https://api.github.com'

    # Endpoint
    endpoint = f'/repos/{owner}/{repo}/branches?per_page=100'

    url = base_url + endpoint

    # Headers
    headers = {
        'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}', 
        'Content-Type': 'application/json',
        'Accept': 'application/vnd.github.v3+json'
    }
    
    response = requests.get(url)
    branches = json.loads(response.content)
    
    return [branch["name"] for branch in branches]

In [6]:
def is_valid_release(branch):
    """Returns True if the branch is a valid release, False if not"""

    # Checks for a pattern corresponding to 3 digits followed by one zero, ie., 3460
    release_pattern = r'^\d{3}0$'
    p = re.compile(release_pattern)
    match = p.match(branch)

    if match:
        return True
    else:
        return False

In [7]:
def get_releases(owner, repo):
    return list(filter(is_valid_release, get_branches(owner, repo)))

In [31]:
def write_config(path):
    """Writes config file containing the current state of branches in 
    a GitHub repo"""
    
    branches_config = {
        "timestamp": str(datetime.datetime.utcnow())[:-7],
        "repository_url": f"https://github.com/{GITHUB_REPOSITORY_OWNER}/{GITHUB_REPOSITORY_NAME}",
        "releases": get_releases(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME)[:-1]
    }
    
    s3_key = "/".join(path.split("/")[3:])
    
    # Reconstructs the path and logs it to make sure it is correct
    validation_path = "/".join(["s3:/", GFE_BUCKET, key])
    
    try:
        print(f'Writing to {validation_path}')
        response = s3.put_object(
             Body=json.dumps(branches_config),
             Bucket=GFE_BUCKET,
             Key=s3_key)
        
        if response['ResponseMetadata']['HTTPStatusCode'] == 200:
            print(f"HTTPStatusCode: {response['ResponseMetadata']['HTTPStatusCode']}")
            return
        else:
            print(f'Failed to write config file to {path}. HTTPStatusCode: {response["ResponseMetadata"]["HTTPStatusCode"]}')
            return
        
    except Exception as err:
        raise err

In [32]:
s3 = boto3.client('s3')
path = f"s3://{GFE_BUCKET}/config/trigger/IMGTHLA-repository-state.json"
response = write_config(path)

Writing to s3://dev-gfe-db-531868584498-us-east-1/config/trigger/IMGTHLA-repository-state.json
HTTPStatusCode: 200


In [33]:
def read_config(path):
    """Reads config file containing the current state of branches in 
    a GitHub repo"""
    
    s3_key = "/".join(path.split("/")[3:])
    
    # Reconstructs the path and logs it to make sure it is correct
    validation_path = "/".join(["s3:/", GFE_BUCKET, key])
    
    try:
        print(f'Writing to {validation_path}')
        response = s3.get_object(
            Bucket=GFE_BUCKET, 
            Key=s3_key)
        
        if response['ResponseMetadata']['HTTPStatusCode'] == 200:
            print(f"Read config file from {path}")
            return json.loads(response["Body"].read().decode())
        else:
            print(f'Failed to read config file to {path}. HTTPStatusCode: {response["ResponseMetadata"]["HTTPStatusCode"]}')
            return
        
    except Exception as err:
        raise err

In [34]:
def check_new_releases(previous_state, current_state):
    """Checks for new IMGT/HLA releases and triggers the update
    pipeline if any are found"""

    # Check if any branches have been added
    new_branches_count = len(current_state) - len(previous_state)
    branches_added = (new_branches_count > 0)

    if branches_added:

        print(f"New branches: {current_state[-new_branches_count:]}")

        # Get the new branches
        new_releases = sorted([int(release) for release in list(set(current_state).difference(previous_state))])
        last_release = [int(previous_state[-1])]

        # Check that the last release and the new releases differ by 10
        elementwise_difference = list(set(np.diff([release for release in last_release + new_releases])))
        new_branches_are_valid_releases = (len(elementwise_difference) == 1 and elementwise_difference[0] == 10)

        if new_branches_are_valid_releases:
            
            return new_releases
    else:
        print("No new branches detected")
        
        return

In [35]:
def check_current_executions(state_machine_arn):
    
    response = sfn.list_executions(
        stateMachineArn=state_machine_arn,
        statusFilter='RUNNING')

    # Extract executions
    executions_arns = [execution['executionArn'] for execution in response['executions']]
    
    releases_processing = []

    for executions_arn in executions_arns:

        response = sfn.describe_execution(
            executionArn=executions_arn)

        releases_processing = releases_processing + [params["RELEASES"] for params in json.loads(response['input'])]

    return releases_processing
    

In [40]:
branches_config = {
    "timestamp": str(datetime.datetime.utcnow())[:-7],
    "repository_url": f"https://github.com/{GITHUB_REPOSITORY_OWNER}/{GITHUB_REPOSITORY_NAME}",
    "releases": get_releases(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME)[:-1]
}

In [41]:
json.dumps(branches_config)

'{"timestamp": "2021-12-09 02:36:59", "repository_url": "https://github.com/ANHIG/IMGTHLA", "releases": ["3100", "3110", "3120", "3130", "3140", "3150", "3160", "3170", "3180", "3190", "3200", "3210", "3220", "3230", "3240", "3250", "3260", "3270", "3280", "3290", "3300", "3310", "3320", "3330", "3340", "3350", "3360", "3370", "3380", "3390", "3400", "3410", "3420", "3430", "3440", "3450"]}'

## Compare previous repo state with current repo state
- Fetch the previous list of branches
- Fetch the current list of branches
- Compare

In [37]:
s3 = boto3.client('s3')
sfn = boto3.client('stepfunctions')

In [38]:
path = f"s3://{GFE_BUCKET}/config/trigger/IMGTHLA-repository-state.json"
response = write_config(path)

Writing to s3://dev-gfe-db-531868584498-us-east-1/config/trigger/IMGTHLA-repository-state.json
HTTPStatusCode: 200


In [39]:
branches_config_path = f"s3://{GFE_BUCKET}/config/trigger/IMGTHLA-repository-state.json"
branches_config = read_config(branches_config_path)

params_path = f"s3://{GFE_BUCKET}/config/pipeline/params.json"
params = read_config(params_path)

Writing to s3://dev-gfe-db-531868584498-us-east-1/config/trigger/IMGTHLA-repository-state.json
Read config file from s3://dev-gfe-db-531868584498-us-east-1/config/trigger/IMGTHLA-repository-state.json
Writing to s3://dev-gfe-db-531868584498-us-east-1/config/trigger/IMGTHLA-repository-state.json


NoSuchKey: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.

In [22]:
params

{'params': {'environment': {'ALIGN': 'False',
   'KIR': 'False',
   'MEM_PROFILE': 'False',
   'LIMIT': '100'}}}

In [23]:
# Set up test conditions for different repo states (no new releases, one, two or three new releases)
test_conditions = {
    "condition1": previous_state["releases"],
    "condition2": previous_state["releases"][:-1],
    "condition3": previous_state["releases"][:-2],
    "condition4": previous_state["releases"][:-3],
}

In [24]:
current_state = get_releases(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME)

In [25]:
# for loop is for testing different conditions
for condition, state in test_conditions.items():
    new_releases = check_new_releases(state, current_state)
    
    if new_releases:
    
        print(f'Pipeline triggered:')

        state_machine_input = []

        for release in new_releases:
            
            params_input = copy.deepcopy(params["params"]["environment"])
            params_input["RELEASES"] = release
            print(f'{params_input}')            
            state_machine_input.append(params_input)
        
    print("\n")

No new branches detected


New branches: ['3460']
Pipeline triggered:
{'ALIGN': 'False', 'KIR': 'False', 'MEM_PROFILE': 'False', 'LIMIT': '100', 'RELEASES': 3460}


New branches: ['3450', '3460']
Pipeline triggered:
{'ALIGN': 'False', 'KIR': 'False', 'MEM_PROFILE': 'False', 'LIMIT': '100', 'RELEASES': 3450}
{'ALIGN': 'False', 'KIR': 'False', 'MEM_PROFILE': 'False', 'LIMIT': '100', 'RELEASES': 3460}


New branches: ['3440', '3450', '3460']
Pipeline triggered:
{'ALIGN': 'False', 'KIR': 'False', 'MEM_PROFILE': 'False', 'LIMIT': '100', 'RELEASES': 3440}
{'ALIGN': 'False', 'KIR': 'False', 'MEM_PROFILE': 'False', 'LIMIT': '100', 'RELEASES': 3450}
{'ALIGN': 'False', 'KIR': 'False', 'MEM_PROFILE': 'False', 'LIMIT': '100', 'RELEASES': 3460}




In [18]:
state_machine_input

[{'ALIGN': 'False',
  'KIR': 'False',
  'MEM_PROFILE': 'False',
  'LIMIT': '100',
  'RELEASES': 3440},
 {'ALIGN': 'False',
  'KIR': 'False',
  'MEM_PROFILE': 'False',
  'LIMIT': '100',
  'RELEASES': 3450},
 {'ALIGN': 'False',
  'KIR': 'False',
  'MEM_PROFILE': 'False',
  'LIMIT': '100',
  'RELEASES': 3460}]

In [19]:
state_machine_input

[{'ALIGN': 'False',
  'KIR': 'False',
  'MEM_PROFILE': 'False',
  'LIMIT': '100',
  'RELEASES': 3440},
 {'ALIGN': 'False',
  'KIR': 'False',
  'MEM_PROFILE': 'False',
  'LIMIT': '100',
  'RELEASES': 3450},
 {'ALIGN': 'False',
  'KIR': 'False',
  'MEM_PROFILE': 'False',
  'LIMIT': '100',
  'RELEASES': 3460}]

In [11]:
write_config(branches_config_path)

NameError: name 'branches_config_path' is not defined

In [42]:
json.dumps({'ALIGN': 'False', 'KIR': 'False', 'MEM_PROFILE': 'False', 'LIMIT': '100', 'RELEASES': 3460})

'{"ALIGN": "False", "KIR": "False", "MEM_PROFILE": "False", "LIMIT": "100", "RELEASES": 3460}'