# Build Source Config
```json
{
    "created_at_utc": "2022-04-14T11:00:42Z",
    "updated_at_utc": "2023-04-25T01:50:12Z",
    "repositories": {
        "ANHIG/IMGTHLA": {
            "owner": "ANHIG",
            "name": "IMGTHLA",
            "url": "https://github.com/ANHIG/IMGTHLA",
            "tracked_assets": [
                "hla.dat",
                "msf/"
            ],
            "default_input_parameters": {
                "align": "False",
                "kir": "False",
                "mem_profile": "False",
                "limit": "1000"
            },
            "execution_history": [
                {
                    "version": 3480,
                    "execution_date_utc": "2022-04-14T11:00:42Z",
                    "commit": {
                        "sha": "ecd63776c6225af0cf8bcc9fa9c6998d3129fb14",
                        "message": "<message>",
                        "date_utc": "2022-04-14T11:00:42Z",
                        "html_url": "url"
                    },
                    "input_parameters": {
                        "align": "False",
                        "kir": "False",
                        "mem_profile": "False",
                        "limit": "1000"
                    },
                    "status": "SUCCESS"                    
                },
                {
                    "version": 3470,
                    "execution_date_utc": "2022-01-13T16:52:15Z",
                    "commit": {
                        "sha": "06ceff14b2db920d458dc337b1100dced992e627",
                        "message": "<message>",
                        "date_utc": "2022-01-13T16:52:15Z",
                        "html_url": "url"
                    },
                    "input_parameters": {
                        "align": "False",
                        "kir": "False",
                        "mem_profile": "False",
                        "limit": "1000"
                    },
                    "status": "SUCCESS"
                }
            ]
        }
    }
}
```

In [1]:
import os
import sys
sys.path.append('/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/gfe-db/pipeline/functions/get_repo_updates/')
import logging
from datetime import datetime
utc_now = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
import re
import json
import pandas as pd
from botocore.exceptions import ClientError
from src.utils.types import (
    Commit,
    InputParameters,
    ExecutionHistoryItem,
    RepositoryConfig,
    SourceConfig
)
from src.utils import (
    read_source_config,
    paginate_commits,
    flatten_json,
    get_repo_asset
)

# logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

ModuleNotFoundError: No module named 'src'

In [2]:
utc_now

'2023-04-25T02:35:07Z'

In [3]:
GITHUB_REPOSITORY_OWNER = "ANHIG" # os.environ["GITHUB_REPOSITORY_OWNER"]
GITHUB_REPOSITORY_NAME = "IMGTHLA" # os.environ["GITHUB_REPOSITORY_NAME"]
AWS_REGION = os.environ["AWS_REGION"]
DATA_BUCKET_NAME = os.environ["DATA_BUCKET_NAME"]
PIPELINE_CONFIG_S3_PATH = os.environ["PIPELINE_CONFIG_S3_PATH"]

### App State

In [4]:
# source config file in S3 must be up to date
try:
    source_config = read_source_config(DATA_BUCKET_NAME, PIPELINE_CONFIG_S3_PATH)
except ClientError as e:
    logger.info("'source-config.json' not found. Building source config from repository.")

ERROR:src.utils.utils:Failed to read config file to s3://dev-gfe-db-531868584498-us-east-1/config/pipeline/source-config.json
INFO:__main__:'source-config.json' not found. Building source config from repository.


## Commits

In [5]:
# Fetch all commits from repo using GitHub API
all_commits = paginate_commits(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME)

In [6]:
select_keys = ["sha", "commit", "html_url"]

# filter by select_keys
all_commits = [{k: v for k, v in x.items() if k in select_keys} for x in all_commits]

In [7]:
# all_commits[0]

In [8]:
# # For reference only, not using pandas in deployment
# select_cols = ["sha", "commit.author.date", "commit.message", "html_url"]
# df = pd.DataFrame([flatten_json(commit) for commit in all_commits])[select_cols]

In [9]:
# df.head()

In [10]:
all_commits_flat = [flatten_json(commit) for commit in all_commits]

In [11]:
select_keys = ["sha", "commit.author.date", "commit.message", "html_url"]
all_commits_flat = [{k: v for k, v in x.items() if k in select_keys} for x in all_commits_flat]

In [12]:
# rename keys
rename_keys = {
    "sha": "sha", 
    "commit.author.date": "date_utc", 
    "commit.message": "message", 
    "html_url": "html_url"
}

# Rename
commits = [{rename_keys[k]: v for k, v in x.items()} for x in all_commits_flat]

In [13]:
commits[0]

{'sha': '38398a75e9762ff070d8e9bd714d074332646cd7',
 'date_utc': '2023-04-17T16:03:52Z',
 'message': 'Merge pull request #334 from ANHIG/3520\n\nIPD-IMGT/HLA Release 3.52.0',
 'html_url': 'https://github.com/ANHIG/IMGTHLA/commit/38398a75e9762ff070d8e9bd714d074332646cd7'}

## Execution History Items

In [14]:
# TODO use multithreading to speed up
# next we get the release version for each commit
release_version_re = r"# version: IPD-IMGT/HLA (\d+\.\d+\.\d+)"
execution_history_items = []
errors = 0
max_errors = 5
limit = None
for idx, commit in enumerate(commits):
    try:
        sha = commit['sha']
        date = commit['date_utc']
        logger.debug(f"Getting release version for sha {sha} and date {date}")
        allele_list = get_repo_asset(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, "Allelelist.txt", sha)
        release_version = int(re.search(release_version_re, allele_list).group(1).replace(".", ""))

        execution_history_items.append({
            "version": release_version,
            "execution_date_utc": None,
            "commit": commit,
            "input_parameters": None,
            "status": None
        })
    except Exception as e:
        errors += 1
        logger.error(f"Error processing commit {commit['sha']}: {e}")
        if errors >= max_errors:
            logger.error(f"Max errors reached. Exiting loop.")
            break

    if limit is not None:
        if idx+1 == limit:
            break

KeyboardInterrupt: 

In [15]:
len(execution_history_items)

39

## RepositoryConfig

In [16]:
repository_path = f"{GITHUB_REPOSITORY_OWNER}/{GITHUB_REPOSITORY_NAME}"
tracked_assets = ["hla.dat", "msf/"]
repository_path


'ANHIG/IMGTHLA'

In [17]:
base_source_config = {
    "created_at_utc": utc_now,
    "updated_at_utc": utc_now,
    "repositories": {
        repository_path: {
            "owner": GITHUB_REPOSITORY_OWNER,
            "name": GITHUB_REPOSITORY_NAME,
            "url": f"https://github.com/{repository_path}",
            "tracked_assets": tracked_assets,
            "default_input_parameters": {
                "align": "False",
                "kir": "False",
                "mem_profile": "False",
                "limit": "1000"
            }
        }
    }
}

In [18]:
base_source_config

{'created_at_utc': '2023-04-25T02:35:07Z',
 'updated_at_utc': '2023-04-25T02:35:07Z',
 'repositories': {'ANHIG/IMGTHLA': {'owner': 'ANHIG',
   'name': 'IMGTHLA',
   'url': 'https://github.com/ANHIG/IMGTHLA',
   'tracked_assets': ['hla.dat', 'msf/'],
   'default_input_parameters': {'align': 'False',
    'kir': 'False',
    'mem_profile': 'False',
    'limit': '1000'}}}}

In [19]:
base_source_config["repositories"][repository_path]["execution_history"] = execution_history_items

In [20]:
source_config = SourceConfig(**base_source_config)

In [21]:
source_config

SourceConfig(created_at_utc='2023-04-25T02:35:07Z', updated_at_utc='2023-04-25T02:35:07Z', repositories={'ANHIG/IMGTHLA': RepositoryConfig(owner='ANHIG', name='IMGTHLA', url='https://github.com/ANHIG/IMGTHLA', tracked_assets=['hla.dat', 'msf/'], default_input_parameters=InputParameters(align=False, kir=False, mem_profile=False, limit='1000'), execution_history=[ExecutionHistoryItem(version=3520, execution_date_utc=None, commit=Commit(sha='38398a75e9762ff070d8e9bd714d074332646cd7', date_utc='2023-04-17T16:03:52Z', message='Merge pull request #334 from ANHIG/3520\n\nIPD-IMGT/HLA Release 3.52.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/38398a75e9762ff070d8e9bd714d074332646cd7'), input_parameters=None, status=None), ExecutionHistoryItem(version=3520, execution_date_utc=None, commit=Commit(sha='def376dc6955b339b17f0a4b840e80eb6b9c744b', date_utc='2023-04-17T16:01:01Z', message='IPD-IMGT/HLA Release 3.52.0', html_url='https://github.com/ANHIG/IMGTHLA/commit/def376dc6955b339b17f0a4b

In [22]:
# write config locally
with open("source-config.json", "w") as f:
    json.dump(source_config.dict(), f, indent=4)