# GitHub API EDA

Application state variables:
```json
{
    "current_release": "3510",
    "last_processed_commit": {
        "sha": "5f2c562056f8ffa89aeea0631f2a52300ee0de17",
        "date": "2023-01-13T10:04:48Z"
    },
    "tracked_assets": [
        "hla.dat",
        "msf/"
    ]
}
```

## Strategy
### 1. App State
* Fetch app state from S3
* Update app state at end of execution
### 2. Asset Processing
* For each tracked asset:
  * Fetch the commits for the asset
  * Filter by the last processed commit date
* Merge the commits for each asset into a single list &rarr; array of commits
* If array is empty (no commits found), exit
* If array is not empty (commits are found)
  * Get the release version for each commit &larr; needs strategy
  * Build the release

In [1]:
import sys
sys.path.append('/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/gfe-db/pipeline/functions/get_repo_updates/')
import logging
from datetime import datetime
import re
import json
import pandas as pd
from src.utils import (
    load_state,
    get_commits_for_asset,
    get_repo_asset,
    get_commits,
    flatten_json
)

# logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)


In [2]:
GITHUB_REPOSITORY_OWNER = "ANHIG" # os.environ["GITHUB_REPOSITORY_OWNER"]
GITHUB_REPOSITORY_NAME = "IMGTHLA" # os.environ["GITHUB_REPOSITORY_NAME"]

### App State

In [32]:
app_state = {
    "last_processed_release": {
        "release": 3510,
        "sha": "ecd63776c6225af0cf8bcc9fa9c6998d3129fb14",
        "date": "2022-04-14T11:00:42Z",
        "status": "SUCCESS"
    },
    "tracked_assets": [
        "hla.dat",
        "msf/"
    ]
}

In [34]:
state = load_state(app_state)

### Asset Processing

In [35]:
asset_commits = []
for asset in state['tracked_assets']:
    commits = get_commits_for_asset(
        owner=GITHUB_REPOSITORY_OWNER,
        repo=GITHUB_REPOSITORY_NAME,
        path=asset,
        since=state['last_processed_commit']['date']
    )
    asset_commits.extend(commits)

In [36]:
# Filter out commits before last processed commit
unique_shas = list(set([(item["sha"], item["commit"]["author"]["date"]) for item in asset_commits \
                        if datetime.strptime(item["commit"]["author"]["date"], "%Y-%m-%dT%H:%M:%SZ") > state['last_processed_commit']['date']]))

In [37]:
unique_shas

[('def376dc6955b339b17f0a4b840e80eb6b9c744b', '2023-04-17T16:01:01Z'),
 ('2d38d3313229fdc5f8aa00052a2db21b35be3d2d', '2022-10-14T08:46:01Z'),
 ('72a9e28a52c9629dd63dfad5f215cdc562e2fd7e', '2022-07-14T12:28:41Z'),
 ('2c631a4b61d529ff1c0635750888f6f6d79c2703', '2022-10-13T12:58:37Z'),
 ('8f80f24d49797595d8a18b8d4d1f59846fbf3fe1', '2022-08-25T15:43:12Z'),
 ('1a3be9a5d01a414854ff3bfacd5257c14adeefa2', '2022-07-14T13:40:17Z'),
 ('4486f5c623705c6a14d9eeaba7d155cff30cdb43', '2023-01-12T14:36:43Z')]

In [39]:
# get the releases for each unique commit from Allelelist.txt
# can produce duplicate release versions if the same release is updated more than once
# makes the assumption that the release version branch is up to date for that release, since the build process targets the release version branch and not the specific commit sha
release_version_re = r"# version: IPD-IMGT/HLA (\d+\.\d+\.\d+)"
release_versions = []
for sha, _ in unique_shas:
    allele_list = get_repo_asset(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, "Allelelist.txt", sha)
    release_version = int(re.search(release_version_re, allele_list).group(1).replace(".", ""))
    release_versions.append((sha, release_version))

release_versions.sort(key=lambda x: x[1], reverse=False)
logger.info(f"Release versions by sha:\n{json.dumps(release_versions)}")

unique_release_versions = list(set([version[1] for version in release_versions]))
logger.info(f"Unique releases:\n {json.dumps(unique_release_versions)}")

INFO:__main__:Release versions by sha[["def376dc6955b339b17f0a4b840e80eb6b9c744b", 3520], ["4486f5c623705c6a14d9eeaba7d155cff30cdb43", 3510], ["2d38d3313229fdc5f8aa00052a2db21b35be3d2d", 3500], ["2c631a4b61d529ff1c0635750888f6f6d79c2703", 3500], ["8f80f24d49797595d8a18b8d4d1f59846fbf3fe1", 3490], ["1a3be9a5d01a414854ff3bfacd5257c14adeefa2", 3490], ["72a9e28a52c9629dd63dfad5f215cdc562e2fd7e", 3480]]
INFO:__main__:Unique releases:
 [3520, 3490, 3500, 3510, 3480]


In [41]:
# send these to the state machine
unique_release_versions

[3520, 3490, 3500, 3510, 3480]

## Testing

In [42]:
# sort commits by most recent date on [].commit.author.date
asset_commits.sort(key=lambda x: x["commit"]["author"]["date"], reverse=True)

In [30]:
all_commits = get_commits(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME)

In [32]:
select_keys = ["sha", "commit"]

# filter by select_keys
all_commits = [{k: v for k, v in x.items() if k in select_keys} for x in all_commits]

In [38]:
df = pd.DataFrame([flatten_json(commit) for commit in all_commits])[["sha", "commit.author.date"]]

In [39]:
df

Unnamed: 0,sha,commit.author.date
0,38398a75e9762ff070d8e9bd714d074332646cd7,2023-04-17T16:03:52Z
1,def376dc6955b339b17f0a4b840e80eb6b9c744b,2023-04-17T16:01:01Z
2,5f2c562056f8ffa89aeea0631f2a52300ee0de17,2023-01-13T10:04:48Z
3,4b8432c7d56121c84d6ef1d75a1c7185c628c13d,2023-01-12T14:47:00Z
4,4486f5c623705c6a14d9eeaba7d155cff30cdb43,2023-01-12T14:36:43Z
5,50b790037030d958b662085c3f4cf34ba72a32ec,2022-12-14T10:02:54Z
6,36220a1c5c2d6954f4873a552544cc0e55b61d0a,2022-12-14T10:02:28Z
7,e941759874365cb152a3562c22d10847d10db326,2022-10-14T08:47:37Z
8,2d38d3313229fdc5f8aa00052a2db21b35be3d2d,2022-10-14T08:46:01Z
9,1ce31fc9e2805034578eff60a269c02176f03252,2022-10-13T13:06:12Z
