# GitHub API EDA

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv('/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.env.nmdpf');
from itertools import chain, starmap
from pathlib import Path
import json
import requests
import pandas as pd

# Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Environment variables
AWS_REGION = os.environ["AWS_REGION"] 
GITHUB_PERSONAL_ACCESS_TOKEN = os.environ["GITHUB_PERSONAL_ACCESS_TOKEN"]
GITHUB_REPOSITORY_OWNER = "ANHIG" # os.environ["GITHUB_REPOSITORY_OWNER"]
GITHUB_REPOSITORY_NAME = "IMGTHLA" # os.environ["GITHUB_REPOSITORY_NAME"]


In [3]:
root_dir = Path('.').resolve().parent

In [4]:
def flatten_json(dictionary, sep='.', skip_fields=[]):
    """Flatten a nested json file. For a list of dictionaries, use this
    inside a for loop before converting to pandas DataFrame."""

    def unpack(parent_key, parent_value):
        """Unpack one level of nesting in json file"""
        # Unpack one level only!!!
        
        if isinstance(parent_value, dict):
            for key, value in parent_value.items():
                temp1 = parent_key + sep + key
                yield temp1, value
        elif isinstance(parent_value, list):
            i = 0 
            for value in parent_value:
                temp2 = parent_key + sep +str(i) 
                i += 1
                yield temp2, value
        else:
            yield parent_key, parent_value    


    # Keep iterating until the termination condition is satisfied
    while True:
        # Keep unpacking the json file until all values are atomic elements (not dictionary or list)
        dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))
        # Terminate condition: not any value in the json file is dictionary or list
        if not any(isinstance(value, dict) for value in dictionary.values()) and \
           not any(isinstance(value, list) for value in dictionary.values()):
            break

    return dictionary




In [5]:
def get_commits(owner, repo, per_page=100):
    """Return a list of GitHub commits for the specified repository"""

    base_url = 'https://api.github.com'

    # Endpoint
    endpoint = f'/repos/{owner}/{repo}/commits?per_page={per_page}'

    url = base_url + endpoint

    # Headers
    headers = {
        'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',
        'Content-Type': 'application/json',
        'Accept': 'application/vnd.github.v3+json',
        'X-GitHub-Api-Version': '2022-11-28'
    }

    response = requests.get(url, headers=headers)

    return response.json()

In [6]:
def get_commit(owner, repo, commit_sha):
    """Return the commit for the specified repository and commit SHA"""

    base_url = 'https://api.github.com'

    # Endpoint
    endpoint = f'/repos/{owner}/{repo}/commits/{commit_sha}'
    url = base_url + endpoint

    # Headers
    headers = {
        'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',
        'Content-Type': 'application/json',
        'Accept': 'application/vnd.github.v3+json',
        'X-GitHub-Api-Version': '2022-11-28'
    }

    response = requests.get(url, headers=headers)

    return response.json()

In [7]:
def get_branches(owner, repo):
    """Fetch branches for a GitHub repository"""

    base_url = 'https://api.github.com'

    # Endpoint
    endpoint = f'/repos/{owner}/{repo}/branches'
    url = base_url + endpoint

    # Headers
    headers = {
        'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',
        'Content-Type': 'application/json',
        'Accept': 'application/vnd.github.v3+json',
        'X-GitHub-Api-Version': '2022-11-28'
    }

    response = requests.get(url, headers=headers)
    branches = response.json()

    return branches

In [8]:
def get_branch(owner, repo, branch_name):
    """Fetch branches for a GitHub repository"""

    base_url = 'https://api.github.com'

    # Endpoint
    endpoint = f'/repos/{owner}/{repo}/branches/{branch_name}'
    url = base_url + endpoint

    # Headers
    headers = {
        'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',
        'Content-Type': 'application/json',
        'Accept': 'application/vnd.github.v3+json',
        'X-GitHub-Api-Version': '2022-11-28'
    }

    response = requests.get(url, headers=headers)
    branches = response.json()

    return branches

In [9]:
# Function to fetch pull requests
def fetch_pull_requests(owner, repo):
    url = f"https://api.github.com/repos/{owner}/{repo}/pulls?state=all"
    
    # Headers
    headers = {
        'Authorization': f'token {GITHUB_PERSONAL_ACCESS_TOKEN}',
        'Content-Type': 'application/json',
        'Accept': 'application/vnd.github.v3+json',
        'X-GitHub-Api-Version': '2022-11-28'
    }
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return []

## Commits by Branch
This data was previously downloaded as a JSON file.

In [10]:
from pygethub import list_branches, list_commits, GitHubPaginator

In [11]:
# paginator = GitHubPaginator(GITHUB_PERSONAL_ACCESS_TOKEN)

# # BRANCHES
# branch_pages = paginator.get_paginator(list_branches, owner=GITHUB_REPOSITORY_OWNER, repo=GITHUB_REPOSITORY_NAME)
# all_branches = list(branch_pages)

# # TODO 2/10/24
# # TODO extract the branch names
# branch_names = [branch["name"] for branch in all_branches]

In [12]:
# commits_by_branch = {}
# for branch in branch_names:
#     list_commits_params = {
#         "owner": GITHUB_REPOSITORY_OWNER,
#         "repo": GITHUB_REPOSITORY_NAME,
#         "sha": branch,
#     }
#     branch_commit_pages = paginator.get_paginator(
#         list_commits, 
#         **list_commits_params,
#         user_agent="nmdp-bioinformatics-gfe-db-state-builder/1.0")
#     commits_by_branch[branch] = list(branch_commit_pages)

In [13]:
# with open(root_dir / "commits-by-branch.json", "w") as f:
#     json.dump(commits_by_branch, f, indent=4)

In [14]:
# load commits-by-branch.json
with open(root_dir / "commits-by-branch.json", "r") as f:
    commits_by_branch = json.load(f)

### Commits by Branch EDA

Reshape commits_by_branch JSON using dict and list comprehensions
Example of input structure
```json
{
    "300": [
        {
            "sha": "ba5cb3d05c7b3ba5024cdafa192d89af186f08a9",
            "node_id": "MDY6Q29tbWl0MjQ1NDAxMzY6YmE1Y2IzZDA1YzdiM2JhNTAyNGNkYWZhMTkyZDg5YWYxODZmMDhhOQ==",
            "commit": {
                "author": {
                    "name": "anhig",
                    "email": "james.robinson@anthonynolan.org",
                    "date": "2017-06-07T13:49:28Z"
                },
                "committer": {
                    "name": "anhig",
                    "email": "james.robinson@anthonynolan.org",
                    "date": "2017-06-07T13:49:28Z"
                },
                "message": "Addition of historical WMDA files\n\nAddition of historical WMDA files",
                "tree": {
                    "sha": "9eafc92b0944c5e08f7c4b9faeb75c491d293a8a",
                    "url": "https://api.github.com/repos/ANHIG/IMGTHLA/git/trees/9eafc92b0944c5e08f7c4b9faeb75c491d293a8a"
                },
                "url": "https://api.github.com/repos/ANHIG/IMGTHLA/git/commits/ba5cb3d05c7b3ba5024cdafa192d89af186f08a9",
                "comment_count": 0,
                "verification": {
                    "verified": false,
                    "reason": "unsigned",
                    "signature": null,
                    "payload": null
                }
            }
        },
        ...
    ],
    ...
}
```
Example of output structure
```json
[
    {
        "branch": "300",
        "sha": "ba5cb3d05c7b3ba5024cdafa192d89af186f08a9",
        "node_id": "MDY6Q29tbWl0MjQ1NDAxMzY6YmE1Y2IzZDA1YzdiM2JhNTAyNGNkYWZhMTkyZDg5YWYxODZmMDhhOQ==",
        "commit": {
            "author": {
                "name": "anhig",
                "email": "james.robinson@anthonynolan.org",
                "date": "2017-06-07T13:49:28Z"
            },
            "committer": {
                "name": "anhig",
                "email": "james.robinson@anthonynolan.org",
                "date": "2017-06-07T13:49:28Z"
            },
            "message": "Addition of historical WMDA files\n\nAddition of historical WMDA files",
            "tree": {
                "sha": "9eafc92b0944c5e08f7c4b9faeb75c491d293a8a",
                "url": "https://api.github.com/repos/ANHIG/IMGTHLA/git/trees/9eafc92b0944c5e08f7c4b9faeb75c491d293a8a"
            },
            "url": "https://api.github.com/repos/ANHIG/IMGTHLA/git/commits/ba5cb3d05c7b3ba5024cdafa192d89af186f08a9",
            "comment_count": 0,
            "verification": {
                "verified": false,
                "reason": "unsigned",
                "signature": null,
                "payload": null
            }
        }
    },
    ...
]
```

In [15]:
commits_by_branch_list = [
    {
        "branch": branch,
        **commit
    }
    for branch, commits in commits_by_branch.items()
    for commit in commits
]
 

In [16]:
# flatten the commits_by_branch_list
commits_by_branch_list_flat = [flatten_json(commit) for commit in commits_by_branch_list]

In [17]:
# load to pandas DataFrame
commits_by_branch_df = pd.DataFrame(commits_by_branch_list_flat)

The aggregation below shows that a single sha can be associated with multiple branches. This means that we cannot rely on the branch name to indicate the release version the commit was made for.

Also notice that some commits are associated with only one branch. These are the commits that were missing from calling list_commits which defaults to the master branch, which in this case is called 'Latest'.

In [18]:
# group by sha and find unique branches, then include a column for the number of branches and sort from least to most
commits_by_sha = commits_by_branch_df.groupby("sha").agg(
    branches=("branch", "unique"),
    date=("commit.author.date", "first"),
    num_branches=("branch", "nunique"),
    # html_url=("html_url", "first"),
).sort_values("date").reset_index()

In [19]:
# commits_by_sha

In [20]:
commits_by_sha[commits_by_sha["sha"] == "e1cd1ec3e66f4ab2b218f6758ed315f557778655"]

Unnamed: 0,sha,branches,date,num_branches
270,e1cd1ec3e66f4ab2b218f6758ed315f557778655,[3130],2017-06-21T14:40:46Z,1


In [21]:
len(commits_by_sha)

739

### Commits by Branch Processing
Reduce commits-by-branch JSON to a list of unique commits.

In [22]:
# Create an array of all unique commits in commits_by_branch and omit the branch information
unique_commits = set()
for release, commits in commits_by_branch.items():
    unique_commits.update([json.dumps(commit) for commit in commits])

# covert back to dict
unique_commits = [json.loads(commit) for commit in unique_commits]

In [23]:
len(unique_commits)

739

In [24]:
unique_commits_flat = [flatten_json(commit) for commit in unique_commits]
unique_commits_df = pd.DataFrame(unique_commits_flat).sort_values("commit.author.date").reset_index(drop=True)

In [25]:
# sort by date
len(unique_commits_df)

739

In [26]:
# unique_commits_df.columns

In [27]:
# unique_commits_df[['sha', 'commit.author.date', 'commit.message', 'html_url']]

### Handling Error SHAs
Some SHAs do not allow files to be retrieved using the GitHub REST API:
```json
[
    "8d77b3dd93959663d58ae5b626289d0746edd0e7",
    "252d7c5dc9d2f7671447fd11fe6bb004c438f34b",
    "e1cd1ec3e66f4ab2b218f6758ed315f557778655",
    "fa208da83a7f96d62c1e4efee2018074bbd805e0",
    "09ed08b9abcd97622d59ec37e31b4706dc9a9391",
    "8db938b1eb58dd8c77cba9b7524f84cf8ffe719c",
    "041318439bf0ba291f990faaa27cd6ad0a062d13",
    "ba5cb3d05c7b3ba5024cdafa192d89af186f08a9",
    "7ca4eb239a96884142d3ef0b0182d3bc84ec1bba",
    "3abe7e12dcbc3824315959af4428c53bd760c6e7",
    "c4d3f67ef7ef4b5f6571b4f1d4aa5b928d2a3d56",
    "23044ee80c27f75bb34c9f9ac689b1c68cd65914"
]
```

In this case version 300 is still missing.

## Most Recent Commit by Branch
Evaluating API responses with the objective of finding the most recent data for a given release.
- All releases are available as branches

In [28]:
paginator = GitHubPaginator(GITHUB_PERSONAL_ACCESS_TOKEN)

### COMMITS BY BRANCHES ###
branch_pages = paginator.get_paginator(
    list_branches, 
    owner=GITHUB_REPOSITORY_OWNER, 
    repo=GITHUB_REPOSITORY_NAME,
    user_agent="nmdp-bioinformatics-gfe-db-state-builder/1.0"
)
all_branches = list(branch_pages)

# # extract the branch names
# branch_names = [branch["name"] for branch in all_branches]

Page 1: 57 items


In [29]:
all_branches[-2]

{'name': '3550',
 'commit': {'sha': 'e4fd1e39a4d9f1da8e7efe4a7f699320e287dcdb',
  'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/e4fd1e39a4d9f1da8e7efe4a7f699320e287dcdb'},
 'protected': False}

In [30]:
all_branches[-5:]

[{'name': '3520',
  'commit': {'sha': '62945381d236dcdb2770daf1fa861b216b99635c',
   'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/62945381d236dcdb2770daf1fa861b216b99635c'},
  'protected': False},
 {'name': '3530',
  'commit': {'sha': '83aa94b540407ccdfcb452c77439b86c543f849d',
   'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/83aa94b540407ccdfcb452c77439b86c543f849d'},
  'protected': False},
 {'name': '3540',
  'commit': {'sha': '7d00d7b49cbcc987e07752845bd8b14986316ab4',
   'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/7d00d7b49cbcc987e07752845bd8b14986316ab4'},
  'protected': False},
 {'name': '3550',
  'commit': {'sha': 'e4fd1e39a4d9f1da8e7efe4a7f699320e287dcdb',
   'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/e4fd1e39a4d9f1da8e7efe4a7f699320e287dcdb'},
  'protected': False},
 {'name': 'Latest',
  'commit': {'sha': 'df6ba6f80a2c5f999590f06fced6c4c4ff56b89d',
   'url': 'https://api.github.com/repos/ANHIG/IMGTHLA/commits/df6ba6f80a

In [31]:
# write json to file for all-branches
with open(root_dir / "all-branches.json", "w") as f:
    json.dump(all_branches, f, indent=4)