# Automated Data Science Asset Register

[Code adapted from https://github.com/moj-analytical-services/data-science-assets]

This notebook uses the GitHub API to pull information from the repos in `assets.yaml`. \
The API call extracts the `yaml` block from each GitHub repo and uses this to populate a dataframe. \
The Gov Notify service is then used to send reminder emails for projects past their review date. \
The dataframe is formatted into html and saved. (ambition is to automate adding this to sharepoint, subject to permissions being granted).

In [1]:
# Load packages
from github import Github
import yaml
import re
import pandas as pd

In [2]:
## Function takes markdown content (e.g. from a GitHub Readme) and returns the contents of the first yaml code block.

def extract_yaml_from_md(md_content):
    # Regular expression pattern to match the FIRST fenced YAML block
    yaml_block_pattern = re.compile(r'```yaml(.*?)```', re.DOTALL)
    
    # Search for the first YAML block
    match = yaml_block_pattern.search(md_content)
    if match:
        yaml_content = match.group(1).strip()  # Remove leading/trailing whitespace
        try:
            # Parse the first YAML content block
            return yaml.safe_load(yaml_content)
        except yaml.YAMLError as e:
            print("Error parsing YAML content:", e)
            return None
    else:
        # Explicitly return None if no YAML blocks are found
        return None

In [None]:
# read in api tokens

import dotenv

secrets = dotenv.dotenv_values("../.env")
user_agent = secrets["AGENT"]
pat = secrets["PAT"] # TODO: Implement OAuth
org_nm1 = secrets["ORG_NM1"]
org_nm2 = secrets["ORG_NM2"]

github_api_token = pat
github_api_token


## Github scrape

In [None]:
g = Github(github_api_token)

# Load in list of repos with assets and their organisation
with open('../data/assets.yaml', 'r') as file:
    data_list = yaml.safe_load(file)

# Convert the list of dictionaries to a list of tuples
repo_names = [(item['name'], item['organisation']) for item in data_list]

print(repo_names)

In [None]:
# Loop through the asset repositories and scrape info from Github 

table_data = []

for repo_name, org_name in repo_names:
    
    org = g.get_organization(org_name)    
    repo = org.get_repo(repo_name)
    
    # Get repo name and url and initialise data object for this repo
    data = {
        'Name': repo.name,
        'Github': repo.html_url,
    }
    
        # Get YAML contents from README (note Name (repo name) is overwritten by the name in the yaml if it exists)
    try:
        readme = repo.get_readme()
        readme_content = readme.decoded_content.decode("utf-8")
        
        # Extract yaml from the readme using function
        data.update(extract_yaml_from_md(readme_content))
        
    except Exception as e: # This will happen if there's no readme or no yaml block
        print(f"Error retrieving YAML for {repo_name}: {e}")

    # Get the date of the last commit and append to data
    try:
        commits = repo.get_commits()
        last_commit_date = commits[0].commit.committer.date
        
        # Convert the last commit date to a string format, e.g., YYYY-MM-DD
        formatted_date = last_commit_date.strftime('%Y-%m-%d')

        # Process the last commit date
        data['Last Commit Date'] = formatted_date
        
        print(f"The last commit date for {repo_name} is: {last_commit_date}")
    except Exception as e:
        print(f"Error retrieving commits for {repo_name}: {e}")
    
    print(data)
    table_data.append(data)

In [None]:
# make dictionary keys all lower case
table_data = [{k.lower() : v for k,v in item.items()} for item in table_data]

In [None]:
# Convert dictionary data to a pandas df 
df = pd.DataFrame(table_data)

# Get next review data as datetime
df['review_date'] = pd.to_datetime(df['next review date'], format='%b-%y')

In [None]:
if len(df.columns) != 17:
    print("WARNING: Unexpected or missing columns")

In [None]:
df

In [None]:
# Rename and re-order columns for final output table
new_columns = {
             'name': 'Name',
             'category' : 'Category', 
             'description' : 'Description',
             'impact' : 'Impact',
             'g6 lead' : 'G6 lead', 
             'sro' : 'SRO',
             'technical lead' : 'Technical lead', 
             'business lead' : 'Business lead', 
             'last review date' : 'Last review date',
             'next review date' : 'Next review date', 
             'outage impact' : 'Outage impact', 
             'maintenance (fte)': 'Maintenance (FTE)',
             'documentation' : 'Documentation', 
             'github' : 'Github', 
             'last commit date' : 'Last Commit Date'
    
}

# Set the DataFrame to the new order and rename
df = df[list(new_columns.keys())].rename(columns=new_columns)


# # Add hyperlinks
# df['Github'] = df['Github'].apply(lambda x: f'<a href="{x}">Repo</a>')
# df['Documentation'] = df['Documentation'].apply(lambda x: f'<a href="{x}">Link</a>')

In [None]:
df