# Github Metrics Collection
This notebook is used to collect github metrics about each RMW. 

In [6]:
import datetime
import sys
import time
import github
import keyring
import requests
import pandas as pd

In [7]:
def rate_limit_to_string(rl):
    rl_reset = datetime.datetime.fromtimestamp(rl.raw_data['core']['reset'])
    return "Rate limit: %d, remaining %d, reset %s" % (rl.raw_data['core']['limit'], rl.raw_data['core']['remaining'], rl_reset.strftime('%c'))


def extract_pull_data(pull):
    """
    Given an API pull request object extract the relevant information and put it in a dict.
    """
    ret_val = {}
    ret_val["state"] = pull.state # open or closed
    ret_val["opened_on"] = pull.created_at
    if pull.state != 'open':
        if pull.merged:
            closed = pull.merged_at
            ret_val["time_to_close"] = (closed - pull.created_at).total_seconds()
            ret_val["closed_at"] = closed
    else:
        ret_val["closed"] = None
        ret_val["time_to_close"] = None
    ret_val["commits"] = pull.commits
    ret_val["number"] = pull.number
    ret_val["title"] = pull.title
    ret_val["url"] = pull.url
    return ret_val

def extract_issue_data(issue):
    """
    Given a GitHub API issue data object cherry pick the data we want and drop it in a dictionary. 
    """
    ret_val = {}
    ret_val["closed_at"] = issue.closed_at
    ret_val["created_at"] = issue.created_at
    ret_val["labels"] = issue.labels
    ret_val["id"] = issue.id
    ret_val["number"] = issue.number
    ret_val["state"] = issue.state
    ret_val["title"] = issue.title
    ret_val["url"] = issue.url
    ret_val["last_mod"] = issue.last_modified
    ret_val["closed_at"] = None
    ret_val["comments"] = issue.comments
    if issue.state != 'open':
         ret_val["turn_round"] =  (issue.closed_at - issue.created_at).total_seconds()
    return ret_val

def extract_commit_stats(commit):
    """
    Given a github API commit object extract the relevant data and put it in a dictionary. 
    """
    ret_val = {}
    try:        
        ret_val["last_modified"] =  datetime.datetime.strptime(commit.last_modified,"%a, %d %b %Y %H:%M:%S %Z")    
        ret_val["author"] =  commit.author.login
        ret_val["sha"] =  commit.sha
        ret_val["changes"] =  commit.stats.total
        ret_val["commiter"] =  commit.committer.login
    except:
        return ret_val
    return ret_val


In [8]:
def pulldown_repo_stats(repository,key,date_cutoff):
    """
    Given a Github repo name and a Github API key, and a date cutoff will pull down all of the data 
    PR, Issue, and Commit data from the current time until the date_cuttoff and 
    return that data as a pandas dataframe
    """
    gh = github.Github(key)
    repo = gh.get_repo(repository)
    print('Fetching pull requests')
    time_to_close_pull_sum = 0
    num_closed_prs = 0
    pulls = repo.get_pulls(state='all')
    raw_list = []
        
    for i_pull, pull in enumerate(pulls):
        try:
            if pull.created_at.date() > date_cutoff:
                dp = extract_pull_data(pull)
                raw_list.append(dp)
            else: # this should be most recent first so we exceed creation time
                # bail.
                break 
        except requests.exceptions.ConnectionError:
            print('Failed reading data for pull request, continuing')
            
        if (i_pull % 100) == 0 :
            print(rate_limit_to_string(gh.get_rate_limit()))

    pr_df = pd.DataFrame(data=raw_list)

    print('Fetching issues')
    issue_list = []
    
    issues = repo.get_issues(state='all')
    for i_issue, issue in enumerate(issues):
        try:
            if issue.created_at.date() > date_cutoff:                      
                issue_data = extract_issue_data(issue)
                issue_list.append(issue_data)
            else: # Dates should be squential so if we get an old one stop processing
                break
        except requests.exceptions.ConnectionError:
            print('Failed reading data for issue, continuing')

        if (i_issue % 100) == 0:
            print(rate_limit_to_string(gh.get_rate_limit()))

    issue_df = pd.DataFrame(data=issue_list)            

    print('Fetching Commits')
    commit_list = []
    
    commits = repo.get_commits()
    for i_commit, commit in enumerate(commits):
        try:

            mod_date = datetime.datetime.strptime(commit.last_modified,"%a, %d %b %Y %H:%M:%S %Z")
            if mod_date.date()> date_cutoff:                      
                commit_data = extract_commit_stats(commit)
                commit_list.append(commit_data)
            else: # Dates should be squential so if we get an old one stop processing
                break
        except requests.exceptions.ConnectionError:
            print('Failed reading data for commit, continuing')

        if (i_commit % 100) == 0:
            print(rate_limit_to_string(gh.get_rate_limit()))

    commit_df = pd.DataFrame(data=commit_list)            

    return commit_df, issue_df, pr_df 


In [9]:
# Get github key
key = keyring.get_password('github-token', 'may-read-repositories')
if key is None:
    print('Failed to get GitHub API key')
else:
    print("Got GitHub Key")
# set the cutoff six months in the past
date_cutoff = datetime.date.today()-datetime.timedelta(days = 30*6)


Got GitHub Key


In [10]:
fast_commit, fast_issue, fast_pr = pulldown_repo_stats("eProsima/Fast-DDS",key,date_cutoff)    
fast_commit.to_csv("./data/git_metrics/fast_commit.csv")
fast_issue.to_csv("./data/git_metrics/fast_issue.csv")
fast_pr.to_csv("./data/git_metrics/fast_pr.csv")
fast_pr.head()

Fetching pull requests
Rate limit: 5000, remaining 4892, reset Tue Aug 10 18:54:59 2021
Rate limit: 5000, remaining 4789, reset Tue Aug 10 18:54:59 2021
Rate limit: 5000, remaining 4686, reset Tue Aug 10 18:54:59 2021
Fetching issues
Rate limit: 5000, remaining 4625, reset Tue Aug 10 18:54:59 2021
Rate limit: 5000, remaining 4622, reset Tue Aug 10 18:54:59 2021
Rate limit: 5000, remaining 4619, reset Tue Aug 10 18:54:59 2021
Rate limit: 5000, remaining 4615, reset Tue Aug 10 18:54:59 2021
Fetching Commits
Rate limit: 5000, remaining 4611, reset Tue Aug 10 18:54:59 2021
Rate limit: 5000, remaining 4508, reset Tue Aug 10 18:54:59 2021


Unnamed: 0,state,opened_on,closed,time_to_close,commits,number,title,url,closed_at
0,open,2021-08-10 18:58:27,,,8,2138,Propagate servers list updates to PDPServer an...,https://api.github.com/repos/eProsima/Fast-DDS...,NaT
1,closed,2021-08-10 08:11:05,,5271.0,2,2136,[12357] Fix doxygen documentation,https://api.github.com/repos/eProsima/Fast-DDS...,2021-08-10 09:38:56
2,closed,2021-08-06 11:11:01,,337444.0,4,2131,WireProtocolConfigQos update through DomainPar...,https://api.github.com/repos/eProsima/Fast-DDS...,2021-08-10 08:55:05
3,closed,2021-08-06 10:47:52,,358799.0,6,2130,[12196] Add UserDataQoS blackbox test,https://api.github.com/repos/eProsima/Fast-DDS...,2021-08-10 14:27:51
4,open,2021-08-06 08:09:11,,,7,2128,Waitset implementation,https://api.github.com/repos/eProsima/Fast-DDS...,NaT


In [11]:
fast_rmw_commit, fast_rmw_issue, fast_rmw_pr = pulldown_repo_stats("ros2/rmw_fastrtps",key,date_cutoff)    
fast_rmw_commit.to_csv("./data/git_metrics/fast_rmw_commit.csv")
fast_rmw_issue.to_csv("./data/git_metrics/fast_rmw_issue.csv")
fast_rmw_pr.to_csv("./data/git_metrics/fast_rmw_pr.csv")
fast_rmw_pr.head()

Fetching pull requests
Rate limit: 5000, remaining 4454, reset Tue Aug 10 18:54:59 2021
Fetching issues
Rate limit: 5000, remaining 4425, reset Tue Aug 10 18:54:59 2021
Fetching Commits
Rate limit: 5000, remaining 4422, reset Tue Aug 10 18:54:59 2021


Unnamed: 0,state,opened_on,time_to_close,closed_at,commits,number,title,url,closed
0,closed,2021-07-27 05:30:02,124272.0,2021-07-28 16:01:14,2,550,Fix type size alignment,https://api.github.com/repos/ros2/rmw_fastrtps...,
1,closed,2021-07-12 20:18:28,80283.0,2021-07-13 18:36:31,1,549,[galactic backport] Support for SubscriptionOp...,https://api.github.com/repos/ros2/rmw_fastrtps...,
2,closed,2021-07-12 20:15:46,80404.0,2021-07-13 18:35:50,1,548,"Revert ""Support for SubscriptionOptions::ignor...",https://api.github.com/repos/ros2/rmw_fastrtps...,
3,closed,2021-07-09 08:39:40,2698768.0,2021-08-09 14:19:08,5,547,[Galactic] Loan messages implementation,https://api.github.com/repos/ros2/rmw_fastrtps...,
4,closed,2021-07-06 18:43:16,153188.0,2021-07-08 13:16:24,1,546,Pass the CRL down to Fast-DDS if available.,https://api.github.com/repos/ros2/rmw_fastrtps...,


In [12]:
cyclone_commit, cyclone_issue, cyclone_pr = pulldown_repo_stats("eclipse-cyclonedds/cyclonedds",key,date_cutoff)    
cyclone_commit.to_csv("./data/git_metrics/cyclone_commit.csv")
cyclone_issue.to_csv("./data/git_metrics/cyclone_issue.csv")
cyclone_pr.to_csv("./data/git_metrics/cyclone_pr.csv")
cyclone_pr.head()

Fetching pull requests
Rate limit: 5000, remaining 4389, reset Tue Aug 10 18:54:59 2021
Rate limit: 5000, remaining 4286, reset Tue Aug 10 18:54:59 2021
Fetching issues
Rate limit: 5000, remaining 4227, reset Tue Aug 10 18:54:59 2021
Rate limit: 5000, remaining 4224, reset Tue Aug 10 18:54:59 2021
Rate limit: 5000, remaining 4221, reset Tue Aug 10 18:54:59 2021
Fetching Commits
Rate limit: 5000, remaining 4218, reset Tue Aug 10 18:54:59 2021
Rate limit: 5000, remaining 4115, reset Tue Aug 10 18:54:59 2021
Rate limit: 5000, remaining 4012, reset Tue Aug 10 18:54:59 2021
Rate limit: 5000, remaining 3908, reset Tue Aug 10 18:54:59 2021


Unnamed: 0,state,opened_on,time_to_close,closed_at,commits,number,title,url,closed
0,closed,2021-08-08 15:46:21,53047.0,2021-08-09 06:30:28,1,906,Make sure to use buffer with truncated name fo...,https://api.github.com/repos/eclipse-cyclonedd...,
1,open,2021-08-05 13:06:49,,NaT,2,904,Implementation of the XTypes type system,https://api.github.com/repos/eclipse-cyclonedd...,
2,open,2021-08-05 10:24:31,,NaT,1,903,XCDR2 support for idlc and cdrstream serializer,https://api.github.com/repos/eclipse-cyclonedd...,
3,open,2021-08-04 11:17:07,,NaT,1,902,Possibility of testing byteswapped input in no...,https://api.github.com/repos/eclipse-cyclonedd...,
4,open,2021-08-04 11:13:06,,NaT,1,901,OpenDDS compatibility,https://api.github.com/repos/eclipse-cyclonedd...,


In [13]:
cyclone_rmw_commit, cyclone_rmw_issue, cyclone_rmw_pr = pulldown_repo_stats("ros2/rmw_cyclonedds",key,date_cutoff)    
cyclone_rmw_commit.to_csv("./data/git_metrics/cyclone_rmw_commit.csv")
cyclone_rmw_issue.to_csv("./data/git_metrics/cyclone_rmw_issue.csv")
cyclone_rmw_pr.to_csv("./data/git_metrics/cyclone_rmw_pr.csv")
cyclone_rmw_pr.head()

Fetching pull requests
Rate limit: 5000, remaining 3813, reset Tue Aug 10 18:54:59 2021
Fetching issues
Rate limit: 5000, remaining 3782, reset Tue Aug 10 18:54:59 2021
Fetching Commits
Rate limit: 5000, remaining 3779, reset Tue Aug 10 18:54:59 2021


Unnamed: 0,state,opened_on,closed,time_to_close,commits,number,title,url,closed_at
0,open,2021-08-02 23:26:19,,,2,329,"Add pub/sub init, publish and take instrumenta...",https://api.github.com/repos/ros2/rmw_cycloned...,NaT
1,open,2021-07-29 12:34:33,,,1,327,add -latomic in CMakelist.txt,https://api.github.com/repos/ros2/rmw_cycloned...,NaT
2,closed,2021-07-06 18:43:03,,153171.0,1,325,Pass the CRL down to CycloneDDS if it exists.,https://api.github.com/repos/ros2/rmw_cycloned...,2021-07-08 13:15:54
3,closed,2021-06-28 23:22:35,,,1,324,Add publishing instrumentation using tracetools,https://api.github.com/repos/ros2/rmw_cycloned...,NaT
4,closed,2021-06-24 18:52:19,,325225.0,1,323,Use the new rmw_dds_common::get_security_files...,https://api.github.com/repos/ros2/rmw_cycloned...,2021-06-28 13:12:44


In [14]:
cyclone_rmw_pr.head()

Unnamed: 0,state,opened_on,closed,time_to_close,commits,number,title,url,closed_at
0,open,2021-08-02 23:26:19,,,2,329,"Add pub/sub init, publish and take instrumenta...",https://api.github.com/repos/ros2/rmw_cycloned...,NaT
1,open,2021-07-29 12:34:33,,,1,327,add -latomic in CMakelist.txt,https://api.github.com/repos/ros2/rmw_cycloned...,NaT
2,closed,2021-07-06 18:43:03,,153171.0,1,325,Pass the CRL down to CycloneDDS if it exists.,https://api.github.com/repos/ros2/rmw_cycloned...,2021-07-08 13:15:54
3,closed,2021-06-28 23:22:35,,,1,324,Add publishing instrumentation using tracetools,https://api.github.com/repos/ros2/rmw_cycloned...,NaT
4,closed,2021-06-24 18:52:19,,325225.0,1,323,Use the new rmw_dds_common::get_security_files...,https://api.github.com/repos/ros2/rmw_cycloned...,2021-06-28 13:12:44
