# Github Metrics Collection
This notebook is used to collect github metrics about each RMW. 

In [41]:
import datetime
import sys
import time
import github
import keyring
import requests
import pandas as pd

In [42]:
def rate_limit_to_string(rl):
    rl_reset = datetime.datetime.fromtimestamp(rl.raw_data['core']['reset'])
    return "Rate limit: %d, remaining %d, reset %s" % (rl.raw_data['core']['limit'], rl.raw_data['core']['remaining'], rl_reset.strftime('%c'))


def extract_pull_data(pull):
    """
    Given an API pull request object extract the relevant information and put it in a dict.
    """
    ret_val = {}
    ret_val["state"] = pull.state # open or closed
    ret_val["opened_on"] = pull.created_at
    if pull.state != 'open':
        if pull.merged:
            closed = pull.merged_at
            ret_val["time_to_close"] = (closed - pull.created_at).total_seconds()
            ret_val["closed_at"] = closed
    else:
        ret_val["closed"] = None
        ret_val["time_to_close"] = None
    ret_val["commits"] = pull.commits
    ret_val["number"] = pull.number
    ret_val["title"] = pull.title
    ret_val["url"] = pull.url
    return ret_val

def extract_issue_data(issue):
    """
    Given a GitHub API issue data object cherry pick the data we want and drop it in a dictionary. 
    """
    ret_val = {}
    ret_val["closed_at"] = issue.closed_at
    ret_val["created_at"] = issue.created_at
    ret_val["labels"] = issue.labels
    ret_val["id"] = issue.id
    ret_val["number"] = issue.number
    ret_val["state"] = issue.state
    ret_val["title"] = issue.title
    ret_val["url"] = issue.url
    ret_val["last_mod"] = issue.last_modified
    ret_val["closed_at"] = None
    ret_val["comments"] = issue.comments
    if issue.state != 'open':
         ret_val["turn_round"] =  (issue.closed_at - issue.created_at).total_seconds()
    return ret_val

def extract_commit_stats(commit):
    """
    Given a github API commit object extract the relevant data and put it in a dictionary. 
    """
    ret_val = {}
    try:        
        ret_val["last_modified"] =  datetime.datetime.strptime(commit.last_modified,"%a, %d %b %Y %H:%M:%S %Z")    
        ret_val["author"] =  commit.author.login
        ret_val["sha"] =  commit.sha
        ret_val["changes"] =  commit.stats.total
        ret_val["commiter"] =  commit.committer.login
    except:
        return ret_val
    return ret_val


In [43]:
def pulldown_repo_stats(repository,key,date_cutoff):
    """
    Given a Github repo name and a Github API key, and a date cutoff will pull down all of the data 
    PR, Issue, and Commit data from the current time until the date_cuttoff and 
    return that data as a pandas dataframe
    """
    gh = github.Github(key)
    repo = gh.get_repo(repository)
    print('Fetching pull requests')
    time_to_close_pull_sum = 0
    num_closed_prs = 0
    pulls = repo.get_pulls(state='all')
    raw_list = []
        
    for i_pull, pull in enumerate(pulls):
        try:
            if pull.created_at.date() > date_cutoff:
                dp = extract_pull_data(pull)
                raw_list.append(dp)
            else: # this should be most recent first so we exceed creation time
                # bail.
                break 
        except requests.exceptions.ConnectionError:
            print('Failed reading data for pull request, continuing')
            
        if (i_pull % 100) == 0 :
            print(rate_limit_to_string(gh.get_rate_limit()))

    pr_df = pd.DataFrame(data=raw_list)

    print('Fetching issues')
    issue_list = []
    
    issues = repo.get_issues(state='all')
    for i_issue, issue in enumerate(issues):
        try:
            if issue.created_at.date() > date_cutoff:                      
                issue_data = extract_issue_data(issue)
                issue_list.append(issue_data)
            else: # Dates should be squential so if we get an old one stop processing
                break
        except requests.exceptions.ConnectionError:
            print('Failed reading data for issue, continuing')

        if (i_issue % 100) == 0:
            print(rate_limit_to_string(gh.get_rate_limit()))

    issue_df = pd.DataFrame(data=issue_list)            

    print('Fetching Commits')
    commit_list = []
    
    commits = repo.get_commits()
    for i_commit, commit in enumerate(commits):
        try:

            mod_date = datetime.datetime.strptime(commit.last_modified,"%a, %d %b %Y %H:%M:%S %Z")
            if mod_date.date()> date_cutoff:                      
                commit_data = extract_commit_stats(commit)
                commit_list.append(commit_data)
            else: # Dates should be squential so if we get an old one stop processing
                break
        except requests.exceptions.ConnectionError:
            print('Failed reading data for commit, continuing')

        if (i_commit % 100) == 0:
            print(rate_limit_to_string(gh.get_rate_limit()))

    commit_df = pd.DataFrame(data=commit_list)            

    return commit_df, issue_df, pr_df 


In [44]:
# Get github key
key = keyring.get_password('github-token', 'may-read-repositories')
if key is None:
    print('Failed to get GitHub API key')
else:
    print("Got GitHub Key")
# set the cutoff six months in the past
date_cutoff = datetime.date.today()-datetime.timedelta(days = 30*6)


Got GitHub Key


In [34]:
fast_commit, fast_issue, fast_pr = pulldown_repo_stats("eProsima/Fast-DDS",key,date_cutoff)    
fast_commit.to_csv("./data/git_metrics/fast_commit.csv")
fast_issue.to_csv("./data/git_metrics/fast_issue.csv")
fast_pr.to_csv("./data/git_metrics/fast_pr.csv")
fast_pr.head()

Fetching pull requests
Rate limit: 5000, remaining 4735, reset Sat Oct 17 15:33:57 2020
Rate limit: 5000, remaining 4632, reset Sat Oct 17 15:33:57 2020
Rate limit: 5000, remaining 4529, reset Sat Oct 17 15:33:57 2020
Fetching issues
Rate limit: 5000, remaining 4471, reset Sat Oct 17 15:33:57 2020
Rate limit: 5000, remaining 4468, reset Sat Oct 17 15:33:57 2020
Rate limit: 5000, remaining 4465, reset Sat Oct 17 15:33:57 2020
Rate limit: 5000, remaining 4461, reset Sat Oct 17 15:33:57 2020
Fetching Commits
Rate limit: 5000, remaining 4458, reset Sat Oct 17 15:33:57 2020
Rate limit: 5000, remaining 4355, reset Sat Oct 17 15:33:57 2020


Unnamed: 0,state,opened_on,closed,time_to_close,commits,number,title,url,closed_at
0,open,2020-10-17 08:13:08,,,1,1500,Support ack messages with garbage 1's [9633],https://api.github.com/repos/eProsima/Fast-DDS...,NaT
1,closed,2020-10-16 14:59:11,,18243.0,4,1499,[Discovery Server] Send new updates,https://api.github.com/repos/eProsima/Fast-DDS...,2020-10-16 20:03:14
2,open,2020-10-16 11:51:04,,,12,1498,Sharing payloads on intra-process [9631],https://api.github.com/repos/eProsima/Fast-DDS...,NaT
3,open,2020-10-16 10:46:56,,,4,1497,Fix transient local retransmission after parti...,https://api.github.com/repos/eProsima/Fast-DDS...,NaT
4,open,2020-10-16 09:04:34,,,6,1496,Keep pending security messages directed for ea...,https://api.github.com/repos/eProsima/Fast-DDS...,NaT


In [46]:
fast_rmw_commit, fast_rmw_issue, fast_rmw_pr = pulldown_repo_stats("ros2/rmw_fastrtps",key,date_cutoff)    
fast_rmw_commit.to_csv("./data/git_metrics/fast_rmw_commit.csv")
fast_rmw_issue.to_csv("./data/git_metrics/fast_rmw_issue.csv")
fast_rmw_pr.to_csv("./data/git_metrics/fast_rmw_pr.csv")
fast_rmw_pr.head()

Fetching pull requests
Rate limit: 5000, remaining 4785, reset Mon Oct 19 17:23:41 2020
Fetching issues
Rate limit: 5000, remaining 4704, reset Mon Oct 19 17:23:41 2020
Fetching Commits
Rate limit: 5000, remaining 4699, reset Mon Oct 19 17:23:41 2020
Rate limit: 5000, remaining 4596, reset Mon Oct 19 17:23:41 2020


Unnamed: 0,state,opened_on,closed,time_to_close,commits,number,title,url,closed_at
0,open,2020-10-13 12:47:39,,,4,466,Update README to clarify the use of environmen...,https://api.github.com/repos/ros2/rmw_fastrtps...,NaT
1,closed,2020-10-09 07:19:39,,541126.0,28,462,[Backport Foxy] Included improvements to incre...,https://api.github.com/repos/ros2/rmw_fastrtps...,2020-10-15 13:38:25
2,closed,2020-10-06 23:43:00,,60895.0,1,459,Update the package.xml files with the latest O...,https://api.github.com/repos/ros2/rmw_fastrtps...,2020-10-07 16:37:55
3,closed,2020-10-01 19:02:15,,6291.0,1,458,Ensure rmw_destroy_node() completes despite ru...,https://api.github.com/repos/ros2/rmw_fastrtps...,2020-10-01 20:47:06
4,closed,2020-09-30 22:08:53,,6296.0,3,457,Handle too large QoS queue depths.,https://api.github.com/repos/ros2/rmw_fastrtps...,2020-09-30 23:53:49


In [35]:
cyclone_commit, cyclone_issue, cyclone_pr = pulldown_repo_stats("eclipse-cyclonedds/cyclonedds",key,date_cutoff)    
cyclone_commit.to_csv("./data/git_metrics/cyclone_commit.csv")
cyclone_issue.to_csv("./data/git_metrics/cyclone_issue.csv")
cyclone_pr.to_csv("./data/git_metrics/cyclone_pr.csv")
cyclone_pr.head()

Fetching pull requests
Rate limit: 5000, remaining 4270, reset Sat Oct 17 15:33:57 2020
Fetching issues
Rate limit: 5000, remaining 4168, reset Sat Oct 17 15:33:57 2020
Rate limit: 5000, remaining 4165, reset Sat Oct 17 15:33:57 2020
Fetching Commits
Rate limit: 5000, remaining 4162, reset Sat Oct 17 15:33:57 2020
Rate limit: 5000, remaining 4059, reset Sat Oct 17 15:33:57 2020
Rate limit: 5000, remaining 3956, reset Sat Oct 17 15:33:57 2020


Unnamed: 0,state,opened_on,closed,time_to_close,commits,number,title,url,closed_at
0,open,2020-10-16 13:37:45,,,4,621,Adding the Cyclone Roadmap,https://api.github.com/repos/eclipse-cyclonedd...,NaT
1,closed,2020-10-15 11:58:50,,75993.0,1,620,Fix detection of reopened modules,https://api.github.com/repos/eclipse-cyclonedd...,2020-10-16 09:05:23
2,closed,2020-10-13 13:28:45,,165706.0,2,618,Idl const fixes,https://api.github.com/repos/eclipse-cyclonedd...,2020-10-15 11:30:31
3,open,2020-10-08 08:30:57,,,2,616,Remove thread pool implementation,https://api.github.com/repos/eclipse-cyclonedd...,NaT
4,open,2020-10-07 15:13:07,,,33,615,Topic Discovery,https://api.github.com/repos/eclipse-cyclonedd...,NaT


In [40]:
cyclone_rmw_commit, cyclone_rmw_issue, cyclone_rmw_pr = pulldown_repo_stats("ros2/rmw_cyclonedds",key,date_cutoff)    
cyclone_rmw_commit.to_csv("./data/git_metrics/cyclone_rmw_commit.csv")
cyclone_rmw_issue.to_csv("./data/git_metrics/cyclone_rmw_issue.csv")
cyclone_rmw_pr.to_csv("./data/git_metrics/cyclone_rmw_pr.csv")
cyclone_rmw_pr.head()

Fetching pull requests
Rate limit: 5000, remaining 3744, reset Sat Oct 17 15:33:57 2020
Fetching issues
Rate limit: 5000, remaining 3668, reset Sat Oct 17 15:33:57 2020
Fetching Commits
Rate limit: 5000, remaining 3663, reset Sat Oct 17 15:33:57 2020
Rate limit: 5000, remaining 3561, reset Sat Oct 17 15:33:57 2020


Unnamed: 0,state,opened_on,time_to_close,closed_at,commits,number,title,url,closed
0,closed,2020-10-09 07:22:16,541385.0,2020-10-15 13:45:21,25,255,[Backport Foxy] Included improvements to incre...,https://api.github.com/repos/ros2/rmw_cycloned...,
1,closed,2020-10-07 13:32:40,25536.0,2020-10-07 20:38:16,1,254,Update maintainers,https://api.github.com/repos/ros2/rmw_cycloned...,
2,open,2020-10-07 08:30:29,,NaT,1,253,Fix format string incorrect use of %ld/%d inst...,https://api.github.com/repos/ros2/rmw_cycloned...,
3,closed,2020-10-01 23:08:28,409249.0,2020-10-06 16:49:17,1,252,[foxy] rmw_destroy_node must remove node from ...,https://api.github.com/repos/ros2/rmw_cycloned...,
4,closed,2020-09-28 15:08:45,62369.0,2020-09-29 08:28:14,1,250,Return RMW_RET_UNSUPPORTED in rmw_get_serializ...,https://api.github.com/repos/ros2/rmw_cycloned...,


In [27]:
cyclone_rmw_pr.head()