In [12]:
from github import Github
import pandas as pd
import requests
from tqdm import tqdm
import re

In [13]:
g = Github("")
org = g.get_organization("Rdatatable")
repo = org.get_repo("data.table")

In [14]:
g.rate_limiting

(646, 5000)

### Issues Extraction

##### Required Filters
Issues Active Metric
- By actor (submitter, commenter, closer)
- By groups of actors (employer, gender, for each of the actors)
- By ratio of active issues over total number of issues during that period
- By start and finish date of the period during which issues are considered
- By total number of active issues during the period

Issues Age Metric
- Module or working group
- Tags/labels on issue

Issues New Metric
- By actor (submitter, commenter, closer)
- By groups of actors (employer, gender, for each of the actors)

Issues Resolution Duration Metric
- By open time
- By closed time
- By actor (submitter, commenter, closer)
- By groups of actors (employer, gender, for each of the actors)

Issues Closed Metric
- By actor (submitter, commenter, closer)
- By groups of actors (employer, gender, for each of the actors)

Response Time Metric

In [15]:
issues = repo.get_issues(state = "all")
extract_issues_list = []
extract_issues_comments_list = []

In [16]:
for issue in tqdm(issues, desc="Processing issues", unit="issue"):
    if not issue.pull_request:
        extract_issues_list.append(
            {
                "id" : issue.id,
                "title" : issue.title,
                "created_at" : issue.created_at,
                "closed_at" : issue.closed_at,
                "created_by" : issue.user.url,
                "closed_by" : issue.closed_by,
                "labels" : issue.labels,
            }
        )

        for comment in issue.get_comments():
            extract_issues_comments_list.append({
                "issue_id" : issue.id,
                "id" : comment.id,
                "created_by" : comment.user,
                "created_at" : comment.created_at,
                "body" : comment.body
            })

Processing issues: 819issue [05:13,  4.50issue/s]Request GET /repos/Rdatatable/data.table/issues/5785/comments failed with 403: Forbidden
Setting next backoff to 498.732561s
Processing issues: 4711issue [49:35,  1.24issue/s]Request GET /repos/Rdatatable/data.table/issues/1867 failed with 403: Forbidden
Setting next backoff to 1438.553514s
Processing issues: 6578issue [1:41:25,  1.08issue/s] 


In [26]:
issues_df = pd.DataFrame(extract_issues_list)
issues_df = issues_df.astype(str)

issues_comments_df = pd.DataFrame(extract_issues_comments_list)
issues_comments_df = issues_comments_df.astype(str)

In [27]:
def clean_text_columns(df):
    for col in df.select_dtypes(include=['object', 'string']):
        df[col] = df[col].apply(lambda x: re.sub(r'[^\x20-\x7E]', '', str(x)) if pd.notnull(x) else x)
    return df

In [28]:
issues_df = clean_text_columns(issues_df)
issues_comments_df = clean_text_columns(issues_comments_df)

In [29]:
issues_df.to_excel(
    'datasets/issues.xlsx', 
    engine = 'openpyxl'
)

issues_comments_df.to_excel(
    'datasets/issues_comments.xlsx', 
    engine = 'openpyxl'
)

### Pull Requests Extraction
##### Requested Filters:

Change Request Closure Ratio
- Date Range (Created and Closed)
- Automated Responses
- Labels
- Type of Change Request
- Type of Close

Change Request Reviews
- Number of contributors doing reviews
- Automated Responses
- Type of Change Request
- Date Range (Created and Closed)

Change Request Commits
- Lines of code added per commit
- Lines of code removed per commit
- Change in code over the period of time
- Number of files changed per commit
- Contributors per commit

Change Request
- Date Range (Created and Closed)
- Actors
- Group of Actors
- Status

Change Requests Duration
- Actors
- Groups of Actors
- Period

Change Requests Accepted
- Actors
- Groups of Actors
- Period

Change Requests Rejected
- Actors
- Groups of Actors
- Period

In [4]:
pulls = repo.get_pulls(state = "all")
extract_pulls_list = []
extract_pulls_comments_list = []

In [5]:
for pull in tqdm(pulls, desc="Processing Pull Requests", unit="PR"):
    extract_pulls_list.append(
        {
            "id" : pull.id,
            "title" : pull.title,
            "created_at" : pull.created_at,
            "closed_at" : pull.closed_at,
            "created_by" : pull.user.url,
            "labels" : pull.labels,
            "state" : pull.state,
            "number_commits" : pull.commits,
            "number_files_modified" : pull.changed_files,
            "mergeable_status" : pull.mergeable_state
        }
    )

    for comment in pull.get_comments():
        extract_pulls_comments_list.append(
            {
                "pull_request_id" : pull.id,
                "id" : comment.id,
                "created_by" : comment.user,
                "created_at" : comment.created_at,
                "body" : comment.body
            }
        )


In [6]:
pull_df = pd.DataFrame(extract_pulls_list)
pull_df = pull_df.astype(str)

pull_comments_df = pd.DataFrame(extract_pulls_comments_list)
pull_comments_df = pull_comments_df.astype(str)

In [7]:
pull_df.to_excel(
    'datasets/pull_requests.xlsx', 
    engine = 'openpyxl'
)

pull_comments_df.to_excel(
    'datasets/pull_request_comments.xlsx', 
    engine = 'openpyxl'
)

## Milestone Extraction

In [10]:
token = ''
repo_owner = 'Rdatatable'
repo_name = 'data.table'

url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/milestones?state=all'

headers = {
    'Authorization': f'token {token}',
    'Accept': 'application/vnd.github.v3+json'
}

response = requests.get(url, headers=headers)
milestones = response.json()

df = pd.DataFrame([
    {
        "MilestoneId": milestone['id'],
        "MilestoneTitle": milestone['title'], 
        "OpennedAt": milestone['created_at'],
        "ClosedAt": milestone['closed_at'],
        "OpennedIssues": milestone['open_issues'],
        "ClosedIssues": milestone['closed_issues'],
        "State": milestone["state"]
    } for milestone in milestones
])

In [11]:
df.to_excel(
    'datasets/milestones.xlsx', 
    engine = 'openpyxl'
)