In [24]:
import pandas as pd
import re
import os
from dotenv import load_dotenv
from github import Github
from tqdm import tqdm

In [None]:
load_dotenv(dotenv_path='secrets.env')

In [25]:
org_name = "angular"
repo_name = "angular.js"

In [26]:
g = Github(os.getenv('API_KEY_2'))
org = g.get_organization(org_name)
repo = org.get_repo(repo_name)

In [27]:
g.rate_limiting

(4998, 5000)

### Pull Request Extraction

In [28]:
pulls = repo.get_pulls(state = "all")
extract_pulls_list = []
extract_pulls_comments_list = []

In [29]:
for pull in tqdm(pulls, desc="Processing Pull Requests", unit="PR", total=pulls.totalCount):
    extract_pulls_list.append(
        {
            "id" : pull.id,
            "title" : pull.title,
            "created_at" : pull.created_at,
            "closed_at" : pull.closed_at,
            "created_by" : pull.user.url,
            "labels" : pull.labels,
            "state" : pull.state,
            "number_commits" : pull.commits,
            "number_files_modified" : pull.changed_files,
            "mergeable_status" : pull.mergeable_state
        }
    )

    for comment in pull.get_comments():
        extract_pulls_comments_list.append(
            {
                "pull_request_id" : pull.id,
                "id" : comment.id,
                "created_by" : comment.user,
                "created_at" : comment.created_at,
                "body" : comment.body
            }
        )


Processing Pull Requests:  11%|█         | 842/7995 [16:31<2:34:23,  1.30s/PR]

In [17]:
pull_df = pd.DataFrame(extract_pulls_list)
pull_df = pull_df.astype(str)

pull_comments_df = pd.DataFrame(extract_pulls_comments_list)
pull_comments_df = pull_comments_df.astype(str)

In [18]:
def clean_text_columns(df):
    for col in df.select_dtypes(include=['object', 'string']):
        df[col] = df[col].apply(lambda x: re.sub(r'[^\x20-\x7E]', '', str(x)) if pd.notnull(x) else x)
    return df

In [19]:
pull_df = clean_text_columns(pull_df)
pull_comments_df = clean_text_columns(pull_comments_df)

In [23]:
pull_df.to_excel(
    f'Files/{org_name.lower()}_{repo_name.lower()}_pull_requests.xlsx', 
    engine = 'openpyxl'
)

pull_comments_df.to_excel(
    f'Files/{org_name.lower()}_{repo_name.lower()}_pull_request_comments.xlsx', 
    engine = 'openpyxl'
)