In [27]:
import pandas as pd
import re
import os
from dotenv import load_dotenv
from github import Github
from tqdm import tqdm

In [None]:
load_dotenv(dotenv_path='secrets.env')

In [28]:
org_name = "angular"
repo_name = "angular.js"

In [29]:
g = Github(os.getenv('API_KEY_1'))
org = g.get_organization(org_name)
repo = org.get_repo(repo_name)

In [30]:
g.rate_limiting

(4998, 5000)

### Issues Extraction

In [31]:
issues = repo.get_issues(state = "all")
extract_issues_list = []
extract_issues_comments_list = []

In [32]:
for issue in tqdm(issues, desc="Processing issues", unit="issue", total=issues.totalCount):
    if not issue.pull_request:
        extract_issues_list.append(
            {
                "id" : issue.id,
                "title" : issue.title,
                "created_at" : issue.created_at,
                "closed_at" : issue.closed_at,
                "created_by" : issue.user.url,
                "closed_by" : issue.closed_by,
                "labels" : issue.labels,
            }
        )

        for comment in issue.get_comments():
            extract_issues_comments_list.append({
                "issue_id" : issue.id,
                "id" : comment.id,
                "created_by" : comment.user,
                "created_at" : comment.created_at,
                "body" : comment.body
            })

Processing issues:   6%|▌         | 1000/17032 [07:51<2:59:55,  1.49issue/s]

In [16]:
issues_df = pd.DataFrame(extract_issues_list)
issues_df = issues_df.astype(str)

issues_comments_df = pd.DataFrame(extract_issues_comments_list)
issues_comments_df = issues_comments_df.astype(str)

In [17]:
def clean_text_columns(df):
    for col in df.select_dtypes(include=['object', 'string']):
        df[col] = df[col].apply(lambda x: re.sub(r'[^\x20-\x7E]', '', str(x)) if pd.notnull(x) else x)
    return df

In [18]:
issues_df = clean_text_columns(issues_df)
issues_comments_df = clean_text_columns(issues_comments_df)

In [25]:
issues_df.to_excel(
    f'Files/{org_name.lower()}_{repo_name.lower()}_issues.xlsx', 
    engine = 'openpyxl'
)

issues_comments_df.to_excel(
    f'Files/{org_name.lower()}_{repo_name.lower()}_issues_comments.xlsx', 
    engine = 'openpyxl'
)