In [58]:
ACCESS_TOKEN = '<>'  # Replace with your token


In [61]:
import requests
import time
import json
import os
import re

# Constants
GITHUB_API_URL = "https://api.github.com"
REPO_OWNER = "pytorch"
REPO_NAME = "pytorch"
ISSUES_PER_PAGE = 30
SAVE_FILE = "issues_data.json"
CHECKPOINT_FILE = "checkpoint.json"

HEADERS = {
    "Accept": "application/vnd.github+json",
    "Authorization": f"Bearer {ACCESS_TOKEN}",
    "X-GitHub-Api-Version": "2022-11-28"
}

# Ensure the data_scripts directory exists
if not os.path.exists('data_scripts'):
    os.makedirs('data_scripts')

# Function to check rate limits
def check_rate_limit():
    response = requests.get(f"{GITHUB_API_URL}/rate_limit", headers=HEADERS)
    if response.status_code == 200:
        rate_limit = response.json()
        if 'resources' in rate_limit and 'core' in rate_limit['resources']:
            remaining = rate_limit['resources']['core']['remaining']
            reset_time = rate_limit['resources']['core']['reset']
            return remaining, reset_time
    return None, None

# Function to fetch issues from GitHub
def fetch_issues(page):
    url = f"{GITHUB_API_URL}/repos/{REPO_OWNER}/{REPO_NAME}/issues"
    params = {
        "state": "all",
        "per_page": ISSUES_PER_PAGE,
        "page": page
    }
    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    return response.json()

# Function to fetch comments for a specific issue with error handling
def fetch_comments(issue_number, page=1):
    url = f"{GITHUB_API_URL}/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}/comments"
    comments = []
    while True:
        params = {"per_page": ISSUES_PER_PAGE, "page": page}
        try:
            response = requests.get(url, headers=HEADERS, params=params)
            response.raise_for_status()
        except requests.exceptions.HTTPError as err:
            if response.status_code == 403:
                remaining, reset_time = check_rate_limit()
                if remaining == 0:
                    sleep_time = reset_time - time.time() + 10
                    print(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.")
                    time.sleep(sleep_time)
                    continue
                else:
                    print(f"HTTPError 403 Forbidden: Retrying after delay...")
                    time.sleep(60)  # wait before retrying
                    continue
            elif response.status_code == 401:
                print(f"HTTPError 401 Unauthorized: Please check your access token.")
                break
            else:
                print(f"An error occurred: {err}")
                break
        page_comments = response.json()
        if not page_comments:
            break
        comments.extend(page_comments)
        page += 1
    return comments

# Function to identify useful comments
def is_useful_comment(comment):
    useful_patterns = [
        r'\bsolution\b', r'\bworkaround\b', r'\bresolved\b',
        r'\bfixed\b', r'\bimplemented\b', r'\btemporary fix\b',
        r'\bpatch\b', r'\bPR\b', r'\bpull request\b', r'\bcommit\b'
    ]
    for pattern in useful_patterns:
        if re.search(pattern, comment['body'], re.IGNORECASE):
            return True
    return False

# Function to format issue data in Q&A style
def format_issue_qa(issue):
    question = {
        "issue_number": issue["number"],
        "question": {
            "title": issue["title"],
            "body": issue["body"],
            "user": {
                "login": issue["user"]["login"],
                "html_url": issue["user"]["html_url"]
            },
            "created_at": issue["created_at"],
            "state": issue["state"]
        },
        "answers": []
    }
    comments = fetch_comments(issue['number'])
    for comment in comments:
        if is_useful_comment(comment):
            answer = {
                "body": comment["body"],
                "user": {
                    "login": comment["user"]["login"],
                    "html_url": comment["user"]["html_url"]
                },
                "created_at": comment["created_at"]
            }
            question["answers"].append(answer)
    return question

# Function to save data incrementally
def save_data(data):
    if os.path.exists(SAVE_FILE):
        with open(SAVE_FILE, 'r') as file:
            existing_data = json.load(file)
        existing_data.extend(data)
    else:
        existing_data = data
    with open(SAVE_FILE, 'w') as file:
        json.dump(existing_data, file, indent=4)

# Function to save checkpoint
def save_checkpoint(page, issue_number):
    checkpoint = {
        "page": page,
        "issue_number": issue_number
    }
    with open(CHECKPOINT_FILE, 'w') as file:
        json.dump(checkpoint, file)

# Function to load checkpoint
def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as file:
            checkpoint = json.load(file)
        return checkpoint["page"], checkpoint["issue_number"]
    return 1, None

# Function to load the last processed issue number from existing data
def load_last_issue_number():
    if os.path.exists(SAVE_FILE):
        with open(SAVE_FILE, 'r') as file:
            existing_data = json.load(file)
        return existing_data[-1]['issue_number'] if existing_data else None
    return None

# Main function to fetch and process issues
def main():
    page, last_issue_number = load_checkpoint()
    if last_issue_number is None:
        last_issue_number = load_last_issue_number()
    all_issues = []

    while True:
        remaining, reset_time = check_rate_limit()
        if remaining is not None and reset_time is not None:
            if remaining < 10:
                sleep_time = reset_time - time.time() + 10
                print(f"Rate limit exceeded. Sleeping for {sleep_time} seconds.")
                time.sleep(sleep_time)

        try:
            issues = fetch_issues(page)
        except requests.exceptions.HTTPError as err:
            print(f"An error occurred while fetching issues: {err}")
            save_checkpoint(page, last_issue_number)
            break

        if not issues:
            break

        for issue in issues:
            if last_issue_number and issue["number"] <= last_issue_number:
                continue
            try:
                formatted_issue = format_issue_qa(issue)
                all_issues.append(formatted_issue)
                save_checkpoint(page, issue["number"])
            except requests.exceptions.HTTPError as err:
                print(f"An error occurred while fetching comments for issue {issue['number']}: {err}")
                save_checkpoint(page, issue["number"])
                break

        save_data(all_issues)
        page += 1
        last_issue_number = None  # Reset last_issue_number after processing a full page
        time.sleep(1)  # To avoid secondary rate limits

if __name__ == "__main__":
    main()

Rate limit exceeded. Sleeping for 587.3035609722137 seconds.
Rate limit exceeded. Sleeping for 507.8897547721863 seconds.


In [None]:
# add logging
# turn this into class
# add hf code here
# add docs


<Response [401]>