## Fetch the first GitHub issue from the Hugging Face datasets repository

In [8]:
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)
issue = response.json()[0]
issue

{'url': 'https://api.github.com/repos/huggingface/datasets/issues/7418',
 'repository_url': 'https://api.github.com/repos/huggingface/datasets',
 'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/7418/labels{/name}',
 'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/7418/comments',
 'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/7418/events',
 'html_url': 'https://github.com/huggingface/datasets/issues/7418',
 'id': 2868701471,
 'node_id': 'I_kwDODunzps6q_Okf',
 'number': 7418,
 'title': 'pyarrow.lib.arrowinvalid: cannot mix list and non-list, non-null values with map function',
 'user': {'login': 'alexxchen',
  'id': 15705569,
  'node_id': 'MDQ6VXNlcjE1NzA1NTY5',
  'avatar_url': 'https://avatars.githubusercontent.com/u/15705569?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/alexxchen',
  'html_url': 'https://github.com/alexxchen',
  'followers_url': 'https://api.github.com/users/alexxchen/follow

## Display the full JSON response for debugging

In [194]:
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/comments/897594128',
  'html_url': 'https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128',
  'issue_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792',
  'id': 897594128,
  'node_id': 'IC_kwDODunzps41gDMQ',
  'user': {'login': 'bhavitvyamalik',
   'id': 19718818,
   'node_id': 'MDQ6VXNlcjE5NzE4ODE4',
   'avatar_url': 'https://avatars.githubusercontent.com/u/19718818?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/bhavitvyamalik',
   'html_url': 'https://github.com/bhavitvyamalik',
   'followers_url': 'https://api.github.com/users/bhavitvyamalik/followers',
   'following_url': 'https://api.github.com/users/bhavitvyamalik/following{/other_user}',
   'gists_url': 'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/

## Authenticate with GitHub API and print user details if successful

In [9]:
import dotenv
import os 

dotenv.load_dotenv()

GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

In [11]:
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

response = requests.get("https://api.github.com/user", headers=headers)

if response.status_code == 200:
    print(response.json())  
else:
    print("Failed:", response.status_code, response.text)

{'login': 'mohamedkaram400', 'id': 194250759, 'node_id': 'U_kgDOC5QIBw', 'avatar_url': 'https://avatars.githubusercontent.com/u/194250759?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/mohamedkaram400', 'html_url': 'https://github.com/mohamedkaram400', 'followers_url': 'https://api.github.com/users/mohamedkaram400/followers', 'following_url': 'https://api.github.com/users/mohamedkaram400/following{/other_user}', 'gists_url': 'https://api.github.com/users/mohamedkaram400/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/mohamedkaram400/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/mohamedkaram400/subscriptions', 'organizations_url': 'https://api.github.com/users/mohamedkaram400/orgs', 'repos_url': 'https://api.github.com/users/mohamedkaram400/repos', 'events_url': 'https://api.github.com/users/mohamedkaram400/events{/privacy}', 'received_events_url': 'https://api.github.com/users/mohamedkaram400/received_events', 'type': 'User', 

## Function to fetch GitHub issues from a repository and store them in a JSON file

In [183]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm

def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl")

## run the function to fetch and save GitHub issues

In [184]:
# fetch_issues()

## Load GitHub issues dataset and print the 'closed_at' column

In [None]:

df = pd.read_json("github-issues.jsonl", lines=True)
print(df['closed_at'])

0                            NaT
1      2025-02-20 14:12:23+00:00
2                            NaT
3                            NaT
4      2025-02-19 13:40:32+00:00
                  ...           
7343   2020-04-29 09:23:05+00:00
7344   2020-05-04 06:11:57+00:00
7345   2020-05-04 06:12:27+00:00
7346   2020-05-11 18:55:22+00:00
7347   2020-04-14 12:01:40+00:00
Name: closed_at, Length: 7348, dtype: datetime64[ns, UTC]


## Display the column names of the dataset

In [201]:
df.columns

Index(['url', 'repository_url', 'labels_url', 'comments_url', 'events_url',
       'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels',
       'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments',
       'created_at', 'updated_at', 'closed_at', 'author_association',
       'sub_issues_summary', 'active_lock_reason', 'body', 'closed_by',
       'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason',
       'draft', 'pull_request'],
      dtype='object')

## Load the GitHub issues dataset file using the Hugging Face datasets library

In [None]:
from datasets import load_dataset

issues_dataset = load_dataset("json", data_files="github-issues.jsonl", split="train")
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

## Print out the URL and pull request entries

In [None]:
sample = issues_dataset.shuffle(seed=666).select(range(3))

for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

>> URL: https://github.com/huggingface/datasets/pull/850
>> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/850', 'html_url': 'https://github.com/huggingface/datasets/pull/850', 'diff_url': 'https://github.com/huggingface/datasets/pull/850.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/850.patch'}

>> URL: https://github.com/huggingface/datasets/issues/2773
>> Pull request: None

>> URL: https://github.com/huggingface/datasets/pull/783
>> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/783', 'html_url': 'https://github.com/huggingface/datasets/pull/783', 'diff_url': 'https://github.com/huggingface/datasets/pull/783.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/783.patch'}



## Add a column to distinguish between issues and pull requests

In [None]:
issues_dataset = issues_dataset.map(lambda x: {"is_pull_request": x["pull_request"] is not None})

Map:   0%|          | 0/3019 [00:00<?, ? examples/s]

## Fetch comments for a specific issue and display the response

In [190]:
issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/comments/897594128',
  'html_url': 'https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128',
  'issue_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792',
  'id': 897594128,
  'node_id': 'IC_kwDODunzps41gDMQ',
  'user': {'login': 'bhavitvyamalik',
   'id': 19718818,
   'node_id': 'MDQ6VXNlcjE5NzE4ODE4',
   'avatar_url': 'https://avatars.githubusercontent.com/u/19718818?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/bhavitvyamalik',
   'html_url': 'https://github.com/bhavitvyamalik',
   'followers_url': 'https://api.github.com/users/bhavitvyamalik/followers',
   'following_url': 'https://api.github.com/users/bhavitvyamalik/following{/other_user}',
   'gists_url': 'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/

## Function to get comments for a given GitHub issue

In [204]:
import requests

def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    
    # Check if the response is valid
    if response.status_code != 200:
        print(f"Error: Received status code {response.status_code}")
        print("Response JSON:", response.json())  # Print for debugging
        return []
    
    data = response.json()

    # Ensure data is a list before iterating
    if isinstance(data, list):
        return [r.get("body", "No comment body") for r in data]
    else:
        print("Unexpected response format:", data)
        return []

# Test function again
get_comments(2792)


["@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = 'gooaq'\r\n\r\n    def test_load_dataset(self, dataset_name):\r\n        configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n>       self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n    self.parent.assertTrue(len(dataset[split]) > 0)\r\nE   AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?",
 'Thanks for the help, @albertvillanova! All tests are passing now.']

## Add a new column with comments for each issue

In [205]:
issues_with_comments_dataset = issues_dataset.map(
    lambda x: {"comments": get_comments(int(x["number"]))}
)

Map:   0%|          | 0/3019 [00:00<?, ? examples/s]

Error: Received status code 404
Response JSON: {'message': 'Not Found', 'documentation_url': 'https://docs.github.com/rest/issues/comments#list-issue-comments', 'status': '404'}
Error: Received status code 404
Response JSON: {'message': 'Not Found', 'documentation_url': 'https://docs.github.com/rest/issues/comments#list-issue-comments', 'status': '404'}
Error: Received status code 404
Response JSON: {'message': 'Not Found', 'documentation_url': 'https://docs.github.com/rest/issues/comments#list-issue-comments', 'status': '404'}
Error: Received status code 404
Response JSON: {'message': 'Not Found', 'documentation_url': 'https://docs.github.com/rest/issues/comments#list-issue-comments', 'status': '404'}
Error: Received status code 404
Response JSON: {'message': 'Not Found', 'documentation_url': 'https://docs.github.com/rest/issues/comments#list-issue-comments', 'status': '404'}
Error: Received status code 404
Response JSON: {'message': 'Not Found', 'documentation_url': 'https://docs.git

## Login in huggingface and push dataset

In [210]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Push the dataset with comments to the Hugging Face Hub

In [212]:
issues_with_comments_dataset.push_to_hub("github-issues")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mohamedkaram88/github-issues/commit/e2342ecf5c3739b5a7f26e9156d05e22cba16b6d', commit_message='Upload dataset', commit_description='', oid='e2342ecf5c3739b5a7f26e9156d05e22cba16b6d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mohamedkaram88/github-issues', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mohamedkaram88/github-issues'), pr_revision=None, pr_num=None)

## Load the dataset from the Hugging Face Hub

In [213]:
remote_dataset = load_dataset("mohamedkaram88/github-issues", split="train")
remote_dataset

README.md:   0%|          | 0.00/5.49k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.03M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3019 [00:00<?, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})