In [3]:
%pip install requests

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests

#https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

In [9]:
# response.status_code
# response.json()

In [3]:
import os
from dotenv import load_dotenv

load_dotenv()

GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [4]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=1_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [5]:
fetch_issues()

  0%|          | 0/10 [00:00<?, ?it/s]

Downloaded all the issues for datasets! Dataset stored at ./datasets-issues.jsonl


In [6]:
from datasets import load_dataset

issues_dataset = load_dataset("json", data_files="datasets-issues.jsonl", split="train")
issues_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'type', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'sub_issues_summary'],
    num_rows: 1000
})

In [7]:
sample = issues_dataset.shuffle(seed=666).select(range(3))

for url, pr in zip(sample['html_url'], sample['pull_request']):
    print(f"-> URL: {url}")
    print(f"-> Pull Request: {pr}\n")

-> URL: https://github.com/huggingface/datasets/issues/7679
-> Pull Request: None

-> URL: https://github.com/huggingface/datasets/pull/7710
-> Pull Request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/7710', 'html_url': 'https://github.com/huggingface/datasets/pull/7710', 'diff_url': 'https://github.com/huggingface/datasets/pull/7710.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/7710.patch', 'merged_at': datetime.datetime(2025, 7, 31, 10, 12, 52)}

-> URL: https://github.com/huggingface/datasets/pull/6934
-> Pull Request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/6934', 'html_url': 'https://github.com/huggingface/datasets/pull/6934', 'diff_url': 'https://github.com/huggingface/datasets/pull/6934.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/6934.patch', 'merged_at': datetime.datetime(2024, 5, 30, 10, 45, 37)}



In [8]:
issues_dataset = issues_dataset.map(
    lambda x: {'is_pull_request': False if x['pull_request'] is None else True}
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
# issues closed
closed_issues = issues_dataset.filter(
    lambda x: (x['state'] == 'closed') and (x['pull_request'] is None)
)

# PR closed
closed_prs = issues_dataset.filter(
    lambda x: (x['state'] == 'closed') and (x['pull_request'] is not None)
)

# to_pandas
closed_issues.set_format("pandas")
df_issues = closed_issues[:]
closed_prs.set_format("pandas")
df_prs = closed_prs[:]

# converts timestamps into datetime
df_issues['created_at'] = pd.to_datetime(df_issues['created_at'])
df_issues['closed_at'] = pd.to_datetime(df_issues['closed_at'])
df_prs['created_at'] = pd.to_datetime(df_prs['created_at'])
df_prs['closed_at'] = pd.to_datetime(df_prs['closed_at'])

# calculate the duration time of resolution
df_issues['resolution_time'] = (df_issues['closed_at'] - df_issues['created_at']).dt.total_seconds() / 3600 / 24
df_prs['resolution_time'] = (df_prs['closed_at'] - df_prs['created_at']).dt.total_seconds() / 3600 / 24

# time mean resolution (issue)
temps_moyen_issues = df_issues['resolution_time'].mean()
print(f"Temps moyen pour résoudre un problème (issue) : {temps_moyen_issues:.2f} jours")

# time mean close a PR
temps_moyen_prs = df_prs['resolution_time'].mean()
print(f"Temps moyen pour fermer une pull request : {temps_moyen_prs:.2f} jours")

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Temps moyen pour résoudre un problème (issue) : 18.57 jours
Temps moyen pour fermer une pull request : 8.96 jours


In [10]:
issues_dataset.reset_format()
# L’API REST GitHub fournit un point de terminaison Comments 
# qui renvoie tous les commentaires associés à un numéro de problème

issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/comments/897594128',
  'html_url': 'https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128',
  'issue_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792',
  'id': 897594128,
  'node_id': 'IC_kwDODunzps41gDMQ',
  'user': {'login': 'bhavitvyamalik',
   'id': 19718818,
   'node_id': 'MDQ6VXNlcjE5NzE4ODE4',
   'avatar_url': 'https://avatars.githubusercontent.com/u/19718818?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/bhavitvyamalik',
   'html_url': 'https://github.com/bhavitvyamalik',
   'followers_url': 'https://api.github.com/users/bhavitvyamalik/followers',
   'following_url': 'https://api.github.com/users/bhavitvyamalik/following{/other_user}',
   'gists_url': 'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/

In [11]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]


# Test our function works as expected
get_comments(2792)


["@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = 'gooaq'\r\n\r\n    def test_load_dataset(self, dataset_name):\r\n        configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n>       self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n    self.parent.assertTrue(len(dataset[split]) > 0)\r\nE   AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?",
 'Thanks for the help, @albertvillanova! All tests are passing now.']

In [12]:
issues_dataset_with_commments = issues_dataset.map(
    lambda x: {"comments": get_comments(x["number"])}
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
issues_dataset_with_commments.to_json("issues-datasets-with-comments.jsonl")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

6916113

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
issues_dataset_with_commments.push_to_hub("github-issues")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  38%|###8      |  602kB / 1.57MB            

CommitInfo(commit_url='https://huggingface.co/datasets/nathbns/github-issues/commit/39137d90052ceb7cc1dec0e2862b2f51deede075', commit_message='Upload dataset', commit_description='', oid='39137d90052ceb7cc1dec0e2862b2f51deede075', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/nathbns/github-issues', endpoint='https://huggingface.co', repo_type='dataset', repo_id='nathbns/github-issues'), pr_revision=None, pr_num=None)

In [19]:
remote_dataset = load_dataset("lewtun/github-issues", split="train")
remote_dataset

README.md: 0.00B [00:00, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


datasets-issues-with-comments.jsonl:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3019 [00:00<?, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})