In [1]:
from github import Github, Auth
from dotenv import dotenv_values
from tqdm import tqdm
import pandas as pd

config = dotenv_values(".env")

g = Github(auth=Auth.Token(config['GITHUB_ACCESS_TOKEN']))

In [2]:
label_map = {
	"facebook/react": {
		"Type: Bug": "bug",
		"Type: Feature Request": "feature",
		"Type: Question": "question",
	},
	"tensorflow/tensorflow": {
		"type:bug": "bug",
		"type:feature": "feature",
		"type:support": "question",
	},
	"microsoft/vscode": {
		"bug": "bug",
		"feature-request": "feature",
		"*question": "question",
	},
	"bitcoin/bitcoin": {
		"Bug": "bug",
		"Feature": "feature",
		"Questions and Help": "question",
	},
	"opencv/opencv": {
		"bug": "bug",
		"feature": "feature",
		"question (invalid tracker)": "question",
	},
}

repo_full_names = list(label_map.keys())


In [3]:
def issues_by_repo(repo_full_name: str, n_per_label: int):
	repo_labels = set(label_map[repo_full_name].keys())
	repo = g.get_repo(repo_full_name)
	for label in repo_labels:
		i = 0
		for issue in repo.get_issues(state="closed", labels=[label], sort="created", direction="desc"):
			issue_labels = set([label.name for label in issue.labels])
			matching_labels = repo_labels.intersection(issue_labels)
			if len(matching_labels) != 1:
				continue
			if issue.title is None or issue.body is None:
				continue
			if issue.closed_at > pd.Timestamp(2023, 10, 1):
				continue
			if i == n_per_label:
				break
			standard_label = label_map[repo_full_name][matching_labels.pop()]
			i += 1
			yield [repo_full_name, issue.created_at, standard_label, issue.title, issue.body]

In [4]:
issues = [issue for repo in repo_full_names for issue in tqdm(issues_by_repo(repo, 200), desc=repo)]

issues = pd.DataFrame(data=issues, columns=["repo", "created_at", "label", "title", "body"])

issues

facebook/react: 0it [00:00, ?it/s]

facebook/react: 600it [00:13, 43.21it/s]
tensorflow/tensorflow: 600it [00:35, 16.93it/s]
microsoft/vscode: 600it [00:29, 20.18it/s]
bitcoin/bitcoin: 600it [00:20, 28.94it/s]
opencv/opencv: 600it [00:21, 27.30it/s]


Unnamed: 0,repo,created_at,label,title,body
0,facebook/react,2023-08-26 06:33:37,bug,"[DevTools Bug] Cannot add node ""1"" because a n...",### Website or app\n\nPrivate repo cannot give...
1,facebook/react,2023-08-02 02:26:00,bug,Bug: [18.3.0-canary] renderToString hoists som...,<!--\r\n Please provide a clear and concise d...
2,facebook/react,2023-07-28 05:16:12,bug,[DevTools Bug]: Devtools extension build faili...,### Website or app\n\nN/A\n\n### Repro steps\n...
3,facebook/react,2023-07-17 22:43:05,bug,[DevTools Bug]: Chrome extension gets disconne...,### Website or app\r\n\r\nhttps://react.dev/\r...
4,facebook/react,2023-07-13 21:58:31,bug,[DevTools Bug]: Deprecated __REACT_DEVTOOLS_GL...,### Website or app\n\nhttps://github.com/open-...
...,...,...,...,...,...
2995,opencv/opencv,2022-01-14 22:05:58,feature,Use modern OpenVINO package interface,"* new cmake options: `WITH_OPENVINO`, `OPENCV_..."
2996,opencv/opencv,2022-01-14 15:37:53,feature,Add general broadcasting layer,Performance details(broadcasting 1x1 to 16x204...
2997,opencv/opencv,2022-01-12 09:14:41,feature,TiffEncoder write support more depth type,**Merge with extra**: https://github.com/openc...
2998,opencv/opencv,2022-01-11 16:30:53,feature,Adapt remote inference to operate with NV12 blobs,### Pull Request Readiness Checklist\r\n\r\nSe...


In [9]:
issues.groupby(["repo", "label"])[["title"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,title
repo,label,Unnamed: 2_level_1
bitcoin/bitcoin,bug,200
bitcoin/bitcoin,feature,200
bitcoin/bitcoin,question,200
facebook/react,bug,200
facebook/react,feature,200
facebook/react,question,200
microsoft/vscode,bug,200
microsoft/vscode,feature,200
microsoft/vscode,question,200
opencv/opencv,bug,200


In [15]:
assert len(issues) == len(repo_full_names) * 3 * 200

issues_train = issues[::2]
issues_test = issues[1::2]

display(issues_train.groupby(["repo", "label"])[["title"]].count())
display(issues_test.groupby(["repo", "label"])[["title"]].count())

Unnamed: 0_level_0,Unnamed: 1_level_0,title
repo,label,Unnamed: 2_level_1
bitcoin/bitcoin,bug,100
bitcoin/bitcoin,feature,100
bitcoin/bitcoin,question,100
facebook/react,bug,100
facebook/react,feature,100
facebook/react,question,100
microsoft/vscode,bug,100
microsoft/vscode,feature,100
microsoft/vscode,question,100
opencv/opencv,bug,100


Unnamed: 0_level_0,Unnamed: 1_level_0,title
repo,label,Unnamed: 2_level_1
bitcoin/bitcoin,bug,100
bitcoin/bitcoin,feature,100
bitcoin/bitcoin,question,100
facebook/react,bug,100
facebook/react,feature,100
facebook/react,question,100
microsoft/vscode,bug,100
microsoft/vscode,feature,100
microsoft/vscode,question,100
opencv/opencv,bug,100


In [14]:
issues_train.to_csv("issues_train.csv", index=False, escapechar="\\")
issues_test.to_csv("issues_test.csv", index=False, escapechar="\\")