# Dataset

In [1]:
from google.cloud import bigquery
import google.auth
import csv
import time
from tqdm.notebook import tqdm
import sys
import sklearn.model_selection
import pandas as pd
import hashlib
import random

csv.field_size_limit(sys.maxsize) # to avoid error: _csv.Error: field larger than field limit (131072)

dataset_file = 'nlbse23-issue-classification.csv'
train_file = 'nlbse23-issue-classification-train.csv'
eval_file = 'nlbse23-issue-classification-eval.csv'


In [2]:

# https://cloud.google.com/docs/authentication/application-default-credentials
client = bigquery.Client(project="nlbse-issue-classification")

query = """
WITH
  -- label synonyms from Izadi et al.
  label_synonyms AS (
    SELECT 'bug' AS label, synonym FROM UNNEST(['bug', 'type: bug', 'kind/bug', 'crash', 'defect', 'type-defect', 'type:bug', 'browser bug', 'fix', 'fixed', 'bugfix', 'bug fix', 'resolution: fixed', 'troubleshooting', 'type/bug', 'bug report']) AS synonym
    UNION ALL
    SELECT 'feature', * FROM UNNEST(['feature', 'feature request', 'enhancement', 'improvement', 'type: feature', 'type:feature', 'new feature', 'kind/feature', 'kind/enhancement'])
    UNION ALL
    SELECT 'question', * FROM UNNEST(['question', 'faq', 'type: question', 'type:question'])
    UNION ALL
    SELECT 'documentation', * FROM UNNEST(['documentation', 'docs', 'doc', 'type: documentation', 'needs documentation', 'area/documentation', 'type: docs', 'type:docs', 'needs docs', 'wiki', 'kind/documentation', 'kind/docs'])
  ),
  close_events AS (
    SELECT payload
    FROM `githubarchive.day.2022*`
    WHERE
      _TABLE_SUFFIX BETWEEN '0101' AND '0101' -- FREE n=3k
      -- _TABLE_SUFFIX BETWEEN '0101' AND '0131' -- $ n=179k
      -- _TABLE_SUFFIX BETWEEN '0101' AND '0631' -- $$ n=
      AND type = 'IssuesEvent'
      AND JSON_EXTRACT_SCALAR(payload, '$.action') = 'closed'
      AND JSON_EXTRACT_SCALAR(payload, '$.issue.body') != 'null'
  ),
  nested_labels AS (
    SELECT
      ARRAY(
        SELECT LOWER(JSON_EXTRACT_SCALAR(label_payload, '$.name'))
        FROM UNNEST(JSON_EXTRACT_ARRAY(payload, '$.issue.labels')) AS label_payload
      ) AS labels,
      payload
    FROM close_events
  ),
  synonymized_labels AS (
    SELECT
      ARRAY(
        SELECT DISTINCT label_synonyms.label -- bug, feature, question, documentation
        FROM UNNEST(nested_labels.labels) AS label, label_synonyms
        WHERE label = label_synonyms.synonym
        ORDER BY label_synonyms.label
      ) AS labels,
      payload
    FROM nested_labels
  ),
  filtered_labels AS (
    SELECT labels, payload
    FROM synonymized_labels
    WHERE ARRAY_LENGTH(labels) = 1 -- exactly 1 label
  ),
  concatenated_labels AS (
    SELECT ARRAY_TO_STRING(labels, ',') AS labels, payload
    FROM filtered_labels
  )
SELECT
  JSON_EXTRACT_SCALAR(payload, '$.issue.id') AS id,
  labels,
  JSON_EXTRACT_SCALAR(payload, '$.issue.title') AS title,
  JSON_EXTRACT_SCALAR(payload, '$.issue.body') AS body,
  JSON_EXTRACT_SCALAR(payload, '$.issue.author_association') AS author_association
FROM concatenated_labels
"""

query_job = client.query(query)
rows = query_job.result()

with open(dataset_file, "w", newline='') as f:
  fieldnames = ["id", "labels", "title", "body", "author_association"]
  writer = csv.DictWriter(f, fieldnames=fieldnames)
  writer.writeheader()
  for row in tqdm(rows, desc="BigQuery", smoothing=0):
    writer.writerow({
      **row,
      "title": row["title"].replace("\0", ""),
      "body": row["body"].replace("\0", ""),
    })



BigQuery: 0it [00:00, ?it/s]

In [3]:
def count_csv_rows(filename):
	with open(filename, "r", newline='') as f:
		return sum(1 for _ in csv.DictReader(f))

total_rows = count_csv_rows(dataset_file)

display(pd.read_csv(dataset_file, nrows=5))
print("total rows", total_rows)

Unnamed: 0,id,labels,title,body,author_association
0,1079869334,question,Impacted jars by Apache Log4j Tool : Zero Day ...,"Hi Team,\r\nWe are using following jar provide...",NONE
1,1089576774,bug,Simulator environment screenAPI does not handl...,"In VSCode, the following code:\r\n```lua\r\nre...",NONE
2,1091674064,bug,"Keyword ""goto"" gets minified => Lua Error",The keyword `goto` used in Lua gets minified (...,NONE
3,1091670941,bug,Mouse Sensitivity Sliders go backward slightly...,"As the title says, when switching between the ...",NONE
4,1091367555,feature,Some mods (Like JKE and Saber Battle X) crash ...,I know this is likely just due to missing stuf...,NONE


total rows 3039


In [4]:
label_count = {"bug": 0, "feature": 0, "question": 0, "documentation": 0}
with open(dataset_file, "r", newline='') as f:
	for row in csv.DictReader(f):
		label_count[row["labels"]] += 1
print(label_count)
total = sum(label_count.values())
print({k: round(v / total, 3) for k, v in label_count.items()})

{'bug': 1419, 'feature': 1386, 'question': 168, 'documentation': 66}
{'bug': 0.467, 'feature': 0.456, 'question': 0.055, 'documentation': 0.022}


# Training and Evaluation Split

We split the dataset into training and evaluation sets.

In [5]:
seed = int(hashlib.sha256("nlbse2023".encode('utf8')).hexdigest(), 16)
r = random.Random(seed)

with open(dataset_file, "r", newline='') as f, open(train_file, "w", newline='') as f_train, open(eval_file, "w", newline='') as f_eval:
	fieldnames = ["id", "labels", "title", "body", "author_association"]
	writer_train = csv.DictWriter(f_train, fieldnames=fieldnames)
	writer_train.writeheader()
	writer_eval = csv.DictWriter(f_eval, fieldnames=fieldnames)
	writer_eval.writeheader()
	is_train_gen = (r.random() < 0.9 for _ in range(total_rows)) # 90% train, 10% eval
	for row, is_train in tqdm(zip(csv.DictReader(f), is_train_gen), desc="Splitting", total=total_rows, smoothing=0):
		if is_train:
			writer_train.writerow(row)
		else:
			writer_eval.writerow(row)

train_rows = count_csv_rows(train_file)
display(pd.read_csv(train_file, nrows=5))
print("train rows", train_rows)

eval_rows = count_csv_rows(eval_file)
display(pd.read_csv(eval_file, nrows=5))
print("eval rows", eval_rows)

epsilon = 1e-2

assert train_rows + eval_rows == total_rows
assert -epsilon < (train_rows / total_rows - 0.9) < epsilon
assert -epsilon < (eval_rows / total_rows - 0.1) < epsilon

Splitting:   0%|          | 0/3039 [00:00<?, ?it/s]

Unnamed: 0,id,labels,title,body,author_association
0,1079869334,question,Impacted jars by Apache Log4j Tool : Zero Day ...,"Hi Team,\r\nWe are using following jar provide...",NONE
1,1089576774,bug,Simulator environment screenAPI does not handl...,"In VSCode, the following code:\r\n```lua\r\nre...",NONE
2,1091674064,bug,"Keyword ""goto"" gets minified => Lua Error",The keyword `goto` used in Lua gets minified (...,NONE
3,1091670941,bug,Mouse Sensitivity Sliders go backward slightly...,"As the title says, when switching between the ...",NONE
4,1091367555,feature,Some mods (Like JKE and Saber Battle X) crash ...,I know this is likely just due to missing stuf...,NONE


train rows 2733


Unnamed: 0,id,labels,title,body,author_association
0,1085424730,bug,按照教程加载cmd出现错误,按照您在bilibili的教程，完全遵循您的教程出现py不能解码，请问如何解决\r\n![1...,NONE
1,1090590579,bug,[BUG] npm ERR when installing RTL,### Describe the bug\r\nAfter cloning the Ride...,NONE
2,957341957,bug,Incomplete setblock command causes sponge-owne...,**I am currently running**\r\n<!-- If you don'...,NONE
3,1034796720,bug,The plugin crashes due to invalid syntax error...,- [x] I've read the [guidelines for Contributi...,NONE
4,1052710504,feature,[SUGGESTION] can you please also add marker li...,\r\n**Describe the feature you'd like**\r\nCan...,NONE


eval rows 306
