# Dataset

In [1]:
from google.cloud import bigquery
import google.auth
import csv
import time
from tqdm.notebook import tqdm
import sys
import sklearn.model_selection
import pandas as pd
import hashlib
import random
import os

csv.field_size_limit(sys.maxsize) # to avoid error: _csv.Error: field larger than field limit (131072)

dataset_all_file = 'nlbse23-issue-classification-all.csv'
dataset_file = 'nlbse23-issue-classification.csv'
train_file = 'nlbse23-issue-classification-train.csv'
eval_file = 'nlbse23-issue-classification-eval.csv'


In [2]:

# https://cloud.google.com/docs/authentication/application-default-credentials
client = bigquery.Client(project="nlbse-issue-classification")

query = """
WITH
  label_synonyms AS (
    SELECT 'bug' AS label, synonym FROM UNNEST(['bug', 'type: bug', 'kind/bug', 'crash', 'defect', 'type-defect', 'type:bug', 'browser bug', 'fix', 'fixed', 'bugfix', 'bug fix', 'resolution: fixed', 'troubleshooting', 'type/bug', 'bug report']) AS synonym
    UNION ALL
    SELECT 'feature', * FROM UNNEST(['feature', 'feature request', 'enhancement', 'improvement', 'type: feature', 'type:feature', 'new feature', 'kind/feature', 'kind/enhancement'])
    UNION ALL
    SELECT 'question', * FROM UNNEST(['question', 'faq', 'type: question', 'type:question'])
    UNION ALL
    SELECT 'documentation', * FROM UNNEST(['documentation', 'docs', 'doc', 'type: documentation', 'needs documentation', 'area/documentation', 'type: docs', 'type:docs', 'needs docs', 'wiki', 'kind/documentation', 'kind/docs'])
  ),
  close_events AS (
    SELECT payload
    FROM `githubarchive.day.2022*`
    WHERE
      -- _TABLE_SUFFIX BETWEEN '0101' AND '0101' -- FREE
      -- _TABLE_SUFFIX BETWEEN '0101' AND '0131' -- $
      _TABLE_SUFFIX BETWEEN '0101' AND '0930' -- $$$ $$$ $$$
      AND type = 'IssuesEvent'
      AND JSON_EXTRACT_SCALAR(payload, '$.action') = 'closed'
      AND JSON_EXTRACT_SCALAR(payload, '$.issue.body') != 'null'
  ),
  nested_labels AS (
    SELECT
      ARRAY(
        SELECT LOWER(JSON_EXTRACT_SCALAR(label_payload, '$.name'))
        FROM UNNEST(JSON_EXTRACT_ARRAY(payload, '$.issue.labels')) AS label_payload
      ) AS labels,
      payload
    FROM close_events
  ),
  synonymized_labels AS (
    SELECT
      ARRAY(
        SELECT DISTINCT label_synonyms.label -- bug, feature, question, documentation
        FROM UNNEST(nested_labels.labels) AS label, label_synonyms
        WHERE label = label_synonyms.synonym
        ORDER BY label_synonyms.label
      ) AS labels,
      payload
    FROM nested_labels
  ),
  filtered_labels AS (
    SELECT labels, payload
    FROM synonymized_labels
    WHERE ARRAY_LENGTH(labels) = 1 -- exactly 1 label
  ),
  concatenated_labels AS (
    SELECT ARRAY_TO_STRING(labels, ',') AS labels, payload
    FROM filtered_labels
  )
SELECT
  JSON_EXTRACT_SCALAR(payload, '$.issue.id') AS id,
  labels,
  JSON_EXTRACT_SCALAR(payload, '$.issue.title') AS title,
  JSON_EXTRACT_SCALAR(payload, '$.issue.body') AS body,
  JSON_EXTRACT_SCALAR(payload, '$.issue.author_association') AS author_association
FROM concatenated_labels
"""

query_job = client.query(query)
rows = query_job.result()

with open(dataset_all_file, "w", newline='', encoding='utf-8') as f:
  fieldnames = ["id", "labels", "title", "body", "author_association"]
  writer = csv.DictWriter(f, fieldnames=fieldnames)
  writer.writeheader()
  for row in tqdm(rows, desc="BigQuery", smoothing=0):
    writer.writerow({
      **row,
      "title": row["title"].replace("\0", ""),
      "body": row["body"].replace("\0", ""),
    })



BigQuery: 0it [00:00, ?it/s]

In [3]:
def count_csv_rows(filename):
	with open(filename, "r", newline='', encoding='utf-8') as f:
		return sum(1 for _ in csv.DictReader(f))

def dataset_stats(filename):
	print(filename)
	print("total rows", count_csv_rows(filename))
	display(pd.read_csv(filename, nrows=5))

dataset_stats(dataset_all_file)

nlbse23-issue-classification-all.csv
total rows 1555561


Unnamed: 0,id,labels,title,body,author_association
0,1199051804,documentation,setting a logging Handler name,BPO | [43058](https://bugs.python.org/issue430...,MANNEQUIN
1,1199074324,documentation,Improve documentation for typing._GenericAlias,BPO | [46589](https://bugs.python.org/issue465...,MANNEQUIN
2,1199022454,documentation,Description of '\w' behavior is vague in `re` ...,BPO | [38566](https://bugs.python.org/issue385...,MANNEQUIN
3,1199028356,documentation,add docstrings to functions in pdb module,BPO | [39278](https://bugs.python.org/issue392...,MANNEQUIN
4,1199055394,documentation,Documentation needs to declare CalledProcessEr...,BPO | [43635](https://bugs.python.org/issue436...,MANNEQUIN


In [4]:
def label_stats(filename):
	label_count = {"bug": 0, "feature": 0, "question": 0, "documentation": 0}
	with open(filename, "r", newline='', encoding='utf-8') as f:
		for row in tqdm(csv.DictReader(f), desc="Counting labels", total=count_csv_rows(filename)):
			label_count[row["labels"]] += 1
	print(filename)
	print("label counts", label_count)
	total = sum(label_count.values())
	print("label distribution", {k: round(v / total, 3) for k, v in label_count.items()})

label_stats(dataset_all_file)

Counting labels:   0%|          | 0/1555561 [00:00<?, ?it/s]

nlbse23-issue-classification-all.csv
label counts {'bug': 801069, 'feature': 589064, 'question': 91373, 'documentation': 74055}
label distribution {'bug': 0.515, 'feature': 0.379, 'question': 0.059, 'documentation': 0.048}


# Language

In [5]:
import fasttext
import gensim

# https://fasttext.cc/docs/en/language-identification.html
# download the training set if it does not exist
if not os.path.isfile("lid.176.bin"):
  !curl "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin" -o "lid.176.bin"

model = fasttext.load_model("lid.176.bin")

def identify_language(text):
  text = gensim.parsing.strip_multiple_whitespaces(text)
  return model.predict(text)[0][0][9:]



In [6]:
def language_stats(filename):
	# compute language statistsics
	language_count = {}
	with open(filename, "r", newline='', encoding='utf-8') as f:
		rows = csv.DictReader(f)
		texts = (f"{row['title']} {row['body']}" for row in rows)
		langs = (identify_language(text) for text in texts)
		for lang in tqdm(langs, desc="Language stats", total=count_csv_rows(filename)):
			if lang not in language_count:
				language_count[lang] = 0
			language_count[lang] += 1
	print(filename)
	print("language counts", language_count)
	total = sum(language_count.values())
	print("language distribution", {k: round(v / total, 3) for k, v in language_count.items()})

language_stats(dataset_all_file)

Language stats:   0%|          | 0/1555561 [00:00<?, ?it/s]

nlbse23-issue-classification-all.csv
language counts {'en': 1418201, 'zh': 30229, 'id': 1842, 'de': 8177, 'pt': 8759, 'ru': 10913, 'gl': 12, 'nl': 2233, 'it': 2269, 'ko': 29715, 'es': 13338, 'fr': 8013, 'fa': 287, 'ja': 13697, 'uk': 1305, 'hr': 92, 'tt': 11, 'ms': 92, 'pl': 821, 'hu': 451, 'el': 90, 'sv': 386, 'cs': 806, 'tr': 544, 'sah': 9, 'ro': 58, 'ar': 84, 'no': 519, 'is': 15, 'vi': 728, 'sr': 247, 'da': 260, 'sh': 38, 'wuu': 4, 'ca': 149, 'he': 67, 'la': 37, 'th': 253, 'kn': 23, 'fi': 211, 'lv': 3, 'ceb': 46, 'hy': 8, 'sl': 58, 'mk': 23, 'bg': 19, 'et': 38, 'ml': 9, 'ka': 4, 'lt': 35, 'sk': 86, 'eo': 9, 'ba': 12, 'mn': 14, 'be': 12, 'eu': 15, 'tl': 20, 'bs': 20, 'war': 2, 'ta': 6, 'oc': 14, 'az': 7, 'hi': 20, 'uz': 13, 'ur': 6, 'sq': 7, 'te': 5, 'qu': 1, 'af': 3, 'ku': 1, 'pms': 2, 'my': 2, 'cv': 1, 'lb': 2, 'mr': 4, 'io': 1, 'ast': 1, 'gu': 2, 'si': 3, 'nds': 2, 'bn': 8, 'jv': 2, 'als': 1, 'ht': 1, 'gv': 1, 'tg': 3, 'sco': 1, 'fy': 1, 'cy': 3, 'arz': 3, 'nn': 6, 'km': 1, 'pa': 1

In [7]:
def filter_en(filename_csv_in, filename_csv_out):
	with open(filename_csv_in, "r", newline='', encoding='utf-8') as f_in, open(filename_csv_out, "w", newline='', encoding='utf-8') as f_out:
		reader = csv.DictReader(f_in)
		writer = csv.DictWriter(f_out, fieldnames=reader.fieldnames)
		writer.writeheader()
		for row in tqdm(reader, desc="Filtering EN", total=count_csv_rows(filename_csv_in)):
			if identify_language(f"{row['title']} {row['body']}") == "en":
				writer.writerow(row)

filter_en(dataset_all_file, dataset_file)

Filtering EN:   0%|          | 0/1555561 [00:00<?, ?it/s]

In [8]:

label_stats(dataset_file)

Counting labels:   0%|          | 0/1418201 [00:00<?, ?it/s]

nlbse23-issue-classification.csv
label counts {'bug': 745732, 'feature': 525013, 'question': 84538, 'documentation': 62918}
label distribution {'bug': 0.526, 'feature': 0.37, 'question': 0.06, 'documentation': 0.044}


# Training and Evaluation Split

We split the dataset into training and evaluation sets.

In [9]:
def split_dataset(filename_in, filename_out_train, filename_out_test, train_percentage):
	seed = int(hashlib.sha256("nlbse2023".encode('utf8')).hexdigest(), 16)
	r = random.Random(seed)

	with open(filename_in, "r", newline='') as f, open(filename_out_train, "w", newline='') as f_train, open(filename_out_test, "w", newline='') as f_test:
		fieldnames = ["id", "labels", "title", "body", "author_association"]
		writer_train = csv.DictWriter(f_train, fieldnames=fieldnames)
		writer_train.writeheader()
		writer_test = csv.DictWriter(f_test, fieldnames=fieldnames)
		writer_test.writeheader()
		total = count_csv_rows(filename_in)
		is_train_gen = (r.random() < 0.9 for _ in range(total))
		for row, is_train in tqdm(zip(csv.DictReader(f), is_train_gen), desc="Splitting", total=total, smoothing=0):

			if is_train:
				writer_train.writerow(row)
			else:
				writer_test.writerow(row)

split_dataset(dataset_file, train_file, eval_file, train_percentage=0.9)

dataset_stats(train_file)
dataset_stats(eval_file)

Splitting:   0%|          | 0/1418201 [00:00<?, ?it/s]

nlbse23-issue-classification-train.csv
total rows 1275881


Unnamed: 0,id,labels,title,body,author_association
0,1199051804,documentation,setting a logging Handler name,BPO | [43058](https://bugs.python.org/issue430...,MANNEQUIN
1,1199074324,documentation,Improve documentation for typing._GenericAlias,BPO | [46589](https://bugs.python.org/issue465...,MANNEQUIN
2,1199022454,documentation,Description of '\w' behavior is vague in `re` ...,BPO | [38566](https://bugs.python.org/issue385...,MANNEQUIN
3,1199028356,documentation,add docstrings to functions in pdb module,BPO | [39278](https://bugs.python.org/issue392...,MANNEQUIN
4,1199055394,documentation,Documentation needs to declare CalledProcessEr...,BPO | [43635](https://bugs.python.org/issue436...,MANNEQUIN


nlbse23-issue-classification-eval.csv
total rows 142320


Unnamed: 0,id,labels,title,body,author_association
0,1199053386,documentation,A possible misleading expression in the Virtua...,BPO | [43319](https://bugs.python.org/issue433...,MANNEQUIN
1,1255069635,bug,[BUG] a valid `gameName` in the `create a new ...,**Describe the bug**\r\nIn the `create a new p...,NONE
2,1089772715,feature,How to check if a certain entity still exists?,During a bug in my own code I noticed that the...,NONE
3,1000928729,feature,chose the timezone in dbeaver option,"Dbeaver 21.2.0\r\n\r\nFor all version DBeaver,...",NONE
4,1300011093,bug,[Issue]: Multiple Versions of a Movie not work...,### Please describe your bug\n\nThe doc at htt...,NONE
