In [1]:
import os
import time
import csv
import numpy as np
import pandas as pd
import fasttext
import gensim
import sklearn.metrics
import re
import unicodedata
from tqdm import tqdm
import sys
import hashlib
import random

train_file = 'nlbse23-issue-classification-train.csv'
test_file = 'nlbse23-issue-classification-test.csv'

csv.field_size_limit(sys.maxsize) # to avoid error: _csv.Error: field larger than field limit (131072)

def count_csv_rows(filename):
	with open(filename, "r", newline='', encoding='utf-8') as f:
		return sum(1 for _ in csv.DictReader(f))

def print_csv_preview(filename):
	print(filename)
	print("total rows", count_csv_rows(filename))
	display(pd.read_csv(filename, nrows=5))
	

In [3]:
# download the training set if it does not exist
if not os.path.isfile(train_file):
  !curl "https://tickettagger.blob.core.windows.net/datasets/{train_file}.tar.gz" | tar -xz

print_csv_preview(train_file)

if not os.path.isfile(test_file):
  !curl "https://tickettagger.blob.core.windows.net/datasets/{test_file}.tar.gz" | tar -xz

print_csv_preview(test_file)


nlbse23-issue-classification-train.csv
total rows 1275881


Unnamed: 0,id,labels,title,body,author_association
0,1199051804,documentation,setting a logging Handler name,BPO | [43058](https://bugs.python.org/issue430...,MANNEQUIN
1,1199074324,documentation,Improve documentation for typing._GenericAlias,BPO | [46589](https://bugs.python.org/issue465...,MANNEQUIN
2,1199022454,documentation,Description of '\w' behavior is vague in `re` ...,BPO | [38566](https://bugs.python.org/issue385...,MANNEQUIN
3,1199028356,documentation,add docstrings to functions in pdb module,BPO | [39278](https://bugs.python.org/issue392...,MANNEQUIN
4,1199055394,documentation,Documentation needs to declare CalledProcessEr...,BPO | [43635](https://bugs.python.org/issue436...,MANNEQUIN


nlbse23-issue-classification-test.csv
total rows 142320


Unnamed: 0,id,labels,title,body,author_association
0,1199053386,documentation,A possible misleading expression in the Virtua...,BPO | [43319](https://bugs.python.org/issue433...,MANNEQUIN
1,1255069635,bug,[BUG] a valid `gameName` in the `create a new ...,**Describe the bug**\r\nIn the `create a new p...,NONE
2,1089772715,feature,How to check if a certain entity still exists?,During a bug in my own code I noticed that the...,NONE
3,1000928729,feature,chose the timezone in dbeaver option,"Dbeaver 21.2.0\r\n\r\nFor all version DBeaver,...",NONE
4,1300011093,bug,[Issue]: Multiple Versions of a Movie not work...,### Please describe your bug\n\nThe doc at htt...,NONE


In [2]:

def preprocess(text):
  text = str(text)

  # escape fasttext special sequences
  text = text.replace("__label__", "")

  # lowercase
  # text = text.lower()
  
  # remove html tags
  # text = gensim.parsing.preprocessing.strip_tags(text)
  
  # remove punctuation
  # text = gensim.parsing.preprocessing.strip_punctuation(text)
  
  # remove numerics
  # text = gensim.parsing.preprocessing.strip_numeric(text)
  
  # remove consecutive whitespace characters and convert tabs to spaces
  text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
  
  # text = gensim.parsing.preprocessing.strip_short(text, minsize=3)
  
  # text = gensim.parsing.preprocessing.remove_stopwords(text)
  
  # text = gensim.parsing.preprocessing.stem_text(text)
  
  return text

def preprocess_row(row):
  doc = preprocess(row["title"])
  doc += " " + preprocess(row["body"])

  return doc

In [3]:

def transform_to_fasttext_format(i_path, o_path):
	with open(i_path, "r", newline='', encoding="utf-8") as i_f, open(o_path, "w", encoding='utf-8') as o_f:
		reader = csv.DictReader(i_f)
		total = count_csv_rows(i_path)
		for row in tqdm(reader, desc="Transform to fastText format", total=total):
			o_f.write(f"__label__{row['labels']} {preprocess_row(row)}\n")

transform_to_fasttext_format(train_file, "issues.train")

!wc -l "issues.train"
!head -n 2 "issues.train"

Transform to fastText format:  60%|██████    | 766081/1275881 [00:59<00:28, 17695.60it/s]

In [16]:
# https://fasttext.cc/docs/en/python-module.html#train_supervised-parameters

model = fasttext.train_supervised("issues.train")
model.save_model(f"issues.bin")

# model.quantize()
# model.save_model(f"issues.ftz")

Read 170M words
Number of words:  11143356
Number of labels: 4
Progress: 100.0% words/sec/thread: 1689349 lr:  0.000000 avg.loss:  0.462312 ETA:   0h 0m 0s100.0% words/sec/thread: 1689354 lr: -0.000001 avg.loss:  0.462312 ETA:   0h 0m 0s


In [17]:
# confusion matrix
y_true = []
y_pred = []

with open(test_file, newline='', encoding='utf-8') as f:
  reader = csv.DictReader(f)
  for row in tqdm(reader, desc="Benchmarking Inference Performance"):
    pred = model.predict(preprocess_row(row))[0][0][9:]
    y_true.append(row["labels"])
    y_pred.append(pred)

for label in ["bug", "feature", "question", "documentation"]:
  P_c = sklearn.metrics.precision_score(y_true, y_pred, average=None, labels=[label])[0]
  R_c = sklearn.metrics.recall_score(y_true, y_pred, average=None, labels=[label])[0]
  F1_c = sklearn.metrics.f1_score(y_true, y_pred, average=None, labels=[label])[0]
  print(f"=*= {label} =*=")
  print(f"precision:\t{P_c:.4f}")
  print(f"recall:\t\t{R_c:.4f}")
  print(f"F1 score:\t{F1_c:.4f}")
  print()


P = sklearn.metrics.precision_score(y_true, y_pred, average='micro')
R = sklearn.metrics.recall_score(y_true, y_pred, average='micro')
F1 = sklearn.metrics.f1_score(y_true, y_pred, average='micro')

print("=*= micro averages =*=")
print(f"precision:\t{P:.4f}")
print(f"recall:\t\t{R:.4f}")
print(f"F1 score:\t{F1:.4f}")

Benchmarking Inference Performance: 0it [00:00, ?it/s]

=*= bug =*=
precision:	0.8771
recall:		0.9173
F1 score:	0.8967

=*= feature =*=
precision:	0.8415
recall:		0.8621
F1 score:	0.8517

=*= question =*=
precision:	0.6702
recall:		0.4555
F1 score:	0.5424

=*= documentation =*=
precision:	0.7363
recall:		0.5011
F1 score:	0.5964

=*= global =*=
precision:	0.8510
recall:		0.8510
F1 score:	0.8510
