In [2]:
!nvidia-smi

Mon Nov 21 16:05:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| 46%   50C    P5    29W / 170W |      0MiB / 12288MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
import os
import time
import numpy as np
import pandas as pd
import gensim
import sklearn.metrics
import re
import unicodedata
from tqdm import tqdm
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import wandb
import csv
from functools import partial
import itertools
import random
import sys
import hashlib
import time

train_file = 'nlbse23-issue-classification-train.csv'
test_file = 'nlbse23-issue-classification-test.csv'

csv.field_size_limit(sys.maxsize) # to avoid error: _csv.Error: field larger than field limit (131072)

def count_tokens(text):
	return len(text.split())

def count_csv_rows(csv_file):
	with open(csv_file, 'r', newline='', encoding='utf-8') as f:
		return sum(1 for _ in csv.DictReader(f))

def print_csv_preview(filename, sep=None):
	print(filename)
	print("total rows", count_csv_rows(filename))
	display(pd.read_csv(filename, nrows=5, sep=sep))

def sample_csv(file, n_sample):
	n_population = count_csv_rows(file)
	skiprows = random.sample(range(1, n_population), n_population - n_sample)
	return pd.read_csv(file, skiprows=skiprows)


### Download dataset

In [None]:
# download the training set if it does not exist
if not os.path.isfile(train_file):
  !curl "https://tickettagger.blob.core.windows.net/datasets/{train_file}.tar.gz" | tar -xz

print_csv_preview(train_file)

if not os.path.isfile(test_file):
  !curl "https://tickettagger.blob.core.windows.net/datasets/{test_file}.tar.gz" | tar -xz

print_csv_preview(test_file)

nlbse23-issue-classification-train.csv
total rows 1275881


Unnamed: 0,id,labels,title,body,author_association
0,1199051804,documentation,setting a logging Handler name,BPO | [43058](https://bugs.python.org/issue430...,MANNEQUIN
1,1199074324,documentation,Improve documentation for typing._GenericAlias,BPO | [46589](https://bugs.python.org/issue465...,MANNEQUIN
2,1199022454,documentation,Description of '\w' behavior is vague in `re` ...,BPO | [38566](https://bugs.python.org/issue385...,MANNEQUIN
3,1199028356,documentation,add docstrings to functions in pdb module,BPO | [39278](https://bugs.python.org/issue392...,MANNEQUIN
4,1199055394,documentation,Documentation needs to declare CalledProcessEr...,BPO | [43635](https://bugs.python.org/issue436...,MANNEQUIN


nlbse23-issue-classification-test.csv
total rows 142320


Unnamed: 0,id,labels,title,body,author_association
0,1199053386,documentation,A possible misleading expression in the Virtua...,BPO | [43319](https://bugs.python.org/issue433...,MANNEQUIN
1,1255069635,bug,[BUG] a valid `gameName` in the `create a new ...,**Describe the bug**\r\nIn the `create a new p...,NONE
2,1089772715,feature,How to check if a certain entity still exists?,During a bug in my own code I noticed that the...,NONE
3,1000928729,feature,chose the timezone in dbeaver option,"Dbeaver 21.2.0\r\n\r\nFor all version DBeaver,...",NONE
4,1300011093,bug,[Issue]: Multiple Versions of a Movie not work...,### Please describe your bug\n\nThe doc at htt...,NONE


### Preprocess

In [None]:
function_sig_regex = re.compile(r'[a-zA-Z][a-zA-Z0-9_.]*\([a-zA-Z0-9_, ]*\)')
issue_id_regex = re.compile(r'#[0-9]+')
non_ascii_char_regex = re.compile(r'[^\x00-\x7f]')
punctuations = '!\'"`$%&\()*,/:;<=>[\\]^{|}~+#@-_'
punctuations_trans = str.maketrans(punctuations, " " * len(punctuations))

def preprocess(text, max_tokens=None):
  text = str(text)

  # replace function signatures
  text = function_sig_regex.sub(" FUNCTION ", text)

  # replace issue ids
  text = issue_id_regex.sub(" ISSUE ", text)
  
  # remove html tags
  # text = gensim.parsing.preprocessing.strip_tags(text)
  
  # remove punctuation
  text = text.translate(punctuations_trans)
  
  # remove numerics
  # text = gensim.parsing.preprocessing.strip_numeric(text)
  
  # remove non-ascii characters
  text = non_ascii_char_regex.sub("", text)
  
  text = unicodedata.normalize('NFD', text)
  
  # remove consecutive whitespace characters and convert tabs to spaces
  text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
  
  # limit the number of tokens
  if max_tokens is not None:
    text = " ".join(text.split()[:max_tokens])
  
  return text

In [None]:
sample_df = sample_csv(train_file, 50_000)

q=[.5, .75, .8, .85, .9, .95, .99, .999]

display("title token count quantiles", sample_df["title"].apply(preprocess).apply(count_tokens).quantile(q=q))
display("body token count quantiles", sample_df["body"].apply(preprocess).apply(count_tokens).quantile(q=q))

'title token count quantiles'

0.500     7.0
0.750    10.0
0.800    10.0
0.850    11.0
0.900    12.0
0.950    15.0
0.990    20.0
0.999    30.0
Name: title, dtype: float64

'body token count quantiles'

0.500      72.000
0.750     152.000
0.800     181.000
0.850     223.000
0.900     296.000
0.950     464.000
0.990    1414.030
0.999    6433.146
Name: body, dtype: float64

In [None]:
# transform dataset into simpletransformers format
# https://simpletransformers.ai/docs/classification-data-formats/#multi-class-classification

def preprocess_row(row):
  doc = "TITLE " + preprocess(row["title"], max_tokens=20) # 99% of titles fit
  doc += " BODY " + preprocess(row["body"], max_tokens=511-count_tokens(doc))

  assert count_tokens(doc) <= 512

  return doc

def transform_to_simpletransformers_format(i_path, o_path):
	label_map = {"bug": 0, "feature": 1, "question": 2, "documentation": 3 }

	with open(i_path, "r") as i_f, open(o_path, "w") as o_f:
		reader = csv.DictReader(i_f)
		writer = csv.DictWriter(o_f, fieldnames=["text", "labels"], delimiter="\t")
		writer.writeheader()
		total = count_csv_rows(i_path)
		for row in tqdm(reader, desc="Transform to simpletransformers format", total=total):
			text = preprocess_row(row)
			labels = label_map[row["labels"]]
			writer.writerow({"text": text, "labels": labels})

transform_to_simpletransformers_format(train_file, "train.csv")
transform_to_simpletransformers_format(test_file, "test.csv")

print_csv_preview("train.csv", sep='\t')
print_csv_preview("test.csv", sep='\t')

Transform to simpletransformers format:   0%|          | 0/1275881 [00:00<?, ?it/s]

Transform to simpletransformers format:   0%|          | 0/142320 [00:00<?, ?it/s]

train.csv
total rows 1275881


Unnamed: 0,text,labels
0,TITLE setting a logging Handler name BODY BPO ...,3
1,TITLE Improve documentation for typing. Generi...,3
2,TITLE Description of w behavior is vague in re...,3
3,TITLE add docstrings to functions in pdb modul...,3
4,TITLE Documentation needs to declare CalledPro...,3


test.csv
total rows 142320


Unnamed: 0,text,labels
0,TITLE A possible misleading expression in the ...,3
1,TITLE BUG a valid gameName in the create a new...,0
2,TITLE How to check if a certain entity still e...,1
3,TITLE chose the timezone in dbeaver option BOD...,1
4,TITLE Issue Multiple Versions of a Movie not w...,0


### Training Loop

In [3]:

def model_args():
  timestamp = str(int(time.time()))

  # https://simpletransformers.ai/docs/usage/#configuring-a-simple-transformers-model
  args = ClassificationArgs()

  args.max_seq_length = 224
  args.learning_rate = 1e-4 # 4e-5
  args.num_train_epochs = 8
  args.train_batch_size = 64
  args.eval_batch_size = 64
  args.gradient_accumulation_steps = 4

  # custom evaluation metric
  # https://github.com/ThilinaRajapakse/simpletransformers/discussions/911
  args.use_early_stopping = False
  args.early_stopping_metric = "f1_micro"
  args.early_stopping_metric_minimize = False

  # evaluate at end of each epoch
  args.evaluate_during_training = True
  args.evaluate_during_training_steps = int(1e20) # never
  # args.evaluate_during_training_steps = 1.5  * 60 * 60 // args.gradient_accumulation_steps # target 1h

  # https://simpletransformers.ai/docs/classification-specifics/#lazy-loading-data
  args.lazy_loading = True

  args.save_steps = -1
  args.logging_steps = max(1, 1.6 * 30 // args.gradient_accumulation_steps) # target 30s
  args.manual_seed = 0

  args.output_dir = f"outputs/{timestamp}"
  args.best_model_dir = f"{args.output_dir}/best_model"

  # https://docs.wandb.ai/guides/integrations/other/simpletransformers
  # https://simpletransformers.ai/docs/tips-and-tricks/#visualization-support
  args.wandb_project = "NLBSE'23 Issue Report Classification"
  args.wandb_kwargs = {"entity": "nlbse", "notes": f"timestamp:{timestamp}"}

  return args

metrics = {
  "precision_bug": partial(sklearn.metrics.precision_score, average=None, labels=[0]),
  "recall_bug": partial(sklearn.metrics.recall_score, average=None, labels=[0]),
  "f1_bug": partial(sklearn.metrics.f1_score, average=None, labels=[0]),

  "precision_feature": partial(sklearn.metrics.precision_score, average=None, labels=[1]),
  "recall_feature": partial(sklearn.metrics.recall_score, average=None, labels=[1]),
  "f1_feature": partial(sklearn.metrics.f1_score, average=None, labels=[1]),

  "precision_question": partial(sklearn.metrics.precision_score, average=None, labels=[2]),
  "recall_question": partial(sklearn.metrics.recall_score, average=None, labels=[2]),
  "f1_question": partial(sklearn.metrics.f1_score, average=None, labels=[2]),

  "precision_documentation": partial(sklearn.metrics.precision_score, average=None, labels=[3]),
  "recall_documentation": partial(sklearn.metrics.recall_score, average=None, labels=[3]),
  "f1_documentation": partial(sklearn.metrics.f1_score, average=None, labels=[3]),

  "precision_micro": partial(sklearn.metrics.precision_score, average='micro'),
  "recall_micro": partial(sklearn.metrics.recall_score, average='micro'),
  "f1_micro": partial(sklearn.metrics.f1_score, average='micro'),
}

In [4]:
model = ClassificationModel(
  'roberta', 
  'roberta-base', 
  args=model_args(), 
  num_labels=4
)

model.train_model(train_df="train.csv", eval_df="test.csv", **metrics)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrafaelkallis[0m ([33mnlbse[0m). Use [1m`wandb login --relogin`[0m to force relogin


Running Epoch 0 of 8:   0%|          | 0/19936 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB (GPU 0; 11.77 GiB total capacity; 10.36 GiB already allocated; 72.25 MiB free; 10.76 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [17]:
model = model = ClassificationModel(
	"roberta",
	"outputs/<timestamp>/best_model",
  args=model_args(), 
  num_labels=4,
)

results, model_outputs, wrong_pred = model.eval_model(eval_df="test.csv", **metrics)
results



# def batch(items, n):
#   b = list()
#   for i, item in enumerate(items):
#     b.append(item)
#     if i % n == n-1:
#       yield b
#       b = list()
#   if len(b) > 0:
#     yield b


# # confusion matrix
# y_true = []
# y_pred = []

# wandb.init(project="NLBSE 2023 Template")

# with open("test.csv", newline='', encoding='utf-8') as f:
#   label_map = {0: "bug", 1: "feature", 2: "question", 3: "documentation"}
#   total = count_csv_rows("test.csv")
#   reader = csv.DictReader(f)
#   for row_batch in batch(tqdm(reader, desc="Benchmarking Inference Performance", total=total), n=32):
#     # https://simpletransformers.ai/docs/classification-models/#making-predictions-with-a-classification-model
#     preds, model_outputs = model.predict([row["text"] for row in row_batch])

#     y_true.extend([label_map[int(row["labels"])] for row in row_batch])
#     y_pred.extend([label_map[pred] for pred in preds])
#     if len(y_true) >= 10_000:
#       break

# report = sklearn.metrics.classification_report(y_true, y_pred, output_dict=True)

# for label in ["bug", "feature", "question", "documentation"]:
#   # P = sklearn.metrics.precision_score(y_true, y_pred, average=None, labels=[label])[0]
#   # R = sklearn.metrics.recall_score(y_true, y_pred, average=None, labels=[label])[0]
#   # F1 = sklearn.metrics.f1_score(y_true, y_pred, average=None, labels=[label])[0]
#   P = report[label]["precision"]
#   R = report[label]["recall"]
#   F1 = report[label]["f1-score"]
#   support = report[label]["support"]
#   print(f"=*= {label} =*=")
#   print(f"precision:\t{P:.4f}")
#   print(f"recall:\t\t{R:.4f}")
#   print(f"f1 score:\t{F1:.4f}")
#   print(f"support:\t{support}")
#   print()

#   wandb.log({ f"precision_{label}": P, f"recall_{label}": R, f"f1_{label}": F1, f"support_{label}": support }) 

# # P = sklearn.metrics.precision_score(y_true, y_pred, average='micro')
# # R = sklearn.metrics.recall_score(y_true, y_pred, average='micro')
# # F1 = sklearn.metrics.f1_score(y_true, y_pred, average='micro')
# P, R, F1, support = sklearn.metrics.precision_recall_fscore_support(y_true, y_pred, average='micro')

# print("=*= micro averages =*=")
# print(f"precision:\t{P:.4f}")
# print(f"recall:\t\t{R:.4f}")
# print(f"F1 score:\t{F1:.4f}")
# print(f"precision:\t{support}")

# wandb.log({ "precision_micro": P, "recall_micro": R, "f1_micro": F1, "support_micro": support })

# wandb.join()

Running Evaluation:   0%|          | 0/2224 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
[34m[1mwandb[0m: Currently logged in as: [33mrafaelkallis[0m ([33mnlbse[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666944669947649, max=1.0)…



{'mcc': 0.8080528003315748,
 'precision_bug': array([0.91259603]),
 'recall_bug': array([0.93561199]),
 'f1_bug': array([0.9239607]),
 'precision_feature': array([0.88941939]),
 'recall_feature': array([0.89972915]),
 'f1_feature': array([0.89454456]),
 'precision_question': array([0.72115668]),
 'recall_question': array([0.57573616]),
 'f1_question': array([0.64029342]),
 'precision_documentation': array([0.78247347]),
 'recall_documentation': array([0.68410109]),
 'f1_documentation': array([0.72998805]),
 'precision_micro': 0.8897835862844294,
 'recall_micro': 0.8897835862844294,
 'f1_micro': 0.8897835862844294,
 'eval_loss': 0.3331970460109895}