In [1]:
!nvidia-smi

Thu Nov 17 21:17:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
|  0%   54C    P8    14W / 170W |      0MiB / 12288MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import gensim
import sklearn.metrics
import re
import unicodedata
from tqdm.notebook import tqdm
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import wandb
import csv
from functools import partial
import itertools
import random
import sys
import hashlib

train_file = 'nlbse23-issue-classification-train.csv'
test_file = 'nlbse23-issue-classification-test.csv'

csv.field_size_limit(sys.maxsize) # to avoid error: _csv.Error: field larger than field limit (131072)

def count_tokens(text):
	return len(text.split())

def count_csv_rows(csv_file):
	with open(csv_file, 'r', newline='', encoding='utf-8') as f:
		return sum(1 for _ in csv.DictReader(f))

def print_csv_preview(filename):
	print(filename)
	print("total rows", count_csv_rows(filename))
	display(pd.read_csv(filename, nrows=5))

In [11]:
# download the training set if it does not exist
if not os.path.isfile(train_file):
  !curl "https://tickettagger.blob.core.windows.net/datasets/{train_file}.tar.gz" | tar -xz

print_csv_preview(train_file)

if not os.path.isfile(test_file):
  !curl "https://tickettagger.blob.core.windows.net/datasets/{test_file}.tar.gz" | tar -xz

print_csv_preview(test_file)

nlbse23-issue-classification-train.csv
total rows 1275881


Unnamed: 0,id,labels,title,body,author_association
0,1199051804,documentation,setting a logging Handler name,BPO | [43058](https://bugs.python.org/issue430...,MANNEQUIN
1,1199074324,documentation,Improve documentation for typing._GenericAlias,BPO | [46589](https://bugs.python.org/issue465...,MANNEQUIN
2,1199022454,documentation,Description of '\w' behavior is vague in `re` ...,BPO | [38566](https://bugs.python.org/issue385...,MANNEQUIN
3,1199028356,documentation,add docstrings to functions in pdb module,BPO | [39278](https://bugs.python.org/issue392...,MANNEQUIN
4,1199055394,documentation,Documentation needs to declare CalledProcessEr...,BPO | [43635](https://bugs.python.org/issue436...,MANNEQUIN


nlbse23-issue-classification-test.csv
total rows 142320


Unnamed: 0,id,labels,title,body,author_association
0,1199053386,documentation,A possible misleading expression in the Virtua...,BPO | [43319](https://bugs.python.org/issue433...,MANNEQUIN
1,1255069635,bug,[BUG] a valid `gameName` in the `create a new ...,**Describe the bug**\r\nIn the `create a new p...,NONE
2,1089772715,feature,How to check if a certain entity still exists?,During a bug in my own code I noticed that the...,NONE
3,1000928729,feature,chose the timezone in dbeaver option,"Dbeaver 21.2.0\r\n\r\nFor all version DBeaver,...",NONE
4,1300011093,bug,[Issue]: Multiple Versions of a Movie not work...,### Please describe your bug\n\nThe doc at htt...,NONE


In [12]:
function_sig_regex = re.compile(r'[a-zA-Z][a-zA-Z0-9_.]*\([a-zA-Z0-9_, ]*\)')
issue_id_regex = re.compile(r'#[0-9]+')
non_ascii_char_regex = re.compile(r'[^\x00-\x7f]')
punctuations = '!"$%&\()*,/:;<=>[\\]^`{|}~+#@-`'
punctuations_trans = str.maketrans(punctuations, " " * len(punctuations))

def preprocess(text, max_tokens=None):
  text = str(text)

  # replace function signatures
  text = function_sig_regex.sub(" function ", text)

  # replace issue ids
  text = issue_id_regex.sub(" issue ", text)
  
  # remove html tags
  # text = gensim.parsing.preprocessing.strip_tags(text)
  
  # remove punctuation
  text = text.translate(punctuations_trans)
  
  # remove numerics
  # text = gensim.parsing.preprocessing.strip_numeric(text)
  
  # remove non-ascii characters
  text = non_ascii_char_regex.sub("", text)
  
  text = unicodedata.normalize('NFD', text)
  
  # remove consecutive whitespace characters and convert tabs to spaces
  text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
  
  # limit the number of tokens
  if max_tokens is not None:
    text = " ".join(text.split()[:max_tokens])
  
  return text

def preprocess_row(row):
  doc = "title " + preprocess(row["title"], max_tokens=20)
  doc += " body " + preprocess(row["body"], max_tokens=511-count_tokens(doc))

  assert count_tokens(doc) <= 512

  return doc

In [13]:
# transform dataset into simpletransformers format
# https://simpletransformers.ai/docs/classification-data-formats/#multi-class-classification

def transform_to_simpletransformers_format(i_path, o_path):
	label_map = {"bug": 0, "feature": 1, "question": 2, "documentation": 3 }

	with open(i_path, "r") as i_f, open(o_path, "w") as o_f:
		reader = csv.DictReader(i_f)
		writer = csv.DictWriter(o_f, fieldnames=["text", "labels"])
		writer.writeheader()
		total = count_csv_rows(i_path)
		for row in tqdm(reader, desc="Transform to simpletransformers format", total=total):
			text = preprocess_row(row)
			labels = label_map[row["labels"]]
			writer.writerow({"text": text, "labels": labels})

transform_to_simpletransformers_format(train_file, "train.csv")
transform_to_simpletransformers_format(test_file, "test.csv")

print_csv_preview("train.csv")
print_csv_preview("test.csv")

Transform to simpletransformers format:   0%|          | 0/1275881 [00:00<?, ?it/s]

Transform to simpletransformers format:   0%|          | 0/142320 [00:00<?, ?it/s]

train.csv
total rows 1275881


Unnamed: 0,text,labels
0,title setting a logging Handler name body BPO ...,3
1,title Improve documentation for typing._Generi...,3
2,title Description of ' w' behavior is vague in...,3
3,title add docstrings to functions in pdb modul...,3
4,title Documentation needs to declare CalledPro...,3


test.csv
total rows 142320


Unnamed: 0,text,labels
0,title A possible misleading expression in the ...,3
1,title BUG a valid gameName in the create a new...,0
2,title How to check if a certain entity still e...,1
3,title chose the timezone in dbeaver option bod...,1
4,title Issue Multiple Versions of a Movie not w...,0


In [4]:

def model_args():
  args = ClassificationArgs()

  # https://simpletransformers.ai/docs/classification-specifics/#lazy-loading-data
  args.lazy_loading = True
  args.lazy_delimiter = ','

  # args.learning_rate = 1e-4 # 4e-5
  args.num_train_epochs = 1 # 1

  args.max_seq_length = 128
  batch_size = 128

  args.train_batch_size = batch_size
  args.eval_batch_size = batch_size

  args.save_steps = -1
  args.save_model_every_epoch = False

  args.manual_seed = 0
  args.evaluate_during_training = True
  args.overwrite_output_dir = True

  # https://simpletransformers.ai/docs/tips-and-tricks/#visualization-support
  args.wandb_project = "NLBSE 2023 Template"

  return args

metrics = {
  "p_micro": partial(sklearn.metrics.precision_score, average='micro'),
  "r_micro": partial(sklearn.metrics.recall_score, average='micro'),
  "f1_micro": partial(sklearn.metrics.f1_score, average='micro'),
}

In [5]:

model = ClassificationModel(
  'roberta', 
  'roberta-base', 
  args=model_args(), 
  num_labels=4
)

model.train_model(train_df="train.csv", eval_df="test.csv", **metrics)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrafaelkallis[0m. Use [1m`wandb login --relogin`[0m to force relogin


Running Epoch 0 of 1:   0%|          | 0/9968 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(9968,
 defaultdict(list,
             {'global_step': [2000, 4000, 6000, 8000, 9968],
              'train_loss': [0.366774320602417,
               0.3608933687210083,
               0.43930262327194214,
               0.43272626399993896,
               0.35443010926246643],
              'mcc': [0.7710794519038561,
               0.7857530116167598,
               0.7906818308836988,
               0.7971106092194528,
               0.7994973755405717],
              'p_micro': [0.8695053400786958,
               0.8772484541877459,
               0.8794477234401349,
               0.883558178752108,
               0.8849845418774592],
              'r_micro': [0.8695053400786958,
               0.8772484541877459,
               0.8794477234401349,
               0.883558178752108,
               0.8849845418774592],
              'f1_micro': [0.8695053400786958,
               0.8772484541877459,
               0.879447723440135,
               0.8835581787521081,
               

In [6]:
# model = model = ClassificationModel(
# 	"roberta",
# 	"outputs/best_model",
#   args=model_args(), 
#   num_labels=4,
# )

results, model_outputs, wrong_pred = model.eval_model(eval_df="test.csv", **metrics)
results

Running Evaluation:   0%|          | 0/1112 [00:00<?, ?it/s]



VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training loss,▆█▅▄▃▅▄▂▄▄▃▂▂▅▄▅▃▃▃▅▃▆▆▄▃▃▄▃▃▄▂▃▃▄▃▁▄▃▂▂
eval_loss,█▄▃▂▁
f1_micro,▁▅▅▇█
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▂▄███▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
mcc,▁▅▆▇█
p_micro,▁▅▅▇█
r_micro,▁▅▅▇█
train_loss,▂▂█▇▁

0,1
Training loss,0.21984
eval_loss,0.34338
f1_micro,0.88498
global_step,9968.0
lr,0.0
mcc,0.7995
p_micro,0.88498
r_micro,0.88498
train_loss,0.35443


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669389432839427, max=1.0…



{'mcc': 0.7994973755405717,
 'p_micro': 0.8849845418774592,
 'r_micro': 0.8849845418774592,
 'f1_micro': 0.8849845418774593,
 'eval_loss': 0.3433843389582291}