In [1]:
!nvidia-smi

Thu Nov 17 10:09:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
|  0%   55C    P8    15W / 170W |      0MiB / 12288MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import gensim
import sklearn.metrics
import re
import unicodedata
from tqdm.notebook import tqdm
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import wandb
import csv
from functools import partial
import itertools
import random
import sys
import hashlib

train_file = 'nlbse23-issue-classification-train.csv'
eval_file = 'nlbse23-issue-classification-eval.csv'

csv.field_size_limit(sys.maxsize) # to avoid error: _csv.Error: field larger than field limit (131072)

2147483647

In [None]:
def count_tokens(text):
	return len(text.split())

def count_csv_rows(csv_file):
	with open(csv_file, 'r', newline='') as f:
		return sum(1 for _ in csv.DictReader(csv_file))

def print_csv_preview(filename):
	print(filename)
	print("total rows", count_csv_rows(filename))
	display(pd.read_csv(filename, nrows=5))

In [3]:
# download the training set if it does not exist
if not os.path.isfile("github-labels-top3-803k-train.csv"):
  !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-train.tar.gz" | tar -xz

print_csv_preview("github-labels-top3-803k-train.csv")

if not os.path.isfile("github-labels-top3-803k-test.csv"):
  !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-test.tar.gz" | tar -xz

print_csv_preview("github-labels-top3-803k-test.csv")


Unnamed: 0.1,Unnamed: 0,issue_url,issue_label,issue_created_at,issue_author_association,repository_url,issue_title,issue_body
0,0,https://api.github.com/repos/eamodio/vscode-gi...,bug,2021-01-02T18:07:30Z,NONE,https://api.github.com/repos/eamodio/vscode-gi...,Welcome screen on every editor window is very ...,I just discovered Gitlens and find the functio...
1,1,https://api.github.com/repos/binwiederhier/pco...,bug,2020-12-31T18:19:31Z,OWNER,https://api.github.com/repos/binwiederhier/pcopy,"""pcopy invite"" and ""pcopy paste abc:"" does not...",
2,2,https://api.github.com/repos/binwiederhier/pco...,bug,2021-01-03T04:33:36Z,OWNER,https://api.github.com/repos/binwiederhier/pcopy,"UI: Modal overlay is half transparent, shouldn...",
3,3,https://api.github.com/repos/Sothatsit/RoyalUr...,enhancement,2020-12-25T00:46:00Z,OWNER,https://api.github.com/repos/Sothatsit/RoyalUr...,Make the loading screen scale with browser win...,Currently the loading wheel is a fixed size in...
4,4,https://api.github.com/repos/Malivil/TTT-Custo...,bug,2021-01-02T21:36:57Z,OWNER,https://api.github.com/repos/Malivil/TTT-Custo...,Spectator - Investigate a way to strip weapons...,To bring magneto stick floating


Unnamed: 0.1,Unnamed: 0,issue_url,issue_label,issue_created_at,issue_author_association,repository_url,issue_title,issue_body
0,6,https://api.github.com/repos/tlnagy/TIFF.jl/is...,enhancement,2020-04-07T09:08:50Z,NONE,https://api.github.com/repos/tlnagy/TIFF.jl,"ERROR: KeyError: key (TIFF.SAMPLEFORMAT_INT, 0...",One more error might need to be caught.\r\n`4D...
1,19,https://api.github.com/repos/tisboyo/Twitch_Bo...,enhancement,2020-11-27T07:17:21Z,OWNER,https://api.github.com/repos/tisboyo/Twitch_Bot,Add database backup to dropbox,
2,25,https://api.github.com/repos/DrWhoCares/imgdan...,enhancement,2021-01-02T19:35:34Z,OWNER,https://api.github.com/repos/DrWhoCares/imgdanke,Add a button/method to open the Source or Outp...,Could also add a method to open up path to eac...
3,30,https://api.github.com/repos/DrWhoCares/imgdan...,bug,2021-01-02T20:55:34Z,OWNER,https://api.github.com/repos/DrWhoCares/imgdanke,Processes are being started twice,At some point I refactored a few things and en...
4,54,https://api.github.com/repos/Bean-1/AOT/issues/3,bug,2020-12-29T15:34:35Z,OWNER,https://api.github.com/repos/Bean-1/AOT,Cannot add hp to wall,


In [3]:
function_sig_regex = re.compile(r'[a-zA-Z][a-zA-Z0-9_.]*\([a-zA-Z0-9_, ]*\)')
issue_id_regex = re.compile(r'#[0-9]+')
non_ascii_char_regex = re.compile(r'[^\x00-\x7f]')
punctuations = '!"$%&\()*,/:;<=>[\\]^`{|}~+#@-`'
punctuations_trans = str.maketrans(punctuations, " " * len(punctuations))

def preprocess(text, max_tokens=None):
  text = str(text)

  # lowercase
  # text = text.lower()

  # replace function signatures
  text = function_sig_regex.sub(" function ", text)

  # replace issue ids
  text = issue_id_regex.sub(" issue ", text)
  
  # remove html tags
  # text = gensim.parsing.preprocessing.strip_tags(text)
  
  # remove punctuation
  # text = gensim.parsing.preprocessing.strip_punctuation(text)
  text = text.translate(punctuations_trans)
  
  # remove numerics
  # text = gensim.parsing.preprocessing.strip_numeric(text)
  
  # remove non-ascii characters
  text = non_ascii_char_regex.sub("", text)
  
  text = unicodedata.normalize('NFD', text)
  
  # remove consecutive whitespace characters and convert tabs to spaces
  text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
  
  # text = gensim.parsing.preprocessing.strip_short(text, minsize=3)
  
  # text = gensim.parsing.preprocessing.remove_stopwords(text)
  
  # text = gensim.parsing.preprocessing.stem_text(text)
  
  # limit the number of tokens
  if max_tokens is not None:
    text = " ".join(text.split()[:max_tokens])
  
  return text

In [3]:
q=[.5, .75, .8, .85, .9, .95, .99, .999]

def sample_csv(file_path, n_sample):
	n_population = sum(1 for l in tqdm(csv.reader(open(file_path)), desc="Sampling CSV")) - 1
	skiprows = random.sample(range(1, n_population), n_population - n_sample)
	return pd.read_csv(file_path, skiprows=skiprows)

sample_df = sample_csv(train_file, 10_000)

display(
	"title token frequency quantiles", 
	sample_df["title"].apply(preprocess).apply(count_tokens).quantile(q=q)
)
display(
	"body token frequency quantiles", 
	sample_df["body"].apply(preprocess).apply(count_tokens).quantile(q=q)
)

Sampling CSV: 0it [00:00, ?it/s]

'title token frequency quantiles'

0.500     7.0
0.750    10.0
0.800    10.0
0.850    11.0
0.900    13.0
0.950    15.0
0.990    20.0
0.999    29.0
Name: title, dtype: float64

'body token frequency quantiles'

0.500      69.500
0.750     145.000
0.800     171.000
0.850     207.000
0.900     270.000
0.950     413.150
0.990    1212.130
0.999    6090.088
Name: body, dtype: float64

In [5]:
def preprocess_row(row):
  # doc = "author " + row["author_association"].lower()
  doc = " title " + preprocess(row["title"], max_tokens=20)
  doc += " body " + preprocess(row["body"], max_tokens=511-count_tokens(doc))

  assert count_tokens(doc) <= 512

  return doc

In [6]:
# transform dataset into simpletransformers format
# https://simpletransformers.ai/docs/classification-data-formats/#multi-class-classification

def transform_to_simpletransformers_format(i_path, o_path):
	label_map = {"bug": 0, "feature": 1, "question": 2, "documentation": 3 }

	with open(i_path, "r") as i_f, open(o_path, "w") as o_f:
		reader = csv.DictReader(i_f)
		writer = csv.DictWriter(o_f, fieldnames=["text", "labels"])
		writer.writeheader()
		for row in tqdm(reader, desc="Transform to simpletransformers format"):
			text = preprocess_row(row)
			labels = label_map[row["labels"]]
			writer.writerow({"text": text, "labels": labels})

transform_to_simpletransformers_format(train_file, "train.csv")
transform_to_simpletransformers_format(eval_file, "test.csv")

Transform to simpletransformers format: 0it [00:00, ?it/s]

Transform to simpletransformers format: 0it [00:00, ?it/s]

In [2]:
%%wandb

model_args = ClassificationArgs()

# https://simpletransformers.ai/docs/classification-specifics/#lazy-loading-data
model_args.lazy_loading = True
model_args.lazy_delimiter = ','

# model_args.learning_rate = 1e-4 # 4e-5
model_args.num_train_epochs = 4 # 1

# model_args.max_seq_length = 512
# batch_size = 32

# model_args.max_seq_length = 200
# batch_size = 64

model_args.max_seq_length = 128
batch_size = 128

# model_args.max_seq_length = 64
# batch_size = 256

model_args.train_batch_size = batch_size
model_args.eval_batch_size = batch_size

model_args.save_steps = -1
model_args.save_model_every_epoch = False

# miscallenous
model_args.manual_seed = 2023
model_args.evaluate_during_training = True
model_args.overwrite_output_dir = True
model_args.wandb_project = "NLBSE 2023 Template"

metrics = {
  "p_micro": partial(sklearn.metrics.precision_score, average='micro'),
  "r_micro": partial(sklearn.metrics.recall_score, average='micro'),
  "f1_micro": partial(sklearn.metrics.f1_score, average='micro'),
}

model = ClassificationModel(
  'roberta', 
  'roberta-base', 
  args=model_args, 
  num_labels=4
)

model.train_model(train_df="train.csv", eval_df="test.csv", **metrics)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrafaelkallis[0m. Use [1m`wandb login --relogin`[0m to force relogin


Running Epoch 0 of 4:   0%|          | 0/9968 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_li