In [1]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import gensim
import sklearn.metrics
import re
import unicodedata
from tqdm.notebook import tqdm
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import wandb
import csv
from functools import partial
import logging

tqdm.pandas()

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [10]:
# download the training set if it does not exist
if not os.path.isfile("github-labels-top3-803k-train.csv"):
  !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-train.tar.gz" | tar -xz

train_df = pd.read_csv("github-labels-top3-803k-train.csv")
display(train_df.head(5))

if not os.path.isfile("github-labels-top3-803k-test.csv"):
  !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-test.tar.gz" | tar -xz

test_df = pd.read_csv("github-labels-top3-803k-test.csv")
display(test_df.head(5))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.security.selinux'
100  236M  100  236M    0     0  2767k      0  0:01:27  0:01:27 --:--:-- 2676k   0     0  2770k      0  0:01:27  0:00:27  0:01:00 2864k180M    0     0  2761k      0  0:01:27  0:01:06  0:00:21 2721k


Unnamed: 0.1,Unnamed: 0,issue_url,issue_label,issue_created_at,issue_author_association,repository_url,issue_title,issue_body
0,0,https://api.github.com/repos/eamodio/vscode-gi...,bug,2021-01-02T18:07:30Z,NONE,https://api.github.com/repos/eamodio/vscode-gi...,Welcome screen on every editor window is very ...,I just discovered Gitlens and find the functio...
1,1,https://api.github.com/repos/binwiederhier/pco...,bug,2020-12-31T18:19:31Z,OWNER,https://api.github.com/repos/binwiederhier/pcopy,"""pcopy invite"" and ""pcopy paste abc:"" does not...",
2,2,https://api.github.com/repos/binwiederhier/pco...,bug,2021-01-03T04:33:36Z,OWNER,https://api.github.com/repos/binwiederhier/pcopy,"UI: Modal overlay is half transparent, shouldn...",
3,3,https://api.github.com/repos/Sothatsit/RoyalUr...,enhancement,2020-12-25T00:46:00Z,OWNER,https://api.github.com/repos/Sothatsit/RoyalUr...,Make the loading screen scale with browser win...,Currently the loading wheel is a fixed size in...
4,4,https://api.github.com/repos/Malivil/TTT-Custo...,bug,2021-01-02T21:36:57Z,OWNER,https://api.github.com/repos/Malivil/TTT-Custo...,Spectator - Investigate a way to strip weapons...,To bring magneto stick floating


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.security.selinux'
100 27.2M  100 27.2M    0     0  2154k      0  0:00:12  0:00:12 --:--:-- 2137k


Unnamed: 0.1,Unnamed: 0,issue_url,issue_label,issue_created_at,issue_author_association,repository_url,issue_title,issue_body
0,6,https://api.github.com/repos/tlnagy/TIFF.jl/is...,enhancement,2020-04-07T09:08:50Z,NONE,https://api.github.com/repos/tlnagy/TIFF.jl,"ERROR: KeyError: key (TIFF.SAMPLEFORMAT_INT, 0...",One more error might need to be caught.\r\n`4D...
1,19,https://api.github.com/repos/tisboyo/Twitch_Bo...,enhancement,2020-11-27T07:17:21Z,OWNER,https://api.github.com/repos/tisboyo/Twitch_Bot,Add database backup to dropbox,
2,25,https://api.github.com/repos/DrWhoCares/imgdan...,enhancement,2021-01-02T19:35:34Z,OWNER,https://api.github.com/repos/DrWhoCares/imgdanke,Add a button/method to open the Source or Outp...,Could also add a method to open up path to eac...
3,30,https://api.github.com/repos/DrWhoCares/imgdan...,bug,2021-01-02T20:55:34Z,OWNER,https://api.github.com/repos/DrWhoCares/imgdanke,Processes are being started twice,At some point I refactored a few things and en...
4,54,https://api.github.com/repos/Bean-1/AOT/issues/3,bug,2020-12-29T15:34:35Z,OWNER,https://api.github.com/repos/Bean-1/AOT,Cannot add hp to wall,


In [2]:
function_sig_regex = re.compile(r'[a-zA-Z][a-zA-Z0-9_.]*\([a-zA-Z0-9_, ]*\)')
issue_id_regex = re.compile(r'#[0-9]+')
non_ascii_char_regex = re.compile(r'[^\x00-\x7f]')

def count_tokens(text):
	return text.count(" ") + 1

def preprocess(text, max_tokens=None):
  text = str(text)

  # lowercase
  text = text.lower()

  # replace function signatures
  text = function_sig_regex.sub(" function ", text)

  # replace issue ids
  text = issue_id_regex.sub(" issue ", text)
  
  # remove html tags
  text = gensim.parsing.preprocessing.strip_tags(text)
  
  # remove punctuation
  text = gensim.parsing.preprocessing.strip_punctuation(text)
  
  # remove numerics
  text = gensim.parsing.preprocessing.strip_numeric(text)
  
  # remove non-ascii characters
  text = non_ascii_char_regex.sub("", text)
  
  text = unicodedata.normalize('NFD', text)
  
  # remove consecutive whitespace characters and convert tabs to spaces
  text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
  
  text = gensim.parsing.preprocessing.strip_short(text, minsize=3)
  
  text = gensim.parsing.preprocessing.remove_stopwords(text)
  
  # text = gensim.parsing.preprocessing.stem_text(text)
  
  # limit the number of tokens
  if max_tokens is not None:
    text = " ".join(text.split()[:max_tokens])
  
  return text

In [12]:
q=[.5, .75, .8, .85, .9, .95, .99, .999]

display(
	"title token frequency quantiles", 
	train_df["issue_title"].sample(10_000).progress_apply(preprocess).apply(count_tokens).quantile(q=q)
)
display(
	"body token frequency quantiles", 
	train_df["issue_body"].sample(10_000).progress_apply(preprocess).apply(count_tokens).quantile(q=q)
)

  0%|          | 0/10000 [00:00<?, ?it/s]

'title token frequency quantiles'

0.500     4.000
0.750     6.000
0.800     6.000
0.850     7.000
0.900     8.000
0.950     9.000
0.990    11.000
0.999    17.001
Name: issue_title, dtype: float64

  0%|          | 0/10000 [00:00<?, ?it/s]

'body token frequency quantiles'

0.500      25.000
0.750      60.000
0.800      74.000
0.850      95.000
0.900     131.000
0.950     226.000
0.990     701.120
0.999    2831.763
Name: issue_body, dtype: float64

In [16]:
def preprocess_row(row):
  doc = "author " + row["issue_author_association"].lower()
  doc += " title " + preprocess(row["issue_title"], max_tokens=18) # 99.9% of titles have <= 18 tokens
  doc += " body " + preprocess(row["issue_body"], max_tokens=511-count_tokens(doc))

  assert count_tokens(doc) <= 512

  return doc

In [17]:
# transform dataset into simpletransformers format
# https://simpletransformers.ai/docs/classification-data-formats/#multi-class-classification

# train_df["text"] = train_df.progress_apply(preprocess_row, axis=1)
# train_df["labels"] = pd.Categorical(train_df["issue_label"]).codes
# display(train_df[["text", "labels"]].head(5))
# train_df[["text", "labels"]].to_csv("train.csv", index=False)

# test_df["text"] = test_df.progress_apply(preprocess_row, axis=1)
# test_df["labels"] = pd.Categorical(test_df["issue_label"]).codes
# display(test_df[["text", "labels"]].head(5))
# test_df[["text", "labels"]].to_csv("test.csv", index=False)

def transform_to_simpletransformers_format(i_path, o_path):
	label_map = {"bug": 0, "enhancement": 1, "question": 2}
	n_lines = sum(1 for _ in open(i_path, "r"))

	with open(i_path, "r") as i_f, open(o_path, "w") as o_f:
		reader = csv.DictReader(i_f)
		writer = csv.DictWriter(o_f, fieldnames=["text", "labels"])
		writer.writeheader()
		for row in tqdm(reader, desc="Transform to simpletransformers format", total=n_lines):
			text = preprocess_row(row)
			labels = label_map[row["issue_label"]]
			writer.writerow({"text": text, "labels": labels})

transform_to_simpletransformers_format("github-labels-top3-803k-train.csv", "train.csv")
transform_to_simpletransformers_format("github-labels-top3-803k-test.csv", "test.csv")

Transform to simpletransformers format:   0%|          | 0/13102655 [00:00<?, ?it/s]

Transform to simpletransformers format:   0%|          | 0/1454863 [00:00<?, ?it/s]

In [3]:
model_args = ClassificationArgs()

# https://simpletransformers.ai/docs/classification-specifics/#lazy-loading-data
model_args.lazy_loading = True
model_args.lazy_delimiter = ','

# 
# https://github.com/ThilinaRajapakse/simpletransformers/issues/225
# model_args.use_multiprocessing = False
# model_args.use_multiprocessing_for_evaluation = False
# model_args.multiprocessing_chunksize = 1
# model_args.dataloader_num_workers = 1

# model_args.learning_rate = 3e-5 # 4e-5
model_args.num_train_epochs = 4 # 1

# ~20 for title, ~100 for body
model_args.max_seq_length = 128

# batch_size = 32 # 5.2GB VRAM
# batch_size = 64 # 7.3GB VRAM
# batch_size = 96 # 9.4GB VRAM
batch_size = 128 # 12GB VRAM
model_args.train_batch_size = batch_size
model_args.eval_batch_size = batch_size

model_args.save_steps = -1
# model_args.save_model_every_epoch = False

# miscallenous
model_args.manual_seed = 0
model_args.evaluate_during_training = True
model_args.overwrite_output_dir = True
model_args.wandb_project = "NLBSE 2023 Template"

model = ClassificationModel(
  'roberta', 
  'roberta-base', 
  args=model_args, 
  num_labels=3
)

metrics = {
  "p_micro": partial(sklearn.metrics.precision_score, average='micro'),
  "r_micro": partial(sklearn.metrics.recall_score, average='micro'),
  "f1_micro": partial(sklearn.metrics.f1_score, average='micro'),
}

# model.train_model(train_df=train_df, eval_df=test_df, **metrics)
model.train_model(train_df="train.csv", eval_df="test.csv", **metrics)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
INFO:simpletransformers.classification.classification_model: Initializing WandB run for training.
[34m[1mwandb[0m: Currently logged in as: [33mrafaelkallis[0m. Use [1m`wandb login --relogin`[0m to force relogin


Running Epoch 0 of 4:   0%|          | 0/5648 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
