In [None]:
!pip install transformers[torch]
!pip install shap
!pip install -q transformers datasets

In [None]:
from datasets import load_dataset, Dataset
import torch
from transformers import AutoTokenizer, pipeline, TextClassificationPipeline, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import shap
import pandas as pd
from sklearn.model_selection import train_test_split
from itertools import combinations
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
from tqdm import tqdm
from sklearn.metrics import classification_report
import re
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
random_state = 42
epochs = 1
learning_rate = 2e-5
batch_size = 4
metric_name = "f1"
model_name = "bert-base-uncased"
model_name2 = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)


tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
tokenizer2.pad_token = tokenizer2.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df = pd.read_csv("final-data.csv")

In [None]:
df_train, df_val = train_test_split(df, test_size = 0.2 , random_state = random_state)

In [None]:
labels = ["bug_reports" , "features_request" , 'experience']

id2label = {
    0 : "bug_reports",
    1 : "features_request" ,
    2 : 'experience'
}


label2id = {
    "bug_reports" : 0 ,
    "features_request" : 1 ,
    'experience' : 2

}

In [None]:
#Bert
def preprocess_data(examples):
  # take a batch of texts
  text = examples["content"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding


#Gpt-2
def preprocess_data2(examples):
  # take a batch of texts
  text = examples["content"]
  # encode them
  encoding = tokenizer2(text, padding="max_length", truncation=True, max_length=512)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [None]:
df_train = Dataset.from_pandas(df_train)
df_val = Dataset.from_pandas(df_val)

In [None]:
#Bert
encoded_dataset_train = df_train.map(preprocess_data, batched=True , remove_columns=df_train.column_names)
encoded_dataset_val = df_val.map(preprocess_data, batched=True , remove_columns=df_val.column_names)

#Gpt-2
encoded_dataset_train2 = df_train.map(preprocess_data2, batched=True , remove_columns=df_train.column_names)
encoded_dataset_val2 = df_val.map(preprocess_data2, batched=True , remove_columns=df_val.column_names)

In [None]:
encoded_dataset_train.set_format("torch")
encoded_dataset_val.set_format("torch")


encoded_dataset_train2.set_format("torch")
encoded_dataset_val2.set_format("torch")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)


model2 = AutoModelForSequenceClassification.from_pretrained(model_name2,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

model2.config.pad_token_id = model2.config.eos_token_id

In [None]:
args = TrainingArguments(
f"/Model/bert",
evaluation_strategy = "epoch",
save_strategy = "epoch",
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=epochs ,
weight_decay=0.01,
save_total_limit=2,
metric_for_best_model=metric_name,

)


args2 = TrainingArguments(
f"/Model/gpt2",
evaluation_strategy = "epoch",
save_strategy = "epoch",
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=epochs ,
weight_decay=0.01,
save_total_limit=1,
metric_for_best_model=metric_name,
)





In [None]:

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_ = f1_score(y_true=y_true, y_pred=y_pred, average='micro', zero_division=0)
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {"f1": f1_,
               "roc_auc": roc_auc,
               "accuracy": accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
t = trainer.train()
e = trainer.evaluate()
trainer.save_model()


trainer.log_metrics("train", t.metrics)
trainer.save_metrics("train", t.metrics)

trainer.log_metrics("eval", e)
trainer.save_metrics("eval", e)

In [None]:
trainer2 = Trainer(
    model2,
    args2,
    train_dataset=encoded_dataset_train2,
    eval_dataset=encoded_dataset_val2,
    tokenizer=tokenizer2,
    compute_metrics=compute_metrics
)

In [None]:
t2 = trainer2.train()
e2 = trainer2.evaluate()
trainer2.save_model()


trainer2.log_metrics("train", t2.metrics)
trainer2.save_metrics("train", t2.metrics)

trainer2.log_metrics("eval", e2)
trainer2.save_metrics("eval", e2)