In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

df = pd.read_csv('final_balanced_classification.csv')

# Map your categories to integers
label_dict = {
    'Functional Testing': 0,
    'Non-functional Testing': 1,
    'Structural Testing': 2,
    'Testing related to Changes': 3,
    'Unclassified': 4 
}

# Apply mapping
df['label'] = df['test_debt_type'].apply(lambda x: label_dict[x])

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['labels'], df['label'], test_size=0.2)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5) # Replace NUM_LABELS with the actual number of labels in your dataset

# Tokenize the text
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)


KeyError: 'labels'

In [2]:
class TestDebtDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TestDebtDataset(train_encodings, list(train_labels))
val_dataset = TestDebtDataset(val_encodings, list(val_labels))


In [3]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",  # or "steps" and then you need to specify eval_steps
    # eval_steps = 100 if you choose "steps" for evaluation_strategy
)


In [4]:
from datasets import load_metric
import numpy as np

# Load the metrics
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

  accuracy_metric = load_metric("accuracy")


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Assuming df is your DataFrame after loading and processing it
# And assuming the rest of the preparation is done correctly as per previous steps

class TestDebtDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Make sure to reinitialize your datasets and model with the correct imports

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset= val_dataset,
    compute_metrics=compute_metrics  # Pass the compute_metrics function
)

In [6]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0008,0.007301,0.998646,0.998708,0.998646,0.99866
2,0.0112,0.004765,0.999459,0.99946,0.999459,0.999458
3,0.0001,0.003881,0.999459,0.99946,0.999459,0.999458


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=2772, training_loss=0.05696490597354511, metrics={'train_runtime': 362.4186, 'train_samples_per_second': 122.278, 'train_steps_per_second': 7.649, 'total_flos': 3803276135624184.0, 'train_loss': 0.05696490597354511, 'epoch': 3.0})

In [8]:
results= trainer.evaluate()
print(results)

{'eval_loss': 0.003880843985825777, 'eval_accuracy': 0.9994585814834868, 'eval_precision': 0.9994597189511852, 'eval_recall': 0.9994585814834868, 'eval_f1': 0.9994578108339207, 'eval_runtime': 8.5063, 'eval_samples_per_second': 434.267, 'eval_steps_per_second': 6.818, 'epoch': 3.0}


In [9]:
model.save_pretrained('./my_test_debt_model')
tokenizer.save_pretrained('./my_test_debt_model')

# To use the model for predictions
from transformers import pipeline

classifier = pipeline('text-classification', model='./my_test_debt_model', tokenizer='./my_test_debt_model')
classifier("add tests for systemdrawingcommonthis issue tracks porting some set of tests from monos test suite covering the portions of systemdrawing that we support on net core monos test cases are in this folder httpsgithubcommonomonotreemastermcsclasssystemdrawingtest we most likely want to convert the most useful tests from all of the sections here with the exception of systemdrawingdesign which we arent going to support right now on net core it is mainly related to designer winforms support monos tests use nunit so we will need to convert them to xunit when copying them additionally ive identified that there will need to be some functional changes made to the tests themselves as they do not pass against the net framework implementation we consider the net framework implementation to be the compatibility baseline so we should change the tests to accomodate it rather than the other way around the test failures seemed mainly related to very small subtle differences in things like floatingpoint precision color values offsets etc we should do the following when we have both windows and unix implementations up and running ensure that all tests have enough leniency to accomodate floatingpoint differences and other minor inconsistencies where unavoidable ensure that all minor inconsistencies are acceptable or otherwise very difficult problematic to fix ensure that all minor inconsistencies are listed somewhere so that we can mention it both in the tests and in the user documentation hughbe qmfrederik mareksafar current status code coverage note that there is a large amount of internal and debugonly code which distorts these numbers when the coverage is generally high we can clean out a lot of dead code and then get more accurate data date branch coverage line coverage 6262017 33 25 7112017 49 54 7172017 55 60 822017 58 66 8252017 62 71 9212017 646 753 namespaces and coverage as of 9212017 systemdrawing 763 systemdrawingdrawing2d 938 systemdrawingimaging 861 systemdrawingprinting 628 systemdrawingtext 96")


[{'label': 'LABEL_1', 'score': 0.9996904134750366}]

In [10]:
classifier("'text analytics abstractivesummarizebatchconveniencewithstatisticstest failing in nightly runshttpsdevazurecomazuresdkinternalbuildresultsbuildid2211068viewresults")

[{'label': 'LABEL_1', 'score': 0.9998538494110107}]

In [11]:
classifier("Assess the application's ability to take the lowest possible time to perform key functions.")

[{'label': 'LABEL_0', 'score': 0.9996416568756104}]