In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizerFast
import torch
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import classification_report
import json
import os
import codecs
from transformers import BertTokenizerFast, BertForSequenceClassification, pipeline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

In [3]:
def load_data():
    raw_data = []
    data_directory = os.path.join(os.curdir, "data")
    for filename in os.listdir(data_directory):
        with codecs.open(data_directory + os.path.sep + filename, "r", "utf-8") as fin:
            raw_data += json.load(fin)
    return raw_data

raw_data = load_data()

for item in raw_data:
    item['label'] = ''


In [4]:
all_labels = set({})
for x in raw_data:
    if x['classified'] == 'BUG':
        x['is_classified_bug'] = True
    else:
        x['is_classified_bug'] = False

df = pd.json_normalize(raw_data)

df.head()

Unnamed: 0,key,summary,description,label,classified,type,is_classified_bug
0,HTTPCLIENT-1177,HttpClient treats URI fragments in redirect UR...,HttpClient treats URI fragments in redirect UR...,,BUG,BUG,True
1,HTTPCLIENT-271,PostMethod#setParameter,[HttpClient2.0-rc1]\n\n-------- code fragment ...,,BUG,BUG,True
2,HTTPCLIENT-511,Preemptive Authorization parameter initializat...,Preemptive authorization is defeated by an inc...,,BUG,BUG,True
3,HTTPCLIENT-312,Update license terms,Copyright 1999-2003 The Apache Software Founda...,,DOCUMENTATION,BUG,False
4,HTTPCLIENT-668,make sure no static loggers are used,"Review all loggers used in the component, make...",,TASK,IMPROVEMENT,False


In [5]:
df['is_classified_bug'].head()

0     True
1     True
2     True
3    False
4    False
Name: is_classified_bug, dtype: bool

In [6]:
# Encode the classification labels
label_encoder = LabelEncoder()
df['is_classified_bug'] = label_encoder.fit_transform(df['is_classified_bug'])

# Extract features and labels
X = df[['description', 'summary']].apply(lambda x: ' '.join(x), axis=1)  # Combine description and summary into a single text feature
y = df['is_classified_bug']  # Labels (whether the prediction was correct)

# Split into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-tiny')

# Tokenize the text
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)
print(label_encoder.classes_)



[False  True]


In [9]:
# Create a PyTorch dataset
class BugDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        # Ensure labels are in the correct format (convert Pandas Series to a list)
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = BugDataset(train_encodings, train_labels)
test_dataset = BugDataset(test_encodings, test_labels)

In [10]:
# Load TinyBERT model
model = BertForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=len(label_encoder.classes_))

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
trainer.train()

Step,Training Loss
500,0.625
1000,0.415
1500,0.3072
2000,0.2315
2500,0.1854
3000,0.1486
3500,0.1131
4000,0.0916
4500,0.0681
5000,0.0534


TrainOutput(global_step=28000, training_loss=0.05066505065560341, metrics={'train_runtime': 642.2922, 'train_samples_per_second': 696.256, 'train_steps_per_second': 43.594, 'total_flos': 142040451072000.0, 'train_loss': 0.05066505065560341, 'epoch': 100.0})

In [12]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 2.156304121017456,
 'eval_runtime': 0.5712,
 'eval_samples_per_second': 1959.147,
 'eval_steps_per_second': 31.514,
 'epoch': 100.0}

In [13]:
# Prepare model and data for evaluation
model.eval()
predictions = []
actuals = []

# Iterate over the test dataset
for item in test_dataset:
    input_ids = item['input_ids'].unsqueeze(0).to(device)
    attention_mask = item['attention_mask'].unsqueeze(0).to(device)
    labels = item['labels'].unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # Move logits to CPU and use softmax for probabilities
    logits = outputs.logits.cpu()
    probabilities = torch.softmax(logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1)
    
    predictions.extend(predicted_label.numpy())
    actuals.extend(labels.cpu().numpy())

# Transform labels back to original encoding
predicted_labels = label_encoder.inverse_transform(predictions)
actual_labels = label_encoder.inverse_transform(actuals)

# Generate classification report
report = classification_report(actual_labels, predicted_labels)
print(report)

              precision    recall  f1-score   support

       False       0.84      0.84      0.84       736
        True       0.69      0.68      0.69       383

    accuracy                           0.79      1119
   macro avg       0.76      0.76      0.76      1119
weighted avg       0.78      0.79      0.79      1119



In [14]:
torch.save(model, "D:\\AI and ML Masters\\BugBert\\BugBertModel\\model.pth")

In [15]:
label_encoder.classes_

array([False,  True])