In [1]:
import pandas as pd
import torch
import os
import numpy as np
import datasets
import transformers
from GPUtil import showUtilization as gpu_usage
from numba import cuda
import torch.nn.functional as F

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset, Dataset, DatasetDict

In [2]:
# !watch -n 0.5 nvidia-smi

In [3]:
print(f'PyTorch version: {torch.__version__}')  # 1.9.1+cu111
print(f'CUDA version: {torch.version.cuda}')  # 11.1
print(f'cuDNN version: {torch.backends.cudnn.version()}')  # 8005
print(f'Current device: {torch.cuda.current_device()}')  # 0
print(f'Is cuda available: {torch.cuda.is_available()}')  # TRUE

PyTorch version: 2.0.1
CUDA version: 11.8
cuDNN version: 8700
Current device: 0
Is cuda available: True


In [4]:
print(f'Transformers version: {transformers.__version__}')
print(f'Datasets version: {datasets.__version__}')

Transformers version: 4.37.2
Datasets version: 2.14.5


In [5]:
# Prevent a warning related to the tokenization process in the transformers library. 
os.environ["TOKENIZERS_PARALLELISM"] = "False"
# Makes CUDA operations synchronous
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [6]:
# Find the GPU with the least memory usage.
!nvidia-smi

Thu Mar  7 11:20:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:01:00.0 Off |                  N/A |
| 40%   65C    P2             168W / 250W |    697MiB / 11264MiB |     96%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:23:00.0 Off |  

In [7]:
def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    # free unreferenced tensors from the GPU memory.
    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache() 

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 96% |  6% |
|  1 |  0% |  6% |
|  2 |  0% |  6% |
|  3 |  0% |  0% |
|  4 |  0% |  6% |
|  5 |  0% |  0% |
|  6 |  0% |  0% |
|  7 |  0% |  6% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 78% |  6% |
|  1 |  0% |  6% |
|  2 |  0% |  6% |
|  3 |  0% |  1% |
|  4 |  0% |  6% |
|  5 |  0% |  0% |
|  6 |  0% |  0% |
|  7 |  0% |  6% |


In [8]:
data = pd.read_csv("clean_test_or_not_test.csv" , index_col = 0)
data

Unnamed: 0,text_clean,label
0,cover anvil service response in unit testshttp...,0
1,service apprepositorycontroller should be cach...,0
2,many caches persist between test casesan examp...,0
3,improve friendliness of behat text pattern mat...,0
4,run e2e tests for prs is your feature request ...,0
...,...,...
437,need to refractor create addresses functionali...,1
438,decouple asset lambda with messagebus 1246 124...,1
439,remove helm chart from werft publish release j...,1
440,add the ability to create new pages or section...,1


In [9]:
data_test_set = pd.read_csv("testset_test_or_not_test_clean.csv" , index_col = 0)
data_test_set

Unnamed: 0,text_clean,label
0,lintingas a developer i need to follow python ...,1
1,allow use of components within wrapper for uni...,0
2,migrate componentschannelnotificationsmodalcom...,1
3,leverage godot 40 tilemap layersmost of our ti...,1
4,review testing code and remove domain filein h...,1
5,investigate usage of production groupvars aws ...,1
6,create unit tests for resource list and involv...,0
7,too much output from waitformarkersnoticed on ...,0
8,orders stuck at pending payment when using rev...,1
9,improve load time of dashboard home and cluste...,0


In [10]:
# Smaller and faster than bert.
base_model_id = "distilbert-base-uncased"

epochs = 5 #Number of full cyles through the training set.
num_labels = 2 
learning_rate = 5e-5 # Rate the model updates based on the data its trained on.
train_batch_size = 16 # Number of training examples in one iteration.
eval_batch_size = 32 # Number evalutaion examples in on iteratoion.
save_strategy = "no" # Should the model be saved automatically during training.
save_steps = 500 # How often to save the model during training. No effect since no over.
logging_steps = 100
model_dir = "./model" #Where to save model

# Use early stopping to prevent overfitting
#load_best_model_at_end=True
#metric_for_best_model="eval_loss"
#greater_is_better=False

In [11]:
# Split dataframe into three parts: training, validation and testing.
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=42):
    np.random.seed(seed)
    # Shuffle index of dataframe
    perm = np.random.permutation(df.index)
    
    df_length = len(df.index)
    
    # Number of row in training set
    train_end = int(train_percent * df_length)
    # Number of rows in validate set
    validate_end = int(validate_percent * df_length) + train_end
    
    # From start to train end
    train = df.iloc[perm[:train_end]]
    # From train_end to validate_end
    validate = df.iloc[perm[train_end:validate_end]]
    # From validate to the last row in dataframe.
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [12]:
# Drops rows with missing values
data.dropna(inplace=True)

In [13]:
# Resets the index after dropping rows
data.reset_index(inplace=True)
data

Unnamed: 0,index,text_clean,label
0,0,cover anvil service response in unit testshttp...,0
1,1,service apprepositorycontroller should be cach...,0
2,2,many caches persist between test casesan examp...,0
3,3,improve friendliness of behat text pattern mat...,0
4,4,run e2e tests for prs is your feature request ...,0
...,...,...,...
437,437,need to refractor create addresses functionali...,1
438,438,decouple asset lambda with messagebus 1246 124...,1
439,439,remove helm chart from werft publish release j...,1
440,440,add the ability to create new pages or section...,1


In [14]:
#Drops the index col, better for managint the data.
data.drop(columns= ["index"], inplace = True)

In [15]:
data


Unnamed: 0,text_clean,label
0,cover anvil service response in unit testshttp...,0
1,service apprepositorycontroller should be cach...,0
2,many caches persist between test casesan examp...,0
3,improve friendliness of behat text pattern mat...,0
4,run e2e tests for prs is your feature request ...,0
...,...,...
437,need to refractor create addresses functionali...,1
438,decouple asset lambda with messagebus 1246 124...,1
439,remove helm chart from werft publish release j...,1
440,add the ability to create new pages or section...,1


In [16]:
# 80% trainig, 10% validate, 10% test. Seed 42.
# Test 80-10-10 and 70-15-15
train , validate , test = train_validate_test_split(data)


In [17]:
train.set_index("label" , inplace = True)
validate.set_index("label" , inplace = True)
test.set_index("label" , inplace = True)

In [18]:
test

Unnamed: 0_level_0,text_clean
label,Unnamed: 1_level_1
1,create a team on ibm cloudas a maintainer of t...
1,remove support for nodejs 14 until end of apr...
0,flaky test debug a cell from a python file1 in...
1,fix all cstyle casting warningscurrently there...
0,add unit tests for cluster delete with namedes...
0,projectutilswaituntilnobuilderrorsproject time...
0,ranchercomponents new versionrelates to ranche...
1,decouple asset lambda with messagebus 1246 124...
0,flaky e2e test detected communitycarecypresss...
0,test carousel and slideshow widgetsthis issue ...


In [19]:
# Convert from Pandas DataFrame to Hugging Face datasets
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
testds = Dataset.from_pandas(test)

separate_test_set = Dataset.from_pandas(data_test_set)
ds = DatasetDict()

ds["test"] = testds
ds["train"] = tds
ds["validate"] = vds
ds["separate_test_set"] = separate_test_set

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 45
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 353
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 44
    })
    separate_test_set: Dataset({
        features: ['text_clean', 'label', '__index_level_0__'],
        num_rows: 49
    })
})

In [20]:
train_dataset = ds["train"]
valid_dataset = ds["validate"]
test_ds = ds["test"]
separate_test_set_dataset = ds["separate_test_set"]

In [21]:
ds["train"][0]

{'text_clean': 'use jsonspecscgroupparsingjson input in unit testsin httpsgithubcomelasticapmagentdotnetpull1833 jsonspec input files were synced into the apmagentdotnet repository however there are no tests yet which make use of these data the goal of this issue is to add new unit tests which have the json test definitions in cgroupparsingjsonhttpsgithubcomelasticapmagentdotnetblobmaintestelasticapmteststestresourcesjsonspecscgroupparsingjson as their input',
 'label': 1}

In [22]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(base_model_id, num_labels=num_labels)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokanization

In [24]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [25]:
#Tokenize the dataset to the correct input for the transformer model.
def tokenize(batch):
    return tokenizer(batch["text_clean"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))
test_dataset = test_ds.map(tokenize, batched=True, batch_size=len(test_ds))
separate_test_set_dataset = separate_test_set_dataset.map(tokenize, batched=True, batch_size=len(separate_test_set_dataset))

Map:   0%|          | 0/353 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

In [26]:
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_strategy=save_strategy,
    save_steps=save_steps,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    logging_steps=logging_steps,
)

In [27]:
 trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [28]:
trainer.train() 

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.509341,0.818182,[0.78947368 0.84 ],[0.83333333 0.80769231],[0.75 0.875]
2,No log,0.429102,0.863636,[0.83333333 0.88461538],[0.9375 0.82142857],[0.75 0.95833333]
3,No log,0.451753,0.840909,[0.8 0.86792453],[0.93333333 0.79310345],[0.7 0.95833333]
4,No log,0.489809,0.795455,[0.76923077 0.81632653],[0.78947368 0.8 ],[0.75 0.83333333]
5,0.414600,0.505073,0.772727,[0.75 0.79166667],[0.75 0.79166667],[0.75 0.79166667]


TrainOutput(global_step=115, training_loss=0.38179699234340503, metrics={'train_runtime': 41.9604, 'train_samples_per_second': 42.063, 'train_steps_per_second': 2.741, 'total_flos': 233804958627840.0, 'train_loss': 0.38179699234340503, 'epoch': 5.0})

In [29]:
eval_result = trainer.evaluate(eval_dataset=valid_dataset)

In [30]:
for key, value in sorted(eval_result.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.7727272727272727

eval_f1 = [0.75       0.79166667]

eval_loss = 0.5050725936889648

eval_precision = [0.75       0.79166667]

eval_recall = [0.75       0.79166667]

eval_runtime = 0.3279

eval_samples_per_second = 134.199

eval_steps_per_second = 6.1



## Training loss decreases, valdiation loss increases = Overfitting

In [31]:
# Evaluate test data set
test_results = trainer.evaluate(eval_dataset=test_dataset)

In [32]:
for key, value in sorted(test_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.8666666666666667

eval_f1 = [0.86956522 0.86363636]

eval_loss = 0.39644381403923035

eval_precision = [0.86956522 0.86363636]

eval_recall = [0.86956522 0.86363636]

eval_runtime = 0.3289

eval_samples_per_second = 136.807

eval_steps_per_second = 6.08



# Test set

In [33]:
separate_test_set_results = trainer.evaluate(eval_dataset=separate_test_set_dataset)

In [34]:
for key, value in sorted(separate_test_set_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.8163265306122449

eval_f1 = [0.83636364 0.79069767]

eval_loss = 0.5959644913673401

eval_precision = [0.82142857 0.80952381]

eval_recall = [0.85185185 0.77272727]

eval_runtime = 0.3527

eval_samples_per_second = 138.922

eval_steps_per_second = 5.67



In [35]:
trainer.save_model(model_dir + "_local") 

In [36]:
from transformers import pipeline
    
classifier = pipeline("text-classification", model="./model_local")

In [37]:
classifier.model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [38]:
classifier("this contain bugs regarding testing")

[{'label': 'LABEL_0', 'score': 0.9490121603012085}]

In [39]:
classifier("this contain bugs regarding automtion not testing")

[{'label': 'LABEL_0', 'score': 0.9519818425178528}]

In [40]:
classifier("this bug has super high impact on the project")

[{'label': 'LABEL_1', 'score': 0.5614379048347473}]

In [41]:
df = pd.read_csv('External_clean_test_debt_or_not_test.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text_clean,Type
0,0,done next time ill bring a lawyer with me sinc...,1
1,1,please include a small change in hbaseserver m...,0
2,2,yeah lets return 0 for the primitive types pat...,0
3,3,merged to master,1
4,4,we were missing property in parent pom and cam...,1


In [42]:
results = []
for index, row in df.iterrows():
    # Apply the classifier to the clean_text column
    prediction = classifier(row['text_clean'])
    predicted_label = prediction[0]['label']
    prediction_confidence = prediction[0]['score']
    
    # Append the result as a tuple
    results.append((row['text_clean'], row['Type'], predicted_label, prediction_confidence))

In [43]:
results_df = pd.DataFrame(results, columns=['clean_text', 'actual_type', 'predicted_type', 'prediction_accuracy'])

# Display the first few rows of the results DataFrame
results_df.head()

Unnamed: 0,clean_text,actual_type,predicted_type,prediction_accuracy
0,done next time ill bring a lawyer with me sinc...,1,LABEL_1,0.727563
1,please include a small change in hbaseserver m...,0,LABEL_0,0.957581
2,yeah lets return 0 for the primitive types pat...,0,LABEL_0,0.944256
3,merged to master,1,LABEL_1,0.676842
4,we were missing property in parent pom and cam...,1,LABEL_1,0.896116


In [44]:
results_df['predicted_type'] = results_df['predicted_type'].apply(lambda x: int(x.split('_')[-1]))
y_true = results_df['actual_type']
y_pred = results_df['predicted_type']

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")

# Calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.8819320214669052
Precision: 0.8333333333333334
Recall: 0.9302325581395349
F1 Score: 0.8791208791208791


### Delete the dataset with large memory

In [44]:
del valid_dataset

In [45]:
del model

In [46]:
# Free cache
torch.cuda.empty_cache()

In [47]:
!nvidia-smi

Thu Feb 22 13:06:23 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  | 00000000:41:00.0 Off |                  N/A |
| 30%   29C    P8              26W / 350W |      6MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        On  | 00000000:61:00.0 Off |  