In [1]:
import pandas as pd
import torch
import os
import numpy as np
import datasets
import transformers
from GPUtil import showUtilization as gpu_usage
from numba import cuda
import torch.nn.functional as F

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset, Dataset, DatasetDict

2024-04-28 00:41:45.624854: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-28 00:41:45.680268: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# !watch -n 0.5 nvidia-smi

In [3]:
print(f'PyTorch version: {torch.__version__}')  # 1.9.1+cu111
print(f'CUDA version: {torch.version.cuda}')  # 11.1
print(f'cuDNN version: {torch.backends.cudnn.version()}')  # 8005
print(f'Current device: {torch.cuda.current_device()}')  # 0
print(f'Is cuda available: {torch.cuda.is_available()}')  # TRUE

PyTorch version: 2.0.1
CUDA version: 11.8
cuDNN version: 8700
Current device: 0
Is cuda available: True


In [4]:
print(f'Transformers version: {transformers.__version__}')
print(f'Datasets version: {datasets.__version__}')

Transformers version: 4.37.2
Datasets version: 2.14.5


In [5]:
# Prevent a warning related to the tokenization process in the transformers library. 
os.environ["TOKENIZERS_PARALLELISM"] = "False"
# Makes CUDA operations synchronous
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [6]:
# Find the GPU with the least memory usage.
!nvidia-smi

Sun Apr 28 00:41:57 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  | 00000000:41:00.0 Off |                  N/A |
| 30%   30C    P8              35W / 350W |   7984MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        On  | 00000000:61:00.0 Off |  

In [7]:
def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    # free unreferenced tensors from the GPU memory.
    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache() 

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% | 32% |
|  1 |  0% |  0% |
|  2 |  0% |  0% |
|  3 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  0% | 32% |
|  1 |  5% |  1% |
|  2 |  0% |  0% |
|  3 |  0% |  0% |


In [62]:
data = pd.read_csv("clean_test_debt_or_not_test_external.csv" , index_col = 0)
data

Unnamed: 0,text_clean,label
0,simple sftptofile integrations with charset co...,0
1,the official docs are now on github as mention...,0
2,migrate java transport client to the new high ...,0
3,make typereference inline anonymous classes co...,0
4,title updated please go ahead,0
...,...,...
550,we dont need the 0rename stuff because distcp ...,1
551,the test hasnt been flaky for some time remove...,1
552,fair enough however that means we are leaving ...,1
553,flakey test,1


In [9]:
data_test_set = pd.read_csv("testset_test_or_not_test_clean.csv" , index_col = 0)
data_test_set

Unnamed: 0,text_clean,label
0,xml related tests are now ignored for composit...,0
1,sound like a plan and you can assign me to it ...,1
2,i have been able to successfully speed up the ...,0
3,testscannersfuzzing currently tests compressed...,0
4,1 lgtm i would also convert the test to junitv...,0
...,...,...
57,yes i added a profile to run only flaky tests ...,0
58,one problem with those is that they are not a ...,0
59,thanks for detailed description ill have a loo...,1
60,the flakeyfinder fingered the above commit as ...,0


In [10]:
# Smaller and faster than bert.
base_model_id = "distilbert-base-uncased"

epochs = 5 #Number of full cyles through the training set.
num_labels = 2 
learning_rate = 5e-5 # Rate the model updates based on the data its trained on.
train_batch_size = 16 # Number of training examples in one iteration.
eval_batch_size = 32 # Number evalutaion examples in on iteratoion.
save_strategy = "no" # Should the model be saved automatically during training.
save_steps = 500 # How often to save the model during training. No effect since no over.
logging_steps = 20
model_dir = "./model" #Where to save model

# Use early stopping to prevent overfitting
#load_best_model_at_end=True
#metric_for_best_model="eval_loss"
#greater_is_better=False

In [11]:
# Split dataframe into three parts: training, validation and testing.
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=42):
    np.random.seed(seed)
    # Shuffle index of dataframe
    perm = np.random.permutation(df.index)
    
    df_length = len(df.index)
    
    # Number of row in training set
    train_end = int(train_percent * df_length)
    # Number of rows in validate set
    validate_end = int(validate_percent * df_length) + train_end
    
    # From start to train end
    train = df.iloc[perm[:train_end]]
    # From train_end to validate_end
    validate = df.iloc[perm[train_end:validate_end]]
    # From validate to the last row in dataframe.
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [12]:
# Drops rows with missing values
data.dropna(inplace=True)

In [13]:
# Resets the index after dropping rows
data.reset_index(inplace=True)
data

Unnamed: 0,index,text_clean,label
0,0,thrift1735 integrates python tutorials into re...,0
1,1,new implementation of zlib compressed transpor...,0
2,2,posted a new patch which is not as radical the...,0
3,3,1 would be good to have a test also,0
4,4,we have authorization tests that are specific ...,0
...,...,...,...
553,553,salesforce adds fields to even after an api ha...,1
554,554,fixed in there were already and but all of the...,1
555,555,intermittent time outs,1
556,556,thanks for the pr,1


In [14]:
#Drops the index col, better for managint the data.
data.drop(columns= ["index"], inplace = True)

In [15]:
data


Unnamed: 0,text_clean,label
0,thrift1735 integrates python tutorials into re...,0
1,new implementation of zlib compressed transpor...,0
2,posted a new patch which is not as radical the...,0
3,1 would be good to have a test also,0
4,we have authorization tests that are specific ...,0
...,...,...
553,salesforce adds fields to even after an api ha...,1
554,fixed in there were already and but all of the...,1
555,intermittent time outs,1
556,thanks for the pr,1


In [16]:
# 80% trainig, 10% validate, 10% test. Seed 42.
# Test 80-10-10 and 70-15-15
train , validate , test = train_validate_test_split(data)


In [17]:
train.set_index("label" , inplace = True)
validate.set_index("label" , inplace = True)
test.set_index("label" , inplace = True)

In [18]:
test

Unnamed: 0_level_0,text_clean
label,Unnamed: 1_level_1
0,1 code looks good is there a way to enhance an...
1,will be fixed in a patch for other issues
0,jake i unfortunately have no clue how to gener...
0,1 ravis proposal i would also add that fullyde...
0,roger to avoid some confusion the attached pat...
1,the package scanning is deprecated and you sho...
0,running more frequently seems to show that it ...
1,we have fine grained details on each endpoint ...
1,raw and child endpoint issue
1,migrate java transport client to the new high ...


In [19]:
# Convert from Pandas DataFrame to Hugging Face datasets
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
testds = Dataset.from_pandas(test)

separate_test_set = Dataset.from_pandas(data_test_set)
ds = DatasetDict()

ds["test"] = testds
ds["train"] = tds
ds["validate"] = vds
ds["separate_test_set"] = separate_test_set

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 57
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 446
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 55
    })
    separate_test_set: Dataset({
        features: ['text_clean', 'label', '__index_level_0__'],
        num_rows: 62
    })
})

In [20]:
train_dataset = ds["train"]
valid_dataset = ds["validate"]
test_ds = ds["test"]
separate_test_set_dataset = ds["separate_test_set"]

In [21]:
ds["train"][0]

{'text_clean': 'this jira applies to trunk as well would you please provide a patch for trunk on the patch for the key names id remove authentication as these properties can be used for different things than authentication testcase is missing',
 'label': 0}

In [22]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(base_model_id, num_labels=num_labels)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokanization

In [24]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [25]:
#Tokenize the dataset to the correct input for the transformer model.
def tokenize(batch):
    return tokenizer(batch["text_clean"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))
test_dataset = test_ds.map(tokenize, batched=True, batch_size=len(test_ds))
separate_test_set_dataset = separate_test_set_dataset.map(tokenize, batched=True, batch_size=len(separate_test_set_dataset))

Map:   0%|          | 0/446 [00:00<?, ? examples/s]

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

In [28]:
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_strategy=save_strategy,
    save_steps=save_steps,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    logging_steps=logging_steps,
)

In [29]:
 trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [30]:
trainer.train() 

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5816,0.205585,0.927273,[0.92857143 0.92592593],[0.96296296 0.89285714],[0.89655172 0.96153846]
2,0.2027,0.098219,0.981818,[0.98305085 0.98039216],[0.96666667 1. ],[1. 0.96153846]
3,0.1133,0.066145,0.963636,[0.96551724 0.96153846],[0.96551724 0.96153846],[0.96551724 0.96153846]
4,0.0871,0.063598,0.981818,[0.98305085 0.98039216],[0.96666667 1. ],[1. 0.96153846]
5,0.0535,0.053812,0.981818,[0.98305085 0.98039216],[0.96666667 1. ],[1. 0.96153846]


Trainer is attempting to log a value of "[0.92857143 0.92592593]" of type <class 'numpy.ndarray'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.96296296 0.89285714]" of type <class 'numpy.ndarray'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.89655172 0.96153846]" of type <class 'numpy.ndarray'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.98305085 0.98039216]" of type <class 'numpy.ndarray'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.96666667 1.        ]" of type <class 'numpy.n

TrainOutput(global_step=140, training_loss=0.19131566030638558, metrics={'train_runtime': 29.1163, 'train_samples_per_second': 76.589, 'train_steps_per_second': 4.808, 'total_flos': 295402299002880.0, 'train_loss': 0.19131566030638558, 'epoch': 5.0})

In [31]:
eval_result = trainer.evaluate(eval_dataset=valid_dataset)

Trainer is attempting to log a value of "[0.98305085 0.98039216]" of type <class 'numpy.ndarray'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.96666667 1.        ]" of type <class 'numpy.ndarray'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[1.         0.96153846]" of type <class 'numpy.ndarray'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


In [32]:
for key, value in sorted(eval_result.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.9818181818181818

eval_f1 = [0.98305085 0.98039216]

eval_loss = 0.05381222069263458

eval_precision = [0.96666667 1.        ]

eval_recall = [1.         0.96153846]

eval_runtime = 0.2353

eval_samples_per_second = 233.759

eval_steps_per_second = 8.5



## Training loss decreases, valdiation loss increases = Overfitting

In [33]:
# Evaluate test data set
test_results = trainer.evaluate(eval_dataset=test_dataset)

Trainer is attempting to log a value of "[1. 1.]" of type <class 'numpy.ndarray'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[1. 1.]" of type <class 'numpy.ndarray'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[1. 1.]" of type <class 'numpy.ndarray'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


In [34]:
for key, value in sorted(test_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 1.0

eval_f1 = [1. 1.]

eval_loss = 0.010014262050390244

eval_precision = [1. 1.]

eval_recall = [1. 1.]

eval_runtime = 0.2426

eval_samples_per_second = 234.91

eval_steps_per_second = 8.242



# Test set

In [35]:
separate_test_set_results = trainer.evaluate(eval_dataset=separate_test_set_dataset)

Trainer is attempting to log a value of "[0.97222222 0.96153846]" of type <class 'numpy.ndarray'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.94594595 1.        ]" of type <class 'numpy.ndarray'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[1.         0.92592593]" of type <class 'numpy.ndarray'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


In [36]:
for key, value in sorted(separate_test_set_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.967741935483871

eval_f1 = [0.97222222 0.96153846]

eval_loss = 0.0909014567732811

eval_precision = [0.94594595 1.        ]

eval_recall = [1.         0.92592593]

eval_runtime = 0.268

eval_samples_per_second = 231.33

eval_steps_per_second = 7.462



In [37]:
trainer.save_model(model_dir + "_local") 

In [48]:
from transformers import pipeline
    
classifier = pipeline("text-classification", model="./model_local", truncation=True)


In [49]:
classifier.model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [50]:
classifier("this contain bugs regarding testing")

[{'label': 'LABEL_0', 'score': 0.9676269888877869}]

In [51]:
classifier("this contain bugs regarding automtion not testing")

[{'label': 'LABEL_0', 'score': 0.9849045276641846}]

In [52]:
classifier("this bug has super high impact on the project")

[{'label': 'LABEL_1', 'score': 0.9777336716651917}]

In [55]:
import pandas as pd
df = pd.read_csv('clean_test_or_not_test_debt.csv',index_col = 0)
df.head()

Unnamed: 0,text_clean,label
0,make sure integration tests are running with i...,1
1,cover anvil service response in unit testshttp...,1
2,service apprepositorycontroller should be cach...,1
3,many caches persist between test casesan examp...,1
4,improve friendliness of behat text pattern mat...,1


In [56]:
results = []
for index, row in df.iterrows():
    # Encode the text, truncate to the max length, and convert to the format expected by the classifier
    inputs = tokenizer.encode(row['text_clean'], return_tensors="pt", max_length=512, truncation=True)
    
    # Since the pipeline expects a string, we decode the tokens to get a truncated string representation
    truncated_text = tokenizer.decode(inputs[0])
    
    prediction = classifier(truncated_text)
    predicted_label = prediction[0]['label']
    prediction_confidence = prediction[0]['score']
    
    # Append the result as a tuple
    results.append((row['text_clean'], row['label'], predicted_label, prediction_confidence))

In [57]:
results_df = pd.DataFrame(results, columns=['clean_text', 'actual_type', 'predicted_type', 'prediction_accuracy'])
# Display the first few rows of the results DataFrame
results_df.head(-5)

Unnamed: 0,clean_text,actual_type,predicted_type,prediction_accuracy
0,make sure integration tests are running with i...,1,LABEL_0,0.987613
1,cover anvil service response in unit testshttp...,1,LABEL_0,0.961769
2,service apprepositorycontroller should be cach...,1,LABEL_1,0.994962
3,many caches persist between test casesan examp...,1,LABEL_0,0.959566
4,improve friendliness of behat text pattern mat...,1,LABEL_0,0.985052
...,...,...,...,...
480,investigate timeout circleci failed builds sum...,0,LABEL_0,0.982028
481,investigate moving off of direct googleprotobu...,0,LABEL_1,0.995170
482,replace ansiterm with consolewe are currently ...,0,LABEL_1,0.996914
483,update key naming strategies to be in line wit...,0,LABEL_1,0.997013


In [58]:
def convert_predicted_label_to_numeric(label_str):
    # Extract the numeric part from the label string and convert it to an integer
    return int(label_str.split('_')[-1])

# Apply the conversion function to the 'predicted_type' column
results_df['predicted_type_numeric'] = results_df['predicted_type'].apply(convert_predicted_label_to_numeric)
results_df.head(-5)

Unnamed: 0,clean_text,actual_type,predicted_type,prediction_accuracy,predicted_type_numeric
0,make sure integration tests are running with i...,1,LABEL_0,0.987613,0
1,cover anvil service response in unit testshttp...,1,LABEL_0,0.961769,0
2,service apprepositorycontroller should be cach...,1,LABEL_1,0.994962,1
3,many caches persist between test casesan examp...,1,LABEL_0,0.959566,0
4,improve friendliness of behat text pattern mat...,1,LABEL_0,0.985052,0
...,...,...,...,...,...
480,investigate timeout circleci failed builds sum...,0,LABEL_0,0.982028,0
481,investigate moving off of direct googleprotobu...,0,LABEL_1,0.995170,1
482,replace ansiterm with consolewe are currently ...,0,LABEL_1,0.996914,1
483,update key naming strategies to be in line wit...,0,LABEL_1,0.997013,1


In [59]:
from sklearn.metrics import accuracy_score, f1_score

# Assuming 'actual_type' is already in the correct numeric format
accuracy = accuracy_score(results_df['actual_type'], results_df['predicted_type_numeric'])
f1 = f1_score(results_df['actual_type'], results_df['predicted_type_numeric'], average='binary')  # Adjust as needed

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")


Accuracy: 0.2163265306122449
F1 Score: 0.16521739130434784


In [61]:
results_df.to_csv('TestNotTestRes_bert.csv', index=False)

### Delete the dataset with large memory

In [42]:
del valid_dataset

In [43]:
del model

In [44]:
# Free cache
torch.cuda.empty_cache()

In [45]:
!nvidia-smi

Wed Jan  3 13:44:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:01:00.0 Off |                  N/A |
| 22%   34C    P8              18W / 250W |   3949MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:24:00.0 Off |  