In [1]:
import pandas as pd
import torch
import os
import numpy as np
import datasets
import transformers
from GPUtil import showUtilization as gpu_usage
from numba import cuda
import torch.nn.functional as F

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset, Dataset, DatasetDict

In [2]:
# !watch -n 0.5 nvidia-smi

In [3]:
print(f'PyTorch version: {torch.__version__}')  # 1.9.1+cu111
print(f'CUDA version: {torch.version.cuda}')  # 11.1
print(f'cuDNN version: {torch.backends.cudnn.version()}')  # 8005
print(f'Current device: {torch.cuda.current_device()}')  # 0
print(f'Is cuda available: {torch.cuda.is_available()}')  # TRUE

PyTorch version: 2.0.1
CUDA version: 11.8
cuDNN version: 8700
Current device: 0
Is cuda available: True


In [4]:
print(f'Transformers version: {transformers.__version__}')
print(f'Datasets version: {datasets.__version__}')

Transformers version: 4.37.2
Datasets version: 2.14.5


In [5]:
# Prevent a warning related to the tokenization process in the transformers library. 
os.environ["TOKENIZERS_PARALLELISM"] = "False"
# Makes CUDA operations synchronous
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [6]:
# Find the GPU with the least memory usage.
!nvidia-smi

Sun Mar 10 17:57:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:01:00.0 Off |                  N/A |
| 36%   60C    P2             171W / 250W |    675MiB / 11264MiB |     52%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:23:00.0 Off |  

In [7]:
def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    # free unreferenced tensors from the GPU memory.
    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache() 

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 39% |  6% |
|  1 |  0% |  6% |
|  2 |  0% |  6% |
|  3 | 55% |  9% |
|  4 |  0% |  6% |
|  5 | 19% |  3% |
|  6 |  0% |  0% |
|  7 |  0% |  6% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 22% |  6% |
|  1 |  0% |  6% |
|  2 |  0% |  6% |
|  3 | 36% |  9% |
|  4 |  0% |  6% |
|  5 | 11% |  4% |
|  6 |  0% |  0% |
|  7 |  0% |  6% |


In [8]:
data = pd.read_csv("clean_test_or_not_test.csv" , index_col = 0)
data

Unnamed: 0,text_clean,label
0,add tests for systemdrawingcommonthis issue tr...,0
1,acctests for packetfabricportresource please k...,0
2,text analytics abstractivesummarizebatchconven...,0
3,see entity recordssee the story in the release...,0
4,editor crash when click mesh instance 3d with ...,0
...,...,...
72522,cdn is downin 987dabdhttpsgithubcomceccunstat...,1
72523,create user registration system using email an...,1
72524,group secure join shows success only after res...,1
72525,plugins not being sourced when using a differe...,1


In [9]:
data_test_set = pd.read_csv("testset_test_or_not_test_clean.csv" , index_col = 0)
data_test_set

Unnamed: 0,text_clean,label
0,xml related tests are now ignored for composit...,0
1,sound like a plan and you can assign me to it ...,1
2,i have been able to successfully speed up the ...,0
3,testscannersfuzzing currently tests compressed...,0
4,1 lgtm i would also convert the test to junitv...,0
...,...,...
57,yes i added a profile to run only flaky tests ...,0
58,one problem with those is that they are not a ...,0
59,thanks for detailed description ill have a loo...,1
60,the flakeyfinder fingered the above commit as ...,0


In [10]:
# Smaller and faster than bert.
base_model_id = "distilbert-base-uncased"

epochs = 5 #Number of full cyles through the training set.
num_labels = 2 
learning_rate = 5e-5 # Rate the model updates based on the data its trained on.
train_batch_size = 16 # Number of training examples in one iteration.
eval_batch_size = 32 # Number evalutaion examples in on iteratoion.
save_strategy = "no" # Should the model be saved automatically during training.
save_steps = 500 # How often to save the model during training. No effect since no over.
logging_steps = 100
model_dir = "./model" #Where to save model

# Use early stopping to prevent overfitting
#load_best_model_at_end=True
#metric_for_best_model="eval_loss"
#greater_is_better=False

In [11]:
# Split dataframe into three parts: training, validation and testing.
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=42):
    np.random.seed(seed)
    # Shuffle index of dataframe
    perm = np.random.permutation(df.index)
    
    df_length = len(df.index)
    
    # Number of row in training set
    train_end = int(train_percent * df_length)
    # Number of rows in validate set
    validate_end = int(validate_percent * df_length) + train_end
    
    # From start to train end
    train = df.iloc[perm[:train_end]]
    # From train_end to validate_end
    validate = df.iloc[perm[train_end:validate_end]]
    # From validate to the last row in dataframe.
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [12]:
# Drops rows with missing values
data.dropna(inplace=True)

In [13]:
# Resets the index after dropping rows
data.reset_index(inplace=True)
data

Unnamed: 0,index,text_clean,label
0,0,add tests for systemdrawingcommonthis issue tr...,0
1,1,acctests for packetfabricportresource please k...,0
2,2,text analytics abstractivesummarizebatchconven...,0
3,3,see entity recordssee the story in the release...,0
4,4,editor crash when click mesh instance 3d with ...,0
...,...,...,...
72522,72522,cdn is downin 987dabdhttpsgithubcomceccunstat...,1
72523,72523,create user registration system using email an...,1
72524,72524,group secure join shows success only after res...,1
72525,72525,plugins not being sourced when using a differe...,1


In [14]:
#Drops the index col, better for managint the data.
data.drop(columns= ["index"], inplace = True)

In [15]:
data


Unnamed: 0,text_clean,label
0,add tests for systemdrawingcommonthis issue tr...,0
1,acctests for packetfabricportresource please k...,0
2,text analytics abstractivesummarizebatchconven...,0
3,see entity recordssee the story in the release...,0
4,editor crash when click mesh instance 3d with ...,0
...,...,...
72522,cdn is downin 987dabdhttpsgithubcomceccunstat...,1
72523,create user registration system using email an...,1
72524,group secure join shows success only after res...,1
72525,plugins not being sourced when using a differe...,1


In [16]:
# 80% trainig, 10% validate, 10% test. Seed 42.
# Test 80-10-10 and 70-15-15
train , validate , test = train_validate_test_split(data)


In [17]:
train.set_index("label" , inplace = True)
validate.set_index("label" , inplace = True)
test.set_index("label" , inplace = True)

In [18]:
test

Unnamed: 0_level_0,text_clean
label,Unnamed: 1_level_1
1,cd supports the resident option sanity checks ...
1,allow zooming in to area on mapall drawing met...
0,umbrella flaky test service is referenced by t...
1,springcloudstarternetflixzuul not work with sp...
0,ui tests sidebar settings chats sidebar should...
...,...
1,tls listener support description implement tls...
0,project card convertedwhen a project card is c...
1,enable bucketlistdb by default in captivecore ...
0,ch 14 designing your test suite link to exampl...


In [19]:
# Convert from Pandas DataFrame to Hugging Face datasets
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
testds = Dataset.from_pandas(test)

separate_test_set = Dataset.from_pandas(data_test_set)
ds = DatasetDict()

ds["test"] = testds
ds["train"] = tds
ds["validate"] = vds
ds["separate_test_set"] = separate_test_set

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 7254
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 58021
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 7252
    })
    separate_test_set: Dataset({
        features: ['text_clean', 'label', '__index_level_0__'],
        num_rows: 62
    })
})

In [20]:
train_dataset = ds["train"]
valid_dataset = ds["validate"]
test_ds = ds["test"]
separate_test_set_dataset = ds["separate_test_set"]

In [21]:
ds["train"][0]

{'text_clean': 'docdb optionsparsertestdboptionsallfieldssettable test failuresjira link db4026httpsyugabyteatlassiannetbrowsedb4026 description reproducible with ybd fastdebug cxxtest optionstest gtestfilter optionsparsertestdboptionsallfieldssettable gcc11 n 2 p 1',
 'label': 0}

In [22]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(base_model_id, num_labels=num_labels)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokanization

In [24]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [25]:
#Tokenize the dataset to the correct input for the transformer model.
def tokenize(batch):
    return tokenizer(batch["text_clean"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))
test_dataset = test_ds.map(tokenize, batched=True, batch_size=len(test_ds))
separate_test_set_dataset = separate_test_set_dataset.map(tokenize, batched=True, batch_size=len(separate_test_set_dataset))

Map:   0%|          | 0/58021 [00:00<?, ? examples/s]

Map:   0%|          | 0/7252 [00:00<?, ? examples/s]

Map:   0%|          | 0/7254 [00:00<?, ? examples/s]

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

In [26]:
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_strategy=save_strategy,
    save_steps=save_steps,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    logging_steps=logging_steps,
)

In [27]:
 trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [28]:
trainer.train() 

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3108,0.305906,0.877827,[0.87024019 0.8845753 ],[0.92554517 0.83993073],[0.82117192 0.93423225]
2,0.2021,0.301252,0.910645,[0.90947192 0.91178873],[0.91949153 0.90220905],[0.89966833 0.92157402]
3,0.1193,0.398712,0.919608,[0.91801434 0.92114162],[0.93444031 0.90582602],[0.90215589 0.93698404]
4,0.0822,0.390972,0.923056,[0.9212754 0.92475728],[0.94092219 0.90666314],[0.90243228 0.94358833]
5,0.045,0.456542,0.924159,[0.92189719 0.92629322],[0.94801402 0.90282132],[0.89718076 0.95101816]


TrainOutput(global_step=18135, training_loss=0.17329566380127684, metrics={'train_runtime': 14070.5115, 'train_samples_per_second': 20.618, 'train_steps_per_second': 1.289, 'total_flos': 3.842945468709888e+16, 'train_loss': 0.17329566380127684, 'epoch': 5.0})

In [29]:
eval_result = trainer.evaluate(eval_dataset=valid_dataset)

In [30]:
for key, value in sorted(eval_result.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.9241588527302813

eval_f1 = [0.92189719 0.92629322]

eval_loss = 0.45654207468032837

eval_precision = [0.94801402 0.90282132]

eval_recall = [0.89718076 0.95101816]

eval_runtime = 60.105

eval_samples_per_second = 120.656

eval_steps_per_second = 3.777



## Training loss decreases, valdiation loss increases = Overfitting

In [31]:
# Evaluate test data set
test_results = trainer.evaluate(eval_dataset=test_dataset)

In [32]:
for key, value in sorted(test_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.9225255031706645

eval_f1 = [0.92278098 0.92226833]

eval_loss = 0.4748363196849823

eval_precision = [0.94885561 0.8974428 ]

eval_recall = [0.8981011 0.9485064]

eval_runtime = 61.7215

eval_samples_per_second = 117.528

eval_steps_per_second = 3.678



# Test set

In [33]:
separate_test_set_results = trainer.evaluate(eval_dataset=separate_test_set_dataset)

In [34]:
for key, value in sorted(separate_test_set_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.8709677419354839

eval_f1 = [0.87878788 0.86206897]

eval_loss = 0.4746240973472595

eval_precision = [0.93548387 0.80645161]

eval_recall = [0.82857143 0.92592593]

eval_runtime = 0.6526

eval_samples_per_second = 95.011

eval_steps_per_second = 3.065



In [35]:
trainer.save_model(model_dir + "_local") 

In [36]:
from transformers import pipeline
    
classifier = pipeline("text-classification", model="./model_local")

In [37]:
classifier.model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [38]:
classifier("this contain bugs regarding testing")

[{'label': 'LABEL_0', 'score': 0.9987213015556335}]

In [39]:
classifier("this contain bugs regarding automtion not testing")

[{'label': 'LABEL_0', 'score': 0.9985414743423462}]

In [40]:
classifier("this bug has super high impact on the project")

[{'label': 'LABEL_1', 'score': 0.9940484762191772}]

In [41]:
import pandas as pd
df = pd.read_csv('clean_test_or_not_test_debt.csv',index_col = 0)
df.head()

Unnamed: 0,text_clean,label
0,make sure integration tests are running with i...,0
1,cover anvil service response in unit testshttp...,0
2,service apprepositorycontroller should be cach...,0
3,many caches persist between test casesan examp...,0
4,improve friendliness of behat text pattern mat...,0


In [None]:
def classify_text(text):
    # Get prediction
    result = classifier(text)
    # Return the label of the highest scoring classification
    return result[0]['label']

# Apply the classification function to your text column (assuming it's named 'text_clean')
df['predicted_label'] = df['text_clean'].apply(classify_text)



Token indices sequence length is longer than the specified maximum sequence length for this model (12711 > 512). Running this sequence through the model will result in indexing errors


In [None]:
df.head(100)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Assuming your predicted labels are in a format like 'LABEL_0', 'LABEL_1', etc.
# Convert these to numeric by extracting the number part and converting it to an integer
df['predicted_label_numeric'] = df['predicted_label'].apply(lambda x: int(x.split('_')[1]))

# Now, your actual labels are assumed to be in the 'label' column and already numeric
actual_labels = df['label']

# Predicted labels are now in 'predicted_label_numeric'
predicted_labels = df['predicted_label_numeric']

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

# Calculate F1 score
# Assuming it's a binary classification, you could specify the average method if it's multi-class
f1 = f1_score(actual_labels, predicted_labels, average='binary' if df['label'].nunique() == 2 else 'weighted')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")


### Delete the dataset with large memory

In [42]:
del valid_dataset

In [43]:
del model

In [44]:
# Free cache
torch.cuda.empty_cache()

In [45]:
!nvidia-smi

Wed Jan  3 13:44:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:01:00.0 Off |                  N/A |
| 22%   34C    P8              18W / 250W |   3949MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:24:00.0 Off |  