In [1]:
import pandas as pd
import torch
import os
import numpy as np
import datasets
import transformers
from GPUtil import showUtilization as gpu_usage
from numba import cuda
import torch.nn.functional as F

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset, Dataset, DatasetDict

In [2]:
# !watch -n 0.5 nvidia-smi

In [3]:
print(f'PyTorch version: {torch.__version__}')  # 1.9.1+cu111
print(f'CUDA version: {torch.version.cuda}')  # 11.1
print(f'cuDNN version: {torch.backends.cudnn.version()}')  # 8005
print(f'Current device: {torch.cuda.current_device()}')  # 0
print(f'Is cuda available: {torch.cuda.is_available()}')  # TRUE

PyTorch version: 2.0.1
CUDA version: 11.8
cuDNN version: 8700
Current device: 0
Is cuda available: True


In [4]:
print(f'Transformers version: {transformers.__version__}')
print(f'Datasets version: {datasets.__version__}')

Transformers version: 4.33.2
Datasets version: 2.14.5


In [5]:
# Prevent a warning related to the tokenization process in the transformers library. 
os.environ["TOKENIZERS_PARALLELISM"] = "False"
# Makes CUDA operations synchronous
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [6]:
# Find the GPU with the least memory usage.
!nvidia-smi

Wed Jan  3 11:41:53 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:01:00.0 Off |                  N/A |
| 22%   34C    P8              18W / 250W |   3949MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:24:00.0 Off |  

In [7]:
def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    # free unreferenced tensors from the GPU memory.
    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache() 

Initial GPU Usage
| ID | GPU  | MEM |
-------------------
|  0 |   0% | 35% |
|  1 |  99% | 63% |
|  2 |  99% | 63% |
|  3 | 100% | 63% |
|  4 | 100% | 63% |
|  5 |   0% | 36% |
|  6 |  34% | 11% |
|  7 |   0% |  0% |
GPU Usage after emptying the cache
| ID | GPU  | MEM |
-------------------
|  0 |   0% | 35% |
|  1 |  99% | 63% |
|  2 |  99% | 63% |
|  3 | 100% | 63% |
|  4 | 100% | 63% |
|  5 |   0% | 36% |
|  6 |  34% | 12% |
|  7 |   0% |  0% |


In [8]:
data = pd.read_csv("clean_test_or_not_test.csv" , index_col = 0)
data

Unnamed: 0,text_clean,label
0,add tests for systemdrawingcommonthis issue tr...,0
1,acctests for packetfabricportresource please k...,0
2,text analytics abstractivesummarizebatchconven...,0
3,see entity recordssee the story in the release...,0
4,editor crash when click mesh instance 3d with ...,0
...,...,...
72563,cdn is downin 987dabdhttpsgithubcomceccunstat...,1
72564,create user registration system using email an...,1
72565,group secure join shows success only after res...,1
72566,plugins not being sourced when using a differe...,1


In [9]:
data_test_set = pd.read_csv("testset_test_or_not_test_clean.csv" , index_col = 0)
data_test_set

Unnamed: 0,text_clean,label
0,badrecordmac in fips testbuild scan httpsgradl...,0
1,validate shutdown message processing even thou...,0
2,create new flutter project failed on android s...,1
3,upgrade from chromium 103 to chromium 104upgra...,0
4,roachtest followerreadssurvivalzonelocalityreg...,0
...,...,...
8058,autodesk v2 describe the bug autodesk publishe...,1
8059,what happened to sqlite we offer sqlite suppor...,0
8060,setup unittest testing frameworkwe need to uti...,0
8061,use clash for windows cannot be connected in l...,1


In [10]:
# Smaller and faster than bert.
base_model_id = "distilbert-base-uncased"

epochs = 5 #Number of full cyles through the training set.
num_labels = 2 
learning_rate = 5e-5 # Rate the model updates based on the data its trained on.
train_batch_size = 16 # Number of training examples in one iteration.
eval_batch_size = 32 # Number evalutaion examples in on iteratoion.
save_strategy = "no" # Should the model be saved automatically during training.
save_steps = 500 # How often to save the model during training. No effect since no over.
logging_steps = 100
model_dir = "./model" #Where to save model

# Use early stopping to prevent overfitting
#load_best_model_at_end=True
#metric_for_best_model="eval_loss"
#greater_is_better=False

In [11]:
# Split dataframe into three parts: training, validation and testing.
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=42):
    np.random.seed(seed)
    # Shuffle index of dataframe
    perm = np.random.permutation(df.index)
    
    df_length = len(df.index)
    
    # Number of row in training set
    train_end = int(train_percent * df_length)
    # Number of rows in validate set
    validate_end = int(validate_percent * df_length) + train_end
    
    # From start to train end
    train = df.iloc[perm[:train_end]]
    # From train_end to validate_end
    validate = df.iloc[perm[train_end:validate_end]]
    # From validate to the last row in dataframe.
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [12]:
# Drops rows with missing values
data.dropna(inplace=True)

In [13]:
# Resets the index after dropping rows
data.reset_index(inplace=True)
data

Unnamed: 0,index,text_clean,label
0,0,add tests for systemdrawingcommonthis issue tr...,0
1,1,acctests for packetfabricportresource please k...,0
2,2,text analytics abstractivesummarizebatchconven...,0
3,3,see entity recordssee the story in the release...,0
4,4,editor crash when click mesh instance 3d with ...,0
...,...,...,...
72563,72563,cdn is downin 987dabdhttpsgithubcomceccunstat...,1
72564,72564,create user registration system using email an...,1
72565,72565,group secure join shows success only after res...,1
72566,72566,plugins not being sourced when using a differe...,1


In [14]:
#Drops the index col, better for managint the data.
data.drop(columns= ["index"], inplace = True)

In [15]:
data


Unnamed: 0,text_clean,label
0,add tests for systemdrawingcommonthis issue tr...,0
1,acctests for packetfabricportresource please k...,0
2,text analytics abstractivesummarizebatchconven...,0
3,see entity recordssee the story in the release...,0
4,editor crash when click mesh instance 3d with ...,0
...,...,...
72563,cdn is downin 987dabdhttpsgithubcomceccunstat...,1
72564,create user registration system using email an...,1
72565,group secure join shows success only after res...,1
72566,plugins not being sourced when using a differe...,1


In [16]:
# 80% trainig, 10% validate, 10% test. Seed 42.
# Test 80-10-10 and 70-15-15
train , validate , test = train_validate_test_split(data)


In [17]:
train.set_index("label" , inplace = True)
validate.set_index("label" , inplace = True)
test.set_index("label" , inplace = True)

In [18]:
test

Unnamed: 0_level_0,text_clean
label,Unnamed: 1_level_1
1,check the bpmn pipelines to see if formsubmiss...
1,add eslint and enforce code style is your feat...
1,rosskopfyxyz is downin 8f48397httpsgithubcomg...
0,time taken by peer link going down when sonic ...
0,test upgrade 42x to 45 time improvement target...
...,...
1,create a release for rke 142my team is waiting...
0,battle error using moves on some togedemarub75...
1,node build fails with latest version bugdescri...
0,bug no compartilhamento de loot em alguns boss...


In [19]:
# Convert from Pandas DataFrame to Hugging Face datasets
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
testds = Dataset.from_pandas(test)

separate_test_set = Dataset.from_pandas(data_test_set)
ds = DatasetDict()

ds["test"] = testds
ds["train"] = tds
ds["validate"] = vds
ds["separate_test_set"] = separate_test_set

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 7258
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 58054
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 7256
    })
    separate_test_set: Dataset({
        features: ['text_clean', 'label', '__index_level_0__'],
        num_rows: 8063
    })
})

In [20]:
train_dataset = ds["train"]
valid_dataset = ds["validate"]
test_ds = ds["test"]
separate_test_set_dataset = ds["separate_test_set"]

In [21]:
ds["train"][0]

{'text_clean': 'decision tree regressor giving different result for scaled and unscaled datathe decision tree regressor outputs different tree for scaled and unscaled data this is true for the random forest regressor also as per my understanding scaling data should not change the results for decision tree or random forest regressor i am attaching the ipynb file of my analysis in the analysis i have generated a synthetic dataset where the input data is in different scales the decision tree is then made using the unscaled data minmax scaled data and arbitrarily scaled data the arbitrary scaling simply multiplies each input with a suitable value to bring all the inputs in the same order of magnitude it is seen that the tree formed by using the the minmax scaled and arbitrarily scaled data is same but the one formed by the unscaled data is different the same results are reflected when using random forest i am unable to understand this result therefore an explanation would be of help the tr

In [22]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(base_model_id, num_labels=num_labels)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokanization

In [24]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [25]:
#Tokenize the dataset to the correct input for the transformer model.
def tokenize(batch):
    return tokenizer(batch["text_clean"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))
test_dataset = test_ds.map(tokenize, batched=True, batch_size=len(test_ds))
separate_test_set_dataset = separate_test_set_dataset.map(tokenize, batched=True, batch_size=len(separate_test_set_dataset))

Map:   0%|          | 0/58054 [00:00<?, ? examples/s]

Map:   0%|          | 0/7256 [00:00<?, ? examples/s]

Map:   0%|          | 0/7258 [00:00<?, ? examples/s]

Map:   0%|          | 0/8063 [00:00<?, ? examples/s]

In [26]:
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_strategy=save_strategy,
    save_steps=save_steps,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    logging_steps=logging_steps,
)

In [27]:
 trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [28]:
trainer.train() 

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3042,0.30106,0.885474,[0.88398716 0.88692339],[0.894603 0.87678235],[0.87362031 0.89730176]
2,0.1926,0.282279,0.90339,[0.90052505 0.90609511],[0.92696465 0.88233759],[0.87555188 0.9311674 ]
3,0.1502,0.331702,0.916069,[0.91476557 0.91733406],[0.92814541 0.90468541],[0.901766 0.93034141]
4,0.0627,0.503231,0.916069,[0.91256281 0.91930568],[0.95121221 0.88607918],[0.87693157 0.95512115]
5,0.0463,0.486252,0.923098,[0.9212087 0.92489906],[0.94331984 0.90468668],[0.90011038 0.94603524]


TrainOutput(global_step=18145, training_loss=0.1741726277977614, metrics={'train_runtime': 6438.8339, 'train_samples_per_second': 45.081, 'train_steps_per_second': 2.818, 'total_flos': 3.845131180787712e+16, 'train_loss': 0.1741726277977614, 'epoch': 5.0})

In [29]:
eval_result = trainer.evaluate(eval_dataset=valid_dataset)

In [30]:
for key, value in sorted(eval_result.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.9230981256890849

eval_f1 = [0.9212087  0.92489906]

eval_loss = 0.48625195026397705

eval_precision = [0.94331984 0.90468668]

eval_recall = [0.90011038 0.94603524]

eval_runtime = 44.5186

eval_samples_per_second = 162.988

eval_steps_per_second = 5.099



## Training loss decreases, valdiation loss increases = Overfitting

In [31]:
# Evaluate test data set
test_results = trainer.evaluate(eval_dataset=test_dataset)

In [32]:
for key, value in sorted(test_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.919123725544227

eval_f1 = [0.91920165 0.91904565]

eval_loss = 0.521524965763092

eval_precision = [0.94911882 0.89090909]

eval_recall = [0.89111289 0.94901737]

eval_runtime = 44.7235

eval_samples_per_second = 162.286

eval_steps_per_second = 5.076



# Test set

In [33]:
separate_test_set_results = trainer.evaluate(eval_dataset=separate_test_set_dataset)

In [34]:
for key, value in sorted(separate_test_set_results.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.9200049609326554

eval_f1 = [0.91792849 0.92197895]

eval_loss = 0.5202078223228455

eval_precision = [0.94796321 0.89502114]

eval_recall = [0.88973853 0.95061112]

eval_runtime = 49.7773

eval_samples_per_second = 161.982

eval_steps_per_second = 5.063



In [35]:
trainer.save_model(model_dir + "_local") 

In [36]:
from transformers import pipeline
    
classifier = pipeline("text-classification", model="./model_local")

In [37]:
classifier.model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [39]:
classifier("this contain bugs regarding testing")

[{'label': 'LABEL_0', 'score': 0.9958046078681946}]

In [40]:
classifier("this contain bugs regarding automtion not testing")

[{'label': 'LABEL_0', 'score': 0.964190661907196}]

In [41]:
classifier("this bug has super high impact on the project")

[{'label': 'LABEL_0', 'score': 0.5383675694465637}]

### Delete the dataset with large memory

In [42]:
del valid_dataset

In [43]:
del model

In [44]:
# Free cache
torch.cuda.empty_cache()

In [45]:
!nvidia-smi

Wed Jan  3 13:44:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:01:00.0 Off |                  N/A |
| 22%   34C    P8              18W / 250W |   3949MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:24:00.0 Off |  