In [1]:
import pandas as pd

In [2]:
import torch

In [3]:
# !watch -n 0.5 nvidia-smi

In [4]:
print(torch.__version__)  # 1.9.1+cu111
print(torch.version.cuda)  # 11.1
print(torch.backends.cudnn.version())  # 8005
print(torch.cuda.current_device())  # 0
print(torch.cuda.is_available())  # TRUE

2.0.1
11.8
8700
0
True


In [5]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "False"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [6]:
!nvidia-smi

Wed Nov 22 17:01:36 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti      On | 00000000:01:00.0 Off |                  N/A |
| 37%   52C    P2               83W / 250W|   2521MiB / 11264MiB |     28%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti      On | 00000000:23:00.0 Off |  

In [7]:

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()   

Initial GPU Usage
| ID | GPU  | MEM |
-------------------
|  0 |  26% | 22% |
|  1 | 100% | 93% |
|  2 | 100% | 93% |
|  3 | 100% | 93% |
|  4 |   0% | 12% |
|  5 |  10% | 32% |
|  6 |  94% | 32% |
|  7 | 100% | 93% |
GPU Usage after emptying the cache
| ID | GPU  | MEM |
-------------------
|  0 |  26% | 22% |
|  1 | 100% | 93% |
|  2 | 100% | 93% |
|  3 | 100% | 93% |
|  4 |   0% | 13% |
|  5 |  10% | 32% |
|  6 |  95% | 32% |
|  7 | 100% | 93% |


In [8]:
data = pd.read_csv("TD_dataset_clean.csv" , index_col = 0)

In [9]:
data

Unnamed: 0,text_clean,label
0,look for min file instead,0
1,as an extension of 78,0
2,bountysourceplugin want to back this issue pla...,0
3,our grunt script is out of control its current...,0
4,jshint is dropping stylerelated support it see...,0
...,...,...
470600,你的功能请求是否与问题有关？ 希望可以为每日任务 困难图 增加次数选择功能。目前我的解决办法...,0
470601,rootlocalhost lsblk name majmin rm size ro typ...,0
470602,env gpu rtx1060 os ubuntu1804 docker cuda vers...,0
470603,env gpu 1080ti os ubuntu1804 cuda version114 t...,0


In [10]:
import datasets
import transformers

print(transformers.__version__)
print(datasets.__version__)

4.33.2
2.14.5


In [11]:
import datasets
from datasets import load_dataset, Dataset, DatasetDict

In [12]:
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

In [13]:
base_model_id = "distilbert-base-uncased"

epochs = 5
num_labels = 2 
learning_rate = 5e-5
train_batch_size = 16
eval_batch_size = 32
save_strategy = "no"
save_steps = 500
logging_steps = 100
model_dir = "./model"

In [14]:
import numpy as np

def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

## Load Datasets

In [15]:
data.dropna(inplace=True)

In [16]:
data.reset_index(inplace=True)

In [17]:
data.drop(columns= ["index"], inplace = True)

In [18]:
data

Unnamed: 0,text_clean,label
0,look for min file instead,0
1,as an extension of 78,0
2,bountysourceplugin want to back this issue pla...,0
3,our grunt script is out of control its current...,0
4,jshint is dropping stylerelated support it see...,0
...,...,...
470556,你的功能请求是否与问题有关？ 希望可以为每日任务 困难图 增加次数选择功能。目前我的解决办法...,0
470557,rootlocalhost lsblk name majmin rm size ro typ...,0
470558,env gpu rtx1060 os ubuntu1804 docker cuda vers...,0
470559,env gpu 1080ti os ubuntu1804 cuda version114 t...,0


In [19]:
train , validate , test = train_validate_test_split(data)

In [20]:

train.set_index("label" , inplace = True)
validate.set_index("label" , inplace = True)
test.set_index("label" , inplace = True)

In [21]:
test

Unnamed: 0_level_0,text_clean
label,Unnamed: 1_level_1
0,user story as a vha digital media web manager ...
0,imagehttpsuserimagesgithubusercontentcom899234...
0,required information operating system windows ...
0,without cleaning up erischains manually a ton ...
0,example imagehttpsuserimagesgithubusercontentc...
...,...
0,in the large table element of srcstylespagetem...
0,steps to reproduce düngerstreuer bredal k165 a...
0,is your feature request related to a problem p...
0,aktuell werden nur tests durchgeführt und dere...


In [22]:
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
testds = Dataset.from_pandas(test)

ds = DatasetDict()

ds["test"] = testds
ds["train"] = tds
ds["validate"] = vds

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 94113
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 282336
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 94112
    })
})

In [23]:
train_dataset = ds["train"]
valid_dataset = ds["validate"]

In [24]:
ds["train"][0]


{'text_clean': 'describe the bug when i execute a command in dm with bot it doesnt reply to reproduce 1 go to dm with the bot 2 execute a slash command 4 see error expected behavior should return reply with default styling environment please complete the following information commit 5a56f35e07ec3e5c98495ddf81cb2905da2eeebc branch main additional context this issue started after adding guild based embed configurations',
 'label': 0}

In [25]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(base_model_id, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
# optim = torch.optim.Adam(model.parameters(), lr=5e-5)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenization

    Character tokenization

In [27]:
text = "Tokenizing text is a core task of NLP."
tokenized_text = list(text)
print(tokenized_text)


['T', 'o', 'k', 'e', 'n', 'i', 'z', 'i', 'n', 'g', ' ', 't', 'e', 'x', 't', ' ', 'i', 's', ' ', 'a', ' ', 'c', 'o', 'r', 'e', ' ', 't', 'a', 's', 'k', ' ', 'o', 'f', ' ', 'N', 'L', 'P', '.']


In [28]:
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}
print(token2idx)


{' ': 0, '.': 1, 'L': 2, 'N': 3, 'P': 4, 'T': 5, 'a': 6, 'c': 7, 'e': 8, 'f': 9, 'g': 10, 'i': 11, 'k': 12, 'n': 13, 'o': 14, 'r': 15, 's': 16, 't': 17, 'x': 18, 'z': 19}


In [29]:

input_ids = [token2idx[token] for token in tokenized_text]
print(input_ids)


[5, 14, 12, 8, 13, 11, 19, 11, 13, 10, 0, 17, 8, 18, 17, 0, 11, 16, 0, 6, 0, 7, 14, 15, 8, 0, 17, 6, 16, 12, 0, 14, 9, 0, 3, 2, 4, 1]


In [30]:

import torch
import torch.nn.functional as F

input_ids = torch.tensor(input_ids)
one_hot_encodings = F.one_hot(input_ids, num_classes=len(token2idx))
one_hot_encodings.shape


torch.Size([38, 20])

In [31]:
print(f"Token: {tokenized_text[0]}")
print(f"Tensor index: {input_ids[0]}")
print(f"One-hot: {one_hot_encodings[0]}")


Token: T
Tensor index: 5
One-hot: tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


    Word Tokenization

In [32]:

tokenized_text = text.split()
print(tokenized_text)


['Tokenizing', 'text', 'is', 'a', 'core', 'task', 'of', 'NLP.']


    Subword Tokenization

In [33]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


In [34]:
encoded_text = tokenizer(text)
print(encoded_text)


{'input_ids': [101, 19204, 6026, 3793, 2003, 1037, 4563, 4708, 1997, 17953, 2361, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [35]:

tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)


['[CLS]', 'token', '##izing', 'text', 'is', 'a', 'core', 'task', 'of', 'nl', '##p', '.', '[SEP]']


In [36]:
print(tokenizer.convert_tokens_to_string(tokens))


[CLS] tokenizing text is a core task of nlp. [SEP]


In [37]:

tokenizer.vocab_size


30522

In [38]:

tokenizer.model_max_length


512

In [39]:
tokenizer.model_input_names


['input_ids', 'attention_mask']

In [40]:

#hide_input
tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))
data = sorted(tokens2ids, key=lambda x: x[-1])
df = pd.DataFrame(data, columns=["Special Token", "Special Token ID"])
df.T


Unnamed: 0,0,1,2,3,4
Special Token,[PAD],[UNK],[CLS],[SEP],[MASK]
Special Token ID,0,100,101,102,103


    Tokenizing the whole dataset

In [41]:
def tokenize(batch):
    return tokenizer(batch["text_clean"], padding="max_length", truncation=True)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))

Map:   0%|          | 0/282336 [00:00<?, ? examples/s]

Map:   0%|          | 0/94112 [00:00<?, ? examples/s]

## Training a classifier

Models like DistilBERT are pretrained to predict masked words in a sequence of text. However, we can't use these language models directly for text classification; we need to modify them slightly. To understand what modifications are necessary

In [42]:
base_model_id = "distilbert-base-uncased"

epochs = 2
num_labels = 2 
learning_rate = 5e-5
train_batch_size = 16
eval_batch_size = 32
save_strategy = "no"
save_steps = 500
logging_steps = 100
model_dir = "./model"

In [43]:
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_strategy=save_strategy,
    save_steps=save_steps,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    logging_steps=logging_steps,
)

In [44]:
 trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [45]:
torch.cuda.empty_cache()

In [46]:
trainer.train() 

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0393,0.034709,0.994464,[0.99722434 0. ],[0.99446404 0. ],[1. 0.]
2,0.0457,0.035852,0.994464,[0.99722434 0. ],[0.99446404 0. ],[1. 0.]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=35292, training_loss=0.034037989419522394, metrics={'train_runtime': 13224.0559, 'train_samples_per_second': 42.7, 'train_steps_per_second': 2.669, 'total_flos': 7.480063093388083e+16, 'train_loss': 0.034037989419522394, 'epoch': 2.0})

In [47]:
eval_result = trainer.evaluate(eval_dataset=valid_dataset)



  _warn_prf(average, modifier, msg_start, len(result))


In [48]:
for key, value in sorted(eval_result.items()):
    print(f"{key} = {value}\n")
    


epoch = 2.0

eval_accuracy = 0.9944640428425705

eval_f1 = [0.99722434 0.        ]

eval_loss = 0.03585215285420418

eval_precision = [0.99446404 0.        ]

eval_recall = [1. 0.]

eval_runtime = 604.2653

eval_samples_per_second = 155.746

eval_steps_per_second = 4.867



In [49]:
trainer.save_model(model_dir + "_local") 

In [50]:
from transformers import pipeline
    
classifier = pipeline("text-classification", model="./model_local")

In [51]:
classifier.model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [52]:
classifier("Woo hoo almost done")

[{'label': 'LABEL_0', 'score': 0.9976814985275269}]

In [53]:
del train_dataset

In [54]:
del valid_dataset

In [55]:
del model

In [56]:
import torch
torch.cuda.empty_cache()

In [57]:
!nvidia-smi

Wed Nov 22 21:00:06 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti      On | 00000000:01:00.0 Off |                  N/A |
| 43%   70C    P2              249W / 250W|   3311MiB / 11264MiB |     95%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti      On | 00000000:23:00.0 Off |  