In [1]:
import pandas as pd

In [2]:
import torch

In [3]:
# !watch -n 0.5 nvidia-smi

In [4]:
print(torch.__version__)  # 1.9.1+cu111
print(torch.version.cuda)  # 11.1
print(torch.backends.cudnn.version())  # 8005
print(torch.cuda.current_device())  # 0
print(torch.cuda.is_available())  # TRUE

1.11.0
11.3
8201
0
True


In [5]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "True"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [6]:
!nvidia-smi

Tue Aug  2 20:53:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:18:00.0 Off |                  N/A |
| 31%   41C    P2    51W / 250W |   1816MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:3B:00.0 Off |                  N/A |
| 28%   29C    P8    13W / 250W |      3MiB / 11019MiB |      0%      Default |
|       

In [7]:

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()   

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% | 16% |
|  1 |  0% |  0% |
|  2 |  0% |  0% |
|  3 | 58% | 51% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  0% | 16% |
|  1 |  0% |  1% |
|  2 |  0% |  0% |
|  3 | 64% | 51% |


In [8]:
data = pd.read_csv("TD_dataset_clean.csv" , index_col = 0)

In [9]:
data

Unnamed: 0,text_clean,label
0,look for min file instead,1
1,as an extension of 78,1
2,bountysourceplugin want to back this issue pla...,1
3,our grunt script is out of control its current...,1
4,jshint is dropping stylerelated support it see...,1
...,...,...
127686,ci is no more ok all i could see right now is ...,0
127687,agentwebfragment 打开其他网址没问题，打开httpsopenapialipa...,0
127688,this wouldnt quite be the same as an installpa...,0
127689,oh no a bug it happens thanks for reporting an...,0


In [10]:
import datasets
import transformers

print(transformers.__version__)
print(datasets.__version__)

4.21.0
2.4.0


In [11]:
import datasets
from datasets import load_dataset, Dataset, DatasetDict

In [12]:
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

In [13]:
base_model_id = "microsoft/deberta-v3-base"

epochs = 5
num_labels = 2 
learning_rate = 2e-5
train_batch_size = 16
eval_batch_size = 32
save_strategy = "no"
save_steps = 500
logging_steps = 100

model_dir = "./model1"

In [14]:
import numpy as np

def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [15]:
data.dropna(inplace=True)

In [16]:
data.reset_index(inplace=True)

In [17]:
data.drop(columns= ["index"], inplace = True)

In [18]:
data

Unnamed: 0,text_clean,label
0,look for min file instead,1
1,as an extension of 78,1
2,bountysourceplugin want to back this issue pla...,1
3,our grunt script is out of control its current...,1
4,jshint is dropping stylerelated support it see...,1
...,...,...
127671,ci is no more ok all i could see right now is ...,0
127672,agentwebfragment 打开其他网址没问题，打开httpsopenapialipa...,0
127673,this wouldnt quite be the same as an installpa...,0
127674,oh no a bug it happens thanks for reporting an...,0


In [19]:
train , validate , test = train_validate_test_split(data)

In [20]:

train.set_index("label" , inplace = True)
validate.set_index("label" , inplace = True)
test.set_index("label" , inplace = True)

In [21]:
test

Unnamed: 0_level_0,text_clean
label,Unnamed: 1_level_1
0,description of the problem when i try to perfo...
1,descriptive summary hyrax 302 makes rails 526 ...
0,the logout function is not revoking user auth ...
0,as a service provider i need the service to pe...
0,what would you like to be added the name of th...
...,...
0,there seems to be a few issues with where erro...
1,mobile scroll and mobile scrollto are the only...
0,acceptance test labels for 16 17 and 18 appear...
0,steps to reproduce 1 install the desktop app f...


In [22]:
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
testds = Dataset.from_pandas(test)

ds = DatasetDict()

ds["test"] = testds
ds["train"] = tds
ds["validate"] = vds

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 25536
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 76605
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 25535
    })
})

In [23]:
train_dataset = ds["train"]
valid_dataset = ds["validate"]

In [24]:
ds["train"][0]


{'text_clean': '问题描述 单元素的数组json序列化的时候，没有中括号；后面直接导致解析端用fastjon反序列化的时候，直接报格式错误 环境信息 请填写以下信息： os信息： eg：centos 842105 4core 310ghz 16 gb jdk信息： eg：hotspot jdk 1703 版本信息：eg：fastjson2 204 重现步骤 如何操作可以重现该问题： 一些框架在序列化的时候，如果是单元素的集合，直接就是省去了中括号后面直接导致解析端用fastjon2反序列化的时候，直接报格式错误 name zhangsan books 西游记 name zhangsan books 西游记红楼梦 1 使用 xxxxxx 方法 2 输入 数据 3 出现 错误 java 可在此输入示例代码 data public class student private string name private liststring books public class jsonmain public static void mainstring args try string str name zhangsan books 西游记 student json1 jsonobjectparseobjectstr studentclass systemoutprintlnjson1 json1 catch exception e eprintstacktrace finally string str2 name zhangsan books 西游记 红楼梦 student json2 jsonobjectparseobjectstr2 studentclass systemoutprintlnjson2 json2 期待的正确结果 对您期望发生的结果进行清晰简洁的描述。 希望fastjson能支持解析单元素数组场景下不含有中括号，也能解析成功 相关日志输出 请复制并粘贴任何相关的日志输出。 comalibabafastjson2jsonexception json format error at comalibabafastjson2readerfieldreaderliststrmethodreadfieldvaluefieldreaderliststrme

In [25]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(base_model_id, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
# optim = torch.optim.Adam(model.parameters(), lr=5e-5)


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

In [28]:
 def tokenize(batch):
    return tokenizer(batch["text_clean"], padding="max_length",max_length=256, truncation=True)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [29]:
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_strategy=save_strategy,
    save_steps=save_steps,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    logging_steps=logging_steps,
)

In [30]:
 trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [31]:
trainer.train() 

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: text_clean. If text_clean are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 76605
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 23940


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.327,0.293458,0.880674,[0.89224458 0.86631861],[0.89214993 0.86643265],[0.89233925 0.8662046 ]
2,0.2295,0.345603,0.887057,[0.8954163 0.87724525],[0.91866954 0.85193452],[0.87331117 0.90410598]
3,0.1658,0.356749,0.88831,[0.90264218 0.86903012],[0.87227024 0.91173637],[0.93520549 0.83014564]
4,0.1255,0.453026,0.892148,[0.90211132 0.87992675],[0.90662285 0.87458832],[0.89764448 0.88533076]
5,0.0877,0.502955,0.8916,[0.90212164 0.87854322],[0.90193028 0.87877458],[0.90231308 0.87831198]


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: text_clean. If text_clean are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25535
  Batch size = 32
Trainer is attempting to log a value of "[0.89224458 0.86631861]" of type <class 'numpy.ndarray'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.89214993 0.86643265]" of type <class 'numpy.ndarray'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.89233925 0.8662046 ]" of type <class 'numpy.ndarray'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is in

TrainOutput(global_step=23940, training_loss=0.20204814871650192, metrics={'train_runtime': 23689.098, 'train_samples_per_second': 16.169, 'train_steps_per_second': 1.011, 'total_flos': 5.03899596589824e+16, 'train_loss': 0.20204814871650192, 'epoch': 5.0})

In [32]:
eval_result = trainer.evaluate(eval_dataset=valid_dataset)



The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: text_clean. If text_clean are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25535
  Batch size = 32


Trainer is attempting to log a value of "[0.90212164 0.87854322]" of type <class 'numpy.ndarray'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.90193028 0.87877458]" of type <class 'numpy.ndarray'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.90231308 0.87831198]" of type <class 'numpy.ndarray'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


In [35]:
for key, value in sorted(eval_result.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.8915997650283924

eval_f1 = [0.90212164 0.87854322]

eval_loss = 0.5029550194740295

eval_precision = [0.90193028 0.87877458]

eval_recall = [0.90231308 0.87831198]

eval_runtime = 645.1756

eval_samples_per_second = 39.578

eval_steps_per_second = 1.237



In [33]:
trainer.save_model(model_dir + "_local") 

Saving model checkpoint to ./model1_local
Configuration saved in ./model1_local/config.json
Model weights saved in ./model1_local/pytorch_model.bin
tokenizer config file saved in ./model1_local/tokenizer_config.json
Special tokens file saved in ./model1_local/special_tokens_map.json


In [39]:
tokenizer.save_pretrained(model_dir + "_local_tokenizer") 

tokenizer config file saved in ./model1_local_tokenizer/tokenizer_config.json
Special tokens file saved in ./model1_local_tokenizer/special_tokens_map.json


('./model1_local_tokenizer/tokenizer_config.json',
 './model1_local_tokenizer/special_tokens_map.json',
 './model1_local_tokenizer/spm.model',
 './model1_local_tokenizer/added_tokens.json',
 './model1_local_tokenizer/tokenizer.json')

In [37]:
 from transformers import pipeline
    
classifier = pipeline("text-classification", model="./model1_local")

loading configuration file ./model1_local/config.json
Model config DebertaV2Config {
  "_name_or_path": "./model1_local",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "float32",
  "transformers_version": "4.21.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

loading configuration file ./model1_local/confi

In [46]:
classifier("descriptive summary hyrax 302 makes rails 526 support official and rails 526 includes a security update for polymorphicpath which is a method used in all link generation across rails linkto urlfor etc all these methods no longer support string parameters which must be converted to symbols this is a problem in 526 and 5246 our current 525 is unaffected and upgrading may see problems on random pages we should check through the application for calls to polymorphicpath urlfor linkto and any other link generating methods you can think of all hyrax links should be fixed in 302 then we can safely update to hyrax 302 and rails 526 expected behavior all links on site call their generation method with symbols or objects not strings hyrax 302 rails 526 related work blocked 1578 from going to 302 accessibility concerns")

[{'label': 'LABEL_1', 'score': 0.9999406337738037}]

In [47]:
classifier("description of the problem when i try to perform file upload in sut via the browsers running in docker the file upload popup is not opening and neither accepts any files causing scripts to fail browser and version safari operating system ran on docker webdrivermanager version v503")

[{'label': 'LABEL_0', 'score': 0.9995662569999695}]

In [49]:
classifier("after user sing up the email must be confirmed prepare a goodlooking email template develop a be to send this template after user registration develop an endpoint to check confirmation token develop a fe page with confirmation status create a cron to check accounts which havent confirmed an email and remove them")

[{'label': 'LABEL_0', 'score': 0.9972280859947205}]

In [48]:
testds[5:9]

{'text_clean': ['current layer always reported as last layer eg 150150',
  'after user sing up the email must be confirmed prepare a goodlooking email template develop a be to send this template after user registration develop an endpoint to check confirmation token develop a fe page with confirmation status create a cron to check accounts which havent confirmed an email and remove them',
  'environment server prod version id or medalcreator url 40httpsmedalcreatorunisantechapiv1versions40 architecture clientserver app version 1012210001212 describe the bug missing mandatory id key for nodes 7648 answer null value null 7773 answer null value 7782 answer null value null 7786 answer null value null 7838 answer null value null 7955 answer null value null 7956 answer null value null 8110 answer null value null 8468 answer null value null 10082 answer null value 7837 answer null value null 7955 answer null value null 7956 answer null value null',
  'kibana version 830 elasticsearch version 

Dataset({
    features: ['text_clean', 'label'],
    num_rows: 25536
})

In [138]:
del model

In [37]:
import torch
torch.cuda.empty_cache()

In [38]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tue Aug  2 11:52:53 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| 22%   25C    P8    15W / 250W |      1MiB / 11019MiB |      0%      Default |
|                               |            