In [1]:
import pandas as pd

In [9]:
import torch

In [10]:
# !watch -n 0.5 nvidia-smi

In [11]:
print(torch.__version__)  # 1.9.1+cu111
print(torch.version.cuda)  # 11.1
print(torch.backends.cudnn.version())  # 8005
print(torch.cuda.current_device())  # 0
print(torch.cuda.is_available())  # TRUE

1.11.0
11.3
8201
0
True


In [12]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "True"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [13]:
!nvidia-smi

Tue Aug  2 23:47:09 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| 22%   25C    P8    15W / 250W |      1MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:23:00.0 Off |                  N/A |
| 22%   25C    P8     7W / 250W |   2093MiB / 11019MiB |      0%      Default |
|       

In [14]:

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()   

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |
|  1 |  0% | 19% |
|  2 |  1% | 15% |
|  3 | 86% | 61% |
|  4 |  0% |  0% |
|  5 | 46% | 25% |
|  6 | 34% | 67% |
|  7 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |
|  1 |  1% | 19% |
|  2 |  0% | 15% |
|  3 | 80% | 61% |
|  4 |  0% |  0% |
|  5 | 46% | 25% |
|  6 | 34% | 67% |
|  7 |  0% |  0% |


In [15]:
data = pd.read_csv("TD_dataset_clean.csv" , index_col = 0)

In [16]:
data

Unnamed: 0,text_clean,label
0,look for min file instead,1
1,as an extension of 78,1
2,bountysourceplugin want to back this issue pla...,1
3,our grunt script is out of control its current...,1
4,jshint is dropping stylerelated support it see...,1
...,...,...
127686,ci is no more ok all i could see right now is ...,0
127687,agentwebfragment 打开其他网址没问题，打开httpsopenapialipa...,0
127688,this wouldnt quite be the same as an installpa...,0
127689,oh no a bug it happens thanks for reporting an...,0


In [17]:
import datasets
import transformers

print(transformers.__version__)
print(datasets.__version__)

4.21.0
2.4.0


In [18]:
import datasets
from datasets import load_dataset, Dataset, DatasetDict

In [19]:
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

In [20]:
base_model_id = "xlm-roberta-base"

epochs = 5
num_labels = 2 
learning_rate = 2e-5
train_batch_size = 16
eval_batch_size = 32
save_strategy = "no"
save_steps = 500
logging_steps = 100

model_dir = "./model"

In [21]:
import numpy as np

def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [22]:
data.dropna(inplace=True)

In [23]:
data.reset_index(inplace=True)

In [24]:
data.drop(columns= ["index"], inplace = True)

In [25]:
data

Unnamed: 0,text_clean,label
0,look for min file instead,1
1,as an extension of 78,1
2,bountysourceplugin want to back this issue pla...,1
3,our grunt script is out of control its current...,1
4,jshint is dropping stylerelated support it see...,1
...,...,...
127671,ci is no more ok all i could see right now is ...,0
127672,agentwebfragment 打开其他网址没问题，打开httpsopenapialipa...,0
127673,this wouldnt quite be the same as an installpa...,0
127674,oh no a bug it happens thanks for reporting an...,0


In [26]:
train , validate , test = train_validate_test_split(data)

In [27]:

train.set_index("label" , inplace = True)
validate.set_index("label" , inplace = True)
test.set_index("label" , inplace = True)

In [28]:
test

Unnamed: 0_level_0,text_clean
label,Unnamed: 1_level_1
0,hi again 👋🏻 describe the problem using unknown...
1,as a developer i need a way to store the state...
1,currently graphene uses a custom scalar type t...
0,describe the bug extension creates a new html ...
0,here is for fxbtcjpy press any key to continue...
...,...
0,idgeneration script can take care of creating ...
0,from google failed to create containerd task f...
0,describe the bug although freetypedll is prese...
0,sql show sqlalchemygettablecomment tables like...


In [29]:
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
testds = Dataset.from_pandas(test)

ds = DatasetDict()

ds["test"] = testds
ds["train"] = tds
ds["validate"] = vds

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 25536
    })
    train: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 76605
    })
    validate: Dataset({
        features: ['text_clean', 'label'],
        num_rows: 25535
    })
})

In [30]:
train_dataset = ds["train"]
valid_dataset = ds["validate"]

In [31]:
ds["train"][0]


{'text_clean': 'as a user when certain events happen i should get an in app notification new message from supportermenstruater app news allow users to get text message andor email notifications as well',
 'label': 0}

In [32]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(base_model_id, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
# optim = torch.optim.Adam(model.parameters(), lr=5e-5)


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [34]:
 def tokenize(batch):
    return tokenizer(batch["text_clean"], padding="max_length", max_length=256,truncation=True)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [35]:
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    save_strategy=save_strategy,
    save_steps=save_steps,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    logging_steps=logging_steps,
)

In [36]:
 trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [37]:
trainer.train() 

The following columns in the training set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text_clean. If text_clean are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 76605
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 23940


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3529,0.336046,0.860192,[0.87721832 0.837683 ],[0.85896141 0.8619012 ],[0.89626818 0.81478861]
2,0.2725,0.325831,0.872763,[0.88835435 0.85210979],[0.86915008 0.87780174],[0.90842645 0.827879 ]
3,0.2274,0.369067,0.875622,[0.88978345 0.85728408],[0.87886474 0.87130069],[0.90097688 0.8437113 ]
4,0.1893,0.401042,0.872567,[0.8879245 0.85233255],[0.87065181 0.87520969],[0.90589641 0.83062091]
5,0.1611,0.525056,0.876013,[0.8904953 0.85711707],[0.87672819 0.87504607],[0.90470167 0.83990801]


The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text_clean. If text_clean are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25535
  Batch size = 32
Trainer is attempting to log a value of "[0.87721832 0.837683  ]" of type <class 'numpy.ndarray'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.85896141 0.8619012 ]" of type <class 'numpy.ndarray'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.89626818 0.81478861]" of type <class 'numpy.ndarray'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is 

TrainOutput(global_step=23940, training_loss=0.2569883647917108, metrics={'train_runtime': 14576.8097, 'train_samples_per_second': 26.276, 'train_steps_per_second': 1.642, 'total_flos': 5.0389055989632e+16, 'train_loss': 0.2569883647917108, 'epoch': 5.0})

In [38]:
eval_result = trainer.evaluate(eval_dataset=valid_dataset)



The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text_clean. If text_clean are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25535
  Batch size = 32


Trainer is attempting to log a value of "[0.8904953  0.85711707]" of type <class 'numpy.ndarray'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.87672819 0.87504607]" of type <class 'numpy.ndarray'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.90470167 0.83990801]" of type <class 'numpy.ndarray'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


In [39]:
trainer.save_model(model_dir + "_local_roberta")

Saving model checkpoint to ./model_local_roberta
Configuration saved in ./model_local_roberta/config.json
Model weights saved in ./model_local_roberta/pytorch_model.bin
tokenizer config file saved in ./model_local_roberta/tokenizer_config.json
Special tokens file saved in ./model_local_roberta/special_tokens_map.json


In [41]:
tokenizer.save_pretrained(model_dir + "_local_tokensizer_roberta")

tokenizer config file saved in ./model_local_tokensizer_roberta/tokenizer_config.json
Special tokens file saved in ./model_local_tokensizer_roberta/special_tokens_map.json


('./model_local_tokensizer_roberta/tokenizer_config.json',
 './model_local_tokensizer_roberta/special_tokens_map.json',
 './model_local_tokensizer_roberta/sentencepiece.bpe.model',
 './model_local_tokensizer_roberta/added_tokens.json',
 './model_local_tokensizer_roberta/tokenizer.json')

In [40]:
for key, value in sorted(eval_result.items()):
    print(f"{key} = {value}\n")

epoch = 5.0

eval_accuracy = 0.8760133150577638

eval_f1 = [0.8904953  0.85711707]

eval_loss = 0.5250561237335205

eval_precision = [0.87672819 0.87504607]

eval_recall = [0.90470167 0.83990801]

eval_runtime = 197.3403

eval_samples_per_second = 129.396

eval_steps_per_second = 4.044



In [None]:
 from transformers import pipeline
    
classifier = pipeline("text-classification", model="./model_local_roberta")

In [138]:
del model

In [37]:
import torch
torch.cuda.empty_cache()

In [38]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tue Aug  2 11:52:53 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| 22%   25C    P8    15W / 250W |      1MiB / 11019MiB |      0%      Default |
|                               |            