<a href="https://colab.research.google.com/github/olgasem10/Unsafe/blob/main/Ray_with_weight_decay.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ray[tune]

In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np
import pandas as pd

### Данные (с добавлением спец токена)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/internship/data/train_randst0.csv')
test_df = pd.read_csv('/content/drive/MyDrive/internship/data/val_randst0.csv')
#train_df = train_df[["text", "unsafe"]]
#test_df = test_df[["text", "unsafe"]]

In [None]:
train_df = train_df.loc[(train_df['unsafe'] >= 0.8) | (train_df['unsafe'] <= 0.2)]

In [None]:
translator = {
    'crime_real' : 'криминал',
    'crime_web' : 'веб-криминал',
    'drugs' : 'наркотики',
    'gambling' : 'азартные игры',
    'pornography' : 'порнография',
    'prostitution' : 'проституция',
    'slavery' : 'рабство',
    'suicide' : 'суицид',
    'terrorism' : 'терроризм',
    'weapons' : 'оружие', 
    'body_shaming' : 'уродства',
    'halth_shaming' : 'инвалидность',
    'politics' : 'политика',
    'racism' : 'расизм',
    'religion' : 'религия',
    'sex_minorities' : 'гомосексуализм',
    'sexism' : 'сексизм',
    'social' : 'социальное',
    'none' : ''
}

In [None]:
col_names = ['crime_real',
 'crime_web',
 'drugs',
 'gambling',
 'pornography',
 'prostitution',
 'slavery',
 'suicide',
 'terrorism',
 'weapons',
 'body_shaming',
 'halth_shaming',
 'politics',
 'racism',
 'religion',
 'sex_minorities',
 'sexism',
 'social']

In [None]:
def binary(val):
    return round(val)

In [None]:
for name in col_names:
    train_df[name] = train_df[name].apply(binary)
    test_df[name] = test_df[name].apply(binary)

In [None]:
train_df['unsafe'] = train_df['unsafe'].apply(binary)
test_df['unsafe'] = test_df['unsafe'].apply(binary)

In [None]:
def add_special_tokens(dataframe):
    new_dataframe = dataframe.copy()
    for column in col_names:
        indices = dataframe.index[dataframe[column] == 1]
        new_dataframe.loc[indices, ['text']] = '<' + translator[column] + '> ' + dataframe.loc[indices, ['text']].astype(str)
    return new_dataframe

In [None]:
train_df = add_special_tokens(train_df)
test_df = add_special_tokens(test_df)

In [None]:
train_df = train_df.sample(frac=1)
test_df = test_df.sample(frac=1)

In [None]:
x_train = train_df['text'].tolist()
y_train = train_df['unsafe'].tolist()
x_test = test_df['text'].tolist()
y_test = test_df['unsafe'].tolist()

In [None]:
class UnsafeData(Dataset):

    def __init__(self, texts, targets, tokenizer, max_len):
        
        super().__init__()
        
        self.texts = texts
        self.targets = targets        
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        
        return len(self.texts)

    
    def __getitem__(self, index):
        x = self.texts[index]

        enc_dict = self.tokenizer(x, truncation=True, max_length=self.max_len, padding='max_length')
      
        item = {key: torch.tensor(val).long() for key, val in enc_dict.items()}
        item['labels'] = torch.tensor(self.targets[index]).long()

        return item 

### Обучение

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
model_name = 'DeepPavlov/rubert-base-cased-conversational'

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
train_dataset = UnsafeData(x_train, y_train, tokenizer, max_len = 60)
test_dataset = UnsafeData(x_test, y_test, tokenizer, max_len = 60)

In [None]:
len(train_dataset), len(test_dataset)

(15000, 2000)

In [None]:
train_dataset[10000]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([  101,   160, 35408,   164, 11830,  6554,   132,   997,  1070, 63603,
           128,   322, 17144,  3869,   885,   883,  8801, 95818,  2933,   371,
          1199,   879,  1088,  2949, 10896, 35140,   344, 35401,   128,  3847,
          1928,   838,  2785, 10209, 32648,  3941,  1757,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'labels': tensor(1),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

In [None]:
def model_init():
    return BertForSequenceClassification.from_pretrained(model_name, return_dict=True)

In [None]:
training_args = TrainingArguments(
    output_dir='/bert1',
    logging_steps = 600,
    evaluation_strategy = 'steps',
    logging_dir='/bert1/logs',
    do_eval=True,
    save_steps = 10000,
    save_total_limit = 2,
    disable_tqdm=True
)

trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    model_init=model_init,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
### По какой метрике оптимизировать

def my_func(metrics):
    return metrics['eval_f1']

In [None]:
from ray import tune

In [None]:
##scheduler  другие: https://docs.ray.io/en/latest/tune/api_docs/schedulers.html

from ray.tune.schedulers import PopulationBasedTraining

scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="eval_f1",
        mode="max",
        perturbation_interval=2,
        hyperparam_mutations={
            "weight_decay": tune.uniform(0.0, 0.1),
            "learning_rate": tune.loguniform(1e-6, 1e-4),
            "per_device_train_batch_size": tune.choice([16, 32, 64]),
            "num_train_epochs": tune.choice([1,2])

        })

In [None]:
### алгоритм  другие: https://docs.ray.io/en/master/tune/api_docs/suggestion.html

from ray.tune.suggest.hyperopt import HyperOptSearch

alg = HyperOptSearch(metric = 'eval_f1', mode = 'max')

In [None]:
### Stopper  другие:

from ray.tune.stopper import TrialPlateauStopper

stopper = TrialPlateauStopper(metric = 'eval_f1', mode = 'max', grace_period = 3)

In [None]:
### Функция для пространства парметров

def my_space(arg = None):
    return {
        "learning_rate": tune.loguniform(1e-6, 1e-3),
        "num_train_epochs": tune.choice([1,2]),
        #"seed": tune.uniform(1, 40),
        "per_device_train_batch_size": tune.choice([16, 32, 64]),
        "weight_decay": tune.uniform(0.0, 0.1),
    }

In [None]:
best_run = trainer.hyperparameter_search(
    hp_space = my_space,
    n_trials=2,
    compute_objective = my_func,
    direction="maximize",
    backend = 'ray',
    search_alg=alg,
    scheduler=scheduler,
    stop = stopper,
    keep_checkpoints_num=1
    
)



== Status ==
Memory usage on this node: 6.3/12.7 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 1/2 CPUs, 0/0 GPUs, 0.0/7.52 GiB heap, 0.0/2.59 GiB objects
Result logdir: /root/ray_results/_inner_2021-03-18_10-46-27
Number of trials: 1/2 (1 RUNNING)
+-----------------+----------+-------+-----------------+--------------------+-------------------------------+----------------+
| Trial name      | status   | loc   |   learning_rate |   num_train_epochs |   per_device_train_batch_size |   weight_decay |
|-----------------+----------+-------+-----------------+--------------------+-------------------------------+----------------|
| _inner_32774bb4 | RUNNING  |       |     8.94172e-05 |                  2 |                            16 |      0.0749374 |
+-----------------+----------+-------+-----------------+--------------------+-------------------------------+----------------+




[2m[36m(pid=1542)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(pid=1542)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(pid=1541)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.weight', 'classifier.bias']
[2m[36m(pid=1541)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(pid=1542)[0m Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
[2m[36m(pid=1542)[0m Using deprecated `--per_gpu_train_batch_size` argument which will be

KeyboardInterrupt: ignored