### Import Libraries and Set the intial variables

In [1]:
!pip install transformers

# подключить google drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [2]:
from transformers import BertModel, BertTokenizer
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

#constants
PATH = r'/content/gdrive/MyDrive/Project_spring/fin/inference/'
MAX_LEN = 128
CLASS_NAMES = ["normal", "toxic"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SM = torch.nn.Sigmoid()

MODEL_FILE_NAME = r'model_rbt2.bin'
MODEL_NAME = 'cointegrated/rubert-tiny2'
THRESHOLD = 0.91

In [3]:
class SentimentClassifier(nn.Module):

    def __init__(self, model_name_):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name_)
        self.out = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 600),
            nn.ReLU(),
            nn.BatchNorm1d(600),
            nn.Linear(600, 600),
            nn.ReLU(),
            nn.BatchNorm1d(600),
            nn.Linear(600, 300),
            nn.ReLU(),
            nn.BatchNorm1d(300),
            nn.Linear(300, 1),
            )

    def forward(self, input_ids, attention_mask):
        _, output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask,
          return_dict=False
        )
        return self.out(output)

In [4]:
def model_prediction(model, tokenizer, text_for_prediction):
  encoded_review = tokenizer.encode_plus(
      text_for_prediction,
      max_length=MAX_LEN,
      add_special_tokens=True,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      truncation=True,
      return_tensors='pt',
  )

  input_ids = encoded_review['input_ids'].to(DEVICE)
  attention_mask = encoded_review['attention_mask'].to(DEVICE)

  model.eval()
  with torch.no_grad():
    output = model(input_ids, attention_mask)

    output_probs = SM(output.flatten())
    prediction = (output_probs.flatten() > THRESHOLD).int()

  return CLASS_NAMES[prediction]

In [5]:
model = SentimentClassifier(model_name_ = MODEL_NAME)
model.load_state_dict(torch.load(PATH + MODEL_FILE_NAME))
model = model.to(DEVICE)

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

In [6]:
model_prediction(model, tokenizer, "ты баран")

'toxic'

In [7]:
model_prediction(model, tokenizer, "пацан красавчик!")

'normal'