In [None]:
import re
import nltk
import numpy as np
import torch
import pprint
from detoxify import Detoxify
from transformers import pipeline as transformers_pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))

In [None]:
clf = transformers_pipeline(
  "feature-extraction",
  model = "nlptown/bert-base-multilingual-uncased-sentiment",
  tokenizer = "ai-forever/sbert_large_nlu_ru"
)

In [None]:
class Text2EmbedingsSbert:
  def __init__(self):
    self.task = "feature-extraction"
    self.model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")
    self.tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")

  def mean_pooling(self, model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

  def embed(self, text):
    encoded_input = self.tokenizer(text, padding=True, truncation=True, max_length=24, return_tensors='pt')
    with torch.no_grad():
      model_output = self.model(**encoded_input)
    return self.mean_pooling(model_output, encoded_input['attention_mask'])

In [None]:
embeder = Text2EmbedingsSbert()
sentences = ['Привет! Как твои дела?',
             'А правда, что 42 твое любимое число?']
embeder.embed(sentences)

In [None]:
class ProfanityClassifier:
    def __init__(self):
        self.model = AutoModelForSequenceClassification.from_pretrained("SkolkovoInstitute/russian_toxicity_classifier")
        self.tokenizer = AutoTokenizer.from_pretrained("SkolkovoInstitute/russian_toxicity_classifier")
        self.label_map = {0: 'neutral', 1: 'toxic'}

    def detect(self, text):
        encoded_input = self.tokenizer(
            text,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        with torch.no_grad():
            outputs = self.model(**encoded_input)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
        predicted_label_id = torch.argmax(probabilities, dim=-1).item()
        toxic_score = probabilities[0, 1].item()

        return self.label_map[predicted_label_id] == 'toxic' or toxic_score > 0.5

In [None]:
class TextClassificationPipeline:
    def __init__(self):
        self.toxicity_model = Detoxify('multilingual')
        self.profanity_classifier = ProfanityClassifier()
        self.embedder = Text2EmbedingsSbert()
        self.topic_classifier = OneVsRestClassifier(LinearSVC())
        self.sentiment_classifier = LinearSVC()

    def _fit_topic_model(self, training_texts, training_labels):
        preprocessed_texts = [self.preprocess_text(text) for text in training_texts]
        embeddings = [self.embedder.embed(text).cpu().numpy().squeeze() for text in preprocessed_texts]
        embeddings = np.array(embeddings)

        # Обучение классификатора
        self.topic_classifier.fit(embeddings, training_labels)

    def _fit_sentiment_model(self, training_texts, training_labels):
        preprocessed_texts = [self.preprocess_text(text) for text in training_texts]
        embeddings = [self.embedder.embed(text).cpu().numpy().squeeze() for text in preprocessed_texts]
        embeddings = np.array(embeddings)
        self.sentiment_classifier.fit(embeddings, training_labels)

    def preprocess_text(self, text):
        text = re.sub(r'[^\w\s]', '', text.lower())
        tokens = text.split()
        tokens = [token for token in tokens if token not in stop_words]
        return ' '.join(tokens)

    def detect_profanity(self, text):
        return self.profanity_classifier.detect(text)

    def classify_toxicity(self, text):
        results = self.toxicity_model.predict(text)
        return bool(results['toxicity'] > 0.5)

    def classify_sentiment(self, text):
        preprocessed_text = self.preprocess_text(text)
        embedding = self.embedder.embed(preprocessed_text).cpu().numpy().squeeze()
        label = self.sentiment_classifier.predict(embedding)[0]
        positive = 1.0 if label == "positive" else 0.5 if label == "neutral" else 0.0
        negative = 1.0 if label == "negative" else 0.5 if label == "neutral" else 0.0
        return positive, negative

    def classify_topic(self, text):
        preprocessed_text = self.preprocess_text(text)
        embedding = self.embedder.embed(preprocessed_text).cpu().numpy().squeeze()
        embedding = embedding.reshape(1, -1)
        topic = str(self.topic_classifier.predict(embedding)[0])
        return topic

    def process_text(self, text):
        preprocessed_text = self.preprocess_text(text)
        result = {
            'not_normalized_text' : text,
            'normalized_text': preprocessed_text,
            'toxicity': self.classify_toxicity(preprocessed_text),
            'profanity': self.detect_profanity(preprocessed_text),
            'sentiment': self.classify_sentiment(preprocessed_text),
            'theme': self.classify_topic(preprocessed_text)
        }
        return result

In [None]:
printer = pprint.PrettyPrinter(width=50, indent=4)

def test():
  pipeline = TextClassificationPipeline()
  topic_texts = [
      "Как настроить роутер?",
      "Почему интернет не работает?",
      "Верните деньги, это ужасно!",
      "Хочу отменить заказ",
      "Какие у вас тарифы?",
      "Расскажите про акции"
  ]
  topic_labels = ["tech_support", "tech_support", "complaint", "complaint", "info_request", "info_request"]
  sentiment_texts = [
      "Этот продукт ужасный, я очень недоволен!",
      "Отличный сервис, всё работает!",
      "Нормально, но могло быть лучше",
      "Почему всё так плохо?",
      "Супер, очень доволен!",
      "Обычный продукт, ничего особенного"
  ]
  sentiment_labels = ["negative", "positive", "neutral", "negative", "positive", "neutral"]
  pipeline._fit_topic_model(topic_texts, topic_labels)
  pipeline._fit_sentiment_model(sentiment_texts, sentiment_labels)
  sample_text = "Этот продукт плохой, я очень недоволен!"
  result = pipeline.process_text(sample_text)
  printer.pprint(result)
  print('/////////////')
  sample_text_2 = "Почему интернет не работает?"
  result_2 = pipeline.process_text(sample_text_2)
  printer.pprint(result_2)

In [None]:
test()