### Gdown files

In [None]:
# dataset
!gdown 1wb6ayDuhhqOnFLjU4qWzeohiMnv7t8RK

# clear dataset
!gdown 1vzYpVcquBvzX5Ige3klpaACQFbjEP4Ak

# id2label and label2id
!gdown 1yBppNyzNCS5tinBvlTIyuMbBDmQhmKBF
!gdown 1GvsfK3vZIBbYViI-KFPCsW-mFw4RUjqK

# contractor dataset
!gdown 1j528C3llhpycw5mqSlUO8hATZR1hzoza

# contractor id2label and label2id
!gdown 1-0o2i16oGXe8gtiV_HnZXzLGiJtfBv9T
!gdown 1FwH6xxW0KXStqkn8nfaeYnsYlars8P_M

# topic2big_topic
!gdown 1EJfpWAHRlgGE9DdPQNYu69hoUDmbahT0

### install packages

In [None]:
!pip install uvicorn nest_asyncio fastapi pyngrok kaleido python-multipart pydantic natasha

### Prepare models

In [None]:
!gdown 1mEE7U-eyVP2uFSR0SJrRMGLBGT_kOorb
!gdown 1lcwRNuEeW60Q6gWH7ChrblOeGu81cLK4
# request rubert-tiny for valid
!gdown 1AP16hHsuogecIikCISlUeuwA5P81ki9P

In [None]:
!unzip ruBert-base.zip

In [None]:
!unzip ruBert-tiny-contractor.zip

In [None]:
!unzip request_ruBert-tiny.zip

In [7]:
from transformers import pipeline


topic_classifier = pipeline("text-classification",
                      model="ruBert-base",
                      tokenizer="ai-forever/ruBert-base")

contractors_classifier = pipeline("text-classification",
                      model="ruBert-tiny-contractor",
                      tokenizer="cointegrated/rubert-tiny2")

# spell_pipeline = pipeline(model='UrukHan/t5-russian-spell',
#                           task='text2text-generation', device='cuda')


valid_pipeline = pipeline(model='request_ruBert-tiny',
                          task="text-classification",
                          tokenizer = 'cointegrated/rubert-tiny2',
                          batch_size=64)

config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

### Spell checker

In [13]:
# def remove_extra_symbols(text):
#     # Убираем лишние символы в начале предложения, если модель их добавила
#     # correct_text = correct_text[correct_text.index(input_text[0]):]
#     text = text.lstrip('.,[]«»')

#     # Если модель выдает несколько одинаковых знаков препинания подряд, оставляем один
#     text = re.sub(r'([^\w\s])\1+', r'\1', text)

#     return text

In [14]:
# def get_right_text(text: str):
#     print(text)
#     text_from_model = spell_pipeline(text)[0]['generated_text']
#     print(text_from_model)
#     return remove_extra_symbols(text_from_model)

### Get Contractors


In [8]:
def get_contractor(text, topic, big_topic, ners):
    full_text = f"{text};\n{topic};\n{big_topic};"
    if ners:
        full_text += f"\n{ners}"
    print(full_text)
    return contractors_classifier(full_text)[0]["label"]

### Validate text

In [9]:
def validate_text(text: str):
    if valid_pipeline(text)[0]['label'] == 'LABEL_1':
        return "Это валидное обращение"
    return "Это невалидное обращение. Классификация может содержать ошибки"

### Get topics

In [10]:
def get_topic(text: str):
    return topic_classifier(text)[0]["label"]

### Get big topics

In [11]:
import json

big_topic_path = 'topic2big_topic.json'

with open(big_topic_path, 'r', encoding='UTF-8') as file:
    json_dict = json.load(file)

def get_id_to_label():
    id2label_path = 'contractors_id2label.json'
    with open(id2label_path, 'r', encoding='UTF-8') as file:
        id2label = json.load(file)
        return id2label

def get_big_topic(topic: str) -> str:
    return json_dict[topic]

### Natasha

In [12]:
from natasha import Doc, NewsEmbedding, NewsNERTagger, MorphVocab, Segmenter

emb = NewsEmbedding()
ner_tagger = NewsNERTagger(emb)
morph_vocab = MorphVocab()
segmenter = Segmenter()


def get_ners(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)
    for span in doc.spans:
        span.normalize(morph_vocab)
    ner_dict = {}
    for span in doc.spans:
        if ((span.type in ['LOC', 'ORG']) and (span.normal not in ner_dict.get(span.type, []))):
            ner_dict[span.type] = ner_dict.get(span.type, []) + [span.normal]
    return ner_dict


def get_ru_ner_labels(text):
    if text == 'PER':
        return 'Персона'
    elif text == 'LOC':
        return 'Локация'
    elif text == 'ORG':
        return 'Организация'
    else:
        return 'Неопознанная сущность'


def get_ners_pretty(ners):
    # ners = get_ners(text)
    result = ''
    for key, value in ners.items():
        result += f'{get_ru_ner_labels(key)}: {", ".join(value)}\n'
    return result


def correct_ners(ners):
    result = ', '.join([f'{key}: {value}' for key, values in ners.items() for value in values])
    return result

### API

In [13]:
!ngrok config add-authtoken 2Y1aLzjLer4uS76QPa56NyAleF5_7Q3WvpoG6jV8W6oxSSGP4

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [14]:
import os
import sys
import json
import uvicorn
import nest_asyncio
import time
import re

from fastapi import FastAPI, File, UploadFile, Request, Response, HTTPException, Form
from fastapi.templating import Jinja2Templates
from fastapi.responses import FileResponse

from pydantic import BaseModel
from pyngrok import ngrok
from PIL import Image

from typing import List, Dict


class TextFromUser(BaseModel):
    text: str


app = FastAPI()


@app.get('/')
async def main(request: Request):
    return {"message": "Используйте эндпоинт /classificate для классификации обращения"}


@app.post('/classificate')
async def get_classification(text_from_user: TextFromUser):
    # text = get_right_text(text_from_user.text)
    text = text_from_user.text
    topic = get_topic(text)
    big_topic = get_big_topic(topic)
    ners = get_ners(text)
    ners_for_model = correct_ners(ners)
    pretty_ners = get_ners_pretty(ners)
    validation = validate_text(text)
    contractor = get_contractor(text, topic, big_topic, ners_for_model)
    response = {
        "Тема": f"{topic}",
        "Группа тем": f"{big_topic}",
        "Ners": f"{pretty_ners}",
        "Валидации": f"{validation}",
        "Исполнитель": f"{contractor}"
    }
    return response


ngrok_tunnel = ngrok.connect(8000,  domain="patient-buck-weekly.ngrok-free.app")
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)

INFO:     Started server process [651]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Public URL: https://patient-buck-weekly.ngrok-free.app
Когда водоотведение и водоснабжения будет передано в муниципальную собственность в п. Яйва? Почему не пересматриваются тарифы ЖКХ? Ведь компания «Жилкомсервис» не справляется со своими обязанностями.;
Завышение платы за коммунальные услуги;
ЖКХ;
ORG: Жилкомсервис
INFO:     188.64.15.38:0 - "POST /classificate HTTP/1.1" 200 OK
Когда водоотведение и водоснабжения будет передано в муниципальную собственность в п. Яйва? Почему не пересматриваются тарифы ЖКХ? Ведь компания «Жилкомсервис» не справляется со своими обязанностями.;
Завышение платы за коммунальные услуги;
ЖКХ;
ORG: Жилкомсервис
INFO:     188.64.15.38:0 - "POST /classificate HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [651]
