Необходимые импорты

In [115]:
from prettytable import PrettyTable
from typing import Dict, Iterable, List, Tuple

import torch

from allennlp.data import (
    DataLoader,
    DatasetReader,
    Instance,
    Vocabulary,
    TextFieldTensors,
)
from allennlp.data.data_loaders import SimpleDataLoader
from allennlp.data.fields import LabelField, TextField, Field
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WhitespaceTokenizer
from allennlp.models import Model
from allennlp.modules import TextFieldEmbedder, Seq2VecEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder
from allennlp.nn import util
from allennlp.predictors import Predictor
from allennlp.training import Trainer, GradientDescentTrainer
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.training.optimizers import AdamOptimizer
from allennlp.training.util import evaluate

from allennlp.common.util import JsonDict

from tqdm import tqdm

import pandas as pd


Импорт класса для обработки текста

In [116]:
from text_preproccessor import TextPreproccessor

DataSet Reader

In [117]:
class ClassificationExcelReader(DatasetReader):
    def __init__(
        self,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        max_tokens: int = None,
        **kwargs
    ):
        super(ClassificationExcelReader, self).__init__()

        self.tokenizer = tokenizer or WhitespaceTokenizer()
        self.token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()}
            
        self.max_tokens = max_tokens

        self.text_preprocessor = TextPreproccessor()

    def text_to_instance(self, text: str, label: str = None) -> Instance:
        tokens = self.tokenizer.tokenize(text)
        text_field = TextField(tokens, self.token_indexers)
        fields = {'text': text_field}
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)

    def __get_texts_from_excel_file(self, file_path: str) -> pd.DataFrame:
        return pd.read_excel(file_path).dropna()

    def _read(self, file_path: str) -> Iterable[Instance]:
        texts_df: pd.DataFrame = self.__get_texts_from_excel_file(file_path)
        
        for row in texts_df.itertuples(index=True):
            text = self.text_preprocessor.preproccess_text(row.description)
            label = row.name

            tokens = self.tokenizer.tokenize(text)
            if self.max_tokens:
                tokens = tokens[: self.max_tokens]

            text_field = TextField(tokens, self.token_indexers)
            label_field = LabelField(label)

            fields: Dict[str, Field] = {
                "text": text_field, 
                "label": label_field
            }

            yield Instance(fields)


Classifier

In [136]:
class AllenClassifier(Model):
    def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, encoder: Seq2VecEncoder):
        super().__init__(vocab)

        self.embedder = embedder
        self.encoder = encoder
        num_labels = vocab.get_vocab_size("labels")
        
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(encoder.get_output_dim(), num_labels),
            torch.nn.Linear(encoder.get_output_dim(), num_labels),
            torch.nn.Linear(encoder.get_output_dim(), num_labels)
        )

        self.accuracy = CategoricalAccuracy()

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

    def forward(self, text: TextFieldTensors, label: torch.Tensor = None) -> Dict[str, torch.Tensor]:

        embedded_text = self.embedder(text)

        mask = util.get_text_field_mask(text)
        encoded_text = self.encoder(embedded_text, mask)

        logits = self.classifier(encoded_text)

        probs = torch.nn.functional.softmax(logits, dim=-1)

        output = {'probs': probs}

        if label is not None:
            self.accuracy(logits, label)

            output['loss'] = torch.nn.functional.cross_entropy(logits, label)

        return output


Функции создания необходимых объектов для обучения

In [137]:
def build_vocab(instances: Iterable[Instance]) -> Vocabulary:
    return Vocabulary.from_instances(instances)


def train_test_split(instances: List[Instance], test_size=0.2) -> Tuple[List[Instance], List[Instance]]:
    test_count = int(len(instances) * test_size)

    return instances[:-test_count], instances[-test_count:]


def build_model(vocab: Vocabulary) -> Model:
    vocab_size = vocab.get_vocab_size("tokens")
    embedder = BasicTextFieldEmbedder(
        {"tokens": Embedding(embedding_dim=10, num_embeddings=vocab_size)}
    )
    encoder = BagOfEmbeddingsEncoder(embedding_dim=10)
    return AllenClassifier(vocab, embedder, encoder)


def build_data_loaders(train_data: List[Instance], test_data: List[Instance], batches_per_epoch=4) -> Tuple[DataLoader, DataLoader]:
    train_loader = SimpleDataLoader(train_data, batches_per_epoch, shuffle=True)
    test_loader = SimpleDataLoader(test_data, batches_per_epoch, shuffle=False)
    return train_loader, test_loader


def build_trainer(model: Model, train_loader: DataLoader, test_loader: DataLoader, num_epochs=40) -> Trainer:
    parameters = [(n, p) for n, p in model.named_parameters() if p.requires_grad]

    optimizer = AdamOptimizer(parameters)

    trainer = GradientDescentTrainer(
        model=model,
        data_loader=train_loader,
        validation_data_loader=test_loader,
        num_epochs=num_epochs,
        optimizer=optimizer,
    )

    return trainer


Функция обучения

In [138]:
def run_training_loop(train_data, test_data):
    
    vocab = build_vocab(train_data + test_data)

    model = build_model(vocab)

    train_loader, test_loader = build_data_loaders(train_data, test_data)
    train_loader.index_with(vocab)
    test_loader.index_with(vocab)

    trainer = build_trainer(model, train_loader, test_loader)

    print("Starting training")
    trainer.train()
    print("Finished training")

    return model


Чтение данных из файла и их подготовка к обучению

In [121]:
TEXTS_FILEPATH = 'texts.xlsx'

In [122]:
dataset_reader = ClassificationExcelReader()
data = list(dataset_reader.read(TEXTS_FILEPATH))

In [123]:
train_data, test_data = train_test_split(data)
print(len(train_data), len(test_data))

7572 1893


In [139]:
model = run_training_loop(train_data, test_data)

building vocab:   0%|          | 0/9465 [00:00<?, ?it/s]

You provided a validation dataset but patience was set to None, meaning that early stopping is disabled


Starting training


  0%|          | 0/1893 [00:00<?, ?it/s]



RuntimeError: mat1 and mat2 shapes cannot be multiplied (4x0 and 10x0)

In [125]:
model.get_metrics()


{'accuracy': 0.0}

In [126]:
class SentenceClassifierPredictor(Predictor):
    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        return self._dataset_reader.text_to_instance(sentence)


In [131]:
from pprint import pprint
vocab = model.vocab
predictor = SentenceClassifierPredictor(model, dataset_reader)

output = predictor.predict(
    "<p>Тротуары отсутствуют, <strong>Алексеевский городской округ, посёлок Ольминского, 13</strong></p>")

table_dict = {'label': [], 'prob': []}
for label_id, prob in enumerate(output["probs"]):
    table_dict['label'].append(vocab.get_token_from_index(label_id, "labels"))
    table_dict["prob"].append(prob)
    
table = pd.DataFrame(table_dict).sort_values(by = "prob", ascending = 0)


table

Unnamed: 0,label,prob
1,Ямы и выбоины на тротуарах,0.021635
2,Нарушено дорожное покрытие (ямы) на дорогах в ...,0.017310
7,Несвоевременный (некачественный) текущий ремон...,0.017073
9,Длительное неисполнение заявок управляющей ком...,0.016900
16,Неудовлетворительное содержание контейнерной п...,0.016519
...,...,...
125,Водоотведение,0.001162
160,Незавершенное благоустройство после сдачи стро...,0.001139
156,Некачественное предоставление услуг доступа в ...,0.001105
126,Неудовлетворительное качество энергоснабжения ...,0.000942


In [140]:
!pip install openai

Collecting openai
  Downloading openai-0.26.2.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m527.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting aiohttp
  Downloading aiohttp-3.8.3-cp310-cp310-macosx_10_9_x86_64.whl (358 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.3/358.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiosignal>=1.1.2
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.3-cp310-cp310-macosx_10_9_x86_64.whl 