### Нарушение согласования

Начнем с поиска ошибки согласования прилагательных

3 типа:
* новЫЙ
* хорошИЙ
* большОЙ

### 1. Построим бандл, сбалансированный по длине предложения и по корпусу.

In [None]:
from tg.grammar_ru.common import Loc
from tg.grammar_ru.corpus import CorpusReader, CorpusBuilder, BucketCorpusBalancer
from tg.grammar_ru.corpus.corpus_reader import read_data
import os
from pathlib import Path
from dotenv import load_dotenv
from tg.grammar_ru.components.yandex_storage.s3_yandex_helpers import S3YandexHandler

from yo_fluq_ds import Queryable, Query, fluq

from typing import List, Union

import math
import pandas as pd
pd.set_option('display.max_rows', 500)


In [None]:
NEW = {
    "ая",
    "ого",
    "ое",
    "ой",
    "ом",
    "ому",
    "ую",
    "ые",
    "ый",
    "ым",
    "ыми",
    "ых"
}
# NOTE выкинули 'ою'

GOOD = {
    "ая",
    "его",
    "ее",
    "ей",
    "ем",
    "ему",
    "ие",
    "ий",
    "им",
    "ими",
    "их",
    "ую",
    "яя",
    "юю",
    "ого",
    "ое",
    "ой",
    "ому",
    "ом",
}  # легкий

BIG = {
    "ая",
    "ие",
    "им",
    "ими",
    "их",
    "ого",
    "ое",
    "ой",
    "ом",
    "ому",
    "ую",
    "ые",
    "ым",
    "ыми",
    "ых",
}  # золотой
# NOTE выкинули 'ою'

NEW_list = sorted(list(NEW))
GOOD_list = sorted(list(GOOD))
BIG_list = sorted(list(BIG))

In [None]:
index = pd.read_parquet(Loc.data_cache_path / 'bundles/agreement/adj_full/index.parquet')

In [None]:
from tg.common.ml.batched_training import IndexedDataBundle
from tg.common import DataBundle

db = DataBundle.load(Loc.data_cache_path/'bundles/agreement/adj_full')
idb = IndexedDataBundle(index, db)
idb.bundle

##### Отправка бандла

In [None]:
from tg.grammar_ru.components import PlainContextBuilder

context_builder = PlainContextBuilder(
    include_zero_offset=False,
    left_to_right_contexts_proportion=0.5
)

In [None]:
from tg.common.ml.batched_training import context as btc
from tg.grammar_ru.components import CoreExtractor

def create_assembly_point(context_length = 6):
    ap = btc.ContextualAssemblyPoint(
        name = 'features',
        context_builder = context_builder,
        extractor = CoreExtractor(join_column='another_word_id'),
        context_length=context_length
    )
    ap.reduction_type = ap.reduction_type.Dim3Folded
    return ap

In [None]:
ap = create_assembly_point(context_length=15)
ap.hidden_size = 50
ap.dim_3_network_factory.network_type = btc.Dim3NetworkType.LSTM
head_factory = ap.create_network_factory()
# head = head_factory(batch)

In [None]:
import torch


def _update_sizes_with_argument(argument_name, argument, sizes, modificator):
    if argument is None:
        return sizes
    elif isinstance(argument, torch.Tensor):
        return modificator(sizes, argument.shape[1])
    elif isinstance(argument, pd.DataFrame):
        return modificator(sizes, argument.shape[1])
    elif isinstance(argument, int):
        return modificator(sizes, argument)
    else:
        raise ValueError(
            f"Argument {argument_name} is supposed to be int, Tensor or none, but was `{argument}`")


class FullyConnectedNetwork(torch.nn.Module):
    def __init__(self,
                 sizes: List[int],
                 input: Union[None, torch.Tensor, int] = None,
                 output: Union[None, torch.Tensor, int] = None):
        super(FullyConnectedNetwork, self).__init__()
        sizes = _update_sizes_with_argument(
            'input', input, sizes, lambda s, v: [v] + s)
        sizes = _update_sizes_with_argument(
            'output', output, sizes, lambda s, v: s + [v])
        self.layers = torch.nn.ModuleList()
        for i in range(len(sizes) - 1):
            self.layers.append(torch.nn.Linear(sizes[i], sizes[i + 1]))
        
        
    def forward(self, input):
        X = input
        for layer in self.layers:
            X = layer(X)
            # X = torch.sigmoid(X)
        return X

In [None]:
import torch
from tg.common.ml.batched_training import factories as btf


class Network(torch.nn.Module):
    def __init__(self, head, hidden_size, batch):
        super(Network, self).__init__()
        self.head = head
        self.tail = FullyConnectedNetwork(
            sizes=[], input=hidden_size, output=batch.index_frame.label.nunique())

    def forward(self, batch):
        return self.tail(self.head(batch))


class NetworkFactory:
    def __init__(self, assembly_point):
        self.assembly_point = assembly_point

    def __call__(self, batch):
        head_factory = self.assembly_point.create_network_factory()
        head = head_factory(batch)
        return Network(head, self.assembly_point.hidden_size,  batch)


network_factory = NetworkFactory(ap)


In [None]:
from tg.common.ml import batched_training as bt
from sklearn.metrics import roc_auc_score, f1_score

class MulticlassMetrics(bt.Metric):
    def get_names(self):
        return ['roc_auc', 'f1_weighted']

    def measure(self, df, _):
        prefix = 'predicted_label_'
        start_idx = len(NEW_list) + len(GOOD_list)
        target = (df.label - start_idx).tolist()
        probas = np.zeros(shape=[len(df), idb.index_frame.label.nunique()])
        for i, (_, row) in enumerate(df.iterrows()):
            for j in range(probas.shape[1]):
                probas[i][j] = row[f'{prefix}{start_idx + j}']
    
        preds = np.argmax(probas, axis=1).tolist()
        result = []
        result.append(roc_auc_score(target, probas, multi_class='ovo'))
        result.append(f1_score(target, preds, average='weighted'))
        return result

In [None]:
from typing import *
from tg.common.ml import batched_training as bt
import pandas as pd
from tg.common.ml.batched_training.factories.conventions import Conventions
import torch
from yo_fluq_ds import Obj


class MulticlassPredictionInterpreter:
    def interpret(self, input, labels, output):
        result = input["index"].copy()
        output = torch.softmax(output, dim=1)
        for i, c in enumerate(labels.columns):
            result["true_" + c] = labels[c]
            result["predicted_" + c] = output[:, i].tolist()
        return result

In [None]:
from tg.common.ml.batched_training.factories import CtorAdapter, TorchModelHandler
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from tg.common.ml import batched_training as bt
from tg.common.ml import dft


def get_multilabel_extractor():
    label_extractor = (bt.PlainExtractor
                       .build(btf.Conventions.LabelFrame)
                       .index()
                       .apply(take_columns=['label'],
                              transformer=dft.DataFrameTransformerFactory.default_factory(max_values_per_category=50))
                       )
    return label_extractor


class TrainingTask(btf.TorchTrainingTask):
    def __init__(self):
        super(TrainingTask, self).__init__()
        self.metric_pool = bt.MetricPool().add(MulticlassMetrics())
        self.features_ap = create_assembly_point()
        self.settings.mini_batch_size = None

    def initialize_task(self, idb):
        self.setup_batcher(
            idb, [ap.create_extractor(), get_multilabel_extractor()])
        self.setup_model(network_factory, ignore_consistancy_check=True)

    def setup_model(self, network_factory, ignore_consistancy_check=False):
        self.model_handler = TorchModelHandler(
            network_factory,
            self.optimizer_ctor,
            self.loss_ctor,
            ignore_consistancy_check,
        )
        self.model_handler.multiclass_prediction_interpreter = MulticlassPredictionInterpreter()        

In [None]:
idb.index_frame = idb.index_frame[idb.index_frame.declension_type == 2]

In [None]:
task = TrainingTask()
task.settings.epoch_count = 40
task.optimizer_ctor = CtorAdapter("torch.optim:Adam", ('params',), lr=5e-3)
task.loss_ctor = CtorAdapter("torch.nn:CrossEntropyLoss")
result = task.run(idb)

In [None]:
task.settings.continue_training = True
task.settings.epoch_count = 100
result = task.run(idb)

In [None]:
import joblib

joblib.dump(result['output']['model'], 'adjectives_decl_big_model.pkl')
joblib.dump(result['output']['batcher'], 'adjectives_decl_big_batcher.pkl')

In [None]:
import joblib

model = joblib.load('adjectives_decl_new_model.pkl')
batcher = joblib.load('adjectives_decl_new_batcher.pkl')

In [None]:
task = TrainingTask()
task.model_handler = model
task.batcher = batcher 

In [None]:
pred_db = task.predict(idb)

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, accuracy_score, balanced_accuracy_score

def score_db(db):
    prefix = 'predicted_label_'
    start_idx = len(NEW_list) + len(GOOD_list)
    target = (db.label - start_idx).tolist()
    probas = np.zeros(shape=[len(db), idb.index_frame.label.nunique()])
    for i, (_, row) in enumerate(db.iterrows()):
        for j in range(probas.shape[1]):
            probas[i][j] = row[f'{prefix}{start_idx + j}']
    
    preds = np.argmax(probas, axis=1).tolist()
    return precision_recall_fscore_support(target, preds, average='weighted')
    # return accuracy_score(target, preds)

In [None]:
score_db(pred_db[pred_db.split == 'test'])

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def plot_confusion_matrix(db):
    prefix = 'predicted_label_'
    start_idx = len(NEW_list) + len(GOOD_list)
    target = (db.label - start_idx).tolist()
    probas = np.zeros(shape=[len(db), idb.index_frame.label.nunique()])
    for i, (_, row) in enumerate(db.iterrows()):
        for j in range(probas.shape[1]):
            probas[i][j] = row[f'{prefix}{start_idx + j}']

    preds = np.argmax(probas, axis=1).tolist()
    cm = confusion_matrix(target, preds, normalize='true').round(2)
    fig, ax = plt.subplots(figsize=(10, 10))
    ConfusionMatrixDisplay(cm, display_labels=BIG_list).plot(ax=ax)

In [None]:
plot_confusion_matrix(pred_db[pred_db.split == 'test'])

In [None]:
import tg.projects.agreement.declension_extractor as de
import importlib

importlib.reload(de)

In [None]:
from tg.projects.agreement.declension_extractor import AdjAgreementIndexBuilder
from tg.grammar_ru.features import SnowballFeaturizer, PyMorphyFeaturizer
from tg.grammar_ru import Separator

test_text = 'Это моя новый машина.'
test_text_db = Separator.build_bundle(test_text, [
    PyMorphyFeaturizer(),
    SnowballFeaturizer(),
    # SlovnetFeaturizer(),
    # SyntaxTreeFeaturizer(),
    # SyntaxStatsFeaturizer(),
])
test_text_db


In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, accuracy_score, balanced_accuracy_score

def condense_predictions(pred_df):
    prefix = 'predicted_label_'
    pred_columns = pred_df.columns[pred_df.columns.str.startswith(prefix)]
    pred_labels = pred_df[pred_columns].idxmax(axis=1)
    pred_labels = pred_labels.apply(lambda column_name: int(column_name.split('_')[-1]))
    condensed_df = pd.DataFrame(pred_labels, columns=['label'])
    aib = AdjAgreementIndexBuilder()
    condensed_df['ending'] = condensed_df.label.apply(lambda x: aib.get_ending_from_index(2, x))
    return condensed_df

In [None]:
def get_text_suggestions(text: str):
    text_db = Separator.build_bundle(text, [
        PyMorphyFeaturizer(),
        SnowballFeaturizer(),
    ])
    index_df = AdjAgreementIndexBuilder().build_index(text_db, 2)
    input_idb = IndexedDataBundle(
        index_frame=index_df,
        bundle=text_db
    )
    input_idb.index_frame['label'] = -1
    pred_df = task.predict(input_idb)
    return condense_predictions(pred_df)

In [None]:
get_text_suggestions('Какая дом у вас построили? У нас на большая улице построили очень дорогую дом. Кажется, что это строение живая, в каком-то смысле слова! Однако он очень молодую, ему всего один год.')

In [None]:
get_text_suggestions('большая человек! молодой человек!')

In [None]:
    get_text_suggestions('большой человек! молодой человек!')

### Результат

##### Бандл

* Сбалансировали по длине и по корпусу. pub & books
* Построили фичи
* Отобрали прилагательные с помощью pymorphy & mystem
* Определили типы склонения и окончания


##### Сеть

* Собрали и запустили в ноутбуке

##### В процессе

* Доставка
