imports

In [None]:
import ast
import seaborn as sns
from plotly.offline import download_plotlyjs, init_notebook_mode, plot
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from tg.grammar_ru.common import Loc
from tg.grammar_ru.corpus import CorpusReader, CorpusBuilder, BucketCorpusBalancer
from tg.grammar_ru.corpus.corpus_reader import read_data
import os
from pathlib import Path
from dotenv import load_dotenv
from tg.grammar_ru.components.yandex_storage.s3_yandex_helpers import S3YandexHandler
from tg.grammar_ru.components.yandex_delivery.training_logs import S3TrainingLogsLoader, TrainingLogsViewer

from yo_fluq_ds import Queryable, Query, fluq
import plotly.express as px
from tg.grammar_ru.common import Separator

from typing import List, Union
import numpy as np
import math
import pandas as pd
from sklearn.metrics import confusion_matrix

pd.set_option('display.max_rows', 500)
load_dotenv(Loc.root_path / 'environment.env')


def get_tasks(bucket, tasks_list_s3_path):
    tmp_local_file = Loc.temp_path / tasks_list_s3_path.split('/')[-1]
    S3YandexHandler.download_file(bucket, tasks_list_s3_path, tmp_local_file)
    with open(tmp_local_file, 'r') as f:
        tasks = ast.literal_eval(f.read())
    return tasks


def plot_metrics(metrics, title=""):
    plt.plot(TrainingLogsViewer.get_metric_by_job(
        metrics, 'accuracy_display'), label='accuracy_display')
    plt.plot(TrainingLogsViewer.get_metric_by_job(
        metrics, 'accuracy_test'), label='accuracy_test')
    plt.title('')
    plt.legend()
    plt.show()


def plot_cm(cm):
    fig = go.Figure(data=go.Heatmap(z=cm,
                                    text=cm,
                                    x=cm.columns,
                                    y=cm.index,
                                    texttemplate="%{text}",
                                    colorscale='Blues'))
    fig.show()


In [None]:
project_name = 'agreementproject'
dataset_name = 'agreement_adj_mid+_mystemless_0_declination'
bucket = 'agreementadjbucket'

In [None]:
def get_label(s):
    return int(s.split('_label_')[1])

def get_true_and_pred(result_df):
    pred_col_names = [c for c in result_df.columns if 'predicted_label' in c ]
    true_col_names = [c for c in result_df.columns if 'true_label' in c ]
    y_pred = result_df[pred_col_names].idxmax(axis="columns").apply(get_label)
    true_probs = result_df[true_col_names]
    y_true = true_probs.idxmax(axis="columns").apply(get_label)

    result_df['pred_label'] = y_pred
    result_df['true_label'] = y_true
    result_df['pred_score'] = result_df[pred_col_names].max(axis=1)

    return y_true, y_pred

In [None]:
def get_worst_words_sents(result_df, src, true_label: int, pred_label: int, worst_words_cnt: int):
    one_inst_another = result_df[(result_df.true_label == true_label) & (
        result_df.pred_label == pred_label)]
    thrsh = one_inst_another[f'predicted_label_{pred_label}'].sort_values(
        ascending=False).head(worst_words_cnt).min()
    worst_mistakes_scores = one_inst_another[
        one_inst_another[f'predicted_label_{pred_label}'] >= thrsh]

    worst_words = (src[src.word_id.isin(worst_mistakes_scores.word_id)]
                   [['word_id', 'sentence_id', 'word']])[:worst_words_cnt]
    worst_sents = worst_words['sentence_id'].unique()
    worst_sents_df = src[src.sentence_id.isin(worst_sents)]
    # worst_sents_df.loc[worst_sents_df.index, 'pred_score'] = -1
    # worst_sents_df.loc[worst_sents_df[worst_sents_df.word_id.isin(worst_mistakes_scores.word_id)].index, "pred_score"] = one_inst_another.pred_score.values
    return worst_words, worst_sents_df

def get_best_words_sents(result_df, src, pred_label: int, words_cnt: int):
    """ 
    Находит слова, в которых сеть была уверена в ответе и ответ верный
    """
    correct_df = result_df[result_df.true_label==pred_label]
    thrsh = correct_df[f'predicted_label_{pred_label}'].sort_values(
        ascending=False).head(words_cnt).min()
    best_scores = correct_df[correct_df[f'predicted_label_{pred_label}']>=thrsh]
    best_words = (src[src.word_id.isin(best_scores.word_id)])[['word_id', 'sentence_id', 'word']][:words_cnt]
    best_sents = best_words.sentence_id.unique()
    best_sents_df = src[src.sentence_id.isin(best_sents)]
    return best_words, best_sents_df

### Filter bundle

In [None]:
new = {'ая', 'ого', 'ое', 'ой', 'ом', 'ому',
       'ою', 'ую', 'ые', 'ый', 'ым', 'ыми', 'ых'} # тут нет окнчаний превосходных форм

# полнейшей, наипрочнейшего, важнейшие,меньшим, милейший, наистраннейшее, новейших, малейшем, слабейшему, меньшими
good = {'ая', 'его', 'ее', 'ей', 'ем', 'ему',
        'ие', 'ий', 'им', 'ими', 'их', 'ую', 'яя', 'юю'}

big = {'ая', 'ие', 'им', 'ими', 'их', 'ого',
       'ое', 'ой', 'ом', 'ому', 'ою', 'ую'}

POSSIBLE_ENDINGS = set().union(new, good, big)
endings_nums = {e: i for i, e in enumerate(
    sorted(list(POSSIBLE_ENDINGS)))}
num_by_ending = endings_nums
ending_by_num = {v:k for k, v in endings_nums.items()}

new_declination_labels = {num for e, num in endings_nums.items() if e in new}


В датасете только слова 1-го типа склонения. Новый. Возможно 13 окончаний. 

In [None]:
new_num_by_ending = {e:num for e,num in num_by_ending.items() if e in new}
new_num_by_ending

Оставили только слова типа "Новый"

In [None]:
from tg.common import DataBundle
from tg.common.ml.batched_training import IndexedDataBundle
from tg.grammar_ru.components.plain_context_builder import PlainContextBuilder
bundle_0_declination_path = Loc.data_cache_path/'bundles/agreement/mid+_mystemless_0_declination'
bundle_full_0_declination_path = Loc.data_cache_path/'bundles/agreement/full_mystemless_0_declination'


In [None]:
# db = DataBundle.load(Loc.data_cache_path/'bundles/agreement/full_mystemless')
# ids_0_type=set(db.src[db.src.declension_type==0].word_id)
# db['index'] = db.index[db.index.word_id.isin(ids_0_type) & db.index.label.isin(new_declination_labels)]
# db = db.copy()
# db.save(Loc.data_cache_path/'bundles/agreement/full_mystemless_0_declination')

In [None]:
db = DataBundle.load(Loc.data_cache_path/'bundles/agreement/mid+_mystemless')
# ids_0_type=set(db.src[db.src.declension_type==0].word_id)
# db['index'] = db.index[db.index.word_id.isin(ids_0_type) & db.index.label.isin(new_declination_labels)]
# db = db.copy()
# db.save(bundle_0_declination_path)
# # idb = IndexedDataBundle(db.index, db)

In [None]:
db.describe()

In [None]:
db.syntax_stats

In [None]:
db.src.info()

Проверим отфильтрованный бандл

In [None]:
del db
db = DataBundle.load(bundle_0_declination_path)


In [None]:
db

In [None]:
db.src[db.src.word_id.isin(db.index.word_id)].declension_type.unique()

In [None]:
# Все возможные окончания слов 0-го типа склонения. "Новый"
db.index.label.replace(ending_by_num).unique()

In [None]:
db.index.label.isin(new_declination_labels).all()

In [None]:
# word_ids = db.index.groupby('label').word_id.first()
# db.src[db.src.word_id.isin(word_ids)].word

In [None]:
# ls = db.src[db.src.word_id.isin(word_ids)].label
# ls[~ls.isin(new_num_by_ending.values())]#.replace(ending_by_num))




Отправка бандла

In [None]:
project_name = 'agreementproject'
dataset_name = 'agreement_adj_mid+_mystemless_0_declination'
bucket = 'agreementadjbucket'

In [None]:
from tg.grammar_ru.components.yandex_storage.s3_yandex_helpers import S3YandexHandler
# try:
#     S3YandexHandler.create_bucket(bucket)
# except:
#     pass 

In [None]:
s3path = f'datasphere/{project_name}/datasets/{dataset_name}'
S3YandexHandler.upload_folder(bucket, s3path, bundle_0_declination_path)

### EDA bundle

In [None]:
# pd.read_parquet(bundle_0_declination_path/'index.parquet')

In [None]:
import plotly.express as px
fig = px.histogram(db.index.label.replace(ending_by_num), histnorm=None)
fig.show()

In [None]:
fig = px.histogram(db.index.label.replace(ending_by_num), color=db.index.split)
fig.show()

In [None]:
# db = DataBundle.load(bundle_full_0_declination_path)
# fig = px.histogram(db.index.label.replace(ending_by_num), histnorm=None)
# fig.show()

In [None]:
# {ending_by_num[num]:occ_cnt for num, occ_cnt in dict(db.index.label.value_counts()).items()}

In [None]:
project_name = 'agreementproject'
dataset_name = 'agreement_adj_mid_1st_declination'
bucket = 'agreementadjbucket'

In [None]:
from tg.grammar_ru.components.yandex_delivery.training_logs import S3TrainingLogsLoader, TrainingLogsViewer
tasks = get_tasks()
loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)
plt.plot(TrainingLogsViewer.get_metric_by_job(
    metrics, 'accuracy_display'), label='accuracy_display')
plt.plot(TrainingLogsViewer.get_metric_by_job(
    metrics, 'accuracy_test'), label='accuracy_test')
plt.title('')
plt.legend()

In [None]:
unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{task_name}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

In [None]:
from sklearn.metrics import confusion_matrix

cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                       normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)

import plotly.graph_objects as go
fig = go.Figure(data=go.Heatmap(z=cm,
                                text=cm,
                                x=cm.columns,
                                y=cm.index,
                                texttemplate="%{text}",
                                colorscale='Blues'))
fig.show()

In [None]:
from sklearn.metrics import confusion_matrix

cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                    #    normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)

import plotly.graph_objects as go
fig = go.Figure(data=go.Heatmap(z=cm,
                                text=cm,
                                x=cm.columns,
                                y=cm.index,
                                texttemplate="%{text}",
                                colorscale='Blues'))
fig.show()

In [None]:
src[src.label==16][['word','split']]


##### without mystem


In [None]:
project_name = 'agreementproject'
dataset_name = 'agreement_adj_mid+_mystemless_1st_declination'
bucket = 'agreementadjbucket'

In [None]:
from tg.grammar_ru.components.yandex_delivery.training_logs import S3TrainingLogsLoader, TrainingLogsViewer
tasks = get_tasks(bucket, 'datasphere/agreementproject/job_info/job_agreementproject_08:21:13.312924.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)


plt.plot(TrainingLogsViewer.get_metric_by_job(
    metrics, 'accuracy_display'), label='accuracy_display')
plt.plot(TrainingLogsViewer.get_metric_by_job(
    metrics, 'accuracy_test'), label='accuracy_test')
plt.title('')
plt.legend()


In [None]:
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                     #    normalize='true'
                     ).round(2),
    columns=[f'pred {n,ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {n,ending_by_num[n]}' for n in sorted_nums]
)

fig = go.Figure(data=go.Heatmap(z=cm,
                                text=cm,
                                x=cm.columns,
                                y=cm.index,
                                texttemplate="%{text}",
                                colorscale='Blues'))
fig.show()


Некоторые классы вообще не предсказаны.
Возможно бага в интерпретации ответов модели с последнего слоя. Посмотрим на предсказанные числа

In [None]:
pred_col_names = [c for c in result_df.columns if 'predicted_label' in c ]
result_df[pred_col_names].sum(axis=1).hist()

В некоторых строчках все числа близки к нулю. То есть их нельзя воспринимать как вероятности.
Добавим в сеть softmax

Softmax+Relu

In [None]:
# tasks = get_tasks(bucket, 'datasphere/agreementproject/job_info/job_agreementproject_10:15:11.216535.txt')

# loader = S3TrainingLogsLoader(bucket, project_name)
# metrics = loader.load_metrics(tasks)

# unzipped_folder = (Loc.root_path /
#                    'temp'/'training_results' /
#                    f'{tasks[0]}.unzipped')
# result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
# y_true, y_pred = get_true_and_pred(result_df)

# plot_metrics(metrics)
# sorted_nums = sorted(list(y_true.unique()))
# cm = pd.DataFrame(
#     confusion_matrix(y_true, y_pred,
#                      #    normalize='true'
#                      ).round(2),
#     columns=[f'pred {n,ending_by_num[n]}' for n in sorted_nums],
#     index=[f'actual {n,ending_by_num[n]}' for n in sorted_nums]
# )

# fig = go.Figure(data=go.Heatmap(z=cm,
#                                 text=cm,
#                                 x=cm.columns,
#                                 y=cm.index,
#                                 texttemplate="%{text}",
#                                 colorscale='Blues'))
# fig.show()

Softmax

In [None]:
tasks = get_tasks(bucket, 'datasphere/agreementproject/job_info/job_agreementproject_09:53:58.071673.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics)
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                     #    normalize='true'
                     ).round(2),
    columns=[f'pred {n,ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {n,ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)


In [None]:
pred_col_names = [c for c in result_df.columns if 'predicted_label' in c ]
true_col_names = [c for c in result_df.columns if 'true_label' in c ]

In [None]:
fig = px.histogram(result_df[pred_col_names].values.reshape(-1), histnorm=None)
fig.show()

In [None]:
pred_col_names = [c for c in result_df.columns if 'predicted_label' in c ]
# result_df[pred_col_names+["pred_label", "true_label","label"]]

Некоторые классы не предсказаны.
Посмотрим на эти случаи.

In [None]:
db = DataBundle.load(Loc.data_cache_path/'bundles/agreement/mid+_mystemless_1st_declination')


In [None]:
src = db.src
src.declension_type.value_counts()

In [None]:
db.src

In [None]:
num_by_ending

In [None]:
from tg.grammar_ru.common import Separator

pred_label = 19
true_label = 15
worst_words, worst_sents_df = get_worst_words_sents(
    result_df, db.src, true_label=true_label, pred_label=pred_label, worst_words_cnt=50)
# result_df, db.src, true_label=14, pred_label=13, worst_words_cnt=50)
print(f"Predicted  {ending_by_num[pred_label]}  instead of  {ending_by_num[true_label]} " )
Separator.Viewer().tooltip("word_id").color('word_id',
                                            value_to_color={
                                                wid: 'red' for wid in worst_words.word_id}
                                            ).to_html_display(worst_sents_df)


In [None]:
# result_df[(result_df.true_label==21) & (result_df.pred_label==20)]
result_df[(result_df.true_label==14) & (result_df.pred_label==13)]

In [None]:
best_words, best_sents_df = get_best_words_sents(
    result_df, db.src, pred_label=0, words_cnt=5)

Separator.Viewer().tooltip("word_id").color('word_id',
                                            value_to_color={
                                                wid: 'green' for wid in best_words.word_id}
                                            ).to_html_display(best_sents_df)

In [None]:
result_df.pred_label.nunique()

In [None]:
# from sklearn.metrics import classification_report
# classification_report(y_true, y_pred)

#### Neatly filter bundle

In [None]:
project_name = 'agreementproject'
dataset_name = 'agreement_adj_mid+_mystemless_0_declination'
bucket = 'agreementadjbucket'

In [None]:
tasks = get_tasks(bucket, 'datasphere/agreementproject/job_info/job_agreementproject_07:13:16.756552.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics)
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                     #    normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)


Основные проблемы связаны с низкочастотными классами: ом, ому, ою, ым.

ОЮ встречается всего 4 раза в датасете.

ОМ, ОМУ предсказаны как ЫЙ - самый частотный класс.

ЫМ предсказано как ЫЙ, ЫМИ.


##### Probability

In [None]:
# worst_sents_df[:100]

In [None]:
result_df.pred_score.round(2).value_counts(normalize=True)

В 14% случаев ответ - random.

In [None]:
pred_col_names = [c for c in result_df.columns if 'predicted_label' in c ]
true_col_names = [c for c in result_df.columns if 'true_label' in c ]
result_df[pred_col_names].round(2)[:20]

В некоторых строчках распределение вероятности почти равномерное по всем классам.

In [None]:
# fig = px.histogram(result_df.pred_score, histnorm=None)
# fig.show()

##### Residual

In [None]:
from tg.grammar_ru.common import Separator

true_label = 15
pred_label = 19
worst_words, worst_sents_df = get_worst_words_sents(
    result_df, db.src, true_label=true_label, pred_label=pred_label, worst_words_cnt=40)
print(f"Predicted  {ending_by_num[pred_label]}  instead of  {ending_by_num[true_label]} " )
Separator.Viewer().tooltip("word_id").color('word_id',
                                            value_to_color={
                                                wid: 'red' for wid in worst_words.word_id}
                                            ).to_html_display(worst_sents_df)


In [None]:
best_words, best_sents_df = get_best_words_sents(
    result_df, db.src, pred_label=0, words_cnt=5)

Separator.Viewer().tooltip("word_id").color('word_id',
                                            value_to_color={
                                                wid: 'green' for wid in best_words.word_id}
                                            ).to_html_display(best_sents_df)

In [None]:
import plotly.express as px
fig = px.histogram(result_df.label.replace(ending_by_num), histnorm=None)
fig.show()

# Выводы

Возможно, вероятности распределяются равномерно потому что loss - mse.
Заменим на кросс-энтропию.

ОЮ нужно выкинуть из бандла.

В бандле был slovnet, его тоже нужно выкинуть потому что он некорректно работает на предложениях с ошибками.