### Метрики энкодера на основе resnet*

### Визуализация в 3 ГК помимо того что не дает колличественных оценок точности энкодера, так и несет в себе в лучшем случае около 40% информации от выходного вектора длинной 1024. 

###  Необходимо ознакомится с метриками и оценками модели энкодера. исп.:
* kMeans
* OneClass SVM
* Gaussian Mixture

### Конечная цель: оценка целесообразности применения энкодера в рамках *данной* задачи.

Что откуда качать:

* https://drive.google.com/file/d/1-oIPyg3uFT1n--MXyR4Uzx95YqR3NsNT/view?usp=sharing - дополнительные знаки - не референсные. Часть из них - вырезка из видосов, часть - собранно ручками. Разместить в папке *data/additional_sign/*
* https://drive.google.com/file/d/1-rTwhmdUdcuPMYz8BiPQV3fiWCSJjE20/view?usp=sharing - *last_encoder_1024_98* - веса энкодера. Разместить в папке с ноутбуком.
* https://drive.google.com/file/d/1-K3ee1NbMmx_0T5uwMesStmKnZO_6mWi/view?usp=sharing - rtds с csv, содержащей инфу. Разместить в папке *data*: data/R_MERGED/.. и data/RTDS_DATASET.csv.
* https://drive.google.com/file/d/1-l3VvU-WtSoXbW_AaTFUreVD-tgXV8Q0/view?usp=sharing - стоковые знаки. Используются как референс, то есть объеденяются с rtds с пометкой 'train'. Разместить так: data/STOCK_SIGNS.

In [None]:
# autoreload 
%load_ext autoreload
%autoreload 2

# core imports
import os
import sys
import random
from datetime import datetime
from pathlib import Path
from datetime import datetime

import albumentations as A
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
import seaborn as sns
import pandas as pd
import cv2
from tqdm.notebook import trange, tqdm
from sklearn.preprocessing import LabelEncoder

# append src
PROJECT_ROOT = Path(os.readlink(f'/proc/{os.environ["JPY_PARENT_PID"]}/cwd'))
DATA_DIR = PROJECT_ROOT / 'SignDetectorAndClassifier' / 'data'

# Зафиксируем состояние случайных чисел
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
%matplotlib inline
plt.rcParams["figure.figsize"] = (17,10)
torch.set_grad_enabled(False)

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device

PLOT = False

In [None]:
from maddrive_adas.utils.models import get_model_and_img_size
encoder, img_size = get_model_and_img_size(DATA_DIR / 'encoder_config.json')
encoder = encoder.to(device)

from maddrive_adas.utils.checkpoint import load_checkpoint
encoder, _, _, _ = load_checkpoint(encoder, filename=str(DATA_DIR / 'last_encoder'))

### Этап 1.1. Берем RTDS, из него берем *train* как *baseline*. Заменяем *valid* на *test*.

In [None]:
DATASET_PREFIX = DATA_DIR / 'ENCODER_DATASET'
RTDS_DF = pd.read_csv(DATASET_PREFIX / 'WIDE_DATASET_4_ENCODER.csv')
RTDS_DF['filepath'] = RTDS_DF['filepath'].apply(lambda x: str(DATASET_PREFIX / x))
RTDS_DF.drop_duplicates(subset=['filepath'], inplace=True)

# убираем доп знаки 
# RTDS_DF = RTDS_DF[RTDS_DF['filepath'].str.contains('rtsd')]

TARGET_SIGNS = [
    '1.1', '1.6', '1.8', '1.22', '1.31', '1.33', 
    '2.1', '2.2', '2.3', '2.4', '2.5', 
    '3.1', '3.18', '3.20', '3.21', '3.22', '3.23', '3.24',
    '3.25', '3.27', '3.28', '3.31', 
    '4.1.1', '4.3', 
    '5.5', '5.6', '5.16', 
    '5.19.1', '5.20', 
    '6.3.2', '6.4', 
    '7.3', '7.4'
]

RTDS_DF = RTDS_DF[RTDS_DF['sign'].isin(TARGET_SIGNS)]

In [None]:
RTDS_DF

In [None]:
RTDS_DF[RTDS_DF['filepath'].str.contains('rtsd')]

In [None]:
print(len(set(RTDS_DF['sign'])))
print(len(set(RTDS_DF['encoded'])))

### *train* как референс, *valid* - query для валидации.
### Этап 1.2. Формируем DataFrame отсутствущих знаков в RTDS.

In [None]:
le = LabelEncoder()

included_signs = sorted(set(RTDS_DF.sign))
print('Included signs in ENCODER_DATASET:', included_signs)
not_included_signs = sorted(set(TARGET_SIGNS) - set(RTDS_DF.sign))
print('Not included in ENCODER_DATASET:', not_included_signs)

print('Getting aditional sings...')
additional_DF = pd.DataFrame(columns=RTDS_DF.columns)

encode_offset = max(set(RTDS_DF['encoded'])) + 1
files = os.listdir(DATA_DIR / 'additional_sign')

skipped_signs = []
row_list = []

for file in files:
    sign = file.split('_')[0]
     
    if sign.rsplit('.', 1)[0] == '3.25':
        sign = '3.25'
        
    if sign.rsplit('.', 1)[0] == '3.24':
        sign = '3.24'         

    if sign in included_signs:
        skipped_signs.append(sign)
        continue
        
    row = {'filepath': str(DATA_DIR / 'additional_sign' / file), 
           'sign': sign, 
           'set': 'test', # HANDLE ME
           'encoded': None
          }

    row_list.append(row)

print('Skipped signs:', skipped_signs)
additional_DF = pd.DataFrame(row_list, columns=RTDS_DF.columns)
le.fit(list(set(additional_DF.sign).union(set(RTDS_DF.sign))))

print('Including part of additional_DF for:', sorted(set(additional_DF.sign)), 'sign.')
additional_DF = additional_DF[~additional_DF['sign'].isin(RTDS_DF['sign'])]

RTDS_DF = pd.concat([RTDS_DF, additional_DF], ignore_index=True)
RTDS_DF['encoded'] = le.transform(RTDS_DF['sign'])

In [None]:
valid_subset = RTDS_DF[RTDS_DF['set'] == 'test']
# display(valid_subset)
sum(valid_subset['sign'] == '1.6')

In [None]:
print('So we got', len(set(RTDS_DF['sign'])), 'signs. Assume == 33')
LABEL_DICT = dict(zip(RTDS_DF.sign, RTDS_DF.encoded))

In [None]:
centroid_from_train_for_signs = sorted(set(RTDS_DF.loc[RTDS_DF['set'] == 'train', 'sign']))
print('We will get centroids from TRAIN for', centroid_from_train_for_signs)
centroid_from_stock_for_signs = sorted(set(TARGET_SIGNS) - set(centroid_from_train_for_signs))
print('We should get centroids from STOCK signs for', centroid_from_stock_for_signs)

### Этап 2. Формируем для отсутствующих~=**ДЛЯ ВСЕХ** знаков baseline из образцовых знаков с википедии.

In [None]:
STOCK_SIGNS_CSV_LOCATION = DATA_DIR / 'STOCK_SIGNS/STOCK_SIGNS.csv'
STOCK_SIGNS_DATAFRAME = pd.read_csv(STOCK_SIGNS_CSV_LOCATION)
STOCK_SIGNS_DATAFRAME.rename({'SIGN': 'sign'}, axis='columns', inplace=True)

STOCK_SIGNS_DATAFRAME['filepath'] = STOCK_SIGNS_DATAFRAME['filepath'].apply(lambda x: str(x).replace('\\', '/'))
STOCK_SIGNS_DATAFRAME.loc[STOCK_SIGNS_DATAFRAME['sign'] == '5.19.2', 'sign'] = '5.19.1'

STOCK_SIGNS_DATAFRAME['sign'] = STOCK_SIGNS_DATAFRAME['sign'].apply(
        lambda x: '3.25' if x.rsplit('.', 1)[0] == '3.25' else x)

## FIXUP для проблем описанных ниже
STOCK_SIGNS_DATAFRAME['sign'] = STOCK_SIGNS_DATAFRAME['sign'].apply(
        lambda x: '3.18' if x.rsplit('.', 1)[0] == '3.18' else x)

STOCK_SIGNS_DATAFRAME['sign'] = STOCK_SIGNS_DATAFRAME['sign'].apply(
        lambda x: '2.3' if x.rsplit('.', 1)[0] == '2.3' else x)

STOCK_SIGNS_DATAFRAME['sign'] = STOCK_SIGNS_DATAFRAME['sign'].apply(
        lambda x: '3.24' if x.rsplit('.', 1)[0] == '3.24' else x)

STOCK_SIGNS_DATAFRAME['filepath'] = STOCK_SIGNS_DATAFRAME['filepath'].apply(lambda x: str(DATA_DIR / x))
STOCK_SIGNS_DATAFRAME['encoded'] = [LABEL_DICT[i] for i in STOCK_SIGNS_DATAFRAME['sign']]

STOCK_SIGNS_DATAFRAME['set'] = 'train'

print('Leave only signs from', centroid_from_stock_for_signs)
STOCK_SIGNS_DATAFRAME = STOCK_SIGNS_DATAFRAME[STOCK_SIGNS_DATAFRAME['sign'].isin(
    centroid_from_stock_for_signs)]

display(STOCK_SIGNS_DATAFRAME)

RTDS_DF = pd.concat([RTDS_DF, STOCK_SIGNS_DATAFRAME], ignore_index=True)

### Baseline готов, тестовый датасет готов. Че хотим? Хотим получить какие-нибудь метрики.

In [None]:
from maddrive_adas.utils.transforms import get_minimal_and_augment_transforms
from utils.datasets import SignDataset

minimal_transform, _, = get_minimal_and_augment_transforms(img_size)

train_dataset = SignDataset(
    RTDS_DF, 
    set_label='train', 
    transform=minimal_transform, 
    hyp=None,
    alpha_color=144
)

valid_dataset = SignDataset(
    RTDS_DF, 
    set_label='test',  
    transform=minimal_transform, 
    hyp=None,
    alpha_color=144
)

In [None]:
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
from maddrive_adas.utils.datasets import get_dataloader_from_dataset

@torch.no_grad()
def simpleGetAllEmbeddings(model, dataset, batch_size, dsc=''):
    dataloader = get_dataloader_from_dataset(
        dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,
        drop_last=False
    )

    s, e = 0, 0
    pbar = tqdm(
        enumerate(dataloader),
        total=len(dataloader),
        position=0,
        leave=False,
        desc='Getting all embeddings...' + dsc)
    
    info_arr = []
    add_info_len = None

    for idx, (data, labels, info) in pbar:
        data = data.to(device)
        q = model(data)

        if labels.dim() == 1:
            labels = labels.unsqueeze(1)
        if idx == 0:
            labels_ret = torch.zeros(
                len(dataloader.dataset),
                labels.size(1),
                device=device,
                dtype=labels.dtype,
            )
            all_q = torch.zeros(
                len(dataloader.dataset),
                q.size(1),
                device=device,
                dtype=q.dtype,
            )

        info = np.array(info)
        if add_info_len == None:
            add_info_len = info.shape[0]

        info_arr.extend(info.T.reshape((-1, add_info_len)))
        e = s + q.size(0)
        all_q[s:e] = q
        labels_ret[s:e] = labels
        s = e

    labels_ret = labels_ret.squeeze(1)
    all_q = torch.nn.functional.normalize(all_q)
    return all_q, labels_ret, info_arr

In [None]:
batch_size = 1800
num_workers = 16

encoder.eval()
train_embeddings, train_labels, train_additional_info = simpleGetAllEmbeddings(
    encoder, train_dataset, batch_size, ' for train'
)

test_embeddings, test_labels, test_additional_info = simpleGetAllEmbeddings(
    encoder, valid_dataset, batch_size, ' for valid'
)

print('Test labels:', test_labels.unique(), 'len:', len(test_labels.unique()))
print('Train labels:', train_labels.unique(), 'len:', len(train_labels.unique()))

In [None]:
print(test_embeddings.max())
print(test_embeddings.min())

In [None]:
print(len(set(RTDS_DF[RTDS_DF['set']=='train']['encoded'])))

In [None]:
print(len(set(RTDS_DF[RTDS_DF['set']=='test']['sign'])))
print(len(set(RTDS_DF[RTDS_DF['set']=='train']['sign'])))

print(len(set(RTDS_DF[RTDS_DF['set'] == 'test']['encoded'])))
print(len(set(RTDS_DF[RTDS_DF['set'] == 'train']['encoded'])))

print(len(set(RTDS_DF[RTDS_DF['set']=='test'].index)))
print(len(RTDS_DF[RTDS_DF['set']=='train'].index))

print(len(set(RTDS_DF[RTDS_DF['set']=='test'].index)))
print(len(RTDS_DF[RTDS_DF['set']=='train'].index))

### Выше все ок, тестовый набор содержит тест+валид, который сформирован ноутбуков *RTSD-R_MERGED.ipynb*. В валид попало много знаков пешеходного перехода, т.к. их количество значительно превосходило остальные.

## Get Centroids

In [None]:
INVERSED_LABEL_DICT = {v: k for k, v in LABEL_DICT.items()}

labels_list = train_labels.cpu().numpy()
labels_set = list(set(labels_list))

embeddingsListForCentroids = train_embeddings.cpu().numpy()
centroid_location_dict_cpu = {}

p = tqdm(labels_set)
for label in p:
    p.set_description(
        f'Current label: {label} [{INVERSED_LABEL_DICT[label]}]'
    )
    mask = labels_list == label
    
    currentLabelEmbeddingsForCentroids = embeddingsListForCentroids[mask]
    zipped = list(zip(*currentLabelEmbeddingsForCentroids))
    
    singleCoord = []
    for coord in zipped:
        coord = sum(coord) / len(coord)
        singleCoord.append(coord)
        # print(coord)
        
    centroid_location_dict_cpu[label] = singleCoord

centroid_location_dict_gpu = {}
for key, item in centroid_location_dict_cpu.items():
    centroid_location_dict_gpu[key] = torch.Tensor(item).to(device)

print('Getting centroids done.')

In [None]:
def _get_nearest_centroids(
        embs, #np.array, 
        labels,
        centroid_coords_dict_gpu, #: dict[int, np.array],
    ): # -> list[tuple[float, str]]:

    labels_per_embeddings = labels.cpu().tolist()

    centroid_index_to_key = {
        int(index): int(val) for index, val in enumerate(
            centroid_location_dict_gpu.keys()
        )
    }
    centroid_locations = torch.stack([centroid_location_dict_gpu[label] for _, label in centroid_index_to_key.items()])
    dist_sign_list = []
    for i, emb in tqdm(
            enumerate(embs),
            total=len(embs)
        ):

        dist = (emb - centroid_locations).pow(2).sum(-1).sqrt()
        key = centroid_index_to_key[int(torch.argmin(dist))]

        realSign = INVERSED_LABEL_DICT[labels_per_embeddings[i]]
        predictedSign = INVERSED_LABEL_DICT[key]
        dist_sign_list.append(
            (
                float(dist[key]),
                predictedSign
            )
        )
    return dist_sign_list

nearest_centroid_list_for_train = _get_nearest_centroids(
    train_embeddings,
    train_labels,
    centroid_location_dict_gpu,
)

nearest_centroid_list_for_test = _get_nearest_centroids(
    test_embeddings,
    test_labels,
    centroid_location_dict_gpu,
)

### lets construct plot DataFrame

In [None]:
len(centroid_location_dict_cpu)

In [None]:
from sklearn.decomposition import PCA
dim3 = True
print('Constructing dataframe for plotting.')

coords = ['x', 'y'] + (['z'] if dim3 else [])
plot_df = pd.DataFrame(
    columns=[*coords, 'type', 'size', 'sign', 'filepath', 'color', 'marker', 'nearest_centroid']
)

reducer = PCA(
    n_components=3 if dim3 else 2, 
    random_state=RANDOM_STATE
)

train_size = 2
train_type = 'train'
test_size = 2
test_type = 'test'
centroid_size = 10
centroid_type = 'centroid'

## FIT REDUCER
train_embeddings_ = reducer.fit_transform(train_embeddings.cpu().numpy())

from itertools import cycle
import plotly.express as px
palette = cycle(
        [*px.colors.qualitative.Dark24, 
         *px.colors.qualitative.Alphabet, 
         *px.colors.qualitative.Light24]
    )
colorDict = {}

listOfRows = []

## CENTROIDS
for k, v in tqdm(centroid_location_dict_cpu.items()):
    coords = reducer.transform(np.array(v).reshape(1, -1)).flatten()
    path = RTDS_DF[RTDS_DF['sign'] == INVERSED_LABEL_DICT[k]]['filepath'].values[0]
    colorDict[INVERSED_LABEL_DICT[k]] = next(palette)
    row = pd.Series(
        [*coords, centroid_type, centroid_size, INVERSED_LABEL_DICT[k], path, 
         colorDict[INVERSED_LABEL_DICT[k]], 'diamond', (0.0, 'self')],
        index=plot_df.columns,
    )
    listOfRows.append(row)


    
## TRAIN
for idx, (fitted_coords, label, info) in tqdm(
        enumerate(
            zip(train_embeddings_,
                train_labels, 
                train_additional_info)
        ),
        total=len(train_labels)
    ):
    label = label.cpu().numpy()
    color = colorDict[INVERSED_LABEL_DICT[int(label)]]
    
    row = pd.Series(
        [*fitted_coords, train_type, train_size, INVERSED_LABEL_DICT[int(label)], info[0], 
         colorDict[INVERSED_LABEL_DICT[k]], 'circle',
        nearest_centroid_list_for_train[idx]
        ],
        index=plot_df.columns,
    )
    listOfRows.append(row)

## TEST
test_embeddings_ = reducer.transform(test_embeddings.cpu().numpy())

for idx, (coord, label, info) in tqdm(
    enumerate(
        zip(test_embeddings_, test_labels, test_additional_info)),
    total=len(test_labels)
):
    label = label.cpu().numpy()
    
    row = pd.Series(
        [*coord, test_type, test_size, info[1], info[0], 
        colorDict[INVERSED_LABEL_DICT[k]], 'circle',
        nearest_centroid_list_for_test[idx]
        ],
        index=plot_df.columns,
    )
    
    listOfRows.append(row)

plot_df = pd.concat([plot_df, pd.DataFrame(listOfRows)], axis=0)
plot_df['x'] = plot_df['x'].astype(float)
plot_df['y'] = plot_df['y'].astype(float)
if 'z' in plot_df.columns:
    plot_df['z'] = plot_df['z'].astype(float)
    
plot_df['size'] = plot_df['size'].astype(int)

In [None]:
sum(reducer.explained_variance_ratio_)

In [None]:
import plotly.express as px
from itertools import cycle

PLOT_CENTROID_AND_TEST_ONLY = True
PLOT_LIMIT_FRAC = 1

if PLOT_CENTROID_AND_TEST_ONLY:
    plot_df_ = plot_df[plot_df['type'] != 'train'][::-1]
else:
    plot_df_ = plot_df[::-1]

if PLOT_LIMIT_FRAC:
    plot_df_ = plot_df_.groupby(['sign', 'type']).sample(frac=PLOT_LIMIT_FRAC)

from jupyter_dash import JupyterDash
from dash import dcc, html, Input, Output, no_update
import base64

app = JupyterDash(__name__)

@app.callback(
    Output("graph-tooltip-5", "show"),
    Output("graph-tooltip-5", "bbox"),
    Output("graph-tooltip-5", "children"),
    Input("graph-5", "hoverData"),
)
def display_hover(hoverData):

    if hoverData is None:
        return False, no_update, no_update

    hover_data = hoverData["points"][0]
    bbox = hover_data["bbox"]
    num = hover_data["pointNumber"]
    data = hover_data['customdata']
    sign = data[0]
    rel_img_path = data[1]
    nearest_centroid_info = data[3]
    # print(nearest_centroid_info[0])
    try:
        with open(rel_img_path, 'rb') as f:
            image = f.read()
    except FileNotFoundError as exc_obj:
        print('[!] Exception', data, exc_obj)
        return False, no_update, no_update
    
    b64sed_image = 'data:image/png;base64,' + base64.b64encode(image).decode('utf-8')
    prgrph_str = 'Sign:' + sign + ', Nearest:' + str(nearest_centroid_info[1]) \
            + ', Dist:' + str(nearest_centroid_info[0])
    
    children = [
        html.Div([
            html.Img(
                src=b64sed_image,
                style={"width": "70px", 'display': 'block', 'margin': '0 auto'},
            ),
            html.P(prgrph_str, style={"fontSize": 14, 'text-align':'center'}),
            html.P(rel_img_path, style={"fontSize": 10}),
        ])
    ]
    return True, bbox, children

plot_args = {
    'x': 'x',
    'y': 'y',
    'color': 'sign',
    'size': 'size',
    'opacity': 0.2 if dim3 else 0.5,
    'symbol': 'type',
    'hover_name': 'sign',
    'hover_data': ['sign', 'filepath', 'type', 'nearest_centroid'],
    'animation_group': 'type',
    'color_discrete_sequence': [
        *px.colors.qualitative.Dark24, 
        *px.colors.qualitative.Alphabet, 
        *px.colors.qualitative.Light24
    ]
}

if dim3:
    plotFcn = px.scatter_3d
    plot_args.update({'z': 'z'})
else:
    plotFcn = px.scatter
      
fig = plotFcn(
        plot_df_,
        **plot_args,

    )

fig.update_traces(
    hoverinfo="none", 
    hovertemplate=None,
    marker=dict(
        line=dict(
        width=0)
    )
)
    
fig.update_layout(
    width=950,
    height=950
)

## FIX Z-ORDER
if True:
    sampleData = list(fig.data)
    centroidsList = []
    for t in list(sampleData[:]):
        if (t.ids[0] == 'centroid'):
            temp_t = t
            sampleData.remove(t)
            temp_t['marker']['opacity'] = 1
            temp_t['text'] = temp_t['customdata'][0][0]
            temp_t['textposition'] = 'top center'
            temp_t['mode'] = 'markers+text'
            temp_t['marker']['line']['width'] = 40 if dim3 else 2
            temp_t['marker']['line']['color'] = 'rgb(0, 0, 0)'
            centroidsList.append(temp_t)
            
    fig.data = tuple(sampleData + centroidsList)
    
fig.update_layout(font=dict(size=18))

app.layout = html.Div(
            className="container",
            children=[
                html.Div(html.H2("Visualization")),
                dcc.Graph(id="graph-5", figure=fig, clear_on_unhover=True),
                dcc.Tooltip(id="graph-tooltip-5", direction='bottom'),
            ],
        )
    
if __name__ == '__main__' and PLOT:
    app.run_server(mode='inline', debug=True, port=2002)

In [None]:
sign_ = '6.3.2'
centroid = centroid_location_dict_gpu[LABEL_DICT['6.4']]
sign_embeddings = test_embeddings[test_labels == LABEL_DICT[sign_]] 
print(sign_embeddings)

distances = (sign_embeddings - centroid).pow(2).sum(-1).sqrt()
print(distances)

In [None]:
def getEulerDistance(a, b):
    squares = [(p-q) ** 2 for p, q in zip(a, b)]
    return sum(squares) ** .5
    
distancesSign = {}

centroidSignList = list(centroid_location_dict_cpu.keys())
# print(centroidSignList)
for idx, ikey in enumerate(centroidSignList):
    distancesSign[INVERSED_LABEL_DICT[ikey]] = {}
    distancesSign[INVERSED_LABEL_DICT[ikey]][INVERSED_LABEL_DICT[ikey]] = np.NaN
    for jdx, jkey in enumerate(centroidSignList[idx + 1:]):
        dist = getEulerDistance(
            centroid_location_dict_cpu[ikey],
            centroid_location_dict_cpu[jkey]
        )
        distancesSign[INVERSED_LABEL_DICT[ikey]][INVERSED_LABEL_DICT[jkey]] = dist
        
# distancesSign
distancesSign = pd.DataFrame.from_dict(distancesSign)
for i in range(len(distancesSign)):
    distancesSign.iloc[i] = distancesSign.iloc[:, i]

In [None]:
figH = px.imshow(distancesSign)

app1 = JupyterDash(__name__)

PATH_PREFIX = '../data/STOCK_SIGNS/'
PATH_POSTFIX = '.png'

@app1.callback(
    Output("graph-tooltip-5", "show"),
    Output("graph-tooltip-5", "bbox"),
    Output("graph-tooltip-5", "children"),
    Input("graph-5", "hoverData"),
)
def display_hover(hoverData):
    if hoverData is None:
        return False, no_update, no_update

    hover_data = hoverData["points"][0]
    
    hover_data['x'] = '2.3.1' if hover_data['x'] == '2.3' else hover_data['x']
    hover_data['y'] = '2.3.1' if hover_data['y'] == '2.3' else hover_data['y']        
    hover_data['x'] = '3.18.1'  if hover_data['x'] == '3.18' else hover_data['x'] 
    hover_data['y'] = '3.18.1'  if hover_data['y'] == '3.18' else hover_data['y']
        
    x_img_path = PATH_PREFIX + hover_data['x'] + PATH_POSTFIX
    y_img_path = PATH_PREFIX + hover_data['y'] + PATH_POSTFIX
    
    try:
        with open(x_img_path, 'rb') as f:
            image1 = f.read()
        with open(y_img_path, 'rb') as f:
            image2 = f.read()
    except:
        print(hoverData)
        return False, no_update, no_update
    
    img1 = 'data:image/png;base64,' + base64.b64encode(image1).decode('utf-8')
    img2 = 'data:image/png;base64,' + base64.b64encode(image2).decode('utf-8')

    children = [
        html.Div([
            html.Img(
                src=img1,
                style={"width": "70px",  'margin': '0 auto'},
            ),
            html.Img(
                src=img2,
                style={"width": "70px",  'margin': '0 auto'},
            ),
            html.P(hover_data['x'] + ':' + hover_data['y'], style={"fontSize": 14, 'text-align':'center'}),
            html.P(str(hover_data['z']), style={"fontSize": 14, 'text-align':'center'}),
            
        ]),
    ]
    return True, hover_data["bbox"], children

figH.update_traces(hoverinfo="none", hovertemplate=None)
    
figH.update_layout(
        width=600,
        height=600)

app1.layout = html.Div(
    className="container",
        children=[
            html.Div(html.H2("Расстояние между центроидами")),
            dcc.Graph(id="graph-5", figure=figH, clear_on_unhover=True),
            dcc.Tooltip(id="graph-tooltip-5", direction='bottom'),
            ],
    )

if __name__ == '__main__' and PLOT:
    app1.run_server(mode='inline', debug=True, port=2003)

## Confusion Matrix

* train_embeddings, train_labels, train_additional_info - в этих переменных вся инфа о тестовых картинках;
* centroid_location_dict_cpu - словарь центроидов на CPU.
* centroid_location_dict_gpu - словарь центроидов на GPU.
* getEulerDistance - функция эвклидова расстояния.

In [None]:
from copy import deepcopy
sign_set = sorted(set(RTDS_DF['sign']))
v = {v: 0 for v in sign_set}
cf_dict = {k: deepcopy(v) for k in sign_set}

CHECK_TRAIN = False
embs = train_embeddings if CHECK_TRAIN else test_embeddings
labels_per_embeddings = train_labels if CHECK_TRAIN else test_labels
labels_per_embeddings = labels_per_embeddings.cpu().tolist()

centroid_index_to_key = {int(index): int(val) for index, val in enumerate(centroid_location_dict_gpu.keys())}
centroid_locations = torch.stack([centroid_location_dict_gpu[label] for _, label in centroid_index_to_key.items()])

for i, emb in tqdm(
        enumerate(embs),
        total=len(embs)
    ):
    
    dist = (emb - centroid_locations).pow(2).sum(-1).sqrt()
    key = centroid_index_to_key[int(torch.argmin(dist))]
    
    realSign = INVERSED_LABEL_DICT[labels_per_embeddings[i]]
    predictedSign = INVERSED_LABEL_DICT[key]

    cf_dict[realSign][predictedSign] += 1
    
cf_df = pd.DataFrame(
    columns=sorted(set(RTDS_DF.sign)),
    index=sorted(set(RTDS_DF.sign))
)

for real_sign, predicted_signs in cf_dict.items():
    for predicted_sign, val in predicted_signs.items():
        cf_df[real_sign][predicted_sign] = val
        

In [None]:
cf_dict['2.2']

In [None]:
fig_cf = px.imshow(
    cf_df.apply(lambda x: (x / x.sum()) if x.sum() else 0, axis=1).replace(0, np.nan),
    color_continuous_scale=px.colors.sequential.Cividis_r,
)

app2 = JupyterDash(__name__ + 'cf')

PATH_PREFIX = '../data/STOCK_SIGNS/'
PATH_POSTFIX = '.png'

@app2.callback(
    Output("graph-tooltip-5", "show"),
    Output("graph-tooltip-5", "bbox"),
    Output("graph-tooltip-5", "children"),
    Input("graph-5", "hoverData"),
)
def display_hover(hoverData):
    if hoverData is None:
        return False, no_update, no_update

    hover_data = hoverData["points"][0]
    sum_y = sum(map(int, cf_dict[hover_data['x']].values()))
    
    if not hover_data['z']:
        return False, no_update, no_update
    
    hover_data['x'] = '2.3.1' if hover_data['x'] == '2.3' else hover_data['x']
    hover_data['y'] = '2.3.1' if hover_data['y'] == '2.3' else hover_data['y']        
    hover_data['x'] = '3.18.1'  if hover_data['x'] == '3.18' else hover_data['x'] 
    hover_data['y'] = '3.18.1'  if hover_data['y'] == '3.18' else hover_data['y']
    hover_data['x'] = '3.25.10' if hover_data['x'] == '3.25' else hover_data['x']
    hover_data['y'] = '3.25.10' if hover_data['y'] == '3.25' else hover_data['y']         
    hover_data['x'] = '3.24.10' if hover_data['x'] == '3.24' else hover_data['x']
    hover_data['y'] = '3.24.10' if hover_data['y'] == '3.24' else hover_data['y']   
    
    x_img_path = PATH_PREFIX + hover_data['x'] + PATH_POSTFIX
    y_img_path = PATH_PREFIX + hover_data['y'] + PATH_POSTFIX
    
    bbox = hover_data["bbox"]
    
    try:
        with open(x_img_path, 'rb') as f:
            image1 = f.read()
        with open(y_img_path, 'rb') as f:
            image2 = f.read()
    except FileNotFoundError:
        return False, no_update, no_update
    
    img1 = 'data:image/png;base64,' + base64.b64encode(image1).decode('utf-8')
    img2 = 'data:image/png;base64,' + base64.b64encode(image2).decode('utf-8')

    children = [
        html.Div([
            html.Img(
                src=img1,
                style={"width": "70px",  'margin': '0 auto'},
            ),
            html.Img(
                src=img2,
                style={"width": "70px",  'margin': '0 auto'},
            ),
            html.P(hover_data['x'] + ':' + hover_data['y'], style={"fontSize": 14, 'text-align':'center'}),
            html.P(
                str(hover_data['z']) 
                + ': all:' + str(int(sum_y * hover_data['z'])), 
                style={"fontSize": 14, 'text-align':'center'}
            ),
        ]),
    ]
    return True, bbox, children

fig_cf.update_traces(
        hoverinfo="none", 
        hovertemplate=None)
    
fig_cf.update_layout(
    width=950,
    height=650,
    xaxis_title='Target Sign',
    yaxis_title='Predicted Sign'
)

app2.layout = html.Div(
    className="container",
    children=[
        html.Div(html.H2("Confusion matrix")),
            dcc.Graph(
                id="graph-5", 
                figure=fig_cf, 
                clear_on_unhover=True
            ),
        dcc.Tooltip(id="graph-tooltip-5", direction='bottom'),
    ],
)
    
# fig_cf.show()
if __name__ == '__main__' and PLOT:
    app2.run_server(mode='inline', debug=True, port=2004)

Precision/F1

Значения precision находятся в матрице ниже. В строках - актульные значения, в столбцах точность/вероятность предсказывания соответсвующего знака.

In [None]:
TPdict = {}
FNdict = {}
FPdict = {}
TNdict = {}

for i, row in cf_df.iterrows():    
    TPdict[i] = cf_df[i][i]
    FNdict[i] = cf_df[i].sum() - TPdict[i]
    FPdict[i] = cf_df.loc[i].sum() - TPdict[i]
    TNdict[i] = cf_df.fillna(0).values.sum() - TPdict[i] - FNdict[i] - FPdict[i]

In [None]:
sign = '3.22'
print(TPdict[sign])
print(FNdict[sign])
print(FPdict[sign])
print(TNdict[sign])

In [None]:
PrecisionDict = {}
RecallDict = {}
F1Dict = {}
SupportDict = {}

for i in TPdict.keys():
    try:
        PrecisionDict[i] = TPdict[i] / (TPdict[i] + FPdict[i])
        RecallDict[i] = TPdict[i] / (TPdict[i] + FNdict[i])
        F1Dict[i] = 2 / (1 / PrecisionDict[i] + 1 / RecallDict[i])
        SupportDict[i] = TPdict[i] + FNdict[i]
        # print(SupportDict[i]),
        # print(sum(RTDS_DF[RTDS_DF['set']=='test']['sign'] == i))
        assert SupportDict[i] == sum(RTDS_DF[RTDS_DF['set']=='test']['sign'] == i), \
            'For ' + str(i) + ' mismatch: ' + str(SupportDict[i]) + ' != ' + str(
            sum(RTDS_DF[RTDS_DF['set']=='test']['sign'] == i)
        )
    except (ZeroDivisionError):
        PrecisionDict[i] = RecallDict[i] = F1Dict[i] = 0
        SupportDict[i] = sum(RTDS_DF[RTDS_DF['set']=='test']['sign'] == i)
        print('ZDE for', i)


In [None]:
columns = ['Precision', 'Recall', 'F1', 'Support']
metrics = {}

for i in zip(PrecisionDict.items(), RecallDict.items(), F1Dict.items(), SupportDict.items()):
    metrics[i[0][0]] = [i[0][1], i[1][1], i[2][1], i[3][1]]
    
metricsDf = pd.DataFrame().from_dict(metrics, orient='index')
metricsDf.columns = columns
# metricsDf

In [None]:
centroid_from_stock_for_signs

In [None]:
NOT_INCLUDED_IN_TRAIN = centroid_from_stock_for_signs # ['1.31', '1.6', '2.4', '3.22', '3.25', '3.31', '6.3.2']

TRAINED = metricsDf.loc[~metricsDf.index.isin(NOT_INCLUDED_IN_TRAIN)]
NOT_TRAINED = metricsDf.loc[metricsDf.index.isin(NOT_INCLUDED_IN_TRAIN)]

display(TRAINED)
display(NOT_TRAINED)

In [None]:
assert False, 'Next cells should be executed mannualy'

In [None]:
def save_model_config_for_package(
    model: nn.Module, 
    centroid_location_dict: dict,
    model_config: str,
    output_file_path: Path = Path('saved_model')
):
    print(f'Saving model for {model_config}')
    torch.save({
        'model': model.state_dict(),
        'centroid_location': centroid_location_dict,
        'model_config': model_config
    }, output_file_path)
    print('Saving success!')

with open(DATA_DIR / 'encoder_config.json') as f:
    model_config = f.read()
    
centroid_location_dict = {
    INVERSED_LABEL_DICT[x]: coord for x, coord in centroid_location_dict_cpu.items()
}
save_model_config_for_package(
    model=encoder,
    centroid_location_dict=centroid_location_dict,
    model_config=model_config,
)

In [None]:
TRAINED.to_excel('trained.xls', engine='xlsxwriter')
NOT_TRAINED.to_excel('not_trained.xls', engine='xlsxwriter')

In [None]:
# metricsDf.to_excel('metrics3_24.xls', engine='xlsxwriter')