## Resulting Confusion Matrix

In [None]:
# autoreload 
%load_ext autoreload
%autoreload 2

# core imports
import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
import cv2
import torch

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# append src
try:
    PROJECT_ROOT = Path(os.readlink(f'/proc/{os.environ["JPY_PARENT_PID"]}/cwd'))
except:
    PROJECT_ROOT = Path(os.getcwd()).parent.parent
DATA_DIR = PROJECT_ROOT / 'SignDetectorAndClassifier' / 'data'

# Зафиксируем состояние случайных чисел
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
torch.set_grad_enabled(False)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

PLOT = False

In [None]:
from maddrive_adas.sign_det.classifier import EncoderBasedClassifier
encoder = EncoderBasedClassifier(
        config_path=PROJECT_ROOT / 'classifier_archive',
)

### Этап 1.1. Берем RTDS, из него берем *train* как *baseline*. Заменяем *valid* на *test*.

In [None]:
DATASET_PREFIX = DATA_DIR / 'ENCODER_DATASET'
RTDS_DF = pd.read_csv(DATASET_PREFIX / 'WIDE_DATASET_4_ENCODER.csv')
RTDS_DF['filepath'] = RTDS_DF['filepath'].apply(lambda x: str(DATASET_PREFIX / x))
RTDS_DF.drop_duplicates(subset=['filepath'], inplace=True)

# убираем доп знаки 
# RTDS_DF = RTDS_DF[RTDS_DF['filepath'].str.contains('rtsd')]

TARGET_SIGNS = [
    '1.1', '1.6', '1.8', '1.22', '1.31', '1.33', 
    '2.1', '2.2', '2.3', '2.4', '2.5', 
    '3.1', '3.18', '3.20', '3.21', '3.22', '3.23', '3.24',
    '3.25', '3.27', '3.28', '3.31', 
    '4.1.1', '4.3', 
    '5.5', '5.6', '5.16', 
    '5.19.1', '5.20', 
    '6.3.2', '6.4', 
    '7.3', '7.4'
]

RTDS_DF = RTDS_DF[RTDS_DF['sign'].isin(TARGET_SIGNS)]

### *train* как референс, *valid* - query для валидации.
### Этап 1.2. Формируем DataFrame отсутствущих знаков в RTDS.

In [None]:
included_signs = sorted(set(RTDS_DF.sign))
print('Included signs in ENCODER_DATASET:', included_signs)
not_included_signs = sorted(set(TARGET_SIGNS) - set(RTDS_DF.sign))
print('Not included in ENCODER_DATASET:', not_included_signs)

print('Getting aditional sings...')
additional_DF = pd.DataFrame(columns=RTDS_DF.columns)

encode_offset = max(set(RTDS_DF['encoded'])) + 1
files = os.listdir(DATA_DIR / 'additional_sign')

skipped_signs = []
row_list = []

for file in files:
    sign = file.split('_')[0]
     
    if sign.rsplit('.', 1)[0] == '3.25':
        sign = '3.25'
        
    if sign.rsplit('.', 1)[0] == '3.24':
        sign = '3.24'         

    if sign in included_signs:
        skipped_signs.append(sign)
        continue
        
    row = {'filepath': str(DATA_DIR / 'additional_sign' / file), 
           'sign': sign, 
           'set': 'test', # HANDLE ME
           'encoded': None
          }

    row_list.append(row)

print('Skipped signs:', skipped_signs)
additional_DF = pd.DataFrame(row_list, columns=RTDS_DF.columns)


print('Including part of additional_DF for:', sorted(set(additional_DF.sign)), 'sign.')
additional_DF = additional_DF[~additional_DF['sign'].isin(RTDS_DF['sign'])]

RTDS_DF = pd.concat([RTDS_DF, additional_DF], ignore_index=True)

In [None]:
from maddrive_adas.sign_det.detector import DetectedInstance

from tqdm.notebook import tqdm

test_dataset: pd.DataFrame = RTDS_DF[RTDS_DF['set'] == 'test']

labels_set = sorted(set(test_dataset['sign']))

detected_instances = []
labels = []
for row in tqdm(test_dataset.itertuples(), total=len(test_dataset)):
    filepath = row[1]
    label = row[2]
    labels.append(label)

    img = cv2.imread(filepath)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    detected_instance = DetectedInstance(img)
    detected_instance.add_rel_roi([0., 0., 1., 1.], 1.)
    detected_instances.append(detected_instance)    

In [None]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

predicted = []

chunk_size = 200
detected_instances_chuncker = chunker(detected_instances, chunk_size)
for part_of_detected_instances in tqdm(detected_instances_chuncker, total=len(detected_instances) // chunk_size):
    result = encoder.classify_batch(part_of_detected_instances)
    predicted.extend(
        [x[1][0][0] for x in result]
    )

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

cf_ = confusion_matrix(labels, predicted, normalize='true', labels=list(set(labels)))
cmd_ = ConfusionMatrixDisplay(cf_)
cmd_.plot()

import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('TkAgg')
plt.show()

In [None]:
plt.show()

In [None]:

cr_ = classification_report(labels, predicted, output_dict=True)
df = pd.DataFrame(cr_).transpose()
df.to_csv("report")

In [None]:
pd.read_csv("report", index_col=0)