# This notebook enables training and testing of Sherlock.
The procedure is:
- Load train, val, test datasets (should be preprocessed)
- Initialize model using the "pretrained" model or by training one from scratch.
- Evaluate and analyse the model predictions.

In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')

In [None]:
# This will be the ID for the retrained model,
#further down predictions can also be made with the original model: "sherlock"
model_id = 'retrained_sherlock'

In [None]:
from ast import literal_eval
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, classification_report

from sherlock.deploy.model import SherlockModel

## Load datasets for training, validation, testing

In [None]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('../data/data/processed/train.parquet')
y_train = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

In [None]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

In [None]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('../data/data/processed/validation.parquet')
y_validation = pd.read_parquet('../data/data/raw/validation_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

In [None]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('../data/data/processed/test.parquet')
y_test = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

## Initialize the model
Two options:
- Load Sherlock model with pretrained weights
- Fit Sherlock model from scratch

### Option 1: load Sherlock with pretrained weights

In [None]:
start = datetime.now()
print(f'Started at {start}')

model = SherlockModel();
model.initialize_model_from_json(with_weights=True, model_id="sherlock");

print('Initialized model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

### Option 2: fit Sherlock from scratch (and save for later use)

In [None]:
model_id = "retrained_sherlock"

In [None]:
start = datetime.now()
print(f'Started at {start}')

model = SherlockModel()
# Model will be stored with ID `model_id`
model.fit(X_train, y_train, X_validation, y_validation, model_id=model_id)

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

In [None]:
model.store_weights(model_id=model_id)

In [None]:
X_train[0:10]

### Make prediction

In [None]:
predicted_labels = model.predict(X_test)
predicted_labels = np.array([x.lower() for x in predicted_labels])

In [None]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

In [None]:
# If using the original model, model_id should be replaced with "sherlock"
#model_id = "sherlock"
classes = np.load(f"../model_files/classes_{model_id}.npy", allow_pickle=True)

report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [None]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

### Bottom 5 Types

In [None]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

### All Scores

In [None]:
print(classification_report(y_test, predicted_labels, digits=3))

## Review errors

In [None]:
size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
        
        # zoom in to specific errors. Use the index in the next step
        if k1 in ('address'):
            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

In [None]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')

In [None]:
idx = 1001
original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')