# This notebook enables training and testing of Sherlock.
The procedure is:
- Load train, val, test datasets (should be preprocessed)
- Initialize model using the "pretrained" model or by training one from scratch.
- Evaluate and analyse the model predictions.

In [1]:
%env PYTHONHASHSEED=13
%load_ext autoreload
%autoreload 2

env: PYTHONHASHSEED=13


In [2]:
# This will be the ID for the retrained model,
#further down predictions can also be made with the original model: "sherlock"
model_id = 'retrained_sherlock'

In [3]:
from ast import literal_eval
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, classification_report

from sherlock.deploy.model import SherlockModel

## Load datasets for training, validation, testing

In [4]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('../data/data/processed/train.parquet')
y_train = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-10-15 15:57:04.319767
Load data (train) process took 0:00:02.167139 seconds.


In [5]:
len(np.unique(y_train))

32

In [6]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [7]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('../data/data/processed/validation.parquet')
y_validation = pd.read_parquet('../data/data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-10-15 15:57:08.896525
Load data (validation) process took 0:00:02.014184 seconds.


In [8]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('../data/data/processed/test.parquet')
y_test = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-10-15 15:57:11.250738
Finished at 2022-10-15 15:57:12.798950, took 0:00:01.548226 seconds


## Initialize the model
Two options:
- Load Sherlock model with pretrained weights
- Fit Sherlock model from scratch

In [9]:
model_id = "retrained_sherlock"

In [11]:
model = SherlockModel()
try:
    model.initialize_model_from_json(with_weights=True, model_id=model_id);
except:
    start = datetime.now()
    print(f'Started at {start}')
    # Model will be stored with ID `model_id`
    model.fit(X_train, y_train, X_validation, y_validation, model_id=model_id)

    print('Trained and saved new model.')
    print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')
    model.store_weights(model_id=model_id)

Started at 2022-10-15 15:57:41.526597


2022-10-15 15:57:41.941292: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-15 15:57:41.950972: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
  super(Adam, self).__init__(name, **kwargs)


Epoch 1/10


W1015 15:57:44.223948 46912496407488 ag_logging.py:142] AutoGraph could not transform <function Model.make_train_function.<locals>.train_function at 0x2aadc85123a0> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
  5/254 [..............................] - ETA: 9:20 - loss: 4.1815 - categorical_accuracy: 0.0320

KeyboardInterrupt: 

In [13]:
predicted_labels = []

### Make prediction

In [14]:
predicted_labels = model.predict(X_test, model_id)
predicted_labels = np.array([x.lower() for x in predicted_labels])

W0926 03:15:24.883456 46912496407488 ag_logging.py:142] AutoGraph could not transform <function Model.make_predict_function.<locals>.predict_function at 0x2aadab5b2700> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


In [15]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

prediction count 65007, type = <class 'numpy.ndarray'>


0.9131604033035498

In [16]:
# If using the original model, model_id should be replaced with "sherlock"
#model_id = "sherlock"
classes = np.load(f"../model_files/classes_{model_id}.npy", allow_pickle=True)

report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [17]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
industry	0.986		0.985		0.988		2958
birth date	0.976		0.975		0.977		479
currency	0.971		0.980		0.963		405
sex		0.970		0.983		0.958		2997
symbol		0.965		0.957		0.972		1752


### Bottom 5 Types

In [18]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
range		0.778		0.943		0.662		577
rank		0.755		0.682		0.847		2983
nationality	0.744		0.665		0.844		424
person		0.726		0.876		0.620		579
sales		0.408		0.781		0.276		322


### All Scores

In [19]:
print(classification_report(y_test, predicted_labels, digits=3))

              precision    recall  f1-score   support

     address      0.956     0.966     0.961      3003
         age      0.900     0.959     0.929      3033
        area      0.950     0.818     0.879      1987
  birth date      0.975     0.977     0.976       479
 birth place      0.990     0.935     0.962       418
       brand      0.895     0.772     0.829       574
        city      0.892     0.920     0.906      2966
   continent      0.787     0.925     0.850       227
     country      0.933     0.923     0.928      3038
      county      0.938     0.966     0.952      2959
    currency      0.980     0.963     0.971       405
         day      0.945     0.869     0.905      3038
    duration      0.936     0.947     0.941      3000
    industry      0.985     0.988     0.986      2958
    language      0.862     0.964     0.910      1474
    location      0.924     0.845     0.882      2949
manufacturer      0.906     0.906     0.906       945
        name      0.881    

## Review errors

In [20]:
size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
        
        # zoom in to specific errors. Use the index in the next step
        if k1 in ('address'):
            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

[1187] expected "address" but predicted "location"
[1201] expected "address" but predicted "city"
[1585] expected "address" but predicted "city"
[2256] expected "address" but predicted "person"
[2303] expected "address" but predicted "area"
[2393] expected "address" but predicted "location"
[3342] expected "address" but predicted "city"
[4640] expected "address" but predicted "city"
[4787] expected "address" but predicted "duration"
[5261] expected "address" but predicted "city"
[5729] expected "address" but predicted "location"
[6327] expected "address" but predicted "product"
[7404] expected "address" but predicted "city"
[9532] expected "address" but predicted "manufacturer"
[9920] expected "address" but predicted "language"
[10188] expected "address" but predicted "product"
[11837] expected "address" but predicted "rank"
[12499] expected "address" but predicted "name"
[12829] expected "address" but predicted "name"
[14477] expected "address" but predicted "language"
[15140] expecte

[('region', 588),
 ('location', 458),
 ('rank', 456),
 ('day', 397),
 ('area', 362),
 ('name', 284),
 ('city', 236),
 ('country', 234),
 ('sales', 233),
 ('person', 220),
 ('range', 195),
 ('type', 181),
 ('duration', 160),
 ('product', 156),
 ('brand', 131),
 ('state', 127),
 ('status', 126),
 ('sex', 126),
 ('year', 125),
 ('order', 124),
 ('age', 124),
 ('address', 103),
 ('county', 100),
 ('manufacturer', 89),
 ('nationality', 66),
 ('language', 53),
 ('symbol', 49),
 ('industry', 35),
 ('birth place', 27),
 ('continent', 17),
 ('currency', 15),
 ('birth date', 11)]

In [21]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')

In [54]:
idx = 57


original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "symbol", actual label "symbol". Actual values:
[['VASC', 'TNK', 'NAT', 'GPRO', 'MANH']]
