# This notebook enables training and testing of Sherlock.
The procedure is:
- Load train, val, test datasets (should be preprocessed)
- Initialize model using the "pretrained" model or by training one from scratch.
- Evaluate and analyse the model predictions.

In [1]:
%env PYTHONHASHSEED=13
%load_ext autoreload
%autoreload 2

env: PYTHONHASHSEED=13


In [2]:
# This will be the ID for the retrained model,
#further down predictions can also be made with the original model: "sherlock"
model_id = 'retrained_sherlock'

In [3]:
from ast import literal_eval
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, classification_report

from sherlock.deploy.model import SherlockModel

## Load datasets for training, validation, testing

In [4]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('../data/data/processed/train.parquet')
y_train = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-10-15 16:21:04.731660
Load data (train) process took 0:00:01.655188 seconds.


In [5]:
len(np.unique(y_train))

32

In [6]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [7]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('../data/data/processed/validation.parquet')
y_validation = pd.read_parquet('../data/data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-10-15 16:21:06.697384
Load data (validation) process took 0:00:01.013941 seconds.


In [8]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('../data/data/processed/test.parquet')
y_test = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-10-15 16:21:07.766820
Finished at 2022-10-15 16:21:09.039885, took 0:00:01.273078 seconds


## Initialize the model
Two options:
- Load Sherlock model with pretrained weights
- Fit Sherlock model from scratch

In [9]:
model_id = "retrained_sherlock"

In [10]:
model = SherlockModel()
try:
    model.initialize_model_from_json(with_weights=True, model_id=model_id);
except:
    start = datetime.now()
    print(f'Started at {start}')
    # Model will be stored with ID `model_id`
    model.fit(X_train, y_train, X_validation, y_validation, model_id=model_id)

    print('Trained and saved new model.')
    print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')
    model.store_weights(model_id=model_id)

Started at 2022-10-15 16:21:09.111457


2022-10-15 16:21:09.335083: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-15 16:21:09.338257: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
  super(Adam, self).__init__(name, **kwargs)


Epoch 1/10


W1015 16:21:10.187919 46912499975424 ag_logging.py:142] AutoGraph could not transform <function Model.make_train_function.<locals>.train_function at 0x2aab4b9a0940> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'

W1015 16:21:16.402033 46912499975424 ag_logging.py:142] AutoGraph could not transform <function Model.make_test_function.<locals>.test_function at 0x2aad5577ddc0> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Trained and saved new model.
Finished at 2022-10-15 16:22:17.242551, took 0:01:08.131111 seconds


In [11]:
predicted_labels = []

### Make prediction

In [12]:
predicted_labels = model.predict(X_test, model_id)
predicted_labels = np.array([x.lower() for x in predicted_labels])

W1015 16:22:19.045528 46912499975424 ag_logging.py:142] AutoGraph could not transform <function Model.make_predict_function.<locals>.predict_function at 0x2aad5577dd30> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


In [13]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

prediction count 65007, type = <class 'numpy.ndarray'>


0.9129910873635969

In [14]:
# If using the original model, model_id should be replaced with "sherlock"
#model_id = "sherlock"
classes = np.load(f"../model_files/classes_{model_id}.npy", allow_pickle=True)

report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [15]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
industry	0.985		0.983		0.987		2958
birth date	0.979		0.975		0.983		479
sex		0.974		0.982		0.966		2997
currency	0.968		0.975		0.960		405
year		0.964		0.977		0.951		3015


### Bottom 5 Types

In [16]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
range		0.799		0.902		0.718		577
nationality	0.769		0.786		0.752		424
rank		0.755		0.661		0.881		2983
person		0.701		0.912		0.570		579
sales		0.341		0.795		0.217		322


### All Scores

In [17]:
print(classification_report(y_test, predicted_labels, digits=3))

              precision    recall  f1-score   support

     address      0.955     0.967     0.961      3003
         age      0.886     0.964     0.923      3033
        area      0.959     0.798     0.871      1987
  birth date      0.975     0.983     0.979       479
 birth place      0.995     0.933     0.963       418
       brand      0.875     0.768     0.818       574
        city      0.852     0.943     0.895      2966
   continent      0.794     0.934     0.858       227
     country      0.919     0.949     0.934      3038
      county      0.961     0.953     0.957      2959
    currency      0.975     0.960     0.968       405
         day      0.915     0.878     0.896      3038
    duration      0.951     0.943     0.947      3000
    industry      0.983     0.987     0.985      2958
    language      0.852     0.967     0.906      1474
    location      0.943     0.825     0.880      2949
manufacturer      0.928     0.892     0.910       945
        name      0.866    

## Review errors

In [18]:
size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
        
        # zoom in to specific errors. Use the index in the next step
        if k1 in ('address'):
            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

[1201] expected "address" but predicted "city"
[1585] expected "address" but predicted "city"
[2256] expected "address" but predicted "name"
[2303] expected "address" but predicted "rank"
[3342] expected "address" but predicted "name"
[4640] expected "address" but predicted "city"
[4787] expected "address" but predicted "rank"
[5261] expected "address" but predicted "city"
[5729] expected "address" but predicted "location"
[7390] expected "address" but predicted "location"
[7404] expected "address" but predicted "city"
[9532] expected "address" but predicted "city"
[9920] expected "address" but predicted "language"
[10188] expected "address" but predicted "product"
[11837] expected "address" but predicted "rank"
[12829] expected "address" but predicted "name"
[14477] expected "address" but predicted "language"
[15140] expected "address" but predicted "city"
[15145] expected "address" but predicted "language"
[15481] expected "address" but predicted "city"
[15525] expected "address" but

[('region', 553),
 ('location', 515),
 ('area', 401),
 ('day', 372),
 ('rank', 355),
 ('name', 264),
 ('sales', 252),
 ('order', 251),
 ('person', 249),
 ('type', 224),
 ('duration', 172),
 ('city', 168),
 ('range', 163),
 ('product', 158),
 ('country', 155),
 ('year', 149),
 ('county', 140),
 ('status', 134),
 ('brand', 133),
 ('age', 108),
 ('nationality', 105),
 ('manufacturer', 102),
 ('sex', 102),
 ('address', 98),
 ('state', 90),
 ('language', 48),
 ('symbol', 45),
 ('industry', 38),
 ('birth place', 28),
 ('currency', 16),
 ('continent', 15),
 ('birth date', 8)]

In [19]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')

In [20]:
idx = 57


original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "symbol", actual label "symbol". Actual values:
[['VASC', 'TNK', 'NAT', 'GPRO', 'MANH']]
