# This notebook enables training and testing of Sherlock.
The procedure is:
- Load train, val, test datasets (should be preprocessed)
- Initialize model using the "pretrained" model or by training one from scratch.
- Evaluate and analyse the model predictions.

In [1]:
%env PYTHONHASHSEED=13
%load_ext autoreload
%autoreload 2

env: PYTHONHASHSEED=13


In [2]:
# This will be the ID for the retrained model,
#further down predictions can also be made with the original model: "sherlock"
model_id = 'sherlock'

In [3]:
from ast import literal_eval
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, classification_report

from sherlock.deploy.model import SherlockModel

## Load datasets for training, validation, testing

In [4]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('../data/data/processed/train.parquet')
y_train = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-10-18 18:51:01.370050
Load data (train) process took 0:00:01.337385 seconds.


In [5]:
len(np.unique(y_train))

32

In [6]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [7]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('../data/data/processed/validation.parquet')
y_validation = pd.read_parquet('../data/data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-10-18 18:51:03.103523
Load data (validation) process took 0:00:00.737195 seconds.


In [8]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('../data/data/processed/test.parquet')
y_test = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-10-18 18:51:03.886702
Finished at 2022-10-18 18:51:05.324545, took 0:00:01.437855 seconds


## Initialize the model
Two options:
- Load Sherlock model with pretrained weights
- Fit Sherlock model from scratch

### Option 1: load Sherlock with pretrained weights

In [9]:
start = datetime.now()
print(f'Started at {start}')

model = SherlockModel();
model.initialize_model_from_json(with_weights=True, model_id="sherlock");

print('Initialized model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-10-18 18:51:05.418634


2022-10-18 18:51:05.536786: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-18 18:51:05.541187: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Initialized model.
Finished at 2022-10-18 18:51:05.997220, took 0:00:00.578597 seconds


  super(Adam, self).__init__(name, **kwargs)


### Option 2: fit Sherlock from scratch (and save for later use)

In [10]:
# model_id = "retrained_sherlock"

In [11]:
# start = datetime.now()
# print(f'Started at {start}')

# model = SherlockModel()
# # Model will be stored with ID `model_id`
# model.fit(X_train, y_train, X_validation, y_validation, model_id=model_id)

# print('Trained and saved new model.')
# print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

In [12]:
# model.store_weights(model_id=model_id)

In [13]:
# predicted_labels = []

### Make prediction

In [14]:
predicted_labels = model.predict(X_test, model_id)
predicted_labels = np.array([x.lower() for x in predicted_labels])

W1018 18:51:06.525854 46912499975424 ag_logging.py:142] AutoGraph could not transform <function Model.make_predict_function.<locals>.predict_function at 0x2aaaaf9ea670> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


In [15]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

prediction count 65007, type = <class 'numpy.ndarray'>


0.9145668488511228

In [16]:
# If using the original model, model_id should be replaced with "sherlock"
#model_id = "sherlock"
classes = np.load(f"../model_files/classes_{model_id}.npy", allow_pickle=True)

report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Top 5 Types

In [17]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
industry	0.990		0.995		0.985		2958
currency	0.980		0.992		0.968		405
sex		0.966		0.987		0.945		2997
symbol		0.965		0.984		0.946		1752
language	0.960		0.966		0.953		1474


### Bottom 5 Types

In [18]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
result		0.000		0.000		0.000		0
service		0.000		0.000		0.000		0
species		0.000		0.000		0.000		0
team		0.000		0.000		0.000		0
weight		0.000		0.000		0.000		0


### All Scores

In [19]:
print(classification_report(y_test, predicted_labels, digits=3))

  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

       address      0.949     0.943     0.946      3003
     affiliate      0.000     0.000     0.000         0
   affiliation      0.000     0.000     0.000         0
           age      0.928     0.950     0.939      3033
         album      0.000     0.000     0.000         0
          area      0.924     0.821     0.869      1987
        artist      0.000     0.000     0.000         0
    birth date      0.989     0.969     0.979       479
   birth place      0.937     0.923     0.930       418
         brand      0.895     0.669     0.766       574
      capacity      0.000     0.000     0.000         0
      category      0.000     0.000     0.000         0
          city      0.899     0.904     0.901      2966
         class      0.000     0.000     0.000         0
classification      0.000     0.000     0.000         0
          club      0.000     0.000     0.000         0
          code      0.000     0.000     0.000  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Review errors

In [20]:
size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
        
        # zoom in to specific errors. Use the index in the next step
        if k1 in ('address'):
            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

[478] expected "address" but predicted "name"
[700] expected "address" but predicted "language"
[1111] expected "address" but predicted "club"
[1201] expected "address" but predicted "city"
[1585] expected "address" but predicted "city"
[2393] expected "address" but predicted "location"
[2609] expected "address" but predicted "name"
[3056] expected "address" but predicted "location"
[4134] expected "address" but predicted "location"
[4428] expected "address" but predicted "location"
[4537] expected "address" but predicted "location"
[4640] expected "address" but predicted "city"
[4769] expected "address" but predicted "creator"
[4787] expected "address" but predicted "order"
[5066] expected "address" but predicted "area"
[5261] expected "address" but predicted "team"
[5685] expected "address" but predicted "city"
[5729] expected "address" but predicted "location"
[5770] expected "address" but predicted "artist"
[6796] expected "address" but predicted "description"
[7008] expected "addr

Total mismatches: 6970 (F1 score: 0.9145668488511228)


[('name', 726),
 ('rank', 665),
 ('region', 520),
 ('location', 513),
 ('type', 364),
 ('area', 356),
 ('day', 329),
 ('product', 321),
 ('city', 286),
 ('person', 220),
 ('status', 196),
 ('brand', 190),
 ('year', 189),
 ('manufacturer', 171),
 ('address', 170),
 ('sex', 166),
 ('order', 165),
 ('duration', 154),
 ('sales', 152),
 ('country', 152),
 ('age', 151),
 ('range', 139),
 ('nationality', 130),
 ('state', 128),
 ('county', 126),
 ('symbol', 94),
 ('language', 69),
 ('industry', 44),
 ('birth place', 32),
 ('continent', 24),
 ('birth date', 15),
 ('currency', 13)]

In [21]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')

In [22]:
idx = 57


original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "symbol", actual label "symbol". Actual values:
[['VASC', 'TNK', 'NAT', 'GPRO', 'MANH']]
