# This notebook enables training and testing of Sherlock.
The procedure is:
- Load train, val, test datasets (should be preprocessed)
- Initialize model using the "pretrained" model or by training one from scratch.
- Evaluate and analyse the model predictions.

In [1]:
%env PYTHONHASHSEED=13
%load_ext autoreload
%autoreload 2

env: PYTHONHASHSEED=13


In [2]:
# This will be the ID for the retrained model,
#further down predictions can also be made with the original model: "sherlock"
model_id = 'retrained_sherlock'

In [3]:
from ast import literal_eval
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, classification_report

from sherlock.deploy.model import SherlockModel

## Load datasets for training, validation, testing

In [4]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('../data/data/processed/train.parquet')
y_train = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-09-25 11:23:31.034071
Load data (train) process took 0:00:00.861180 seconds.


In [5]:
len(np.unique(y_train))

32

In [6]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [7]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('../data/data/processed/validation.parquet')
y_validation = pd.read_parquet('../data/data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-09-25 11:23:32.215566
Load data (validation) process took 0:00:00.540727 seconds.


In [8]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('../data/data/processed/test.parquet')
y_test = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-09-25 11:23:32.869818
Finished at 2022-09-25 11:23:33.412601, took 0:00:00.542802 seconds


## Initialize the model
Two options:
- Load Sherlock model with pretrained weights
- Fit Sherlock model from scratch

### Option 1: load Sherlock with pretrained weights

In [9]:
# start = datetime.now()
# print(f'Started at {start}')

# model = SherlockModel();
# model.initialize_model_from_json(with_weights=True, model_id="sherlock");

# print('Initialized model.')
# print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

### Option 2: fit Sherlock from scratch (and save for later use)

In [10]:
model_id = "retrained_sherlock"

In [11]:
start = datetime.now()
print(f'Started at {start}')

model = SherlockModel()
# Model will be stored with ID `model_id`
model.fit(X_train, y_train, X_validation, y_validation, model_id=model_id)

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-09-25 11:23:33.669550


2022-09-25 11:23:34.301137: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-25 11:23:34.309753: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
  super(Adam, self).__init__(name, **kwargs)


Epoch 1/10


W0925 11:23:35.088444 46912496407488 ag_logging.py:142] AutoGraph could not transform <function Model.make_train_function.<locals>.train_function at 0x2aabb39cd790> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'

W0925 11:37:12.239740 46912496407488 ag_logging.py:142] AutoGraph could not transform <function Model.make_test_function.<locals>.test_function at 0x2aadb0b3ea60> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Trained and saved new model.
Finished at 2022-09-25 14:34:08.089110, took 3:10:34.419588 seconds


In [12]:
model.store_weights(model_id=model_id)

In [13]:
predicted_labels = []

### Make prediction

In [14]:
predicted_labels = model.predict(X_test, model_id)
predicted_labels = np.array([x.lower() for x in predicted_labels])

W0925 14:36:54.734713 46912496407488 ag_logging.py:142] AutoGraph could not transform <function Model.make_predict_function.<locals>.predict_function at 0x2aabb8b72af0> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


In [15]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

prediction count 65007, type = <class 'numpy.ndarray'>


0.914404689647462

In [16]:
# If using the original model, model_id should be replaced with "sherlock"
#model_id = "sherlock"
classes = np.load(f"../model_files/classes_{model_id}.npy", allow_pickle=True)

report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [17]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
industry	0.987		0.987		0.986		2958
birth date	0.979		0.975		0.983		479
sex		0.971		0.978		0.965		2997
symbol		0.968		0.960		0.975		1752
year		0.964		0.979		0.949		3015


### Bottom 5 Types

In [18]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
range		0.803		0.906		0.721		577
nationality	0.769		0.838		0.710		424
rank		0.765		0.723		0.813		2983
person		0.705		0.908		0.577		579
sales		0.411		0.776		0.280		322


### All Scores

In [19]:
print(classification_report(y_test, predicted_labels, digits=3))

              precision    recall  f1-score   support

     address      0.954     0.965     0.959      3003
         age      0.882     0.967     0.922      3033
        area      0.927     0.824     0.872      1987
  birth date      0.975     0.983     0.979       479
 birth place      0.992     0.928     0.959       418
       brand      0.891     0.737     0.806       574
        city      0.881     0.935     0.907      2966
   continent      0.885     0.885     0.885       227
     country      0.912     0.961     0.936      3038
      county      0.955     0.955     0.955      2959
    currency      0.972     0.953     0.963       405
         day      0.930     0.888     0.909      3038
    duration      0.944     0.947     0.946      3000
    industry      0.987     0.986     0.987      2958
    language      0.870     0.966     0.916      1474
    location      0.936     0.835     0.883      2949
manufacturer      0.878     0.920     0.898       945
        name      0.864    

## Review errors

In [20]:
size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
        
        # zoom in to specific errors. Use the index in the next step
        if k1 in ('address'):
            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

[1187] expected "address" but predicted "day"
[1201] expected "address" but predicted "city"
[1585] expected "address" but predicted "city"
[2256] expected "address" but predicted "name"
[2303] expected "address" but predicted "area"
[2393] expected "address" but predicted "location"
[3218] expected "address" but predicted "location"
[4640] expected "address" but predicted "city"
[4787] expected "address" but predicted "duration"
[5261] expected "address" but predicted "city"
[5685] expected "address" but predicted "city"
[5729] expected "address" but predicted "location"
[6327] expected "address" but predicted "manufacturer"
[7404] expected "address" but predicted "city"
[9532] expected "address" but predicted "manufacturer"
[9920] expected "address" but predicted "language"
[10188] expected "address" but predicted "product"
[11837] expected "address" but predicted "day"
[12608] expected "address" but predicted "rank"
[12829] expected "address" but predicted "name"
[14477] expected "a

[('rank', 559),
 ('region', 502),
 ('location', 487),
 ('area', 350),
 ('day', 339),
 ('name', 269),
 ('person', 245),
 ('sales', 232),
 ('type', 194),
 ('city', 193),
 ('range', 161),
 ('duration', 158),
 ('year', 154),
 ('brand', 151),
 ('product', 144),
 ('county', 133),
 ('state', 132),
 ('order', 128),
 ('nationality', 123),
 ('country', 119),
 ('status', 117),
 ('address', 105),
 ('sex', 104),
 ('age', 99),
 ('manufacturer', 76),
 ('language', 50),
 ('symbol', 43),
 ('industry', 40),
 ('birth place', 30),
 ('continent', 26),
 ('currency', 19),
 ('birth date', 8)]

In [21]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')

In [27]:
idx = 200
original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "duration", actual label "duration". Actual values:
[['5-15 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', '5-15 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', '5-15 minutes', 'less than 5 minutes', 'less than 5 minutes', '5-15 minutes', '5-15 minutes', '5-15 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', '5-15 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less than 5 minutes', '16-30 minutes', 'less than 5 minutes', '16-30 minutes', 'less than 5 minutes', '5-15 minutes', 'less than 5 minutes', '5-15 minutes', '5-15 minutes', 'less than 5 minutes', 'less than 5 minutes', 'less 