# This notebook enables training and testing of Sherlock.
The procedure is:
- Load train, val, test datasets (should be preprocessed)
- Initialize model using the "pretrained" model or by training one from scratch.
- Evaluate and analyse the model predictions.

In [1]:
%env PYTHONHASHSEED=13
%load_ext autoreload
%autoreload 2

env: PYTHONHASHSEED=13


In [2]:
# This will be the ID for the retrained model,
#further down predictions can also be made with the original model: "sherlock"
model_id = 'retrained_sherlock'

In [3]:
from ast import literal_eval
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, classification_report

from sherlock.deploy.model import SherlockModel

## Load datasets for training, validation, testing

In [4]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('../data/data/processed/train.parquet')
y_train = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-11-14 10:42:52.409883
Load data (train) process took 0:00:01.976014 seconds.


In [5]:
len(np.unique(y_train))

39

In [6]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [7]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('../data/data/processed/validation.parquet')
y_validation = pd.read_parquet('../data/data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-11-14 10:42:54.583338
Load data (validation) process took 0:00:00.152510 seconds.


In [8]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('../data/data/processed/test.parquet')
y_test = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-11-14 10:42:54.778632
Finished at 2022-11-14 10:42:55.163904, took 0:00:00.385281 seconds


## Initialize the model
Two options:
- Load Sherlock model with pretrained weights
- Fit Sherlock model from scratch

In [9]:
model_id = "retrained_sherlock"

In [10]:
model = SherlockModel()
try:
    model.initialize_model_from_json(with_weights=True, model_id=model_id);
except:
    start = datetime.now()
    print(f'Started at {start}')
    # Model will be stored with ID `model_id`
    model.fit(X_train, y_train, X_validation, y_validation, model_id=model_id)

    print('Trained and saved new model.')
    print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')
    model.store_weights(model_id=model_id)

Started at 2022-11-14 10:42:55.235803


2022-11-14 10:42:56.199924: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-14 10:42:56.224990: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
  super(Adam, self).__init__(name, **kwargs)


Epoch 1/10000


W1114 10:42:57.830766 46912499975424 ag_logging.py:142] AutoGraph could not transform <function Model.make_train_function.<locals>.train_function at 0x2aab559bf940> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'

W1114 10:43:04.295014 46912499975424 ag_logging.py:142] AutoGraph could not transform <function Model.make_test_function.<locals>.test_function at 0x2aab571bddc0> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Trained and saved new model.
Finished at 2022-11-14 10:45:50.698073, took 0:02:55.462283 seconds


In [11]:
predicted_labels = []

### Make prediction

In [12]:
predicted_labels = model.predict(X_test, model_id)
predicted_labels = np.array([x.lower() for x in predicted_labels])

W1114 10:45:52.750897 46912499975424 ag_logging.py:142] AutoGraph could not transform <function Model.make_predict_function.<locals>.predict_function at 0x2aab56925790> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'


In [13]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

prediction count 12524, type = <class 'numpy.ndarray'>


0.9064457021902538

In [14]:
# If using the original model, model_id should be replaced with "sherlock"
#model_id = "sherlock"
classes = np.load(f"../model_files/classes_{model_id}.npy", allow_pickle=True)

report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [15]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
phone number	1.000		1.000		1.000		300
credit card account numbers	0.999		0.998		1.000		585
lei		0.995		0.991		1.000		110
birth date	0.993		1.000		0.986		72
currency	0.984		0.984		0.984		61


### Bottom 5 Types

In [16]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
person		0.649		0.766		0.563		87
fips code	0.543		0.759		0.423		97
naic		0.500		0.917		0.344		64
sales		0.466		0.680		0.354		48
mcc code	0.000		0.000		0.000		29


### All Scores

In [17]:
print(classification_report(y_test, predicted_labels, digits=3))

                             precision    recall  f1-score   support

                    address      0.947     0.956     0.951       450
                        age      0.882     0.967     0.922       455
                       area      0.882     0.779     0.827       298
                 birth date      1.000     0.986     0.993        72
                birth place      1.000     0.873     0.932        63
                      brand      0.727     0.744     0.736        86
                       city      0.869     0.928     0.898       445
                  continent      0.769     0.882     0.822        34
                    country      0.940     0.921     0.930       456
                     county      0.938     0.962     0.950       444
credit card account numbers      0.998     1.000     0.999       585
                   currency      0.984     0.984     0.984        61
                        day      0.880     0.886     0.883       456
                   duration      

## Review errors

In [18]:
size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
        
        # zoom in to specific errors. Use the index in the next step
        if k1 in ('address'):
            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

[33] expected "address" but predicted "name"
[47] expected "address" but predicted "location"
[86] expected "address" but predicted "city"
[100] expected "address" but predicted "day"
[101] expected "address" but predicted "area"
[103] expected "address" but predicted "location"
[121] expected "address" but predicted "range"
[135] expected "address" but predicted "county"
[161] expected "address" but predicted "region"
[175] expected "address" but predicted "name"
[181] expected "address" but predicted "type"
[226] expected "address" but predicted "area"
[270] expected "address" but predicted "name"
[277] expected "address" but predicted "language"
[293] expected "address" but predicted "name"
[316] expected "address" but predicted "location"
[368] expected "address" but predicted "year"
[375] expected "address" but predicted "duration"
[402] expected "address" but predicted "product"
[415] expected "address" but predicted "name"
Total mismatches: 1130 (F1 score: 0.9064457021902538)


[('rank', 111),
 ('location', 89),
 ('region', 70),
 ('area', 66),
 ('name', 59),
 ('fips code', 56),
 ('day', 52),
 ('product', 47),
 ('naic', 42),
 ('type', 40),
 ('person', 38),
 ('country', 36),
 ('status', 35),
 ('city', 32),
 ('sales', 31),
 ('duration', 30),
 ('range', 29),
 ('mcc code', 29),
 ('state', 27),
 ('year', 25),
 ('manufacturer', 23),
 ('brand', 22),
 ('address', 20),
 ('county', 17),
 ('order', 16),
 ('sex', 16),
 ('age', 15),
 ('zip code', 14),
 ('language', 11),
 ('birth place', 8),
 ('symbol', 7),
 ('nationality', 6),
 ('industry', 5),
 ('continent', 4),
 ('birth date', 1),
 ('currency', 1)]

In [19]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')

In [20]:
idx = 57


original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "address", actual label "address". Actual values:
[['Gillette, WY', 'Gillette, WY', '1720 W Warlow Drive, Gillette, WY', '2675 Ledoux Avenue, Gillette, WY', '2501 Ledoux Avenue, Gillette, WY', '4500 Running W Drive, Gillette, WY', '4500 Running W Drive, Gillette, WY']]


In [21]:
class_scores

[('phone number',
  {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 300}),
 ('credit card account numbers',
  {'precision': 0.9982935153583617,
   'recall': 1.0,
   'f1-score': 0.9991460290350128,
   'support': 585}),
 ('lei',
  {'precision': 0.990990990990991,
   'recall': 1.0,
   'f1-score': 0.9954751131221719,
   'support': 110}),
 ('birth date',
  {'precision': 1.0,
   'recall': 0.9861111111111112,
   'f1-score': 0.993006993006993,
   'support': 72}),
 ('currency',
  {'precision': 0.9836065573770492,
   'recall': 0.9836065573770492,
   'f1-score': 0.9836065573770492,
   'support': 61}),
 ('industry',
  {'precision': 0.9733924611973392,
   'recall': 0.9887387387387387,
   'f1-score': 0.9810055865921787,
   'support': 444}),
 ('sex',
  {'precision': 0.9774774774774775,
   'recall': 0.9644444444444444,
   'f1-score': 0.970917225950783,
   'support': 450}),
 ('symbol',
  {'precision': 0.9481481481481482,
   'recall': 0.973384030418251,
   'f1-score': 0.9606003752345217,
 