In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
nn_model_id = 'retrained_sherlock'

In [3]:
import pandas as pd
import numpy as np

from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock
from datetime import datetime
from sklearn.metrics import f1_score, classification_report

### Load training and validation sets

In [4]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('train.parquet')
y_train = pd.read_parquet('../data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-02-07 14:19:57.111717
Load data (train) process took 0:00:04.862089 seconds.


In [5]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [6]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('validation.parquet')
y_validation = pd.read_parquet('../data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-02-07 14:20:02.237587
Load data (validation) process took 0:00:01.877156 seconds.


### Re-train Sherlock

In [7]:
start = datetime.now()
print(f'Started at {start}')

train_sherlock(X_train, y_train, X_validation, y_validation, nn_id=nn_model_id);

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-02-07 14:20:04.187351


W0207 14:20:04.670890 4557827584 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project-1/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0207 14:20:04.672062 4557827584 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project-1/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0207 14:20:04.675765 4557827584 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project-1/venv/lib/python3.7/s

Successfully loaded and compiled model, now fitting model on data.
Train on 412059 samples, validate on 137353 samples


2022-02-07 14:20:08.798394: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-02-07 14:20:08.845623: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f973e4dbe30 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-02-07 14:20:08.845638: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100


Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Retrained Sherlock.
Trained and saved new model.
Finished at 2022-02-07 15:17:45.203004, took 0:57:41.015671 seconds


### Load test set

In [8]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('test.parquet')
y_test = pd.read_parquet('../data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-02-07 15:17:45.266596
Trained and saved new model.
Finished at 2022-02-07 15:17:47.026177, took 0:00:01.759595 seconds


### Make prediction

In [9]:
predicted_labels = predict_sherlock(X_test, nn_id=nn_model_id)

In [10]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

prediction count 137353, type = <class 'numpy.ndarray'>


0.8940572197723697

In [11]:
# Best results
# 0.8956975031011328, adam optimiser, took 0:59:19.320310 seconds (retrained_sherlock8)

# 0.8871638630500646, SGD Nesterov optimiser,  took 1:37:09.679227 seconds (retrained_sherlock9)

# LAST
# 0.8941992181577104
# 0.8661276466591463
# 0.8765360861865846
# 0.8777825108666634
# 0.8709458033528152
# 0.8526528126158138
# 0.836800124917518
# 0.8590214693450329
# 0.8583626471084315

In [12]:
classes = np.load(f"../sherlock/deploy/classes_{nn_model_id}.npy", allow_pickle=True)

report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [13]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
grades		0.992		0.991		0.993		1765
isbn		0.990		0.992		0.988		1430
jockey		0.986		0.981		0.990		2819
industry	0.982		0.979		0.985		2958
birth date	0.978		0.981		0.975		479


### Bottom 5 Types

In [14]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
rank		0.715		0.691		0.741		2983
person		0.643		0.625		0.661		579
sales		0.562		0.689		0.475		322
director	0.562		0.584		0.542		225
ranking		0.469		0.823		0.328		439


### All Scores

In [15]:
print(classification_report(y_test, predicted_labels, digits=3))

                precision    recall  f1-score   support

       address      0.940     0.941     0.941      3003
     affiliate      0.976     0.794     0.876       204
   affiliation      0.975     0.956     0.965      1768
           age      0.852     0.961     0.903      3033
         album      0.879     0.898     0.889      3035
          area      0.869     0.823     0.846      1987
        artist      0.813     0.874     0.842      3043
    birth date      0.981     0.975     0.978       479
   birth place      0.959     0.907     0.932       418
         brand      0.774     0.706     0.738       574
      capacity      0.840     0.713     0.771       362
      category      0.909     0.896     0.903      3087
          city      0.846     0.908     0.876      2966
         class      0.901     0.917     0.909      2971
classification      0.938     0.852     0.893       587
          club      0.969     0.960     0.965      2977
          code      0.920     0.908     0.914  

## Review errors

In [16]:
from collections import Counter

size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
        
        # zoom in to specific errors. Use the index in the next step
        if k1 in ('address'):
            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

[1116] expected "address" but predicted "name"
[2420] expected "address" but predicted "location"
[2616] expected "address" but predicted "location"
[3398] expected "address" but predicted "city"
[4354] expected "address" but predicted "location"
[5112] expected "address" but predicted "location"
[7119] expected "address" but predicted "city"
[8797] expected "address" but predicted "location"
[9574] expected "address" but predicted "location"
[9806] expected "address" but predicted "city"
[10035] expected "address" but predicted "creator"
[10067] expected "address" but predicted "family"
[10665] expected "address" but predicted "area"
[11055] expected "address" but predicted "county"
[11902] expected "address" but predicted "team"
[11993] expected "address" but predicted "location"
[12072] expected "address" but predicted "city"
[12639] expected "address" but predicted "location"
[14677] expected "address" but predicted "location"
[15232] expected "address" but predicted "city"
[15461]

[137027] expected "address" but predicted "location"
Total mismatches: 14493 (F1 score: 0.8940572197723697)


[('name', 891),
 ('rank', 772),
 ('region', 509),
 ('location', 496),
 ('position', 478),
 ('team', 409),
 ('notes', 398),
 ('artist', 384),
 ('description', 384),
 ('area', 351),
 ('type', 329),
 ('category', 321),
 ('gender', 312),
 ('product', 309),
 ('album', 309),
 ('company', 307),
 ('day', 306),
 ('ranking', 295),
 ('team name', 281),
 ('city', 272),
 ('code', 271),
 ('class', 246),
 ('status', 218),
 ('owner', 208),
 ('year', 201),
 ('weight', 197),
 ('person', 196),
 ('manufacturer', 195),
 ('country', 184),
 ('credit', 177),
 ('address', 177),
 ('service', 177),
 ('result', 176),
 ('brand', 169),
 ('sales', 169),
 ('duration', 165),
 ('sex', 160),
 ('origin', 155),
 ('range', 149),
 ('component', 143),
 ('order', 141),
 ('format', 131),
 ('state', 130),
 ('plays', 126),
 ('county', 119),
 ('club', 119),
 ('age', 118),
 ('nationality', 107),
 ('capacity', 104),
 ('director', 103),
 ('symbol', 91),
 ('command', 88),
 ('classification', 87),
 ('family', 83),
 ('publisher', 79),


In [17]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')

In [18]:
from ast import literal_eval

idx = 1001
original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "address", actual label "address". Actual values:
[['Cabot House', 'Cabot House', '5 Hill Rd.', '5 Hill Rd.', '9 Cabot Rd.', '9 Cabot Rd.', 'Cabot House', '22 Bank Rd.', '22 Bank Rd.', 'Cabot House', '31 Bank Rd.', '31 Bank Rd.', 'Bairds Hotel', '11 Cabot Rd.', '11 Cabot Rd.', '10 Hill Rd.', '10 Hill Rd.', '10 Hill Rd.', '10 Hill Rd.', '7A Church Rd.', '1 Cabot Rd.', '1 Cabot Rd.', '1 Cabot Rd.', '1 Cabot Rd.', '2 Coronation St.', '2 Coronation St.', '7A Church Rd.', '12 Hill Rd.', '12 Hill Rd.', '12 Hill Rd.', 'Cabot House', '19 Bank Rd.', '19 Bank Rd.', '19 Bank Rd.', '19 Bank Rd.', '19 Bank Rd.', '7A Church Rd.', '18 Mill Rd.', '17 Hill Rd.', '17 Hill Rd.', 'Cabot House', 'Cabot House', '25 Bank Rd.', '10 Coronation St.', '6 Cabot Rd.', '6 Cabot Rd.', '8 Hill Rd.', '8 Hill Rd.', '4 Mill Rd.', '4 Mill Rd.', '12 Sulva Rd.', '4 Haig Rd.', '13 Botwood Rd.', '13 Botwood Rd.', '8 Botwood Rd.', '8 Botwood Rd.', '16 Botwood Rd.', '16 Botwood Rd.', '16 Botwood Rd.', '16 Botwood Rd.