In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
nn_model_id = 'retrained_sherlock'

In [3]:
from ast import literal_eval
from collections import Counter
from datetime import datetime

import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, classification_report

from sherlock.deploy.predict_sherlock import predict_sherlock
from sherlock.deploy.train_sherlock import train_sherlock

### Load training and validation sets

In [5]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('../data/data/processed/train.parquet')
y_train = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-02-09 20:49:56.748214
Load data (train) process took 0:00:07.648936 seconds.


In [6]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [8]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('../data/data/processed/validation.parquet')
y_validation = pd.read_parquet('../data/data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-02-09 20:50:33.639660
Load data (validation) process took 0:00:01.923424 seconds.


### Re-train Sherlock

In [9]:
start = datetime.now()
print(f'Started at {start}')

train_sherlock(X_train, y_train, X_validation, y_validation, nn_id=nn_model_id);

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-02-09 20:50:38.331927


W0209 20:50:38.942614 4774411776 deprecation.py:506] From /Users/madelon/miniconda3/envs/sherlock-project/lib/python3.6/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0209 20:50:38.945662 4774411776 deprecation.py:506] From /Users/madelon/miniconda3/envs/sherlock-project/lib/python3.6/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0209 20:50:38.953925 4774411776 deprecation.py:506] From /Users/madelon/miniconda3/envs/sherlock-project/lib/python3.6/site-packages/tensorflow_core/python/

Successfully loaded and compiled model, now fitting model on data.
Train on 412059 samples, validate on 137353 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Retrained Sherlock.
Trained and saved new model.
Finished at 2022-02-09 21:40:30.886113, took 0:49:52.554223 seconds


### Load test set

In [21]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('../data/data/processed/test.parquet')
y_test = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-02-09 21:41:39.388852
Trained and saved new model.
Finished at 2022-02-09 21:41:40.845056, took 0:00:01.456216 seconds


### Make prediction

In [22]:
predicted_labels = predict_sherlock(X_test, nn_id=nn_model_id)

In [12]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

prediction count 137353, type = <class 'numpy.ndarray'>


0.8933795716268179

In [23]:
# Best results
# 0.8956975031011328, adam optimiser, took 0:59:19.320310 seconds (retrained_sherlock8)

# 0.8871638630500646, SGD Nesterov optimiser,  took 1:37:09.679227 seconds (retrained_sherlock9)

# LAST
# 0.8941992181577104
# 0.8661276466591463
# 0.8765360861865846
# 0.8777825108666634
# 0.8709458033528152
# 0.8526528126158138
# 0.836800124917518
# 0.8590214693450329
# 0.8583626471084315

In [24]:
classes = np.load(f"../sherlock/deploy/classes_{nn_model_id}.npy", allow_pickle=True)

report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [25]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
isbn		0.992		0.996		0.988		1430
grades		0.992		0.988		0.995		1765
jockey		0.988		0.987		0.988		2819
industry	0.981		0.976		0.986		2958
birth date	0.974		0.983		0.965		479


### Bottom 5 Types

In [26]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
rank		0.704		0.644		0.775		2983
person		0.675		0.742		0.620		579
sales		0.575		0.623		0.534		322
director	0.559		0.588		0.533		225
ranking		0.444		0.884		0.296		439


### All Scores

In [27]:
print(classification_report(y_test, predicted_labels, digits=3))

                precision    recall  f1-score   support

       address      0.942     0.934     0.938      3003
     affiliate      0.954     0.819     0.881       204
   affiliation      0.984     0.953     0.968      1768
           age      0.886     0.940     0.912      3033
         album      0.885     0.887     0.886      3035
          area      0.864     0.824     0.844      1987
        artist      0.787     0.883     0.832      3043
    birth date      0.983     0.965     0.974       479
   birth place      0.979     0.897     0.936       418
         brand      0.824     0.669     0.738       574
      capacity      0.823     0.707     0.761       362
      category      0.918     0.887     0.902      3087
          city      0.837     0.906     0.870      2966
         class      0.897     0.916     0.906      2971
classification      0.956     0.857     0.904       587
          club      0.973     0.956     0.965      2977
          code      0.938     0.894     0.916  

## Review errors

In [28]:
size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
        
        # zoom in to specific errors. Use the index in the next step
        if k1 in ('address'):
            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

[1116] expected "address" but predicted "name"
[1578] expected "address" but predicted "language"
[2420] expected "address" but predicted "location"
[2588] expected "address" but predicted "location"
[2616] expected "address" but predicted "city"
[3398] expected "address" but predicted "city"
[4354] expected "address" but predicted "location"
[4380] expected "address" but predicted "county"
[4422] expected "address" but predicted "city"
[5112] expected "address" but predicted "location"
[5546] expected "address" but predicted "name"
[7119] expected "address" but predicted "location"
[8797] expected "address" but predicted "location"
[9354] expected "address" but predicted "location"
[9574] expected "address" but predicted "location"
[9806] expected "address" but predicted "city"
[10067] expected "address" but predicted "rank"
[11055] expected "address" but predicted "city"
[11902] expected "address" but predicted "location"
[11993] expected "address" but predicted "location"
[12072] ex

Total mismatches: 14652 (F1 score: 0.8933795716268179)


[('name', 737),
 ('rank', 670),
 ('region', 499),
 ('location', 494),
 ('description', 488),
 ('position', 455),
 ('team', 451),
 ('type', 404),
 ('artist', 357),
 ('notes', 355),
 ('area', 349),
 ('category', 349),
 ('album', 344),
 ('code', 312),
 ('ranking', 309),
 ('company', 307),
 ('product', 307),
 ('city', 278),
 ('day', 273),
 ('gender', 265),
 ('team name', 255),
 ('order', 250),
 ('class', 250),
 ('owner', 228),
 ('weight', 227),
 ('person', 220),
 ('status', 209),
 ('address', 198),
 ('brand', 190),
 ('manufacturer', 185),
 ('result', 183),
 ('age', 182),
 ('service', 180),
 ('year', 177),
 ('sex', 176),
 ('credit', 174),
 ('country', 168),
 ('range', 157),
 ('origin', 155),
 ('sales', 150),
 ('component', 148),
 ('duration', 145),
 ('plays', 140),
 ('state', 137),
 ('club', 130),
 ('county', 126),
 ('format', 124),
 ('capacity', 106),
 ('director', 105),
 ('publisher', 97),
 ('nationality', 90),
 ('depth', 85),
 ('command', 84),
 ('classification', 84),
 ('affiliation', 83

In [29]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')

In [30]:
idx = 1001
original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "address", actual label "address". Actual values:
[['Cabot House', 'Cabot House', '5 Hill Rd.', '5 Hill Rd.', '9 Cabot Rd.', '9 Cabot Rd.', 'Cabot House', '22 Bank Rd.', '22 Bank Rd.', 'Cabot House', '31 Bank Rd.', '31 Bank Rd.', 'Bairds Hotel', '11 Cabot Rd.', '11 Cabot Rd.', '10 Hill Rd.', '10 Hill Rd.', '10 Hill Rd.', '10 Hill Rd.', '7A Church Rd.', '1 Cabot Rd.', '1 Cabot Rd.', '1 Cabot Rd.', '1 Cabot Rd.', '2 Coronation St.', '2 Coronation St.', '7A Church Rd.', '12 Hill Rd.', '12 Hill Rd.', '12 Hill Rd.', 'Cabot House', '19 Bank Rd.', '19 Bank Rd.', '19 Bank Rd.', '19 Bank Rd.', '19 Bank Rd.', '7A Church Rd.', '18 Mill Rd.', '17 Hill Rd.', '17 Hill Rd.', 'Cabot House', 'Cabot House', '25 Bank Rd.', '10 Coronation St.', '6 Cabot Rd.', '6 Cabot Rd.', '8 Hill Rd.', '8 Hill Rd.', '4 Mill Rd.', '4 Mill Rd.', '12 Sulva Rd.', '4 Haig Rd.', '13 Botwood Rd.', '13 Botwood Rd.', '8 Botwood Rd.', '8 Botwood Rd.', '16 Botwood Rd.', '16 Botwood Rd.', '16 Botwood Rd.', '16 Botwood Rd.