In [44]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
nn_model_id = 'retrained_sherlock10'

In [46]:
import pandas as pd
import numpy as np

from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock
from datetime import datetime
from sklearn.metrics import f1_score, classification_report

### Load training and validation sets

In [47]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('train.parquet')
y_train = pd.read_parquet('../data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2021-01-16 17:36:54.754033
Load data (train) process took 0:00:08.295982 seconds.


In [48]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [49]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('validation.parquet')
y_validation = pd.read_parquet('../data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2021-01-16 17:37:03.236977
Load data (validation) process took 0:00:01.806952 seconds.


### Re-train Sherlock

In [50]:
start = datetime.now()
print(f'Started at {start}')

train_sherlock(X_train, y_train, X_validation, y_validation, nn_id=nn_model_id);

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2021-01-16 17:37:05.103117
Successfully loaded and compiled model, now fitting model on data.
Train on 412059 samples, validate on 137353 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Retrained Sherlock.
Trained and saved new model.
Finished at 2021-01-16 18:31:07.670998, took 0:54:02.567899 seconds


### Load test set

In [51]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('test.parquet')
y_test = pd.read_parquet('../data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2021-01-16 18:31:07.731679
Trained and saved new model.
Finished at 2021-01-16 18:31:09.559229, took 0:00:01.827566 seconds


### Make prediction

In [52]:
predicted_labels = predict_sherlock(X_test, nn_id=nn_model_id)

In [53]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

prediction count 137353, type = <class 'numpy.ndarray'>


0.8936155925238338

In [54]:
# Best results
# 0.8956975031011328, adam optimiser, took 0:59:19.320310 seconds (retrained_sherlock8)

# 0.8871638630500646, SGD Nesterov optimiser,  took 1:37:09.679227 seconds (retrained_sherlock9)

# LAST
# 0.8941992181577104
# 0.8661276466591463
# 0.8765360861865846
# 0.8777825108666634
# 0.8709458033528152
# 0.8526528126158138
# 0.836800124917518
# 0.8590214693450329
# 0.8583626471084315

In [55]:
classes = np.load(f"../sherlock/deploy/classes_{nn_model_id}.npy", allow_pickle=True)

report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [56]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
grades		0.992		0.991		0.993		1765
jockey		0.985		0.980		0.991		2819
industry	0.983		0.981		0.984		2958
isbn		0.982		0.972		0.992		1430
birth date	0.981		0.987		0.975		479


### Bottom 5 Types

In [57]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
rank		0.700		0.641		0.771		2983
person		0.666		0.702		0.634		579
director	0.552		0.606		0.507		225
sales		0.525		0.729		0.410		322
ranking		0.474		0.749		0.346		439


### All Scores

In [58]:
print(classification_report(y_test, predicted_labels, digits=3))

                precision    recall  f1-score   support

       address      0.936     0.942     0.939      3003
     affiliate      0.965     0.804     0.877       204
   affiliation      0.984     0.950     0.967      1768
           age      0.883     0.944     0.912      3033
         album      0.866     0.898     0.882      3035
          area      0.876     0.825     0.850      1987
        artist      0.809     0.872     0.839      3043
    birth date      0.987     0.975     0.981       479
   birth place      0.980     0.916     0.947       418
         brand      0.770     0.704     0.735       574
      capacity      0.819     0.699     0.754       362
      category      0.894     0.894     0.894      3087
          city      0.867     0.881     0.874      2966
         class      0.896     0.916     0.906      2971
classification      0.940     0.859     0.898       587
          club      0.964     0.963     0.964      2977
          code      0.914     0.909     0.912  

## Review errors

In [59]:
from collections import Counter

size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
    if k1 in ('address'):
        print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

[19] expected "address" but predicted "address"
[61] expected "address" but predicted "address"
[87] expected "address" but predicted "address"
[130] expected "address" but predicted "address"
[193] expected "address" but predicted "address"
[208] expected "address" but predicted "address"
[251] expected "address" but predicted "address"
[269] expected "address" but predicted "address"
[282] expected "address" but predicted "address"
[306] expected "address" but predicted "address"
[349] expected "address" but predicted "address"
[360] expected "address" but predicted "address"
[439] expected "address" but predicted "address"
[441] expected "address" but predicted "address"
[510] expected "address" but predicted "address"
[726] expected "address" but predicted "address"
[775] expected "address" but predicted "address"
[779] expected "address" but predicted "address"
[807] expected "address" but predicted "address"
[810] expected "address" but predicted "address"
[890] expected "address

[51958] expected "address" but predicted "address"
[52044] expected "address" but predicted "address"
[52090] expected "address" but predicted "address"
[52093] expected "address" but predicted "address"
[52120] expected "address" but predicted "address"
[52140] expected "address" but predicted "address"
[52165] expected "address" but predicted "address"
[52196] expected "address" but predicted "address"
[52252] expected "address" but predicted "address"
[52292] expected "address" but predicted "address"
[52297] expected "address" but predicted "address"
[52319] expected "address" but predicted "address"
[52332] expected "address" but predicted "address"
[52354] expected "address" but predicted "address"
[52361] expected "address" but predicted "address"
[52421] expected "address" but predicted "address"
[52436] expected "address" but predicted "address"
[52515] expected "address" but predicted "address"
[52547] expected "address" but predicted "address"
[52600] expected "address" but 

[60666] expected "address" but predicted "address"
[60695] expected "address" but predicted "address"
[60809] expected "address" but predicted "address"
[60875] expected "address" but predicted "address"
[60881] expected "address" but predicted "address"
[60922] expected "address" but predicted "address"
[60995] expected "address" but predicted "address"
[61006] expected "address" but predicted "address"
[61087] expected "address" but predicted "address"
[61150] expected "address" but predicted "address"
[61152] expected "address" but predicted "address"
[61203] expected "address" but predicted "address"
[61299] expected "address" but predicted "address"
[61320] expected "address" but predicted "address"
[61324] expected "address" but predicted "address"
[61418] expected "address" but predicted "address"
[61440] expected "address" but predicted "address"
[61466] expected "address" but predicted "address"
[61686] expected "address" but predicted "address"
[61713] expected "address" but 

[89464] expected "address" but predicted "address"
[89504] expected "address" but predicted "address"
[89543] expected "address" but predicted "address"
[89608] expected "address" but predicted "address"
[89631] expected "address" but predicted "address"
[89745] expected "address" but predicted "address"
[89794] expected "address" but predicted "address"
[89800] expected "address" but predicted "location"
[89818] expected "address" but predicted "address"
[89819] expected "address" but predicted "address"
[89830] expected "address" but predicted "address"
[89831] expected "address" but predicted "address"
[89846] expected "address" but predicted "address"
[89861] expected "address" but predicted "address"
[89897] expected "address" but predicted "address"
[89966] expected "address" but predicted "address"
[90039] expected "address" but predicted "address"
[90054] expected "address" but predicted "name"
[90110] expected "address" but predicted "address"
[90116] expected "address" but pr

[107289] expected "address" but predicted "address"
[107305] expected "address" but predicted "address"
[107349] expected "address" but predicted "address"
[107444] expected "address" but predicted "address"
[107467] expected "address" but predicted "address"
[107469] expected "address" but predicted "address"
[107493] expected "address" but predicted "address"
[107567] expected "address" but predicted "address"
[107614] expected "address" but predicted "address"
[107615] expected "address" but predicted "address"
[107701] expected "address" but predicted "address"
[107750] expected "address" but predicted "address"
[107805] expected "address" but predicted "address"
[107841] expected "address" but predicted "address"
[107902] expected "address" but predicted "address"
[107928] expected "address" but predicted "address"
[107960] expected "address" but predicted "address"
[107970] expected "address" but predicted "address"
[107972] expected "address" but predicted "address"
[107976] exp

Total mismatches: 14589 (F1 score: 0.8936155925238338)


[('name', 850),
 ('rank', 684),
 ('position', 498),
 ('region', 489),
 ('location', 483),
 ('team', 424),
 ('notes', 391),
 ('artist', 390),
 ('description', 384),
 ('type', 362),
 ('product', 359),
 ('city', 352),
 ('area', 348),
 ('category', 328),
 ('sex', 313),
 ('album', 309),
 ('team name', 305),
 ('day', 293),
 ('ranking', 287),
 ('company', 284),
 ('code', 268),
 ('class', 249),
 ('owner', 212),
 ('person', 212),
 ('weight', 203),
 ('year', 196),
 ('status', 194),
 ('gender', 191),
 ('sales', 190),
 ('credit', 180),
 ('result', 180),
 ('address', 174),
 ('service', 174),
 ('age', 171),
 ('brand', 170),
 ('manufacturer', 165),
 ('country', 164),
 ('duration', 154),
 ('component', 153),
 ('origin', 150),
 ('plays', 150),
 ('order', 146),
 ('range', 144),
 ('format', 133),
 ('state', 124),
 ('nationality', 117),
 ('director', 111),
 ('club', 109),
 ('capacity', 109),
 ('county', 97),
 ('command', 93),
 ('affiliation', 88),
 ('family', 83),
 ('classification', 83),
 ('symbol', 81),

In [60]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')

In [61]:
from ast import literal_eval

#433

idx = 1116
original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "name", actual label "address". Actual values:
[['Karam Plaza Bldg Near Ajman Lulu Hypermarket Near Masreq Bank', 'Shop No: 3 Sheikh Zayed Street Sheikh Zayed Street, Ajman', 'Shop No: 13 Abdulla Humaid Al Mazroei Bld. Near Emirates Gas Round About New Industrial Area', 'Opposite Ajman Korea Furniture Near Madina Super market Al Jurf Industrial Area, Ajman', 'Ground Floor Inside Al Manama Hypermarket Opp. LuLu Center, Ajman', 'Inside NK Mall Al Hamidiya Street Near Al Hamidiya Bridge', 'Ground Floor Next To Dibba Municipality Below New Fujaira Medical Centre Near to Dubai Islamic Bank', 'TSRV2-GL, Next to Carrefour, opposite Bench (Unit No.TW4), Fujairah City Centre, Fujairah', 'Ahmed Bin Mohd. Road Juxtapose HSBC Inside CHOITHRAM Super Market', '108 Shaikh Saeed Al Qasimi Street Next to NBAD Bank Fujairah -Kalba, Fujairah', 'Next to Omar Bin Al Khatab Mosque Khalid Qazmi Road', 'Near the Round about Opp. Mosque and Government Hospital Main Road Masafi', 'Shop No. G43, Ground