# Train and test Sherlock when ensembled with a RF classifier
To boost the performance of Sherlock, it can be combined with a RF classifier.

The scripts below show the procedure for doing so.

In [16]:
model_id = 'sherlock'

In [2]:
# If you need fully deterministic results between runs, set the following environment value prior to launching jupyter.

# See comment in sherlock.features.paragraph_vectors.infer_paragraph_embeddings_features for more info.
%env PYTHONHASHSEED

'13'

In [3]:
%load_ext autoreload
%autoreload 2

In [29]:
import itertools

from ast import literal_eval
from collections import Counter
from datetime import datetime

import pandas as pd
import numpy as np

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score

from sherlock.deploy.model import SherlockModel

### Load training and validation sets

In [38]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('../data/data/processed/train.parquet')
y_train = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-02-21 14:44:58.387328
Load data (train) process took 0:00:07.072707 seconds.


In [8]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [9]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('../data/data/processed/validation.parquet')
y_validation = pd.read_parquet('../data/data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-02-21 14:16:45.455219
Load data (validation) process took 0:00:02.024156 seconds.


In [10]:
X_train = pd.concat([X_train, X_validation], ignore_index=True)

In [39]:
y_train = np.array([x.lower() for x in itertools.chain(y_train, y_validation)])

### Train Voting Classifier using RFC and ETC

In [12]:
# n_estimators=300 gives a slightly better result (0.1%), but triples the fit time
voting_clf = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=13, n_jobs=-1)),
        ('et', ExtraTreesClassifier(n_estimators=100, random_state=13, n_jobs=-1))
    ],
    voting='soft'
)

start = datetime.now()
print(f'Started at {start}')

voting_clf.fit(X_train, y_train)

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-02-21 14:17:08.147857
Finished at 2022-02-21 14:38:09.947917, took 0:21:01.802720 seconds


In [13]:
# Make individual (trained) estimators available
rf_clf = voting_clf.estimators_[0]
et_clf = voting_clf.estimators_[1]

### Load test set

In [14]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('../data/data/processed/test.parquet')
y_test = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-02-21 14:38:10.601540
Trained and saved new model.
Finished at 2022-02-21 14:38:12.493349, took 0:00:01.891821 seconds


In [52]:
np.unique(y_validation)

array(['address', 'affiliate', 'affiliation', 'age', 'album', 'area',
       'artist', 'birth date', 'birth place', 'brand', 'capacity',
       'category', 'city', 'class', 'classification', 'club', 'code',
       'collection', 'command', 'company', 'component', 'continent',
       'country', 'county', 'creator', 'credit', 'currency', 'day',
       'depth', 'description', 'director', 'duration', 'education',
       'elevation', 'family', 'file size', 'format', 'gender', 'genre',
       'grades', 'industry', 'isbn', 'jockey', 'language', 'location',
       'manufacturer', 'name', 'nationality', 'notes', 'operator',
       'order', 'organisation', 'origin', 'owner', 'person', 'plays',
       'position', 'product', 'publisher', 'range', 'rank', 'ranking',
       'region', 'religion', 'requirement', 'result', 'sales', 'service',
       'sex', 'species', 'state', 'status', 'symbol', 'team', 'team name',
       'type', 'weight', 'year'], dtype='<U14')

### Make predictions

In [72]:
classes = np.load(
    f"../model_files/classes_{model_id}.npy",
    allow_pickle=True
)
classes = np.array([cls.lower() for cls in classes])

In [73]:
assert (classes == sorted(classes)).all()

In [65]:
def predicted_labels(y_pred_proba, classes):
    y_pred_int = np.argmax(y_pred_proba, axis=1)
    
    encoder = LabelEncoder()
    encoder.classes_ = classes

    return encoder.inverse_transform(y_pred_int)


def prediction_summary(y_test, predicted_labels):
    print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

    size=len(y_test)

    print(f'f1 score {f1_score(y_test[:size], predicted_labels[:size], average="weighted")}')

### Predict: RFC

In [58]:
predicted_rfc_proba = rf_clf.predict_proba(X_test)

In [74]:
prediction_summary(y_test, predicted_labels(predicted_rfc_proba, classes))

prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.8912755744265719


### Predict: ETC

In [75]:
predicted_etc_proba = et_clf.predict_proba(X_test)

In [76]:
prediction_summary(y_test, predicted_labels(predicted_etc_proba, classes))

prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.8883526561931331


### Predict: Voting Classifier (RFC + ETC)

In [77]:
predicted_voting_proba = voting_clf.predict_proba(X_test)

In [78]:
prediction_summary(y_test, predicted_labels(predicted_voting_proba, classes))

prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.8940645473980389


### Predict: Sherlock NN

In [79]:
model = SherlockModel()
model.initialize_model_from_json(with_weights=True, model_id="sherlock")
predicted_sherlock_proba = model.predict_proba(X_test)

In [80]:
prediction_summary(y_test, predicted_labels(predicted_sherlock_proba, classes))

prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.8951410029373902


### Predict: Combined

In [81]:
combined = []
    
for i in range(len(y_test)):
    nn_probs = predicted_sherlock_proba[i]
    voting_probs = predicted_voting_proba[i]
    
    x = nn_probs + voting_probs
    x = x / 2

    combined.append(x)

In [82]:
labels = predicted_labels(combined, classes)

prediction_summary(y_test, labels)

prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.905491661885665


In [83]:
report = classification_report(y_test, labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [84]:
def score_table(class_scores):
    print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

    for key, value in class_scores:
        if len(key) >= 8:
            tabs = '\t' * 1
        else:
            tabs = '\t' * 2

        print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

In [85]:
score_table(class_scores[0:5])

		f1-score	precision	recall		support
grades		0.995		0.994		0.995		1765
isbn		0.990		0.992		0.989		1430
industry	0.986		0.985		0.988		2958
jockey		0.985		0.984		0.987		2819
currency	0.979		0.985		0.973		405


### Bottom 5 Types

In [86]:
score_table(class_scores[len(class_scores)-5:len(class_scores)])

		f1-score	precision	recall		support
rank		0.738		0.678		0.810		2983
person		0.695		0.767		0.636		579
sales		0.615		0.667		0.571		322
director	0.604		0.661		0.556		225
ranking		0.569		0.823		0.435		439


### All Scores (by class)

In [87]:
print(classification_report(y_test, labels, digits=3))

                precision    recall  f1-score   support

       address      0.929     0.951     0.940      3003
     affiliate      0.949     0.819     0.879       204
   affiliation      0.975     0.958     0.966      1768
           age      0.891     0.955     0.922      3033
         album      0.894     0.895     0.894      3035
          area      0.892     0.836     0.863      1987
        artist      0.810     0.884     0.846      3043
    birth date      0.983     0.969     0.976       479
   birth place      0.939     0.919     0.929       418
         brand      0.849     0.695     0.764       574
      capacity      0.851     0.771     0.809       362
      category      0.927     0.898     0.912      3087
          city      0.870     0.910     0.890      2966
         class      0.921     0.923     0.922      2971
classification      0.946     0.874     0.909       587
          club      0.975     0.957     0.966      2977
          code      0.921     0.925     0.923  

## Review errors

In [88]:
size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = labels[idx]

    if k1 != k2:
        mismatches.append(k1)
#        if k1 in ('brand'):
#        print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

Total mismatches: 12994 (F1 score: 0.905491661885665)


[('name', 697),
 ('rank', 566),
 ('location', 479),
 ('position', 460),
 ('region', 427),
 ('team', 388),
 ('artist', 353),
 ('notes', 352),
 ('description', 352),
 ('type', 342),
 ('area', 326),
 ('album', 320),
 ('category', 316),
 ('company', 310),
 ('product', 301),
 ('city', 266),
 ('day', 261),
 ('team name', 259),
 ('ranking', 248),
 ('class', 229),
 ('code', 222),
 ('person', 211),
 ('owner', 210),
 ('sex', 185),
 ('order', 180),
 ('status', 178),
 ('brand', 175),
 ('gender', 169),
 ('manufacturer', 163),
 ('year', 163),
 ('credit', 158),
 ('service', 158),
 ('result', 154),
 ('weight', 149),
 ('address', 146),
 ('sales', 138),
 ('duration', 136),
 ('age', 136),
 ('origin', 136),
 ('component', 133),
 ('country', 130),
 ('club', 129),
 ('plays', 128),
 ('nationality', 124),
 ('format', 119),
 ('range', 117),
 ('state', 115),
 ('county', 108),
 ('director', 100),
 ('command', 86),
 ('publisher', 85),
 ('capacity', 83),
 ('language', 78),
 ('affiliation', 75),
 ('classification',

In [89]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')

In [90]:
idx = 541
original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "age", actual label "position". Actual values:
[[2, 4]]


In [91]:
print(f'Completed at {datetime.now()}')

Completed at 2022-02-21 14:53:39.160195
