In [1]:
nn_model_id = 'retrained_sherlock8'

In [2]:
import pandas as pd
import numpy as np
import itertools

from datetime import datetime
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report

### Load training and validation sets

In [3]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('train.parquet')
y_train = pd.read_parquet('../data/raw/train_labels.parquet').values.flatten()

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2021-12-20 19:11:25.021252
Load data (train) process took 0:00:09.589445 seconds.


In [4]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [5]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('validation.parquet')
y_validation = pd.read_parquet('../data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2021-12-20 19:11:34.635005
Load data (validation) process took 0:00:02.410613 seconds.


In [6]:
X_train = pd.concat([X_train, X_validation], ignore_index=True)

In [7]:
y_train = np.array([x.lower() for x in itertools.chain(y_train, y_validation)])

### Train Random Forest

In [8]:
# n_estimators=300 gives a slightly better result (0.1%), but triples the fit time
n_estimators=100

rnd_clf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
et_clf = ExtraTreesClassifier(n_estimators=n_estimators, n_jobs=-1)

start = datetime.now()
print(f'Started at {start}')

voting_clf = VotingClassifier(estimators=[('rd', rnd_clf), ('et', et_clf)], voting='soft')

voting_clf.fit(X_train, y_train)

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2021-12-20 19:11:41.404964
Trained and saved new model.
Finished at 2021-12-20 19:28:13.317631, took 0:16:31.914820 seconds


### Load test set

In [9]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('test.parquet')
y_test = pd.read_parquet('../data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2021-12-20 19:28:13.510667
Trained and saved new model.
Finished at 2021-12-20 19:28:15.894303, took 0:00:02.383650 seconds


### Make prediction

In [10]:
predicted_prob = voting_clf.predict_proba(X_test)

In [11]:
classes = np.load(
        f"../sherlock/deploy/classes_{nn_model_id}.npy",
        allow_pickle=True
    )

In [12]:
assert (classes == sorted(classes)).all()

In [13]:
import pandas as pd
import numpy as np

from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock_proba, _transform_predictions_to_classes
from datetime import datetime
from sklearn.metrics import f1_score, classification_report

predict_sherlock_proba = predict_sherlock_proba(X_test, nn_id=nn_model_id)

W1220 19:28:47.962548 4420566528 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1220 19:28:47.963889 4420566528 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1220 19:28:47.967644 4420566528 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project/venv/lib/python3.7/site-pa

In [14]:
combined = []

for i in range(len(y_test)):
    nn_probs = predict_sherlock_proba[i]
    voting_probs = predicted_prob[i]
    
    x = nn_probs + voting_probs
    x = x / 2

    combined.append(x)


In [15]:
predicted_labels = _transform_predictions_to_classes(combined, nn_id=nn_model_id)

In [16]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

prediction count 137353, type = <class 'numpy.ndarray'>


0.9057198185099099

In [17]:
# 0.8914784495960828 RandomForestClassifier
# 0.8888581433158012 ExtraTreesClassifier
# 0.8939142078215914 VotingClassifier

# 0.9051046508515834  NN + votingclassifier
# 0.9037255679859006 RFC(100) + NN
# 0.904082859543776 RFC(300) + NN

# 0.905148377678918 NN + VotingClassifier(RFC100 + ETC100)
# 0.9057069845572598 NN (retrained_sherlock8) + VotingClassifier(RFC300 + ETC300)

# 0.9057005326978261 NN (retrained_sherlock8) + VotingClassifier(RFC100 + ETC100)

In [18]:
report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [19]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
grades		0.993		0.990		0.995		1765
isbn		0.992		0.995		0.989		1430
jockey		0.988		0.987		0.989		2819
industry	0.984		0.979		0.990		2958
birth date	0.978		0.979		0.977		479


### Bottom 5 Types

In [20]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
brand		0.748		0.783		0.716		574
person		0.687		0.757		0.629		579
sales		0.644		0.722		0.581		322
director	0.615		0.709		0.542		225
ranking		0.596		0.823		0.467		439


### All Scores

In [21]:
print(classification_report(y_test, predicted_labels, digits=3))

                precision    recall  f1-score   support

       address      0.939     0.953     0.946      3003
     affiliate      0.988     0.809     0.889       204
   affiliation      0.983     0.954     0.968      1768
           age      0.901     0.952     0.926      3033
         album      0.885     0.906     0.895      3035
          area      0.894     0.840     0.866      1987
        artist      0.815     0.883     0.848      3043
    birth date      0.979     0.977     0.978       479
   birth place      0.982     0.902     0.940       418
         brand      0.783     0.716     0.748       574
      capacity      0.821     0.773     0.797       362
      category      0.915     0.903     0.909      3087
          city      0.837     0.919     0.876      2966
         class      0.920     0.925     0.923      2971
classification      0.946     0.869     0.906       587
          club      0.976     0.960     0.968      2977
          code      0.934     0.922     0.928  

## Review errors

In [22]:
from collections import Counter

size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
#        if k1 in ('brand'):
#        print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

Total mismatches: 12946 (F1 score: 0.9057198185099099)


[('name', 697),
 ('rank', 607),
 ('location', 461),
 ('position', 445),
 ('team', 418),
 ('region', 392),
 ('description', 360),
 ('artist', 357),
 ('notes', 357),
 ('type', 320),
 ('product', 318),
 ('area', 318),
 ('category', 299),
 ('company', 288),
 ('album', 286),
 ('day', 272),
 ('team name', 258),
 ('city', 240),
 ('ranking', 234),
 ('code', 230),
 ('class', 223),
 ('person', 215),
 ('owner', 211),
 ('sex', 190),
 ('order', 183),
 ('status', 180),
 ('country', 179),
 ('manufacturer', 171),
 ('gender', 169),
 ('year', 167),
 ('result', 164),
 ('brand', 163),
 ('service', 160),
 ('weight', 152),
 ('origin', 147),
 ('age', 145),
 ('credit', 145),
 ('address', 142),
 ('duration', 137),
 ('sales', 135),
 ('state', 131),
 ('component', 127),
 ('format', 122),
 ('club', 119),
 ('plays', 118),
 ('range', 117),
 ('county', 114),
 ('nationality', 104),
 ('director', 103),
 ('affiliation', 82),
 ('capacity', 82),
 ('publisher', 80),
 ('language', 78),
 ('classification', 77),
 ('command',

In [23]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')

In [24]:
from ast import literal_eval

idx = 541
original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "age", actual label "position". Actual values:
[[2, 4]]
