In [1]:
nn_model_id = 'retrained_sherlock'

In [2]:
# If you need fully deterministic results between runs, set the following environment value prior to launching jupyter.
# See comment in sherlock.features.paragraph_vectors.infer_paragraph_embeddings_features for more info.
%env PYTHONHASHSEED

'13'

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
import numpy as np
import itertools

from datetime import datetime
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 
from sklearn.metrics import f1_score, classification_report

### Load training and validation sets

In [5]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('train.parquet')
y_train = pd.read_parquet('../data/raw/train_labels.parquet').values.flatten()

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-02-07 15:46:02.658556
Load data (train) process took 0:00:04.837128 seconds.


In [6]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [7]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('validation.parquet')
y_validation = pd.read_parquet('../data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-02-07 15:46:07.682626
Load data (validation) process took 0:00:01.803970 seconds.


In [8]:
X_train = pd.concat([X_train, X_validation], ignore_index=True)

In [9]:
y_train = np.array([x.lower() for x in itertools.chain(y_train, y_validation)])

### Train Voting Classifier using RFC and ETC

In [10]:
# n_estimators=300 gives a slightly better result (0.1%), but triples the fit time
voting_clf = VotingClassifier(estimators=[('rf', RandomForestClassifier(n_estimators=100, random_state=13, n_jobs=-1)),
                                          ('et', ExtraTreesClassifier(n_estimators=100, random_state=13, n_jobs=-1))],
                              voting='soft')

start = datetime.now()
print(f'Started at {start}')

voting_clf.fit(X_train, y_train)

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-02-07 15:46:11.682256
Finished at 2022-02-07 16:01:59.050932, took 0:15:47.368743 seconds


In [11]:
# Make individual (trained) estimators available
rf_clf = voting_clf.estimators_[0]
et_clf = voting_clf.estimators_[1]

### Load test set

In [12]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('test.parquet')
y_test = pd.read_parquet('../data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-02-07 16:01:59.120891
Trained and saved new model.
Finished at 2022-02-07 16:02:00.975332, took 0:00:01.854455 seconds


### Make predictions

In [13]:
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock_proba, _transform_predictions_to_classes
from sklearn.metrics import f1_score, classification_report

In [14]:
classes = np.load(
        f"../sherlock/deploy/classes_{nn_model_id}.npy",
        allow_pickle=True
    )

In [15]:
assert (classes == sorted(classes)).all()

In [16]:
from sklearn.preprocessing import LabelEncoder

def predicted_labels(y_pred_proba, classes):
    y_pred_int = np.argmax(y_pred_proba, axis=1)
    
    encoder = LabelEncoder()
    encoder.classes_ = classes

    return encoder.inverse_transform(y_pred_int)


def prediction_summary(y_test, predicted_labels):
    print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

    size=len(y_test)

    print(f'f1 score {f1_score(y_test[:size], predicted_labels[:size], average="weighted")}')

### Predict: RFC

In [17]:
predicted_rfc_proba = rf_clf.predict_proba(X_test)

In [18]:
prediction_summary(y_test, predicted_labels(predicted_rfc_proba, classes))

prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.8909529156786774


### Predict: ETC

In [19]:
predicted_etc_proba = et_clf.predict_proba(X_test)

In [20]:
prediction_summary(y_test, predicted_labels(predicted_etc_proba, classes))

prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.8884613184751746


### Predict: Voting Classifier (RFC + ETC)

In [21]:
predicted_voting_proba = voting_clf.predict_proba(X_test)

In [22]:
prediction_summary(y_test, predicted_labels(predicted_voting_proba, classes))

prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.8933550546229518


### Predict: Sherlock NN

In [23]:
predicted_sherlock_proba = predict_sherlock_proba(X_test, nn_id=nn_model_id)

W0207 16:02:52.102345 4654583296 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project-1/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0207 16:02:52.104378 4654583296 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project-1/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0207 16:02:52.108886 4654583296 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project-1/venv/lib/python3.7/s

In [24]:
prediction_summary(y_test, predicted_labels(predicted_sherlock_proba, classes))

prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.8940572197723697


### Predict: Combined

In [25]:
combined = []
    
for i in range(len(y_test)):
    nn_probs = predicted_sherlock_proba[i]
    voting_probs = predicted_voting_proba[i]
    
    x = nn_probs + voting_probs
    x = x / 2

    combined.append(x)
    

In [26]:
labels = predicted_labels(combined, classes)

prediction_summary(y_test, labels)


prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.9047220789717997


In [27]:
report = classification_report(y_test, labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [28]:
def score_table(class_scores):
    print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

    for key, value in class_scores:
        if len(key) >= 8:
            tabs = '\t' * 1
        else:
            tabs = '\t' * 2

        print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

In [29]:
score_table(class_scores[0:5])

		f1-score	precision	recall		support
grades		0.994		0.994		0.994		1765
isbn		0.991		0.993		0.989		1430
jockey		0.986		0.980		0.991		2819
industry	0.983		0.979		0.988		2958
birth date	0.978		0.981		0.975		479


### Bottom 5 Types

In [30]:
score_table(class_scores[len(class_scores)-5:len(class_scores)])

		f1-score	precision	recall		support
rank		0.751		0.710		0.796		2983
person		0.690		0.702		0.679		579
sales		0.633		0.747		0.550		322
director	0.598		0.648		0.556		225
ranking		0.594		0.855		0.456		439


### All Scores (by class)

In [31]:
print(classification_report(y_test, labels, digits=3))

                precision    recall  f1-score   support

       address      0.926     0.947     0.937      3003
     affiliate      0.976     0.814     0.888       204
   affiliation      0.978     0.958     0.968      1768
           age      0.882     0.963     0.921      3033
         album      0.883     0.901     0.892      3035
          area      0.888     0.840     0.863      1987
        artist      0.807     0.886     0.845      3043
    birth date      0.981     0.975     0.978       479
   birth place      0.974     0.904     0.938       418
         brand      0.795     0.723     0.757       574
      capacity      0.879     0.746     0.807       362
      category      0.913     0.901     0.907      3087
          city      0.857     0.912     0.883      2966
         class      0.906     0.926     0.916      2971
classification      0.955     0.867     0.909       587
          club      0.972     0.961     0.967      2977
          code      0.921     0.925     0.923  

## Review errors

In [32]:
from collections import Counter

size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = labels[idx]

    if k1 != k2:
        mismatches.append(k1)
#        if k1 in ('brand'):
#        print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

Total mismatches: 13058 (F1 score: 0.9047220789717997)


[('name', 830),
 ('rank', 608),
 ('location', 460),
 ('position', 441),
 ('region', 412),
 ('team', 399),
 ('notes', 388),
 ('artist', 348),
 ('description', 347),
 ('area', 318),
 ('type', 307),
 ('category', 307),
 ('album', 299),
 ('company', 291),
 ('product', 283),
 ('team name', 262),
 ('city', 261),
 ('day', 258),
 ('ranking', 239),
 ('code', 222),
 ('class', 220),
 ('owner', 205),
 ('order', 200),
 ('manufacturer', 191),
 ('status', 190),
 ('person', 186),
 ('sex', 181),
 ('year', 177),
 ('gender', 171),
 ('credit', 167),
 ('service', 167),
 ('brand', 159),
 ('address', 158),
 ('result', 151),
 ('country', 147),
 ('origin', 146),
 ('weight', 145),
 ('sales', 145),
 ('duration', 140),
 ('component', 129),
 ('format', 126),
 ('plays', 123),
 ('state', 121),
 ('club', 116),
 ('range', 115),
 ('nationality', 111),
 ('age', 111),
 ('county', 110),
 ('director', 100),
 ('capacity', 92),
 ('language', 81),
 ('classification', 78),
 ('affiliation', 75),
 ('command', 73),
 ('family', 72

In [33]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')

In [34]:
from ast import literal_eval

idx = 541
original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "age", actual label "position". Actual values:
[[2, 4]]


In [35]:
print(f'Completed at {datetime.now()}')

Completed at 2022-02-07 16:03:14.044571
