In [1]:
nn_model_id = 'retrained_sherlock'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import itertools

from datetime import datetime
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 
from sklearn.metrics import f1_score, classification_report

### Load training and validation sets

In [4]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('train.parquet')
y_train = pd.read_parquet('../data/raw/train_labels.parquet').values.flatten()

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-02-02 08:57:12.713750
Load data (train) process took 0:00:03.294787 seconds.


In [5]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [6]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('validation.parquet')
y_validation = pd.read_parquet('../data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-02-02 08:57:16.192381
Load data (validation) process took 0:00:01.407590 seconds.


In [7]:
X_train = pd.concat([X_train, X_validation], ignore_index=True)

In [8]:
y_train = np.array([x.lower() for x in itertools.chain(y_train, y_validation)])

### Train Random Forest

In [9]:
# n_estimators=300 gives a slightly better result (0.1%), but triples the fit time
n_estimators=100

rnd_clf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
et_clf = ExtraTreesClassifier(n_estimators=n_estimators, n_jobs=-1)

start = datetime.now()
print(f'Started at {start}')

voting_clf = VotingClassifier(estimators=[('rd', rnd_clf), ('et', et_clf)], voting='soft')

voting_clf.fit(X_train, y_train)

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-02-02 08:57:19.883704
Trained and saved new model.
Finished at 2022-02-02 09:13:27.313795, took 0:16:07.430228 seconds


### Load test set

In [10]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('test.parquet')
y_test = pd.read_parquet('../data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-02-02 09:13:27.393033
Trained and saved new model.
Finished at 2022-02-02 09:13:29.390839, took 0:00:01.997820 seconds


### Make prediction

In [11]:
predicted_prob = voting_clf.predict_proba(X_test)

In [12]:
classes = np.load(
        f"../sherlock/deploy/classes_{nn_model_id}.npy",
        allow_pickle=True
    )

In [13]:
assert (classes == sorted(classes)).all()

In [14]:
import pandas as pd
import numpy as np

from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock_proba, _transform_predictions_to_classes
from datetime import datetime
from sklearn.metrics import f1_score, classification_report

predict_sherlock_proba = predict_sherlock_proba(X_test, nn_id=nn_model_id)

W0202 09:13:55.804179 4680068608 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project-1/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0202 09:13:55.805443 4680068608 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project-1/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0202 09:13:55.809165 4680068608 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project-1/venv/lib/python3.7/s

Feature cols: ['n_[0]-agg-any', 'n_[0]-agg-all', 'n_[0]-agg-mean', 'n_[0]-agg-var', 'n_[0]-agg-min', 'n_[0]-agg-max', 'n_[0]-agg-median', 'n_[0]-agg-sum', 'n_[0]-agg-kurtosis', 'n_[0]-agg-skewness', 'n_[1]-agg-any', 'n_[1]-agg-all', 'n_[1]-agg-mean', 'n_[1]-agg-var', 'n_[1]-agg-min', 'n_[1]-agg-max', 'n_[1]-agg-median', 'n_[1]-agg-sum', 'n_[1]-agg-kurtosis', 'n_[1]-agg-skewness', 'n_[2]-agg-any', 'n_[2]-agg-all', 'n_[2]-agg-mean', 'n_[2]-agg-var', 'n_[2]-agg-min', 'n_[2]-agg-max', 'n_[2]-agg-median', 'n_[2]-agg-sum', 'n_[2]-agg-kurtosis', 'n_[2]-agg-skewness', 'n_[3]-agg-any', 'n_[3]-agg-all', 'n_[3]-agg-mean', 'n_[3]-agg-var', 'n_[3]-agg-min', 'n_[3]-agg-max', 'n_[3]-agg-median', 'n_[3]-agg-sum', 'n_[3]-agg-kurtosis', 'n_[3]-agg-skewness', 'n_[4]-agg-any', 'n_[4]-agg-all', 'n_[4]-agg-mean', 'n_[4]-agg-var', 'n_[4]-agg-min', 'n_[4]-agg-max', 'n_[4]-agg-median', 'n_[4]-agg-sum', 'n_[4]-agg-kurtosis', 'n_[4]-agg-skewness', 'n_[5]-agg-any', 'n_[5]-agg-all', 'n_[5]-agg-mean', 'n_[5]-agg-va

In [15]:
combined = []

for i in range(len(y_test)):
    nn_probs = predict_sherlock_proba[i]
    voting_probs = predicted_prob[i]
    
    x = nn_probs + voting_probs
    x = x / 2

    combined.append(x)


In [16]:
predicted_labels = _transform_predictions_to_classes(combined, nn_id=nn_model_id)

In [17]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

prediction count 137353, type = <class 'numpy.ndarray'>


0.9043127091005166

In [18]:
# 0.8914784495960828 RandomForestClassifier
# 0.8888581433158012 ExtraTreesClassifier
# 0.8939142078215914 VotingClassifier

# 0.9051046508515834  NN + votingclassifier
# 0.9037255679859006 RFC(100) + NN
# 0.904082859543776 RFC(300) + NN

# 0.905148377678918 NN + VotingClassifier(RFC100 + ETC100)
# 0.9057069845572598 NN (retrained_sherlock8) + VotingClassifier(RFC300 + ETC300)

# 0.9057005326978261 NN (retrained_sherlock8) + VotingClassifier(RFC100 + ETC100)

In [19]:
report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [20]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
grades		0.994		0.991		0.997		1765
isbn		0.990		0.990		0.990		1430
jockey		0.988		0.987		0.990		2819
industry	0.984		0.979		0.989		2958
currency	0.978		0.987		0.968		405


### Bottom 5 Types

In [21]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

		f1-score	precision	recall		support
rank		0.746		0.708		0.787		2983
person		0.690		0.736		0.649		579
sales		0.622		0.750		0.531		322
director	0.612		0.719		0.533		225
ranking		0.584		0.822		0.453		439


### All Scores

In [22]:
print(classification_report(y_test, predicted_labels, digits=3))

                precision    recall  f1-score   support

       address      0.930     0.951     0.940      3003
     affiliate      0.976     0.814     0.888       204
   affiliation      0.982     0.952     0.967      1768
           age      0.884     0.958     0.919      3033
         album      0.893     0.886     0.890      3035
          area      0.897     0.840     0.868      1987
        artist      0.803     0.887     0.843      3043
    birth date      0.971     0.975     0.973       479
   birth place      0.984     0.911     0.947       418
         brand      0.821     0.718     0.766       574
      capacity      0.839     0.765     0.801       362
      category      0.925     0.898     0.911      3087
          city      0.855     0.908     0.880      2966
         class      0.908     0.927     0.917      2971
classification      0.943     0.869     0.904       587
          club      0.978     0.954     0.965      2977
          code      0.931     0.925     0.928  

## Review errors

In [23]:
from collections import Counter

size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
#        if k1 in ('brand'):
#        print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

Total mismatches: 13132 (F1 score: 0.9043127091005166)


[('name', 738),
 ('rank', 634),
 ('location', 497),
 ('position', 422),
 ('team', 391),
 ('description', 383),
 ('region', 382),
 ('notes', 377),
 ('album', 345),
 ('artist', 343),
 ('type', 324),
 ('area', 317),
 ('category', 316),
 ('company', 312),
 ('product', 304),
 ('city', 273),
 ('day', 258),
 ('team name', 247),
 ('ranking', 240),
 ('code', 221),
 ('class', 216),
 ('owner', 206),
 ('person', 203),
 ('sex', 196),
 ('order', 184),
 ('year', 180),
 ('status', 176),
 ('manufacturer', 173),
 ('country', 171),
 ('result', 168),
 ('service', 163),
 ('brand', 162),
 ('gender', 162),
 ('sales', 151),
 ('weight', 148),
 ('credit', 148),
 ('address', 148),
 ('duration', 144),
 ('club', 138),
 ('origin', 137),
 ('plays', 131),
 ('component', 131),
 ('state', 129),
 ('age', 127),
 ('format', 127),
 ('range', 119),
 ('county', 114),
 ('director', 105),
 ('nationality', 103),
 ('capacity', 85),
 ('affiliation', 84),
 ('language', 80),
 ('classification', 77),
 ('publisher', 72),
 ('family', 

In [24]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')

In [25]:
from ast import literal_eval

idx = 541
original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "age", actual label "position". Actual values:
[[2, 4]]
