In [1]:
%load_ext autoreload
%autoreload 2

# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [2]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf
from ast import literal_eval

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

from pympler import muppy, summary
from datetime import datetime

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [3]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw and preprocessed data into ../data/data.zip.
Data was downloaded.
Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [4]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [5]:
import time 

timestr = time.strftime("%Y%m%d-%H%M%S")

X_test_filename = f'../data/work/test_{timestr}.csv'
X_train_filename = f'../data/work/train_{timestr}.csv'
X_validation_filename = f'../data/work/validation_{timestr}.csv'

In [6]:
import csv
from datetime import datetime
from functional import pseq
from sherlock.functional import as_py_str, to_literal, randomise_sample, as_str_series, dropna, extract_features, normalise_string_whitespace
from pyarrow.parquet import ParquetFile


def load_parquet_values(path):
    pf = ParquetFile(source = path)
    row_df = pf.read_row_group(0)
    
    return row_df['values']

def extract_features_to_csv(output_path, parquet_values):
    verify_keys = False
    first_keys = None
    i = 0

    start = datetime.now()

    print(f'Starting {output_path} at {start}')

    with open(output_path, "w") as outfile:
        csvwriter = csv.writer(outfile)

        # Comparable performance with using pool.imap directly, but the code is *much* cleaner
        for features in pseq(parquet_values, processes=6, partition_size=10)\
            .map(as_py_str)\
            .map(to_literal)\
            .map(randomise_sample)\
            .map(normalise_string_whitespace)\
            .map(as_str_series)\
            .map(dropna)\
            .map(extract_features):
                i = i+1

                keys=features.keys()

                if first_keys is None:
                    first_keys = keys
                    first_keys_str = ','.join(keys)

                    print(f'Exporting {len(first_keys)} column features')

                    csvwriter.writerow(first_keys)
                elif verify_keys:
                    keys_str = ','.join(keys)
                    if first_keys_str != keys_str:
                        key_list = list(keys)

                        print(f'keys are NOT equal. k1 len={len(first_keys)}, k2 len={len(keys)}')

                        for idx, k1 in enumerate(first_keys):
                            k2 = key_list[idx]

                            if k1 != k2:
                                print(f'{k1} != {k2}')

                csvwriter.writerow(list(features.values()))

    print(f'Finished. Processed {i} rows in {datetime.now() - start}')

## EXTRACT FEATURES TO CSV (NEW METHOD)

### PREPARATION

In [7]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model
from sherlock.features.preprocessing import prepare_feature_extraction

prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:05.790520 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:01.180743 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)


### TEST SET

In [8]:
values = load_parquet_values("../data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename, values)

values = None

Starting ../data/work/test_20201226-184943.csv at 2020-12-26 18:51:24.146382
Exporting 1578 column features
Finished. Processed 137353 rows in 0:28:58.703536


In [9]:
print(f'Finished at {datetime.now()}')

Finished at 2020-12-26 19:20:22.960766


### TRAIN SET

In [10]:
values = load_parquet_values("../data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename, values)

values = None

Starting ../data/work/train_20201226-184943.csv at 2020-12-26 19:20:25.240876
Exporting 1578 column features
Finished. Processed 412059 rows in 1:18:38.726698


In [11]:
print(f'Finished at {datetime.now()}')

Finished at 2020-12-26 20:39:04.195718


### VALIDATION SET

In [12]:
values = load_parquet_values("../data/raw/val_values.parquet")

extract_features_to_csv(X_validation_filename, values)

values = None

Starting ../data/work/validation_20201226-184943.csv at 2020-12-26 20:39:05.082241
Exporting 1578 column features
Finished. Processed 137353 rows in 0:26:10.045277


In [13]:
print(f'Finished at {datetime.now()}')

Finished at 2020-12-26 21:05:15.229655


## EXTRACT FEATURES TO CSV (**OLD** METHOD) 

### TEST SET (OLD METHOD)

In [15]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [7]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')
test_labels = pd.read_parquet('../data/raw/test_labels.parquet')

In [8]:
test_samples_converted, y_test = convert_string_lists_to_lists(test_samples, test_labels, "values", "type")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [9]:
# free memory
test_samples = None
test_labels = None

In [10]:
test_samples_converted.head()

20368     [Central Missouri, unattached, unattached, Kan...
664102    [95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, ...
366813    [Katie Crews, Christian Hiraldo, Alex Estrada,...
530567    [Christian, Non-Christian, Unreported, Jewish,...
176253    [AAF-McQuay Canada Inc., AAF-McQuay Canada Inc...
Name: values, dtype: object

In [11]:
# output "head" 
y_test[:5]

['affiliation', 'weight', 'jockey', 'religion', 'company']

### Given that feature extraction can take long, we only take the first 100 samples.

In [12]:
y_test_subset = y_test[:100]

In [13]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model

initialise_word_embeddings()
initialise_pretrained_model(400)

Initialising word embeddings
Initialise Word Embeddings process took 0:00:05.994311 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:01.203899 seconds.


In [14]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [29]:
start = datetime.now()

#X_test = extract_features(test_samples_converted.head(n=100))
extract_features(X_test_filename, test_samples_converted.head(n=100))

print(f'Extract Features (test) process took {datetime.now() - start} seconds.')

Extracting Features:   7%|▋         | 7/100 [00:00<00:02, 31.56it/s]

Exporting 1578 column features


Extracting Features: 100%|██████████| 100/100 [00:02<00:00, 40.24it/s]

Extract Features (test) process took 0:00:02.489302 seconds.





In [None]:
test_samples_converted = None

In [None]:
import gc

gc.collect()

In [None]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [None]:
# over all, without memory management
# Extract Features (test) process took 3:40:25.799880 seconds.

In [None]:
# Baseline
# Extract Features (test) process took 0:11:04.137081 seconds.

# Iterations
# Extract Features (test) process took 0:00:56.671353 seconds. (cache word embeddings)
# Extract Features (test) process took 0:00:13.523261 seconds. (cache Doc2Vec)

### TRAIN SET (OLD METHOD)

In [None]:
train_samples = pd.read_parquet('../data/raw/train_values.parquet')
train_labels = pd.read_parquet('../data/raw/train_labels.parquet')

In [None]:
train_samples_converted, y_train = convert_string_lists_to_lists(train_samples, train_labels, "values", "type")

In [None]:
# free memory
train_samples = None
train_labels = None

In [None]:
y_train_subset = y_train[:100]

In [None]:
start = datetime.now()

extract_features(X_train_filename, train_samples_converted)

print(f'Extract Features (train) process took {datetime.now() - start} seconds.')

In [None]:
train_samples_converted = None

### VALIDATION SET (OLD METHOD)

In [None]:
validation_samples = pd.read_parquet('../data/raw/val_values.parquet')
validation_labels = pd.read_parquet('../data/raw/val_labels.parquet')

In [None]:
validation_samples_converted, y_validation = convert_string_lists_to_lists(validation_samples, validation_labels, "values", "type")

In [None]:
# free memory
validation_samples = None
validation_labels = None

In [None]:
y_validation_subset = y_validation[:100]

In [None]:
start = datetime.now()

extract_features(X_validation_filename, validation_samples_converted)

print(f'Extract Features (validation) process took {datetime.now() - start} seconds.')

In [None]:
validation_samples_converted = None

### Read Locally Processed Features

In [16]:
timestr = '20201224-105345'

X_test_filename = f'../data/work/test_{timestr}.csv'
X_train_filename = f'../data/work/train_{timestr}.csv'
X_validation_filename = f'../data/work/validation_{timestr}.csv'

In [14]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:00:37.855500 seconds.


In [15]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-0.001,0.001006,-0.000732,0.000139,-0.001023,0.000647,0.000841,0.000608,0.000641,9.2e-05
1,True,False,0.210526,0.166205,0,1,0.0,4,0.016667,1.420094,...,-0.000429,0.000148,-0.000498,-0.001104,0.00038,-0.000146,0.00072,-0.001103,-0.000719,-0.001237
2,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-0.001026,-6.6e-05,-0.000258,-0.000787,0.001085,-0.000515,-8.6e-05,-0.000965,-0.00107,-0.000936
3,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,0.001046,3.1e-05,-0.00072,0.000739,-0.000891,-0.000133,-0.001018,-0.00063,-0.000442,-0.000617
4,True,False,0.02008,0.035741,0,2,0.0,5,96.521561,9.784149,...,0.000337,-0.000999,-0.000844,0.000755,-0.001139,0.000474,-0.000506,0.000785,0.000571,-0.000209


In [16]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

Load Features (train) process took 0:02:02.434049 seconds.


In [17]:
X_train.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,0.00059,-0.000621,-0.000125,-0.000316,0.00056,-0.000397,0.000358,0.000512,0.000716,-0.000111
1,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-0.000542,-9.4e-05,0.001015,0.000624,-0.000116,0.000525,1.4e-05,-0.000882,0.000631,-0.00076
2,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,0.001092,-0.000314,-0.000646,6e-06,0.000335,0.001145,0.000908,-0.000703,0.000594,-0.000548
3,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-0.000389,-0.001117,0.001101,0.001177,-0.000984,0.000332,-0.000916,0.000818,0.000485,-0.000822
4,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-4.8e-05,-0.000403,-0.000994,0.000605,-0.000721,0.000525,0.000824,-0.00122,-0.000475,-0.000648


In [18]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

Load Features (validation) process took 0:00:38.160077 seconds.


In [19]:
X_validation.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-0.000717,-0.00013,0.001186,-0.000725,0.000864,-0.000655,0.001048,-6.6e-05,0.000996,0.000543
1,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,0.000436,-1e-06,0.001134,0.001093,0.000516,0.000571,0.000288,-0.00119,0.001071,0.000506
2,True,False,0.25,0.1875,0,1,0.0,10,-0.666667,1.154701,...,-0.000832,0.000847,0.001088,0.000945,0.000846,0.000262,0.000769,-0.000741,0.000132,-0.000985
3,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-0.000233,-0.000331,-0.000233,0.000242,-0.000856,-0.001208,-0.001075,-0.000803,-0.000702,-0.000797
4,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,0.000578,0.00062,0.00095,0.000361,-0.000398,0.001148,-0.00072,-0.000684,0.000582,-0.000153


## Impute NaN values with feature means

In [20]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:12:28.458519 seconds.


In [21]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

FillNA process took 0:00:27.155404 seconds.


In [22]:
X_train.to_csv('train.csv')
X_validation.to_csv('validation.csv')
X_test.to_csv('test.csv')

In [23]:
X_test = None

In [24]:
import gc

gc.collect()

40

## Restore Pickled Session ready for training

In [25]:
X_train = pd.read_csv('train.csv')
X_validation = pd.read_csv('validation.csv')

In [26]:
y_train = pd.read_parquet('../data/raw/train_labels.parquet').values.flatten()
y_validation = pd.read_parquet('../data/raw/val_labels.parquet').values.flatten()

In [27]:
y_train

array(['area', 'collection', 'team Name', ..., 'description', 'depth',
       'product'], dtype=object)

## Retrain sherlock
The model can be retrained using the code below. The model is currently restricted to be trained on 78 classes, the code of the model architecture will soon be added for adjusting this.

In [28]:
print(f'Started at {datetime.now()}')

train_sherlock(X_train, y_train, X_validation, y_validation, nn_id='retrained_sherlock3');

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}')


Started at 2020-12-26 21:45:05.876629


W1226 21:45:06.117853 4443131328 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1226 21:45:06.122068 4443131328 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1226 21:45:06.129063 4443131328 deprecation.py:506] From /Users/lowecg/source/private-github/sherlock-project/venv/lib/python3.7/site-pa

Successfully loaded and compiled model, now fitting model on data.
Train on 412059 samples, validate on 137353 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100


Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Retrained Sherlock.
Trained and saved new model.
Finished at 2020-12-27 01:39:01.580431


## Generate predictions with a model
If you want to use the pretrained Sherlock model `nn_id` set to "sherlock".

If you want to use another model, you can use the identifier corresponding to that model.

**Note**: There is a bug somewhere in the refactored code which affects the model predictions, this should be fixed soon.

In [29]:
X_test = pd.read_csv('test.csv')

In [30]:
y_test = pd.read_parquet('../data/raw/test_labels.parquet').values.flatten()

In [34]:
predicted_labels = predict_sherlock(X_test, nn_id='retrained_sherlock3')

In [35]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

prediction count 137353, type = <class 'numpy.ndarray'>


In [36]:
size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

0.8550958902652783

In [27]:
pd.Series(predicted_labels).nunique()

78

In [29]:
pd.Series(y_test).nunique()

78

In [40]:
predicted_labels[:25]

array(['affiliation', 'weight', 'jockey', 'religion', 'company', 'grades',
       'area', 'component', 'company', 'brand', 'weight', 'genre',
       'album', 'origin', 'description', 'status', 'credit', 'team Name',
       'artist', 'address', 'age', 'album', 'club', 'description',
       'family'], dtype=object)

In [41]:
y_test[:25]

array(['affiliation', 'weight', 'jockey', 'religion', 'company', 'grades',
       'area', 'component', 'company', 'manufacturer', 'weight', 'genre',
       'album', 'origin', 'description', 'status', 'credit', 'team Name',
       'artist', 'address', 'age', 'album', 'club', 'description',
       'family'], dtype=object)

In [72]:
set(y_test)

{'address',
 'affiliate',
 'affiliation',
 'age',
 'album',
 'area',
 'artist',
 'birth Date',
 'birth Place',
 'brand',
 'capacity',
 'category',
 'city',
 'class',
 'classification',
 'club',
 'code',
 'collection',
 'command',
 'company',
 'component',
 'continent',
 'country',
 'county',
 'creator',
 'credit',
 'currency',
 'day',
 'depth',
 'description',
 'director',
 'duration',
 'education',
 'elevation',
 'family',
 'file Size',
 'format',
 'gender',
 'genre',
 'grades',
 'industry',
 'isbn',
 'jockey',
 'language',
 'location',
 'manufacturer',
 'name',
 'nationality',
 'notes',
 'operator',
 'order',
 'organisation',
 'origin',
 'owner',
 'person',
 'plays',
 'position',
 'product',
 'publisher',
 'range',
 'rank',
 'ranking',
 'region',
 'religion',
 'requirement',
 'result',
 'sales',
 'service',
 'sex',
 'species',
 'state',
 'status',
 'symbol',
 'team',
 'team Name',
 'type',
 'weight',
 'year'}

In [42]:
from collections import Counter

size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
#        if k1 in ('name'):
#            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

Total mismatches: 22468 (F1 score: 0.836800124917518)


[('rank', 1277),
 ('name', 1083),
 ('position', 861),
 ('class', 840),
 ('plays', 773),
 ('description', 679),
 ('region', 667),
 ('location', 653),
 ('day', 567),
 ('artist', 541),
 ('type', 482),
 ('category', 475),
 ('team', 470),
 ('area', 464),
 ('team Name', 462),
 ('city', 459),
 ('notes', 456),
 ('album', 442),
 ('company', 406),
 ('owner', 400),
 ('code', 399),
 ('sex', 394),
 ('order', 386),
 ('product', 380),
 ('manufacturer', 376),
 ('age', 367),
 ('ranking', 348),
 ('status', 313),
 ('person', 288),
 ('credit', 277),
 ('country', 264),
 ('brand', 250),
 ('service', 245),
 ('county', 243),
 ('result', 240),
 ('year', 239),
 ('state', 232),
 ('component', 231),
 ('weight', 231),
 ('gender', 227),
 ('sales', 222),
 ('duration', 221),
 ('address', 219),
 ('club', 202),
 ('format', 196),
 ('origin', 185),
 ('range', 179),
 ('nationality', 154),
 ('director', 153),
 ('capacity', 149),
 ('family', 145),
 ('command', 138),
 ('publisher', 133),
 ('jockey', 133),
 ('classification',

In [15]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')

In [78]:
idx = 123758
converted = test_samples.iloc[idx].apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

Predicted "jockey", actual label "name". Actual values:
[['Alexander Steen', 'Patrik Berglund', 'Barret Jackman', 'Vladimir Sobotka', 'Chris Stewart', 'Kevin Shattenkirk', 'David Perron', 'Alex Pietrangelo', 'Andy McDonald']]


## Generate predictions with preprocessed data using Sherlock

Requires the data to be downloaded from Google Drive (see first step in notebook).

In [None]:
X_test_preprocessed = pd.read_parquet("../data/data/processed/X_test.parquet")
y_test_preprocessed = pd.read_parquet("../data/data/processed/y_test.parquet").reset_index(drop=True)

In [None]:
X_test_preprocessed.head()

In [None]:
#X_test_preprocessed.to_csv('test_preprocessed.csv')

In [None]:
X_test.head()

In [None]:
y_test_preprocessed.head()

In [None]:
predicted_labels = predict_sherlock(X_test_preprocessed.head(n=25), 'sherlock')

In [None]:
f1_score(y_test_preprocessed.head(n=25), predicted_labels, average='weighted')

In [None]:
predicted_labels[:10]

In [None]:
y_test_preprocessed.head(n=10)

In [None]:
first_keys = X_test_preprocessed.columns
first_keys_str = ','.join(first_keys)

keys = ','.join(X_test.columns)
if first_keys_str == keys:
    print('Keys are equal')
else:
    key_list = list(X_test.columns)

    print(f'keys are NOT equal. k1 len={len(first_keys)}, k2 len={len(key_list)}')

    for idx, k1 in enumerate(first_keys):
        k2 = key_list[idx]

        if k1 == k2:
            print(f'{k1} == {k2}')
        else:
            print(f'{k1} != {k2}')