In [1]:
%load_ext autoreload
%autoreload 2

# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [2]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf
from ast import literal_eval

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

from pympler import muppy, summary
from datetime import datetime

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [3]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw and preprocessed data into ../data/data.zip.
Data was downloaded.
Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [4]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [5]:
import time 

timestr = time.strftime("%Y%m%d-%H%M%S")

X_test_filename_csv = f'../data/work/test_{timestr}.csv'
X_train_filename_csv = f'../data/work/train_{timestr}.csv'
X_validation_filename_csv = f'../data/work/validation_{timestr}.csv'

In [6]:
import io
import csv
from datetime import datetime
from functional import pseq
from sherlock.functional import as_py_str, to_literal, randomise_sample, as_str_series, dropna, extract_features, normalise_string_whitespace, keys_on_first
from pyarrow.parquet import ParquetFile


def keys_to_csv(keys):
    with io.StringIO() as output:
        writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC)
        writer.writerow(keys)

        return output.getvalue()
    

def load_parquet_values(path):
    pf = ParquetFile(source = path)
    row_df = pf.read_row_group(0)
    
    return row_df['values']


def extract_features_to_csv(output_path, parquet_values):
    # incompatible with keys_on_first
    verify_keys = False
    first_keys = None
    i = 0

    start = datetime.now()

    print(f'Starting {output_path} at {start}')

    with open(output_path, "w") as outfile:
        # Comparable performance with using pool.imap directly, but the code is *much* cleaner
        for keys, values_str in pseq(parquet_values, processes=6, partition_size=10)\
            .map(as_py_str)\
            .map(to_literal)\
            .map(randomise_sample)\
            .map(normalise_string_whitespace)\
            .map(as_str_series)\
            .map(dropna)\
            .map(extract_features)\
            .map(keys_on_first): # to-do: make this function a partial, and pass in the verify_keys
                i = i+1

                if first_keys is None:
                    first_keys = keys
                    first_keys_str = keys_to_csv(keys)

                    print(f'Exporting {len(first_keys)} column features')
                    
                    outfile.write(keys_to_csv(keys))
                    outfile.write('\n')
                elif verify_keys: # incompatible with keys_on_first
                    keys_str = ','.join(keys)
                    if first_keys_str != keys_str:
                        key_list = list(keys)

                        print(f'keys are NOT equal. k1 len={len(first_keys)}, k2 len={len(keys)}')

                        for idx, k1 in enumerate(first_keys):
                            k2 = key_list[idx]

                            if k1 != k2:
                                print(f'{k1} != {k2}')

                outfile.write(values_str)
                outfile.write('\n')

    print(f'Finished. Processed {i} rows in {datetime.now() - start}')

## EXTRACT FEATURES TO CSV (NEW METHOD)

### PREPARATION

In [7]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model
from sherlock.features.preprocessing import prepare_feature_extraction

prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:05.617424 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:00.345168 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)


### TEST SET

In [8]:
values = load_parquet_values("../data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename_csv, values)

values = None

Starting ../data/work/test_20201228-095113.csv at 2020-12-28 09:51:20.027644
Exporting 1578 column features
Finished. Processed 137353 rows in 0:17:38.860040


In [9]:
print(f'Finished at {datetime.now()}')

Finished at 2020-12-28 10:08:59.044344


### TRAIN SET

In [10]:
values = load_parquet_values("../data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename_csv, values)

values = None

Starting ../data/work/train_20201228-095113.csv at 2020-12-28 10:09:01.788163
Exporting 1578 column features
Finished. Processed 412059 rows in 0:50:56.331647


In [11]:
print(f'Finished at {datetime.now()}')

Finished at 2020-12-28 10:59:58.393430


### VALIDATION SET

In [12]:
values = load_parquet_values("../data/raw/val_values.parquet")

extract_features_to_csv(X_validation_filename_csv, values)

values = None

Starting ../data/work/validation_20201228-095113.csv at 2020-12-28 10:59:59.267311
Exporting 1578 column features
Finished. Processed 137353 rows in 0:16:53.218644


In [13]:
print(f'Finished at {datetime.now()}')

Finished at 2020-12-28 11:16:52.589097


## EXTRACT FEATURES TO CSV (**OLD** METHOD) 

### TEST SET (OLD METHOD)

In [None]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [None]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')
test_labels = pd.read_parquet('../data/raw/test_labels.parquet')

In [None]:
test_samples_converted, y_test = convert_string_lists_to_lists(test_samples, test_labels, "values", "type")

In [None]:
# free memory
test_samples = None
test_labels = None

In [None]:
test_samples_converted.head()

In [None]:
# output "head" 
y_test[:5]

### Given that feature extraction can take long, we only take the first 100 samples.

In [None]:
y_test_subset = y_test[:100]

In [None]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model

initialise_word_embeddings()
initialise_pretrained_model(400)

In [None]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [None]:
start = datetime.now()

#X_test = extract_features(test_samples_converted.head(n=100))
extract_features(X_test_filename, test_samples_converted.head(n=100))

print(f'Extract Features (test) process took {datetime.now() - start} seconds.')

In [None]:
test_samples_converted = None

In [None]:
import gc

gc.collect()

In [None]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [None]:
# over all, without memory management
# Extract Features (test) process took 3:40:25.799880 seconds.

In [None]:
# Baseline
# Extract Features (test) process took 0:11:04.137081 seconds.

# Iterations
# Extract Features (test) process took 0:00:56.671353 seconds. (cache word embeddings)
# Extract Features (test) process took 0:00:13.523261 seconds. (cache Doc2Vec)

### TRAIN SET (OLD METHOD)

In [None]:
train_samples = pd.read_parquet('../data/raw/train_values.parquet')
train_labels = pd.read_parquet('../data/raw/train_labels.parquet')

In [None]:
train_samples_converted, y_train = convert_string_lists_to_lists(train_samples, train_labels, "values", "type")

In [None]:
# free memory
train_samples = None
train_labels = None

In [None]:
y_train_subset = y_train[:100]

In [None]:
start = datetime.now()

extract_features(X_train_filename, train_samples_converted)

print(f'Extract Features (train) process took {datetime.now() - start} seconds.')

In [None]:
train_samples_converted = None

### VALIDATION SET (OLD METHOD)

In [None]:
validation_samples = pd.read_parquet('../data/raw/val_values.parquet')
validation_labels = pd.read_parquet('../data/raw/val_labels.parquet')

In [None]:
validation_samples_converted, y_validation = convert_string_lists_to_lists(validation_samples, validation_labels, "values", "type")

In [None]:
# free memory
validation_samples = None
validation_labels = None

In [None]:
y_validation_subset = y_validation[:100]

In [None]:
start = datetime.now()

extract_features(X_validation_filename, validation_samples_converted)

print(f'Extract Features (validation) process took {datetime.now() - start} seconds.')

In [None]:
validation_samples_converted = None

### Read Locally Processed Features

In [None]:
timestr = '20201224-105345'

X_test_filename_csv = f'../data/work/test_{timestr}.csv'
X_train_filename_csv = f'../data/work/train_{timestr}.csv'
X_validation_filename_csv = f'../data/work/validation_{timestr}.csv'

In [14]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:00:30.782784 seconds.


In [15]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.000858,0.00076,-0.000477,-0.000894,0.000838,-0.001224,0.000353,-0.000655,0.000566,0.000537
1,1.0,0.0,0.368421,0.33795,0.0,2.0,0.0,7.0,0.742677,1.32687,...,0.000528,-0.000353,-0.000504,0.000945,-0.000456,-0.000986,-0.000151,0.000882,-0.00031,-0.000562
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.000518,-4.6e-05,-0.001005,-0.000698,0.00038,-0.000641,-0.000597,0.001205,-0.000855,-0.000297
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.000597,0.000691,-0.000288,-0.000656,0.00105,0.00039,-0.000236,-0.000667,0.000136,0.000689
4,1.0,0.0,0.02008,0.035741,0.0,2.0,0.0,5.0,96.521599,9.78415,...,-0.000113,-0.000873,0.000573,0.001029,0.00066,0.000528,-0.000259,-0.00065,0.001239,-0.000386


In [16]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename_csv, dtype=np.float32)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

Load Features (train) process took 0:01:34.038627 seconds.


In [17]:
X_train.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.000187,2.9e-05,0.001129,-0.00015,0.000268,0.000933,0.000559,-0.000327,0.000676,-0.00116
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.000521,0.001054,0.001103,-0.001192,0.000949,0.000982,-0.00071,-0.000395,0.000786,0.000155
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.000786,4.8e-05,-0.00054,0.001154,0.000818,0.000298,-0.000822,-0.001179,0.000924,0.00099
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.000516,0.000566,-0.001198,7e-06,-0.000356,-0.000701,-0.000855,-0.000915,0.000859,0.001127
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.00107,0.000864,0.000288,-0.000666,0.000863,-0.001102,-0.000942,-0.000307,-0.000346,0.000427


In [18]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename_csv, dtype=np.float32)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

Load Features (validation) process took 0:00:31.175299 seconds.


In [19]:
X_validation.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-4.1e-05,-0.000604,4.7e-05,-0.001002,-0.000816,-0.000517,-0.000162,0.000447,-0.000991,-0.001119
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.000346,0.000752,0.000917,0.000994,-0.001099,0.000208,0.000226,0.000293,-0.001147,0.000102
2,1.0,0.0,0.25,0.1875,0.0,1.0,0.0,10.0,-0.666667,1.1547,...,-0.000214,0.000341,-0.000271,0.000169,-0.000431,-0.000201,-0.000945,-0.001021,0.000314,0.000636
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,3.9e-05,-0.000419,-0.000654,-3.8e-05,-0.000995,-0.000175,-0.000871,0.000118,0.00052,0.000551
4,1.0,0.0,0.09434,0.311855,0.0,4.0,0.0,5.0,42.275002,6.53059,...,-0.000664,-0.000761,-0.0006,-0.000969,-0.000591,0.00063,0.000505,0.000988,0.000451,-0.000105


## Impute NaN values with feature means

In [20]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:00:19.554785 seconds.


In [21]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

FillNA process took 0:00:01.880830 seconds.


In [22]:
start = datetime.now()

X_train.to_parquet('train.parquet', engine='pyarrow', compression='snappy')
X_validation.to_parquet('validation.parquet', engine='pyarrow', compression='snappy')
X_test.to_parquet('test.parquet', engine='pyarrow', compression='snappy')

print(f'Save parquet process took {datetime.now() - start} seconds.')

In [23]:
X_test = None

In [24]:
import gc

gc.collect()

40

In [25]:
print(f'Finished at {datetime.now()}')

Finished at 2020-12-28 11:20:47.922207


## Restore Pickled Session ready for training

In [None]:
X_train = pd.read_csv('train.csv')
X_validation = pd.read_csv('validation.csv')

In [None]:
y_train = pd.read_parquet('../data/raw/train_labels.parquet').values.flatten()
y_validation = pd.read_parquet('../data/raw/val_labels.parquet').values.flatten()

In [None]:
y_train

## Retrain sherlock
The model can be retrained using the code below. The model is currently restricted to be trained on 78 classes, the code of the model architecture will soon be added for adjusting this.

In [None]:
print(f'Started at {datetime.now()}')

train_sherlock(X_train, y_train, X_validation, y_validation, nn_id='retrained_sherlock3');

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}')


## Generate predictions with a model
If you want to use the pretrained Sherlock model `nn_id` set to "sherlock".

If you want to use another model, you can use the identifier corresponding to that model.

**Note**: There is a bug somewhere in the refactored code which affects the model predictions, this should be fixed soon.

In [None]:
X_test = pd.read_csv('test.csv')

In [None]:
y_test = pd.read_parquet('../data/raw/test_labels.parquet').values.flatten()

In [None]:
predicted_labels = predict_sherlock(X_test, nn_id='retrained_sherlock3')

In [None]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

In [None]:
size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

In [None]:
pd.Series(predicted_labels).nunique()

In [None]:
pd.Series(y_test).nunique()

In [None]:
predicted_labels[:25]

In [None]:
y_test[:25]

In [None]:
set(y_test)

In [None]:
from collections import Counter

size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
#        if k1 in ('name'):
#            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

In [None]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')

In [None]:
idx = 123758
converted = test_samples.iloc[idx].apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

## Generate predictions with preprocessed data using Sherlock

Requires the data to be downloaded from Google Drive (see first step in notebook).

In [None]:
X_test_preprocessed = pd.read_parquet("../data/data/processed/X_test.parquet")
y_test_preprocessed = pd.read_parquet("../data/data/processed/y_test.parquet").reset_index(drop=True)

In [None]:
X_test_preprocessed.head()

In [None]:
#X_test_preprocessed.to_csv('test_preprocessed.csv')

In [None]:
X_test.head()

In [None]:
y_test_preprocessed.head()

In [None]:
predicted_labels = predict_sherlock(X_test_preprocessed.head(n=25), 'sherlock')

In [None]:
f1_score(y_test_preprocessed.head(n=25), predicted_labels, average='weighted')

In [None]:
predicted_labels[:10]

In [None]:
y_test_preprocessed.head(n=10)

In [None]:
first_keys = X_test_preprocessed.columns
first_keys_str = ','.join(first_keys)

keys = ','.join(X_test.columns)
if first_keys_str == keys:
    print('Keys are equal')
else:
    key_list = list(X_test.columns)

    print(f'keys are NOT equal. k1 len={len(first_keys)}, k2 len={len(key_list)}')

    for idx, k1 in enumerate(first_keys):
        k2 = key_list[idx]

        if k1 == k2:
            print(f'{k1} == {k2}')
        else:
            print(f'{k1} != {k2}')