In [1]:
%load_ext autoreload
%autoreload 2

# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [2]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf
from ast import literal_eval

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

from pympler import muppy, summary
from datetime import datetime

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [3]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw and preprocessed data into ../data/data.zip.
Data was downloaded.
Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [4]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [5]:
import time 

timestr = time.strftime("%Y%m%d-%H%M%S")

X_test_filename_csv = f'../data/work/test_{timestr}.csv'
X_train_filename_csv = f'../data/work/train_{timestr}.csv'
X_validation_filename_csv = f'../data/work/validation_{timestr}.csv'

In [6]:
import io
import csv
from datetime import datetime
from functional import pseq
from sherlock.functional import as_py_str, to_literal, randomise_sample, as_str_series, dropna, extract_features, normalise_string_whitespace, keys_on_first
from pyarrow.parquet import ParquetFile


def keys_to_csv(keys):
    with io.StringIO() as output:
        writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC)
        writer.writerow(keys)

        return output.getvalue()
    

def load_parquet_values(path):
    pf = ParquetFile(source = path)
    row_df = pf.read_row_group(0)
    
    return row_df['values']


def extract_features_to_csv(output_path, parquet_values):
    # incompatible with keys_on_first
    verify_keys = False
    first_keys = None
    i = 0

    start = datetime.now()

    print(f'Starting {output_path} at {start}')

    with open(output_path, "w") as outfile:
        # Comparable performance with using pool.imap directly, but the code is *much* cleaner
        for keys, values_str in pseq(parquet_values, processes=6, partition_size=10)\
            .map(as_py_str)\
            .map(to_literal)\
            .map(randomise_sample)\
            .map(normalise_string_whitespace)\
            .map(as_str_series)\
            .map(dropna)\
            .map(extract_features)\
            .map(keys_on_first): # to-do: make this function a partial, and pass in the verify_keys
                i = i+1

                if first_keys is None:
                    first_keys = keys
                    first_keys_str = keys_to_csv(keys)

                    print(f'Exporting {len(first_keys)} column features')
                    
                    outfile.write(keys_to_csv(keys))
                    outfile.write('\n')
                elif verify_keys: # incompatible with keys_on_first
                    keys_str = ','.join(keys)
                    if first_keys_str != keys_str:
                        key_list = list(keys)

                        print(f'keys are NOT equal. k1 len={len(first_keys)}, k2 len={len(keys)}')

                        for idx, k1 in enumerate(first_keys):
                            k2 = key_list[idx]

                            if k1 != k2:
                                print(f'{k1} != {k2}')

                outfile.write(values_str)
                outfile.write('\n')

    print(f'Finished. Processed {i} rows in {datetime.now() - start}')

### PREPARATION

In [7]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import prepare_feature_extraction

prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:06.248213 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:03.551956 seconds. (filename = ../sherlock/features/par_vec_retrained_400.pkl)
Initialised NLTK, process took 0:00:00.154631 seconds.


[nltk_data] Downloading package punkt to /Users/lowecg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lowecg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## EXTRACT FEATURES TO CSV (NEW METHOD)

### TEST SET

In [8]:
values = load_parquet_values("../data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename_csv, values)

values = None

Starting ../data/work/test_20201231-090535.csv at 2020-12-31 09:05:46.013873
Exporting 1578 column features
Finished. Processed 137353 rows in 0:09:48.036084


In [9]:
# Finished. Processed 137353 rows in 0:14:53.196073

In [10]:
print(f'Finished at {datetime.now()}')

Finished at 2020-12-31 09:15:34.284739


### TRAIN SET

In [11]:
values = load_parquet_values("../data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename_csv, values)

values = None

Starting ../data/work/train_20201231-090535.csv at 2020-12-31 09:15:36.099159
Exporting 1578 column features
Finished. Processed 412059 rows in 0:27:55.469915


In [12]:
print(f'Finished at {datetime.now()}')

Finished at 2020-12-31 09:43:31.821122


### VALIDATION SET

In [13]:
values = load_parquet_values("../data/raw/val_values.parquet")

extract_features_to_csv(X_validation_filename_csv, values)

values = None

Starting ../data/work/validation_20201231-090535.csv at 2020-12-31 09:43:32.695194
Exporting 1578 column features
Finished. Processed 137353 rows in 0:09:26.351942


In [14]:
print(f'Finished at {datetime.now()}')

Finished at 2020-12-31 09:52:59.162628


## EXTRACT FEATURES TO CSV (**OLD** METHOD) 

### TEST SET (OLD METHOD)

In [8]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [8]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')
test_labels = pd.read_parquet('../data/raw/test_labels.parquet')

In [9]:
test_samples_converted, y_test = convert_string_lists_to_lists(test_samples, test_labels, "values", "type")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [10]:
# free memory
test_samples = None
test_labels = None

In [11]:
test_samples_converted.head()

20368     [Central Missouri, unattached, unattached, Kan...
664102    [95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, ...
366813    [Katie Crews, Christian Hiraldo, Alex Estrada,...
530567    [Christian, Non-Christian, Unreported, Jewish,...
176253    [AAF-McQuay Canada Inc., AAF-McQuay Canada Inc...
Name: values, dtype: object

In [12]:
# output "head" 
y_test[:5]

['affiliation', 'weight', 'jockey', 'religion', 'company']

### Given that feature extraction can take long, we only take the first 100 samples.

In [13]:
y_test_subset = y_test[:100]

In [14]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [15]:
%load_ext line_profiler

In [24]:
from sherlock.features.preprocessing import extract_features

start = datetime.now()

#extract_features(X_test_filename, test_samples_converted.head(n=100))

%lprun -m sherlock.features.preprocessing X_test=extract_features('deleteme.csv', test_samples_converted.head(n=100)) 

print(f'Extract Features (test) process took {datetime.now() - start} seconds.')

Extracting Features:  11%|█         | 11/100 [00:00<00:01, 47.45it/s]

Exporting 1578 column features


Extracting Features: 100%|██████████| 100/100 [00:01<00:00, 59.84it/s]

Extract Features (test) process took 0:00:01.718025 seconds.





In [26]:
test_samples_converted = None

In [22]:
from sherlock.features.helpers import generate_chars_col
generate_chars_col()

In [None]:
import gc

gc.collect()

In [None]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [None]:
# over all, without memory management
# Extract Features (test) process took 3:40:25.799880 seconds.

In [None]:
# Baseline
# Extract Features (test) process took 0:11:04.137081 seconds.

# Iterations
# Extract Features (test) process took 0:00:56.671353 seconds. (cache word embeddings)
# Extract Features (test) process took 0:00:13.523261 seconds. (cache Doc2Vec)
# Extract Features (test) process took 0:00:03.674007 seconds. (loads of tweaks, use np.array for stats)
# Extract Features (test) process took 0:00:03.262298 seconds. (manually compute counts)
# Extract Features (test) process took 0:00:02.853031 seconds. (replace series.str.count in BoW, use series.tolist() in paragraph vectors )
# Extract Features (test) process took 0:00:01.718025 seconds. (compute mean, variance, skew and kurtosis together)

### TRAIN SET (OLD METHOD)

In [None]:
train_samples = pd.read_parquet('../data/raw/train_values.parquet')
train_labels = pd.read_parquet('../data/raw/train_labels.parquet')

In [None]:
train_samples_converted, y_train = convert_string_lists_to_lists(train_samples, train_labels, "values", "type")

In [None]:
# free memory
train_samples = None
train_labels = None

In [None]:
y_train_subset = y_train[:100]

In [None]:
start = datetime.now()

extract_features(X_train_filename, train_samples_converted)

print(f'Extract Features (train) process took {datetime.now() - start} seconds.')

In [None]:
train_samples_converted = None

### VALIDATION SET (OLD METHOD)

In [None]:
validation_samples = pd.read_parquet('../data/raw/val_values.parquet')
validation_labels = pd.read_parquet('../data/raw/val_labels.parquet')

In [None]:
validation_samples_converted, y_validation = convert_string_lists_to_lists(validation_samples, validation_labels, "values", "type")

In [None]:
# free memory
validation_samples = None
validation_labels = None

In [None]:
y_validation_subset = y_validation[:100]

In [None]:
start = datetime.now()

extract_features(X_validation_filename, validation_samples_converted)

print(f'Extract Features (validation) process took {datetime.now() - start} seconds.')

In [None]:
validation_samples_converted = None

### Read Locally Processed Features

In [None]:
timestr = '20201224-105345'

X_test_filename_csv = f'../data/work/test_{timestr}.csv'
X_train_filename_csv = f'../data/work/train_{timestr}.csv'
X_validation_filename_csv = f'../data/work/validation_{timestr}.csv'

In [15]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:00:31.353612 seconds.


In [16]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.161198,-0.079423,-0.045488,-0.120119,-0.014794,-0.02721,0.139047,-0.215492,0.136643,0.050392
1,1.0,0.0,0.368421,0.33795,0.0,2.0,0.0,7.0,0.742677,1.32687,...,0.198473,-0.212392,0.019466,-0.125395,0.088254,-0.104947,0.00837,-0.016458,-0.079247,0.120809
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.217175,-0.06189,-0.187662,-0.19951,-0.070486,-0.081841,-0.127853,-0.042899,-0.036182,0.1253
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.012347,-0.124472,-0.051345,0.066079,-0.095552,0.046814,-0.01114,-0.002393,0.112862,0.001077
4,1.0,0.0,0.02008,0.035741,0.0,2.0,0.0,5.0,96.521599,9.78415,...,0.560375,-0.543664,0.637601,-0.172634,1.10543,-0.085147,-0.301327,-0.182403,-0.740886,-0.178526


In [17]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename_csv, dtype=np.float32)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

Load Features (train) process took 0:01:35.503106 seconds.


In [18]:
X_train.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.09597,-0.015877,-0.052533,-0.073722,0.069357,0.009126,0.048459,-0.00697,0.053385,0.032584
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.113287,-0.149856,-0.003312,-0.03853,-0.027172,-0.0554,0.116889,-0.021296,0.007829,0.173933
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.047728,-0.791269,-0.098732,0.124096,-0.01156,0.023012,0.560486,-0.324866,-0.141458,0.047281
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.166476,-0.265747,0.02619,0.084216,0.027764,-0.048622,0.061961,-0.048417,-0.082588,0.083388
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.337626,-0.199005,-0.046064,0.100829,-0.038889,-0.014319,0.213116,-0.101944,-0.005389,0.217992


In [19]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename_csv, dtype=np.float32)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

Load Features (validation) process took 0:00:31.131727 seconds.


In [20]:
X_validation.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.131482,-0.070942,-0.031589,-0.117757,0.032934,-0.050159,0.178825,-0.154065,0.173346,0.198612
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.088263,-0.066631,-0.013783,-0.13704,0.0718,-0.087343,0.192935,-0.240598,0.146652,0.217533
2,1.0,0.0,0.25,0.1875,0.0,1.0,0.0,10.0,-0.666667,1.1547,...,0.097332,-0.031429,0.073273,0.213653,0.014368,-0.010966,0.470403,-0.295013,0.133404,-0.09526
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.176577,-0.047124,-0.11991,-0.14094,-0.134155,0.023145,0.005568,-0.019759,-0.038645,0.072193
4,1.0,0.0,0.09434,0.311855,0.0,4.0,0.0,5.0,42.275002,6.53059,...,-0.319834,-0.068512,0.034314,0.037491,0.13151,-0.283767,-0.22307,-0.086019,0.152444,0.314189


## Impute NaN values with feature means

In [21]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:00:19.864713 seconds.


In [22]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

FillNA process took 0:00:01.983939 seconds.


In [23]:
start = datetime.now()

X_train.to_parquet('train.parquet', engine='pyarrow', compression='snappy')
X_validation.to_parquet('validation.parquet', engine='pyarrow', compression='snappy')
X_test.to_parquet('test.parquet', engine='pyarrow', compression='snappy')

print(f'Save parquet process took {datetime.now() - start} seconds.')

Save parquet process took 0:00:58.220595 seconds.


In [24]:
X_test = None

In [25]:
import gc

gc.collect()

40

In [26]:
print(f'Finished at {datetime.now()}')

Finished at 2020-12-30 17:52:40.703314


## Restore Pickled Session ready for training

In [None]:
X_train = pd.read_csv('train.csv')
X_validation = pd.read_csv('validation.csv')

In [None]:
y_train = pd.read_parquet('../data/raw/train_labels.parquet').values.flatten()
y_validation = pd.read_parquet('../data/raw/val_labels.parquet').values.flatten()

In [None]:
y_train

## Retrain sherlock
The model can be retrained using the code below. The model is currently restricted to be trained on 78 classes, the code of the model architecture will soon be added for adjusting this.

In [None]:
print(f'Started at {datetime.now()}')

train_sherlock(X_train, y_train, X_validation, y_validation, nn_id='retrained_sherlock3');

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}')


## Generate predictions with a model
If you want to use the pretrained Sherlock model `nn_id` set to "sherlock".

If you want to use another model, you can use the identifier corresponding to that model.

**Note**: There is a bug somewhere in the refactored code which affects the model predictions, this should be fixed soon.

In [None]:
X_test = pd.read_csv('test.csv')

In [None]:
y_test = pd.read_parquet('../data/raw/test_labels.parquet').values.flatten()

In [None]:
predicted_labels = predict_sherlock(X_test, nn_id='retrained_sherlock3')

In [None]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

In [None]:
size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

In [None]:
pd.Series(predicted_labels).nunique()

In [None]:
pd.Series(y_test).nunique()

In [None]:
predicted_labels[:25]

In [None]:
y_test[:25]

In [None]:
set(y_test)

In [None]:
from collections import Counter

size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
#        if k1 in ('name'):
#            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

In [None]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')

In [None]:
idx = 123758
converted = test_samples.iloc[idx].apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

## Generate predictions with preprocessed data using Sherlock

Requires the data to be downloaded from Google Drive (see first step in notebook).

In [None]:
X_test_preprocessed = pd.read_parquet("../data/data/processed/X_test.parquet")
y_test_preprocessed = pd.read_parquet("../data/data/processed/y_test.parquet").reset_index(drop=True)

In [None]:
X_test_preprocessed.head()

In [None]:
#X_test_preprocessed.to_csv('test_preprocessed.csv')

In [None]:
X_test.head()

In [None]:
y_test_preprocessed.head()

In [None]:
predicted_labels = predict_sherlock(X_test_preprocessed.head(n=25), 'sherlock')

In [None]:
f1_score(y_test_preprocessed.head(n=25), predicted_labels, average='weighted')

In [None]:
predicted_labels[:10]

In [None]:
y_test_preprocessed.head(n=10)

In [None]:
first_keys = X_test_preprocessed.columns
first_keys_str = ','.join(first_keys)

keys = ','.join(X_test.columns)
if first_keys_str == keys:
    print('Keys are equal')
else:
    key_list = list(X_test.columns)

    print(f'keys are NOT equal. k1 len={len(first_keys)}, k2 len={len(key_list)}')

    for idx, k1 in enumerate(first_keys):
        k2 = key_list[idx]

        if k1 == k2:
            print(f'{k1} == {k2}')
        else:
            print(f'{k1} != {k2}')