In [None]:
%load_ext autoreload
%autoreload 2

# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [None]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf
from ast import literal_eval

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

from pympler import muppy, summary
from datetime import datetime

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [None]:
helpers.download_data()
prepare_feature_extraction()

## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [None]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [None]:
import time 

timestr = time.strftime("%Y%m%d-%H%M%S")

X_test_filename = f'../data/work/test_{timestr}.csv'
X_train_filename = f'../data/work/train_{timestr}.csv'
X_validation_filename = f'../data/work/validation_{timestr}.csv'

### TEST SET

In [None]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [None]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')
test_labels = pd.read_parquet('../data/raw/test_labels.parquet')

In [None]:
test_samples_converted, y_test = convert_string_lists_to_lists(test_samples, test_labels, "values", "type")

In [None]:
# free memory
test_samples = None
test_labels = None

In [None]:
test_samples_converted.head()

In [None]:
# output "head" 
y_test[:5]

### Given that feature extraction can take long, we only take the first 100 samples.

In [None]:
y_test_subset = y_test[:100]

In [None]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model

initialise_word_embeddings()
initialise_pretrained_model(400)

In [None]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [None]:
start = datetime.now()

#X_test = extract_features(test_samples_converted.head(n=100))
extract_features(X_test_filename, test_samples_converted)

print(f'Extract Features (test) process took {datetime.now() - start} seconds.')

In [None]:
test_samples_converted = None

In [None]:
import gc

gc.collect()

In [None]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [None]:
# over all, without memory management
# Extract Features (test) process took 3:40:25.799880 seconds.

In [None]:
# Baseline
# Extract Features (test) process took 0:11:04.137081 seconds.

# Iterations
# Extract Features (test) process took 0:00:56.671353 seconds. (cache word embeddings)
# Extract Features (test) process took 0:00:13.523261 seconds. (cache Doc2Vec)

### TRAIN SET

In [None]:
train_samples = pd.read_parquet('../data/raw/train_values.parquet')
train_labels = pd.read_parquet('../data/raw/train_labels.parquet')

In [None]:
train_samples_converted, y_train = convert_string_lists_to_lists(train_samples, train_labels, "values", "type")

In [None]:
# free memory
train_samples = None
train_labels = None

In [None]:
y_train_subset = y_train[:100]

In [None]:
start = datetime.now()

extract_features(X_train_filename, train_samples_converted)

print(f'Extract Features (train) process took {datetime.now() - start} seconds.')

In [None]:
train_samples_converted = None

### VALIDATION SET

In [None]:
validation_samples = pd.read_parquet('../data/raw/val_values.parquet')
validation_labels = pd.read_parquet('../data/raw/val_labels.parquet')

In [None]:
validation_samples_converted, y_validation = convert_string_lists_to_lists(validation_samples, validation_labels, "values", "type")

In [None]:
# free memory
validation_samples = None
validation_labels = None

In [None]:
y_validation_subset = y_validation[:100]

In [None]:
start = datetime.now()

extract_features(X_validation_filename, validation_samples_converted)

print(f'Extract Features (validation) process took {datetime.now() - start} seconds.')

In [None]:
validation_samples_converted = None

### Read Locally Processed Features

In [None]:
timestr = '20201221-084342'

X_test_filename = f'../data/work/test_{timestr}.csv'
X_train_filename = f'../data/work/train_{timestr}.csv'
X_validation_filename = f'../data/work/validation_{timestr}.csv'

In [None]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

In [None]:
X_test.head()

In [None]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

In [None]:
X_train.head()

In [None]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

In [None]:
X_validation.head()

## Impute NaN values with feature means

In [None]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

In [None]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

In [None]:
X_train.to_csv('train.csv')
X_validation.to_csv('validation.csv')
X_test.to_csv('test.csv')

## Restore Pickled Session ready for training

In [None]:
y_train = pd.read_parquet('../data/raw/train_labels.parquet').values.flatten()
y_validation = pd.read_parquet('../data/raw/val_labels.parquet').values.flatten()

## Retrain sherlock
The model can be retrained using the code below. The model is currently restricted to be trained on 78 classes, the code of the model architecture will soon be added for adjusting this.

In [None]:
print('')

train_sherlock(X_train, y_train, X_validation, y_validation, nn_id='retrained_sherlock');
print('Trained and saved new model.')

## Generate predictions with a model
If you want to use the pretrained Sherlock model `nn_id` set to "sherlock".

If you want to use another model, you can use the identifier corresponding to that model.

**Note**: There is a bug somewhere in the refactored code which affects the model predictions, this should be fixed soon.

In [None]:
predicted_labels = predict_sherlock(X_test.head(n=10), nn_id='retrained_sherlock')

In [None]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

In [None]:
# Should be fully deterministic too.
f1_score(y_test_subset[:10], predicted_labels[:10], average="weighted")

In [None]:
pd.Series(predicted_labels).nunique()

In [None]:
pd.Series(y_test_subset).nunique()

In [None]:
predicted_labels

In [None]:
y_test_subset[:10]

## Generate predictions with preprocessed data using Sherlock

Requires the data to be downloaded from Google Drive (see first step in notebook).

In [None]:
X_test_preprocessed = pd.read_parquet("../data/data/processed/X_test.parquet")
y_test_preprocessed = pd.read_parquet("../data/data/processed/y_test.parquet").reset_index(drop=True)

In [None]:
X_test_preprocessed.head()

In [None]:
#X_test_preprocessed.to_csv('test_preprocessed.csv')

In [None]:
X_test.head()

In [None]:
y_test_preprocessed.head()

In [None]:
predicted_labels = predict_sherlock(X_test_preprocessed.head(n=25), 'sherlock')

In [None]:
f1_score(y_test_preprocessed.head(n=25), predicted_labels, average='weighted')

In [None]:
predicted_labels[:10]

In [None]:
y_test_preprocessed.head(n=10)

In [None]:
first_keys = X_test_preprocessed.columns
first_keys_str = ','.join(first_keys)

keys = ','.join(X_test.columns)
if first_keys_str == keys:
    print('Keys are equal')
else:
    key_list = list(X_test.columns)

    print(f'keys are NOT equal. k1 len={len(first_keys)}, k2 len={len(key_list)}')

    for idx, k1 in enumerate(first_keys):
        k2 = key_list[idx]

        if k1 == k2:
            print(f'{k1} == {k2}')
        else:
            print(f'{k1} != {k2}')