In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# If you need fully deterministic results between runs, set the following environment value prior to launching jupyter.
# See comment in sherlock.features.paragraph_vectors.infer_paragraph_embeddings_features for more info.
%env PYTHONHASHSEED

'13'

In [9]:
from datetime import datetime

print(f'Started at {datetime.now()}.')

Started at 2022-02-04 11:22:49.967026.


# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [10]:
import sys

import numpy as np
import pandas as pd

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [11]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw and preprocessed data into ../data/data.zip.
Data was downloaded.
Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


In [12]:
import os

if not os.path.exists('../sherlock/features/par_vec_retrained_400.pkl'):
    raise SystemExit('Retrained paragraph vectors do not exist, please run "train_doc2vec" notebook before continuing')

## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [13]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [14]:
import time 

timestr = time.strftime("%Y%m%d-%H%M%S")

# Features will be output to the following files
X_test_filename_csv = f'../data/work/test_{timestr}.csv'
X_train_filename_csv = f'../data/work/train_{timestr}.csv'
X_validation_filename_csv = f'../data/work/validation_{timestr}.csv'

### PREPARATION

In [15]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import prepare_feature_extraction

prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:06.319829 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:03.591479 seconds. (filename = ../sherlock/features/par_vec_retrained_400.pkl)
Initialised NLTK, process took 0:00:00.195532 seconds.


[nltk_data] Downloading package punkt to /Users/lowecg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lowecg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
from pyarrow.parquet import ParquetFile

def load_parquet_values(path):
    pf = ParquetFile(source=path)
    row_df = pf.read_row_group(0)

    return row_df['values']

In [17]:
%load_ext line_profiler

In [18]:
import multiprocessing as mp

# default process creation mode is changed in Python 3.8 to 'spawn' which causes "name not defined" type errors. Force fork 'mode' for now (this only needs to be called once).
# https://bugs.python.org/issue39931
#mp.set_start_method('fork', force=True)

## EXTRACT FEATURES TO CSV

### TEST SET

In [19]:
from sherlock.functional import extract_features_to_csv

values = load_parquet_values("../data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename_csv, values)

values = None

Starting ../data/work/test_20220204-112250.csv at 2022-02-04 11:23:01.349174. Rows=137353, using 8 CPU cores
Exporting 1588 column features
Finished. Processed 137353 rows in 0:04:48.065071, key_count=8


In [20]:
print(f'Finished at {datetime.now()}')

Finished at 2022-02-04 11:27:49.507300


### TRAIN SET

In [21]:
values = load_parquet_values("../data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename_csv, values)

values = None

Starting ../data/work/train_20220204-112250.csv at 2022-02-04 11:27:51.300174. Rows=412059, using 8 CPU cores
Exporting 1588 column features
Finished. Processed 412059 rows in 0:14:15.962255, key_count=8


In [22]:
print(f'Finished at {datetime.now()}')

Finished at 2022-02-04 11:42:07.393216


### VALIDATION SET

In [23]:
values = load_parquet_values("../data/raw/val_values.parquet")

extract_features_to_csv(X_validation_filename_csv, values)

values = None

Starting ../data/work/validation_20220204-112250.csv at 2022-02-04 11:42:08.220458. Rows=137353, using 8 CPU cores
Exporting 1588 column features
Finished. Processed 137353 rows in 0:04:45.193637, key_count=8


In [24]:
print(f'Finished at {datetime.now()}')

Finished at 2022-02-04 11:46:53.498882


### Read Locally Processed Features

In [25]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:00:33.212736 seconds.


In [26]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.143217,-0.009815,-0.046129,0.238079,-0.119581,-0.031083,-0.028402,-0.071125,0.111424,-0.005298
1,1.0,0.0,0.368421,0.33795,0.0,2.0,0.0,7.0,0.742677,1.326868,...,-0.10412,0.076913,-0.164679,0.04999,-0.068911,-0.12456,0.010527,-0.031023,0.165831,-0.258051
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.049593,0.195894,-0.174179,-0.018907,-0.092391,0.055675,-0.049304,-0.088299,-0.041625,-0.128652
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.138319,9.8e-05,-0.147726,6.8e-05,-0.159474,-0.068264,-0.07924,-0.007175,-0.029696,0.086017
4,1.0,0.0,0.02008,0.035741,0.0,2.0,0.0,5.0,96.521561,9.784149,...,0.380081,0.068306,-0.51782,0.771269,-0.179867,-0.630215,-0.223715,-0.721744,-0.538348,-0.066933


In [27]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename_csv, dtype=np.float32)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

Load Features (train) process took 0:01:41.275746 seconds.


In [28]:
X_train.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.028492,0.088204,-0.060888,0.026088,-0.070441,0.014158,0.020463,-0.035948,-0.004609,-0.105932
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.131441,0.03762,-0.096918,0.161002,-0.105732,0.026181,-0.092335,-0.042593,0.155727,-0.06688
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.285964,0.068456,-0.188468,0.199399,0.072216,-0.351159,-0.069735,-0.192109,0.324884,-0.345518
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.16066,0.032871,-0.070633,0.04613,0.058185,0.090536,-0.179968,-0.063661,0.171529,-0.109546
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.10543,-0.105382,-0.194499,0.32239,-0.179725,-0.109367,-0.178109,0.130783,0.126567,-0.312613


In [29]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename_csv, dtype=np.float32)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

Load Features (validation) process took 0:00:32.738429 seconds.


In [30]:
X_validation.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.104845,0.368129,-0.323955,0.319534,-0.19903,-0.059769,-0.118122,-0.18837,0.431995,-0.114784
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.240649,0.021433,-0.140748,0.210096,-0.04915,-0.106274,0.024451,-0.021593,0.281272,-0.148775
2,1.0,0.0,0.25,0.1875,0.0,1.0,0.0,10.0,-0.666667,1.154701,...,-0.241652,0.17535,0.236145,-0.095223,0.026109,-0.037214,0.115044,0.127849,-0.004681,-0.059551
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.063456,0.043425,-0.238892,0.075612,-0.100516,0.027665,0.00064,-0.025805,0.094761,-0.023998
4,1.0,0.0,0.09434,0.311855,0.0,4.0,0.0,5.0,42.275021,6.530585,...,-0.230932,0.270783,0.256644,-0.070366,-0.193472,0.116177,0.265872,0.069686,-0.077824,-0.673256


## Impute NaN values with feature means

In [31]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:00:20.809540 seconds.


In [32]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

FillNA process took 0:00:02.438055 seconds.


In [33]:
start = datetime.now()

X_train.to_parquet('train.parquet', engine='pyarrow', compression='snappy')
X_validation.to_parquet('validation.parquet', engine='pyarrow', compression='snappy')
X_test.to_parquet('test.parquet', engine='pyarrow', compression='snappy')

print(f'Save parquet process took {datetime.now() - start} seconds.')

Save parquet process took 0:00:58.805568 seconds.


In [34]:
print(f'Completed at {datetime.now()}.')

Completed at 2022-02-04 11:51:03.415880.
