In [1]:
%load_ext autoreload
%autoreload 2

# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [2]:
import sys

import numpy as np
import pandas as pd

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction

from datetime import datetime

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [3]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw and preprocessed data into ../data/data.zip.
Data was downloaded.
Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [4]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [5]:
import time 

timestr = time.strftime("%Y%m%d-%H%M%S")

X_test_filename_csv = f'../data/work/test_{timestr}.csv'
X_train_filename_csv = f'../data/work/train_{timestr}.csv'
X_validation_filename_csv = f'../data/work/validation_{timestr}.csv'

### PREPARATION

In [6]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import prepare_feature_extraction

prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:06.450996 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:03.576208 seconds. (filename = ../sherlock/features/par_vec_retrained_400.pkl)
Initialised NLTK, process took 0:00:00.143610 seconds.


[nltk_data] Downloading package punkt to /Users/lowecg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lowecg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from pyarrow.parquet import ParquetFile

def load_parquet_values(path):
    pf = ParquetFile(source=path)
    row_df = pf.read_row_group(0)

    return row_df['values']

In [8]:
%load_ext line_profiler

In [9]:
import multiprocessing as mp

# default process creation mode is changed in Python 3.8 to 'spawn' which causes "name not defined" type errors. Force fork 'mode' for now (this only needs to be called once).
# https://bugs.python.org/issue39931
mp.set_start_method('fork', force=True)

## EXTRACT FEATURES TO CSV

### TEST SET

In [10]:
from sherlock.functional import extract_features_to_csv

values = load_parquet_values("../data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename_csv, values)

values = None

Starting ../data/work/test_20211220-170337.csv at 2021-12-20 17:03:48.795801. Rows=137353
Exporting 1578 column features
Finished. Processed 137353 rows in 0:04:44.983828, key_count=8


In [12]:
print(f'Finished at {datetime.now()}')

Finished at 2021-12-20 17:08:33.941412


### TRAIN SET

In [13]:
values = load_parquet_values("../data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename_csv, values)

values = None

Starting ../data/work/train_20211220-170337.csv at 2021-12-20 17:08:36.978012. Rows=412059
Exporting 1578 column features
Finished. Processed 412059 rows in 0:15:44.823902, key_count=8


In [14]:
print(f'Finished at {datetime.now()}')

Finished at 2021-12-20 17:24:22.344563


### VALIDATION SET

In [15]:
values = load_parquet_values("../data/raw/val_values.parquet")

extract_features_to_csv(X_validation_filename_csv, values)

values = None

Starting ../data/work/validation_20211220-170337.csv at 2021-12-20 17:24:23.327323. Rows=137353
Exporting 1578 column features
Finished. Processed 137353 rows in 0:04:51.049531, key_count=8


In [16]:
print(f'Finished at {datetime.now()}')

Finished at 2021-12-20 17:29:14.523994


### Read Locally Processed Features

In [17]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:00:28.250045 seconds.


In [18]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.020721,-0.005287,-0.090256,-0.006676,0.02635,0.127593,0.044672,0.04689,0.283769,-0.172169
1,1.0,0.0,0.368421,0.33795,0.0,2.0,0.0,7.0,0.742677,1.326868,...,-0.057171,0.12043,0.054928,-0.001405,0.129986,0.040218,0.051391,-0.096935,0.382584,-0.084803
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.099364,-0.018977,0.095784,-0.115252,0.023738,0.03544,-0.057802,0.138073,0.283577,-0.016174
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.133863,-0.010696,0.106732,0.035569,0.075957,-0.059119,0.028897,0.02728,0.098025,-0.06459
4,1.0,0.0,0.02008,0.035741,0.0,2.0,0.0,5.0,96.521561,9.784149,...,-0.334377,-0.091279,-0.375093,-0.848622,-0.562868,0.552307,0.343993,0.099884,0.830301,0.208447


In [19]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename_csv, dtype=np.float32)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

Load Features (train) process took 0:01:40.898461 seconds.


In [20]:
X_train.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.008905,-0.06324,0.010034,0.010097,-0.044576,-0.002031,0.080549,-0.029606,0.133408,-0.095668
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.099759,0.081206,-0.067417,0.128702,0.109926,-0.032693,0.084184,-0.007141,0.345851,-0.074017
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.025684,0.215332,-0.166567,0.037947,0.494069,0.222317,0.408799,-0.433496,0.204239,-0.338079
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.036678,-0.047749,0.150105,0.057095,-0.141576,-0.052409,0.106889,-0.078422,0.430574,-0.154241
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.228273,0.103057,0.137383,-0.165722,0.191401,0.090281,0.030404,-0.044517,0.427801,-0.063056


In [21]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename_csv, dtype=np.float32)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

Load Features (validation) process took 0:00:34.900212 seconds.


In [22]:
X_validation.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.310341,0.217952,0.297807,-0.056163,-0.038965,0.395672,0.329095,-0.214337,0.359123,-0.046176
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.11555,0.338066,0.063486,0.01277,-0.000853,0.089392,0.316622,-0.254519,0.422006,-0.266197
2,1.0,0.0,0.25,0.1875,0.0,1.0,0.0,10.0,-0.666667,1.154701,...,0.169015,0.080453,-0.369928,-0.070172,0.031067,0.081292,0.00461,0.032759,0.053653,0.037508
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.026083,0.089946,-0.045484,-0.047944,0.044044,0.087058,0.116548,0.064247,0.18855,0.058881
4,1.0,0.0,0.09434,0.311855,0.0,4.0,0.0,5.0,42.275021,6.530585,...,0.035291,0.682512,0.060669,0.177247,-0.000958,-0.162122,0.561552,-0.234432,0.69089,-0.148726


## Impute NaN values with feature means

In [23]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:00:10.492384 seconds.


In [24]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

FillNA process took 0:00:08.983115 seconds.


In [25]:
start = datetime.now()

X_train.to_parquet('train.parquet', engine='pyarrow', compression='snappy')
X_validation.to_parquet('validation.parquet', engine='pyarrow', compression='snappy')
X_test.to_parquet('test.parquet', engine='pyarrow', compression='snappy')

print(f'Save parquet process took {datetime.now() - start} seconds.')

Save parquet process took 0:01:02.403100 seconds.
