In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime

print(f'Started at {datetime.now()}.')

Started at 2022-01-30 09:02:52.173110.


# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [3]:
import sys

import numpy as np
import pandas as pd

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [4]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw and preprocessed data into ../data/data.zip.
Data was downloaded.
Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [5]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [6]:
import time 

timestr = time.strftime("%Y%m%d-%H%M%S")

# Features will be output to the following files
X_test_filename_csv = f'../data/work/test_{timestr}.csv'
X_train_filename_csv = f'../data/work/train_{timestr}.csv'
X_validation_filename_csv = f'../data/work/validation_{timestr}.csv'

### PREPARATION

In [7]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import prepare_feature_extraction

prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:06.358760 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:01.051196 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)
Initialised NLTK, process took 0:00:00.147576 seconds.


[nltk_data] Downloading package punkt to /Users/lowecg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lowecg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from pyarrow.parquet import ParquetFile

def load_parquet_values(path):
    pf = ParquetFile(source=path)
    row_df = pf.read_row_group(0)

    return row_df['values']

In [9]:
%load_ext line_profiler

In [10]:
import multiprocessing as mp

# default process creation mode is changed in Python 3.8 to 'spawn' which causes "name not defined" type errors. Force fork 'mode' for now (this only needs to be called once).
# https://bugs.python.org/issue39931
#mp.set_start_method('fork', force=True)

## EXTRACT FEATURES TO CSV

### TEST SET

In [11]:
from sherlock.functional import extract_features_to_csv

values = load_parquet_values("../data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename_csv, values)

values = None

Starting ../data/work/test_20220130-090254.csv at 2022-01-30 09:03:02.899829. Rows=137353
Exporting 1578 column features
Finished. Processed 137353 rows in 0:03:54.590404, key_count=8


In [12]:
print(f'Finished at {datetime.now()}')

Finished at 2022-01-30 09:06:57.583740


### TRAIN SET

In [13]:
values = load_parquet_values("../data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename_csv, values)

values = None

Starting ../data/work/train_20220130-090254.csv at 2022-01-30 09:07:00.241181. Rows=412059
Exporting 1578 column features
Finished. Processed 412059 rows in 0:12:39.666735, key_count=8


In [14]:
print(f'Finished at {datetime.now()}')

Finished at 2022-01-30 09:19:40.781901


### VALIDATION SET

In [15]:
values = load_parquet_values("../data/raw/val_values.parquet")

extract_features_to_csv(X_validation_filename_csv, values)

values = None

Starting ../data/work/validation_20220130-090254.csv at 2022-01-30 09:19:41.728664. Rows=137353, using 8 CPU cores
Initialising word embeddings
Initialising word embeddings
Initialising word embeddings
Initialising word embeddings
Initialising word embeddingsInitialising word embeddings

Initialising word embeddings
Initialising word embeddings
Initialise Word Embeddings process took 0:00:19.283013 seconds.Initialise Word Embeddings process took 0:00:19.290279 seconds.Initialise Word Embeddings process took 0:00:19.291230 seconds.Initialise Word Embeddings process took 0:00:19.268282 seconds.Initialise Word Embeddings process took 0:00:19.241705 seconds.Initialise Word Embeddings process took 0:00:19.269592 seconds.Initialise Word Embeddings process took 0:00:19.293087 seconds.Initialise Word Embeddings process took 0:00:19.255624 seconds.







Exporting 1578 column features
Finished. Processed 137353 rows in 0:04:26.826968, key_count=8


In [16]:
print(f'Finished at {datetime.now()}')

Finished at 2022-01-30 09:24:08.675100


### Read Locally Processed Features

In [17]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:00:34.761827 seconds.


In [18]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-8e-06,0.000972,-0.000163,-0.000907,0.000407,0.000227,-0.000434,0.000569,0.000734,0.001
1,1.0,0.0,0.368421,0.33795,0.0,2.0,0.0,7.0,0.742677,1.326868,...,-0.00109,-0.000332,0.000407,0.000102,-0.000114,0.000829,-0.001065,3.3e-05,0.001056,-0.001102
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.000669,0.001152,-0.000633,0.00041,-0.000489,0.000496,0.000961,-0.000419,-0.000261,0.000149
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.00069,0.000504,0.000237,-0.000593,0.000719,0.000886,-0.000106,-0.001007,0.000401,-5.1e-05
4,1.0,0.0,0.02008,0.035741,0.0,2.0,0.0,5.0,96.521561,9.784149,...,0.001186,-0.000316,-0.000943,0.000607,-0.001078,-0.000669,-0.000229,-0.000283,2e-06,0.001216


In [19]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename_csv, dtype=np.float32)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

Load Features (train) process took 0:01:46.130862 seconds.


In [20]:
X_train.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.000632,-0.000901,-0.001165,-0.000413,0.000427,-0.000244,-0.00084,-0.000621,-0.000216,-0.000598
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.000979,0.000524,0.000165,0.001033,-0.000939,0.00069,-0.000223,0.000146,0.000846,-0.000964
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.001158,-0.000385,0.000819,-0.001139,-0.001242,-0.000333,0.000697,0.001104,0.001136,-0.000214
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.000184,0.000855,0.000822,-0.000864,3.1e-05,-0.000946,-0.001179,0.000989,-0.000107,-0.00034
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.000479,-0.001033,-0.000156,0.001054,-6.6e-05,0.000817,0.00099,0.000947,-0.000669,0.000318


In [21]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename_csv, dtype=np.float32)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

Load Features (validation) process took 0:00:39.465941 seconds.


In [22]:
X_validation.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.0008,0.000692,0.000664,-0.000847,0.000956,-0.000499,-0.000184,0.00086,0.000452,-0.000905
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.000738,0.000705,0.00045,-0.00027,-0.000692,-0.001078,-0.000561,0.000757,0.000698,0.000612
2,1.0,0.0,0.25,0.1875,0.0,1.0,0.0,10.0,-0.666667,1.154701,...,0.000488,-6.8e-05,-0.000752,-0.000879,0.001165,0.000404,-0.000787,0.000243,2.8e-05,0.000662
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.000375,0.000406,-0.000992,0.000271,0.001193,-0.000678,-0.001157,-0.001117,0.001207,-0.000427
4,1.0,0.0,0.09434,0.311855,0.0,4.0,0.0,5.0,42.275021,6.530585,...,-0.00041,0.00061,0.000327,0.000133,0.000907,-0.000825,0.000177,-0.000825,0.001041,-2.6e-05


## Impute NaN values with feature means

In [23]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:00:25.392337 seconds.


In [24]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

FillNA process took 0:00:02.394418 seconds.


In [25]:
start = datetime.now()

X_train.to_parquet('train.parquet', engine='pyarrow', compression='snappy')
X_validation.to_parquet('validation.parquet', engine='pyarrow', compression='snappy')
X_test.to_parquet('test.parquet', engine='pyarrow', compression='snappy')

print(f'Save parquet process took {datetime.now() - start} seconds.')

Save parquet process took 0:01:04.018231 seconds.


In [26]:
print(f'Completed at {datetime.now()}.')

Completed at 2022-01-30 09:28:41.561572.
