# Preprocess data and extract features.

In [1]:
%load_ext autoreload
%autoreload 2

In [28]:
# If you need fully deterministic results between runs, set the following environment value prior to launching jupyter.
# Instructions can be found in HOW-TO-ENVIRONMENT.md.
# See comment in sherlock.features.paragraph_vectors.infer_paragraph_embeddings_features for more info.
%env PYTHONHASHSEED

'13'

In [3]:
from datetime import datetime
import os
import sys
import time

import numpy as np
import pandas as pd

from sherlock import helpers
from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (
    extract_features,
    convert_string_lists_to_lists,
    prepare_feature_extraction,
    load_parquet_values,
)
from sherlock.features.word_embeddings import initialise_word_embeddings

In [4]:
print(f'Started at {datetime.now()}.')

Started at 2022-02-21 12:55:47.936774.


## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data for preprocessing and model training into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [5]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw data into ../data/.
Data was downloaded.
Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.


In [6]:
if not os.path.exists('../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy'):
    raise SystemExit(
        """
        Trained paragraph vectors do not exist,
        please run the '01-train-paragraph-vector-features' notebook before continuing
        """
    )

## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [7]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [8]:
timestr = time.strftime("%Y%m%d-%H%M%S")

# Features will be output to the following files
X_test_filename_csv = f'../data/data/processed/test_{timestr}.csv'
X_train_filename_csv = f'../data/data/processed/train_{timestr}.csv'
X_validation_filename_csv = f'../data/data/processed/validation_{timestr}.csv'

### PREPARATION

In [9]:
# ensure embedding initialisation is outside of timing for extract_features
prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:05.196631 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:03.018509 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)
Initialised NLTK, process took 0:00:00.184036 seconds.


[nltk_data] Downloading package punkt to /Users/madelon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/madelon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
%load_ext line_profiler

In [11]:
# default process creation mode is changed in Python 3.8 to 'spawn' which causes "name not defined" type errors. Force fork 'mode' for now (this only needs to be called once).
# https://bugs.python.org/issue39931
#mp.set_start_method('fork', force=True)

## EXTRACT FEATURES TO CSV

### TEST SET

In [12]:
values = load_parquet_values("../data/data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename_csv, values)

values = None

Starting ../data/data/processed/test_20220221-125552.csv at 2022-02-21 12:56:03.739007. Rows=137353, using 8 CPU cores
Exporting 1588 column features
Finished. Processed 137353 rows in 0:04:41.140014, key_count=8


In [13]:
print(f'Finished at {datetime.now()}')

Finished at 2022-02-21 13:00:45.006991


### TRAIN SET

In [14]:
values = load_parquet_values("../data/data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename_csv, values)

values = None

Starting ../data/data/processed/train_20220221-125552.csv at 2022-02-21 13:00:46.766509. Rows=412059, using 8 CPU cores
Exporting 1588 column features
Finished. Processed 412059 rows in 0:13:46.382904, key_count=8


In [15]:
print(f'Finished at {datetime.now()}')

Finished at 2022-02-21 13:14:33.436746


### VALIDATION SET

In [16]:
values = load_parquet_values("../data/data/raw/val_values.parquet")

extract_features_to_csv(X_validation_filename_csv, values)

values = None

Starting ../data/data/processed/validation_20220221-125552.csv at 2022-02-21 13:14:34.282570. Rows=137353, using 8 CPU cores
Exporting 1588 column features
Finished. Processed 137353 rows in 0:04:20.682436, key_count=8


In [17]:
print(f'Finished at {datetime.now()}')

Finished at 2022-02-21 13:18:55.069282


### Read Locally Processed Features

In [18]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:00:30.504465 seconds.


In [19]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.12415,-0.160029,-0.140256,0.109107,-0.044414,-0.124615,-0.033864,0.111789,-0.014209,-0.160459
1,1.0,0.0,0.368421,0.33795,0.0,2.0,0.0,7.0,0.742677,1.326868,...,-0.060843,-0.009223,-0.062848,0.080908,-0.240975,-0.03926,-0.050986,-0.068109,-0.063214,-0.200062
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.254407,-0.140454,-0.077885,0.270047,-0.254806,-0.015952,-0.358635,-0.090083,0.120246,0.006599
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.096018,0.018162,-0.066431,-0.039642,-0.104596,-0.08756,-0.108518,0.023666,0.01781,-0.047467
4,1.0,0.0,0.02008,0.035741,0.0,2.0,0.0,5.0,96.521561,9.784149,...,-0.785815,-0.388175,-1.25939,1.08041,-0.44547,-0.532897,0.137333,-1.17396,1.22239,-1.42321


In [20]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename_csv, dtype=np.float32)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

Load Features (train) process took 0:01:39.241069 seconds.


In [21]:
X_train.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.047276,-0.050462,-0.131864,0.068065,-0.074033,0.016362,0.055275,-0.063609,-0.014739,-0.076884
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.16921,-0.09124,-0.008702,0.158941,-0.102764,-0.001776,0.035398,0.005877,-0.033673,-0.266011
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.086971,0.152903,-0.141693,0.364022,-0.708731,0.064241,-0.183656,-0.001072,0.235746,-0.530483
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.019576,-0.020715,0.08036,0.170148,-0.339541,-0.025463,0.062125,-0.004156,-0.044532,-0.101253
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.264917,-0.144005,-0.086391,0.02788,-0.394154,-0.057364,-0.211917,0.06974,0.095384,-0.356454


In [22]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename_csv, dtype=np.float32)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

Load Features (validation) process took 0:00:30.066667 seconds.


In [23]:
X_validation.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.105711,0.181456,-0.29405,0.163736,-0.496933,0.06932,-0.040903,-0.056309,0.334068,-0.391248
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.258922,-0.001249,-0.231181,0.288208,-0.18153,0.052967,-0.037158,-0.016913,0.032424,-0.311296
2,1.0,0.0,0.25,0.1875,0.0,1.0,0.0,10.0,-0.666667,1.154701,...,-0.265864,-0.388315,-0.133144,-0.209464,-0.042673,-0.084304,-0.112698,0.147051,0.22468,-0.373928
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.198478,0.041071,-0.265879,0.261911,-0.264352,-0.115126,-0.271292,-0.061009,0.032532,-0.105325
4,1.0,0.0,0.09434,0.311855,0.0,4.0,0.0,5.0,42.275021,6.530585,...,-0.19917,-0.4866,0.026245,-0.288035,-0.301814,-0.007751,0.102981,0.405819,0.632662,-0.274999


## Impute NaN values with feature means

In [24]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:00:12.315641 seconds.


In [25]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

FillNA process took 0:00:04.335100 seconds.


In [26]:
start = datetime.now()

X_train.to_parquet('../data/data/processed/train.parquet', engine='pyarrow', compression='snappy')
X_validation.to_parquet('../data/data/processed/validation.parquet', engine='pyarrow', compression='snappy')
X_test.to_parquet('../data/data/processed/test.parquet', engine='pyarrow', compression='snappy')

print(f'Save parquet process took {datetime.now() - start} seconds.')

Save parquet process took 0:00:58.970663 seconds.


In [27]:
print(f'Completed at {datetime.now()}.')

Completed at 2022-02-21 13:22:51.962526.
