# Preprocess data and extract features.

In [11]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# If you need fully deterministic results between runs, set the following environment value prior to launching jupyter.
# Instructions can be found in HOW-TO-ENVIRONMENT.md.
# See comment in sherlock.features.paragraph_vectors.infer_paragraph_embeddings_features for more info.
%env PYTHONHASHSEED

UsageError: Environment does not have key: PYTHONHASHSEED


In [13]:
from datetime import datetime
import os
import sys
import time

import numpy as np
import pandas as pd

from sherlock import helpers
from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (
    extract_features,
    convert_string_lists_to_lists,
    prepare_feature_extraction,
    load_parquet_values,
)
from sherlock.features.word_embeddings import initialise_word_embeddings

In [14]:
print(f'Started at {datetime.now()}.')

Started at 2022-12-02 07:51:17.072950.


## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data for preprocessing and model training into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [15]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw data into ../data/data/.
Data was downloaded.
Preparing feature extraction by downloading 4 files:
        
 /home/fac/mmior/apps/sherlock-project/notebooks/../sherlock/features/glove.6B.50d.txt, 
 /home/fac/mmior/apps/sherlock-project/notebooks/../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 /home/fac/mmior/apps/sherlock-project/notebooks/../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 /home/fac/mmior/apps/sherlock-project/notebooks/../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
Downloading GloVe word embedding vectors.


Downloading...
From: https://drive.google.com/uc?id=1kayd5oNRQm8-NCvA8pIrtezbQ-B1_Vmk
To: /home/fac/mmior/apps/sherlock-project/sherlock/features/glove.6B.50d.txt
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 171M/171M [00:04<00:00, 41.7MB/s]


GloVe word embedding vectors were downloaded.
All files for extracting word and paragraph embeddings are present.


In [16]:
if not os.path.exists('../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy'):
    raise SystemExit(
        """
        Trained paragraph vectors do not exist,
        please run the '01-train-paragraph-vector-features' notebook before continuing
        """
    )

## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [17]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [18]:
timestr = time.strftime("%Y%m%d-%H%M%S")

# Features will be output to the following files
X_test_filename_csv = f'../data/data/processed/test_{timestr}.csv'
X_train_filename_csv = f'../data/data/processed/train_{timestr}.csv'
X_validation_filename_csv = f'../data/data/processed/validation_{timestr}.csv'

### PREPARATION

In [19]:
# ensure embedding initialisation is outside of timing for extract_features
prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 4 files:
        
 /home/fac/mmior/apps/sherlock-project/notebooks/../sherlock/features/glove.6B.50d.txt, 
 /home/fac/mmior/apps/sherlock-project/notebooks/../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 /home/fac/mmior/apps/sherlock-project/notebooks/../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 /home/fac/mmior/apps/sherlock-project/notebooks/../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:09.275200 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:13.343718 seconds. (filename = /home/fac/mmior/apps/sherlock-project/notebooks/../sherlock/features/par_vec_trained_400.pkl)


[nltk_data] Downloading package punkt to /home/fac/mmior/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Initialised NLTK, process took 0:00:01.399770 seconds.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fac/mmior/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [20]:
%load_ext line_profiler

In [21]:
# default process creation mode is changed in Python 3.8 to 'spawn' which causes "name not defined" type errors. Force fork 'mode' for now (this only needs to be called once).
# https://bugs.python.org/issue39931
#mp.set_start_method('fork', force=True)

## EXTRACT FEATURES TO CSV

### TEST SET

In [22]:
values = load_parquet_values("../data/data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename_csv, values)

values = None

Starting ../data/data/processed/test_20221202-080152.csv at 2022-12-02 08:02:24.615030. Rows=137353, using 88 CPU cores
Exporting 8513 column features
Finished. Processed 137353 rows in 0:04:02.758122, key_count=88


In [23]:
print(f'Finished at {datetime.now()}')

Finished at 2022-12-02 08:06:27.918576


### TRAIN SET

In [24]:
values = load_parquet_values("../data/data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename_csv, values)

values = None

Starting ../data/data/processed/train_20221202-080152.csv at 2022-12-02 08:06:35.322951. Rows=412059, using 88 CPU cores
Exporting 8513 column features
Finished. Processed 412059 rows in 0:11:32.947522, key_count=88


In [25]:
print(f'Finished at {datetime.now()}')

Finished at 2022-12-02 08:18:08.813737


### VALIDATION SET

In [26]:
values = load_parquet_values("../data/data/raw/validation_values.parquet")

extract_features_to_csv(X_validation_filename_csv, values)

values = None

Starting ../data/data/processed/validation_20221202-080152.csv at 2022-12-02 08:18:11.553710. Rows=137353, using 88 CPU cores
Exporting 8513 column features
Finished. Processed 137353 rows in 0:04:00.689663, key_count=88


In [27]:
print(f'Finished at {datetime.now()}')

Finished at 2022-12-02 08:22:12.812786


### Read Locally Processed Features

In [28]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:06:43.824128 seconds.


In [29]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.124796,-0.16053,-0.13863,0.108593,-0.043521,-0.12308,-0.036061,0.11187,-0.014046,-0.159628
1,1.0,0.0,0.368421,0.33795,0.0,2.0,0.0,7.0,0.742677,1.326868,...,-0.061843,-0.008442,-0.06211,0.081168,-0.241365,-0.039798,-0.049697,-0.067737,-0.063742,-0.201572
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.253718,-0.139565,-0.078871,0.270388,-0.255809,-0.014809,-0.358749,-0.090113,0.119655,0.006575
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.095994,0.018118,-0.067045,-0.037899,-0.104353,-0.088382,-0.107567,0.024607,0.01836,-0.046713
4,1.0,0.0,0.02008,0.035741,0.0,2.0,0.0,5.0,96.521561,9.784149,...,-0.785971,-0.38805,-1.25946,1.08032,-0.445537,-0.53324,0.137255,-1.17363,1.22247,-1.42291


In [30]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename_csv, dtype=np.float32)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

Load Features (train) process took 0:20:34.007006 seconds.


In [31]:
X_train.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.048421,-0.048724,-0.131172,0.069714,-0.074817,0.01597,0.056011,-0.063332,-0.013391,-0.077518
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.169238,-0.091963,-0.010228,0.159541,-0.104421,-0.001664,0.035712,0.007041,-0.033864,-0.265061
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.085378,0.151967,-0.14226,0.363973,-0.709269,0.065355,-0.185121,-0.001956,0.235202,-0.529358
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.019204,-0.020901,0.081247,0.169977,-0.339903,-0.026066,0.060319,-0.005206,-0.044322,-0.102105
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.265524,-0.144648,-0.085883,0.027499,-0.395339,-0.056772,-0.21164,0.071728,0.094151,-0.357334


In [32]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename_csv, dtype=np.float32)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

Load Features (validation) process took 0:06:55.186530 seconds.


In [33]:
X_validation.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.104503,0.181016,-0.295239,0.163888,-0.495647,0.069102,-0.041661,-0.057777,0.333117,-0.391152
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.259309,-0.000359,-0.230081,0.287751,-0.181445,0.054224,-0.037713,-0.016012,0.0312,-0.312492
2,1.0,0.0,0.25,0.1875,0.0,1.0,0.0,10.0,-0.666667,1.154701,...,-0.265935,-0.387846,-0.13252,-0.209341,-0.043506,-0.08467,-0.112361,0.147159,0.224366,-0.374295
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.198402,0.042583,-0.265598,0.261661,-0.263526,-0.114657,-0.269635,-0.06176,0.030509,-0.105565
4,1.0,0.0,0.09434,0.311855,0.0,4.0,0.0,5.0,42.275021,6.530585,...,-0.199155,-0.486536,0.026853,-0.288409,-0.302479,-0.006825,0.103142,0.406542,0.631675,-0.27437


## Impute NaN values with feature means

In [34]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:00:22.876602 seconds.


In [35]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

FillNA process took 0:00:30.808658 seconds.


In [36]:
start = datetime.now()

X_train.to_parquet('../data/data/processed/train.parquet', engine='pyarrow', compression='snappy')
X_validation.to_parquet('../data/data/processed/validation.parquet', engine='pyarrow', compression='snappy')
X_test.to_parquet('../data/data/processed/test.parquet', engine='pyarrow', compression='snappy')

print(f'Save parquet process took {datetime.now() - start} seconds.')

Save parquet process took 0:03:45.915150 seconds.


In [37]:
print(f'Completed at {datetime.now()}.')

Completed at 2022-12-02 09:01:35.237229.
