# Preprocess data and extract features.

In [31]:
%env PYTHONHASHSEED=13
%load_ext autoreload
%autoreload 2

env: PYTHONHASHSEED=13
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
# If you need fully deterministic results between runs, set the following environment value prior to launching jupyter.
# Instructions can be found in HOW-TO-ENVIRONMENT.md.
# See comment in sherlock.features.paragraph_vectors.infer_paragraph_embeddings_features for more info.
%env PYTHONHASHSEED

'13'

In [33]:
from datetime import datetime
import os
import sys
import time

import numpy as np
import pandas as pd

from sherlock import helpers
from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (
    extract_features,
    convert_string_lists_to_lists,
    prepare_feature_extraction,
    load_parquet_values,
)
from sherlock.features.word_embeddings import initialise_word_embeddings

In [34]:
print(f'Started at {datetime.now()}.')

Started at 2022-09-25 10:53:33.243368.


## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data for preprocessing and model training into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [35]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw data into ../data/data/.
Downloading data directory.


Downloading...
From: https://drive.google.com/u/0/uc?id=1RWB7djA5cJ9Nuw41SxKtEhwDyqqfcJst&export=download
To: /home/ritvikp/CMDA_capstone/sherlock-project/data/data.zip
100%|██████████| 113M/113M [00:01<00:00, 97.7MB/s] 


Data was downloaded.
Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.


In [36]:
if not os.path.exists('../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy'):
    raise SystemExit(
        """
        Trained paragraph vectors do not exist,
        please run the '01-train-paragraph-vector-features' notebook before continuing
        """
    )

## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [37]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [38]:
timestr = time.strftime("%Y%m%d-%H%M%S")

# Features will be output to the following files
X_test_filename_csv = f'../data/data/processed/test_{timestr}.csv'
X_train_filename_csv = f'../data/data/processed/train_{timestr}.csv'
X_validation_filename_csv = f'../data/data/processed/validation_{timestr}.csv'

### PREPARATION

In [39]:
# ensure embedding initialisation is outside of timing for extract_features
prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:04.136077 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:06.034326 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)
Initialised NLTK, process took 0:00:00.119086 seconds.


[nltk_data] Downloading package punkt to /home/ritvikp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ritvikp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [41]:
# default process creation mode is changed in Python 3.8 to 'spawn' which causes "name not defined" type errors. Force fork 'mode' for now (this only needs to be called once).
# https://bugs.python.org/issue39931
#mp.set_start_method('fork', force=True)

## EXTRACT FEATURES TO CSV

### TEST SET

In [42]:
values = load_parquet_values("../data/data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename_csv, values)

values = None

Starting ../data/data/processed/test_20220925-105337.csv at 2022-09-25 10:53:48.582271. Rows=65007, using 128 CPU cores
Exporting 1588 column features
Finished. Processed 65007 rows in 0:02:17.789998, key_count=128


In [43]:
print(f'Finished at {datetime.now()}')

Finished at 2022-09-25 10:56:07.103148


### TRAIN SET

In [44]:
values = load_parquet_values("../data/data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename_csv, values)

values = None

Starting ../data/data/processed/train_20220925-105337.csv at 2022-09-25 10:56:07.758809. Rows=65007, using 128 CPU cores
Exporting 1588 column features
Finished. Processed 65007 rows in 0:02:10.736625, key_count=128


In [45]:
print(f'Finished at {datetime.now()}')

Finished at 2022-09-25 10:58:19.261630


### VALIDATION SET

In [46]:
values = load_parquet_values("../data/data/raw/val_values.parquet")

extract_features_to_csv(X_validation_filename_csv, values)

values = None

Starting ../data/data/processed/validation_20220925-105337.csv at 2022-09-25 10:58:19.814129. Rows=65007, using 128 CPU cores
Exporting 1588 column features
Finished. Processed 65007 rows in 0:02:17.367063, key_count=128


In [47]:
print(f'Finished at {datetime.now()}')

Finished at 2022-09-25 11:00:37.971592


### Read Locally Processed Features

In [48]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:00:08.813651 seconds.


In [49]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.073357,-0.056855,-0.006924,-0.011128,0.004722,0.037334,0.028883,-0.188482,-0.114415,-0.150102
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.183972,0.072769,-0.207743,0.088965,-0.166349,-0.168203,-0.064643,-0.030581,-0.036819,-0.100331
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.0799,-0.07753,-0.131517,0.264931,-0.034884,-0.098135,0.032695,-0.141846,0.154813,-0.333995
3,1.0,0.0,0.5,0.25,0.0,1.0,0.5,1.0,-2.0,0.0,...,-0.00801,-0.011173,-0.004786,0.059424,-0.066956,-0.006355,0.049467,-0.048386,-0.008806,-0.055399
4,1.0,0.0,0.05,0.0475,0.0,1.0,0.0,1.0,15.052631,4.129483,...,-0.101676,-0.052384,0.048183,0.102851,-0.191231,-0.031069,-0.04063,-0.102155,0.013049,-0.238846


In [50]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename_csv, dtype=np.float32)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

Load Features (train) process took 0:00:08.661167 seconds.


In [51]:
X_train.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.073357,-0.056855,-0.006924,-0.011128,0.004722,0.037334,0.028883,-0.188482,-0.114415,-0.150102
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.183972,0.072769,-0.207743,0.088965,-0.166349,-0.168203,-0.064643,-0.030581,-0.036819,-0.100331
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.0799,-0.07753,-0.131517,0.264931,-0.034884,-0.098135,0.032695,-0.141846,0.154813,-0.333995
3,1.0,0.0,0.5,0.25,0.0,1.0,0.5,1.0,-2.0,0.0,...,-0.00801,-0.011173,-0.004786,0.059424,-0.066956,-0.006355,0.049467,-0.048386,-0.008806,-0.055399
4,1.0,0.0,0.05,0.0475,0.0,1.0,0.0,1.0,15.052631,4.129483,...,-0.101676,-0.052384,0.048183,0.102851,-0.191231,-0.031069,-0.04063,-0.102155,0.013049,-0.238846


In [52]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename_csv, dtype=np.float32)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

Load Features (validation) process took 0:00:08.254195 seconds.


In [53]:
X_validation.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.073357,-0.056855,-0.006924,-0.011128,0.004722,0.037334,0.028883,-0.188482,-0.114415,-0.150102
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.183972,0.072769,-0.207743,0.088965,-0.166349,-0.168203,-0.064643,-0.030581,-0.036819,-0.100331
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.0799,-0.07753,-0.131517,0.264931,-0.034884,-0.098135,0.032695,-0.141846,0.154813,-0.333995
3,1.0,0.0,0.5,0.25,0.0,1.0,0.5,1.0,-2.0,0.0,...,-0.00801,-0.011173,-0.004786,0.059424,-0.066956,-0.006355,0.049467,-0.048386,-0.008806,-0.055399
4,1.0,0.0,0.05,0.0475,0.0,1.0,0.0,1.0,15.052631,4.129483,...,-0.101676,-0.052384,0.048183,0.102851,-0.191231,-0.031069,-0.04063,-0.102155,0.013049,-0.238846


## Impute NaN values with feature means

In [54]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:00:00.447923 seconds.


In [55]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

FillNA process took 0:00:00.777623 seconds.


In [56]:
start = datetime.now()

X_train.to_parquet('../data/data/processed/train.parquet', engine='pyarrow', compression='snappy')
X_validation.to_parquet('../data/data/processed/validation.parquet', engine='pyarrow', compression='snappy')
X_test.to_parquet('../data/data/processed/test.parquet', engine='pyarrow', compression='snappy')

print(f'Save parquet process took {datetime.now() - start} seconds.')

Save parquet process took 0:00:10.932869 seconds.


In [57]:
print(f'Completed at {datetime.now()}.')

Completed at 2022-09-25 11:01:18.330495.
