# Preprocess data and extract features.

In [1]:
%env PYTHONHASHSEED=13
%load_ext autoreload
%autoreload 2

env: PYTHONHASHSEED=13


In [2]:
# If you need fully deterministic results between runs, set the following environment value prior to launching jupyter.
# Instructions can be found in HOW-TO-ENVIRONMENT.md.
# See comment in sherlock.features.paragraph_vectors.infer_paragraph_embeddings_features for more info.
%env PYTHONHASHSEED

'13'

In [3]:
from datetime import datetime
import os
import sys
import time

import numpy as np
import pandas as pd

from sherlock import helpers
from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (
    extract_features,
    convert_string_lists_to_lists,
    prepare_feature_extraction,
    load_parquet_values,
)
from sherlock.features.word_embeddings import initialise_word_embeddings

In [4]:
print(f'Started at {datetime.now()}.')

Started at 2022-11-14 10:38:12.384241.


## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data for preprocessing and model training into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [5]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw data into ../data/data/.
Downloading data directory.


Downloading...
From: https://drive.google.com/u/1/uc?id=1F-Bciiv9KD1-3-UpzIWQ_NUMNHcDTvuN&export=download
To: /home/ritvikp/CMDA_capstone/sherlock-project/data/data.zip
100%|██████████| 35.7M/35.7M [00:00<00:00, 67.7MB/s]


Data was downloaded.
Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.


In [6]:
if not os.path.exists('../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy'):
    raise SystemExit(
        """
        Trained paragraph vectors do not exist,
        please run the '01-train-paragraph-vector-features' notebook before continuing
        """
    )

## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [7]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [8]:
timestr = time.strftime("%Y%m%d-%H%M%S")

# Features will be output to the following files
X_test_filename_csv = f'../data/data/processed/test_{timestr}.csv'
X_train_filename_csv = f'../data/data/processed/train_{timestr}.csv'
X_validation_filename_csv = f'../data/data/processed/validation_{timestr}.csv'

### PREPARATION

In [9]:
# ensure embedding initialisation is outside of timing for extract_features
prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:04.852397 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:10.539313 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)
Initialised NLTK, process took 0:00:00.259892 seconds.


[nltk_data] Downloading package punkt to /home/ritvikp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ritvikp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
%load_ext line_profiler

In [11]:
# default process creation mode is changed in Python 3.8 to 'spawn' which causes "name not defined" type errors. Force fork 'mode' for now (this only needs to be called once).
# https://bugs.python.org/issue39931
#mp.set_start_method('fork', force=True)

## EXTRACT FEATURES TO CSV

### TEST SET

In [13]:
values = load_parquet_values("../data/data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename_csv, values)

values = None

Starting ../data/data/processed/test_20221114-103814.csv at 2022-11-14 10:40:03.320033. Rows=12524, using 128 CPU cores
Exporting 1588 column features
Finished. Processed 12524 rows in 0:00:09.300093, key_count=118


In [14]:
print(f'Finished at {datetime.now()}')

Finished at 2022-11-14 10:40:12.648233


### TRAIN SET

In [15]:
values = load_parquet_values("../data/data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename_csv, values)

values = None

Starting ../data/data/processed/train_20221114-103814.csv at 2022-11-14 10:40:13.341618. Rows=66784, using 128 CPU cores
Exporting 1588 column features
Finished. Processed 66784 rows in 0:00:30.262937, key_count=128


In [16]:
print(f'Finished at {datetime.now()}')

Finished at 2022-11-14 10:40:44.046609


### VALIDATION SET

In [17]:
values = load_parquet_values("../data/data/raw/val_values.parquet")

extract_features_to_csv(X_validation_filename_csv, values)

values = None

Starting ../data/data/processed/validation_20221114-103814.csv at 2022-11-14 10:40:44.125011. Rows=4174, using 128 CPU cores
Exporting 1588 column features
Finished. Processed 4174 rows in 0:00:07.090978, key_count=41


In [18]:
print(f'Finished at {datetime.now()}')

Finished at 2022-11-14 10:40:51.492896


### Read Locally Processed Features

In [19]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:00:01.462549 seconds.


In [20]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,1.0,1.0,1.71429,1.06122,1.0,4.0,1.0,12.0,0.596154,1.37608,...,-0.281252,-0.141856,-0.139772,0.043446,-0.153707,-0.136036,0.176066,-0.015164,0.004617,-0.421544
1,1.0,0.0,0.3,0.21,0.0,1.0,0.0,3.0,-1.238095,0.872872,...,-0.12589,0.019456,-0.283072,0.168325,-0.213451,-0.095402,0.058518,0.086278,-0.002591,-0.268067
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.028812,-0.015692,0.369469,0.073525,-0.225517,0.034162,-0.145376,0.115689,-0.088321,-0.324119
3,1.0,0.0,0.6,0.64,0.0,2.0,0.0,6.0,-0.921875,0.84375,...,-0.107193,-0.26243,0.209832,-0.17667,-0.086504,0.093452,-0.013796,-0.042605,-0.112081,-0.31617
4,1.0,0.0,0.047619,0.045351,0.0,1.0,0.0,1.0,16.049999,4.248529,...,-0.426857,-0.108697,0.102459,0.794043,-0.57436,-0.152513,-0.056189,-0.10512,-0.058013,-0.572473


In [21]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename_csv, dtype=np.float32)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

Load Features (train) process took 0:00:07.938462 seconds.


In [22]:
X_train.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,1.0,0.0,0.5,0.25,0.0,1.0,0.5,1.0,-2.0,0.0,...,-0.009931,-0.010923,-0.005544,0.059266,-0.066409,-0.005828,0.04984,-0.049171,-0.011011,-0.05369
1,1.0,0.0,0.2,0.16,0.0,1.0,0.0,5.0,0.25,1.5,...,-0.425365,-0.142438,0.244175,0.734183,-0.445191,-0.39158,-0.091895,-0.283094,-0.09077,-0.527099
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.080679,-0.007773,-0.014259,0.005337,-0.077856,-0.043839,-0.015134,0.012068,-0.068826,-0.131233
3,1.0,0.0,0.04,0.0384,0.0,1.0,0.0,1.0,20.041666,4.694855,...,-0.412441,-0.004193,-0.019288,0.759977,-0.64536,-0.268643,-0.377824,-0.24738,-0.096456,-0.905216
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.007783,-0.022211,-0.069327,0.086026,-0.132584,-0.098505,0.056782,-0.123175,-0.039349,-0.078163


In [23]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename_csv, dtype=np.float32)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

Load Features (validation) process took 0:00:00.539329 seconds.


In [24]:
X_validation.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,1.0,0.0,0.3,0.21,0.0,1.0,0.0,6.0,-1.2381,0.872872,...,0.199079,-0.110822,0.261912,-0.042236,-0.169648,0.174212,0.266578,0.366824,0.332375,-0.506593
1,1.0,0.0,0.5,0.25,0.0,1.0,0.5,1.0,-2.0,0.0,...,-0.012221,-0.01023,-0.039717,0.038382,-0.025192,-0.0135,0.022771,-0.071983,0.004519,-0.007065
2,1.0,0.0,0.084337,0.077225,0.0,1.0,0.0,7.0,6.949248,2.991529,...,-0.980171,0.531731,0.146292,-0.092693,-0.33557,0.155663,-0.06921,0.20188,-0.010261,-0.517029
3,1.0,1.0,1.333333,0.222222,1.0,2.0,1.0,12.0,-1.5,0.707107,...,-0.013562,-0.354612,0.229381,0.030445,-0.386223,-0.047079,-0.241422,-0.072854,0.192145,-0.363749
4,1.0,0.0,0.2,0.16,0.0,1.0,0.0,1.0,0.25,1.5,...,-0.072924,-0.142116,0.020479,0.037456,-0.14817,-0.192141,-0.190252,0.048854,0.029435,-0.19698


## Impute NaN values with feature means

In [25]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:00:00.606062 seconds.


In [26]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

FillNA process took 0:00:00.639300 seconds.


In [27]:
start = datetime.now()

X_train.to_parquet('../data/data/processed/train.parquet', engine='pyarrow', compression='snappy')
X_validation.to_parquet('../data/data/processed/validation.parquet', engine='pyarrow', compression='snappy')
X_test.to_parquet('../data/data/processed/test.parquet', engine='pyarrow', compression='snappy')

print(f'Save parquet process took {datetime.now() - start} seconds.')

Save parquet process took 0:00:04.230638 seconds.


In [28]:
print(f'Completed at {datetime.now()}.')

Completed at 2022-11-14 10:41:07.433073.
