In [1]:
%load_ext autoreload
%autoreload 2

# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [2]:
import sys

import numpy as np
import pandas as pd

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction

from datetime import datetime

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [3]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw and preprocessed data into ../data/data.zip.
Data was downloaded.
Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [4]:
report_memory = False

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [5]:
import time 

timestr = time.strftime("%Y%m%d-%H%M%S")

X_test_filename_csv = f'../data/work/test_{timestr}.csv'
X_train_filename_csv = f'../data/work/train_{timestr}.csv'
X_validation_filename_csv = f'../data/work/validation_{timestr}.csv'

### PREPARATION

In [6]:
# ensure embedding initialisation is outside of timing for extract_features
from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import prepare_feature_extraction

prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:05.799263 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:03.590517 seconds. (filename = ../sherlock/features/par_vec_retrained_400.pkl)
Initialised NLTK, process took 0:00:00.272521 seconds.


[nltk_data] Downloading package punkt to /Users/lowecg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lowecg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from pyarrow.parquet import ParquetFile

def load_parquet_values(path):
    pf = ParquetFile(source=path)
    row_df = pf.read_row_group(0)

    return row_df['values']

In [8]:
%load_ext line_profiler

In [9]:
import multiprocessing as mp

# default process creation mode is changed in Python 3.8 to 'spawn' which causes "name not defined" type errors. Force fork 'mode' for now (this only needs to be called once).
# https://bugs.python.org/issue39931
mp.set_start_method('fork', force=True)

## EXTRACT FEATURES TO CSV (NEW METHOD)

### TEST SET

In [10]:
# from sherlock.functional import extract_features_to_csv2
# from sherlock.global_state import reset_first

# reset_first()

# values = load_parquet_values("../data/raw/test_values.parquet")

# %lprun -m sherlock.features.preprocessing -m sherlock.functional -m sherlock.features extract_features_to_csv2('deleteme.csv', values, 100)

# values = None

from sherlock.functional import extract_features_to_csv

values = load_parquet_values("../data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename_csv, values)

values = None

Starting ../data/work/test_20210116-100958.csv at 2021-01-16 10:10:08.807939
Exporting 1578 column features
Finished. Processed 137353 rows in 0:04:49.730240, key_count=8


In [11]:
# Finished. Processed 137353 rows in 0:14:53.196073

In [12]:
print(f'Finished at {datetime.now()}')

Finished at 2021-01-16 10:14:58.658660


### TRAIN SET

In [13]:
values = load_parquet_values("../data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename_csv, values)

values = None

Starting ../data/work/train_20210116-100958.csv at 2021-01-16 10:15:01.231043
Exporting 1578 column features
Finished. Processed 412059 rows in 0:14:13.107407, key_count=8


In [14]:
print(f'Finished at {datetime.now()}')

Finished at 2021-01-16 10:29:14.492655


### VALIDATION SET

In [15]:
values = load_parquet_values("../data/raw/val_values.parquet")

extract_features_to_csv(X_validation_filename_csv, values)

values = None

Starting ../data/work/validation_20210116-100958.csv at 2021-01-16 10:29:15.362161
Exporting 1578 column features
Finished. Processed 137353 rows in 0:04:42.650943, key_count=8


In [16]:
print(f'Finished at {datetime.now()}')

Finished at 2021-01-16 10:33:58.086008


### Read Locally Processed Features

In [17]:
start = datetime.now()

X_test = pd.read_csv(X_test_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:00:27.301587 seconds.


In [18]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.019735,-0.00537,-0.091668,-0.006344,0.025391,0.126787,0.044118,0.046884,0.283819,-0.171548
1,1.0,0.0,0.368421,0.33795,0.0,2.0,0.0,7.0,0.742677,1.326868,...,-0.057494,0.119669,0.054123,-0.000701,0.131447,0.040795,0.051894,-0.096756,0.383153,-0.083436
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.098697,-0.019443,0.094247,-0.115887,0.024181,0.037573,-0.059128,0.138309,0.283486,-0.017065
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.134933,-0.010362,0.107527,0.033554,0.076186,-0.060033,0.027952,0.027733,0.097994,-0.06379
4,1.0,0.0,0.02008,0.035741,0.0,2.0,0.0,5.0,96.521561,9.784149,...,-0.334515,-0.091502,-0.374657,-0.848212,-0.563223,0.551919,0.344227,0.100083,0.829832,0.208412


In [19]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename_csv, dtype=np.float32)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

Load Features (train) process took 0:01:24.170365 seconds.


In [20]:
X_train.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.009189,-0.063634,0.012111,0.00878,-0.044473,-0.001882,0.080658,-0.031486,0.134823,-0.094311
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.098068,0.07965,-0.066536,0.12793,0.109006,-0.03166,0.086386,-0.006723,0.346329,-0.07374
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.025405,0.213857,-0.167002,0.037465,0.493723,0.221739,0.409898,-0.434886,0.204949,-0.338935
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.038333,-0.048711,0.150525,0.056821,-0.141856,-0.051277,0.107752,-0.078185,0.430811,-0.156045
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.227349,0.102277,0.136173,-0.164556,0.19207,0.090296,0.029739,-0.045325,0.427401,-0.063003


In [21]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename_csv, dtype=np.float32)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

Load Features (validation) process took 0:00:27.088861 seconds.


In [22]:
X_validation.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,0.308491,0.217717,0.298681,-0.056398,-0.039623,0.395839,0.328203,-0.213725,0.359767,-0.045651
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.114152,0.337195,0.062903,0.014054,-0.001424,0.09023,0.317841,-0.256091,0.421869,-0.266941
2,1.0,0.0,0.25,0.1875,0.0,1.0,0.0,10.0,-0.666667,1.154701,...,0.169035,0.080378,-0.37094,-0.069732,0.029818,0.081608,0.005713,0.032234,0.053698,0.038257
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.025819,0.089296,-0.046308,-0.047762,0.044307,0.087926,0.116787,0.062737,0.188108,0.058473
4,1.0,0.0,0.09434,0.311855,0.0,4.0,0.0,5.0,42.275021,6.530585,...,0.034416,0.683127,0.061344,0.177629,-0.000436,-0.162283,0.56138,-0.234554,0.690598,-0.148711


## Impute NaN values with feature means

In [23]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:00:03.999245 seconds.


In [24]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

FillNA process took 0:00:02.305554 seconds.


In [25]:
start = datetime.now()

X_train.to_parquet('train.parquet', engine='pyarrow', compression='snappy')
X_validation.to_parquet('validation.parquet', engine='pyarrow', compression='snappy')
X_test.to_parquet('test.parquet', engine='pyarrow', compression='snappy')

print(f'Save parquet process took {datetime.now() - start} seconds.')

Save parquet process took 0:00:58.368868 seconds.


## EXTRACT FEATURES TO CSV (**OLD** METHOD) 

### TEST SET (OLD METHOD)

In [None]:
from pympler import muppy, summary

In [None]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [None]:
test_samples = pd.read_parquet('../data/raw/test_values.parquet')
test_labels = pd.read_parquet('../data/raw/test_labels.parquet')

In [None]:

test_samples_converted, y_test = convert_string_lists_to_lists(test_samples, test_labels, "values", "type")

In [None]:
# free memory
test_samples = None
test_labels = None

In [None]:
test_samples_converted.head()

In [None]:
# output "head" 
y_test[:5]

### Given that feature extraction can take long, we only take the first 100 samples.

In [None]:
y_test_subset = y_test[:100]

In [None]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [None]:
from sherlock.features.preprocessing import extract_features

start = datetime.now()

#extract_features(X_test_filename, test_samples_converted.head(n=100))

%lprun -m sherlock.features.preprocessing X_test=extract_features('deleteme.csv', test_samples_converted.head(n=100)) 

print(f'Extract Features (test) process took {datetime.now() - start} seconds.')

In [None]:
test_samples_converted = None

In [None]:
from sherlock.features.helpers import generate_chars_col
generate_chars_col()

In [None]:
import gc

gc.collect()

In [None]:
if report_memory:
    all_objects = muppy.get_objects()
    sum1 = summary.summarize(all_objects)
    # Prints out a summary of the large objects
    summary.print_(sum1)

In [None]:
# over all, without memory management
# Extract Features (test) process took 3:40:25.799880 seconds.

In [None]:
# Baseline
# Extract Features (test) process took 0:11:04.137081 seconds.

# Iterations
# Extract Features (test) process took 0:00:56.671353 seconds. (cache word embeddings)
# Extract Features (test) process took 0:00:13.523261 seconds. (cache Doc2Vec)
# Extract Features (test) process took 0:00:03.674007 seconds. (loads of tweaks, use np.array for stats)
# Extract Features (test) process took 0:00:03.262298 seconds. (manually compute counts)
# Extract Features (test) process took 0:00:02.853031 seconds. (replace series.str.count in BoW, use series.tolist() in paragraph vectors )
# Extract Features (test) process took 0:00:01.718025 seconds. (compute mean, variance, skew and kurtosis together)
# Extract Features (test) process took 0:00:01.437484 seconds. (compute sum, min and max together, nunique replaced with len(set(series))), use statistics.median not np.median)

### TRAIN SET (OLD METHOD)

In [None]:
train_samples = pd.read_parquet('../data/raw/train_values.parquet')
train_labels = pd.read_parquet('../data/raw/train_labels.parquet')

In [None]:
train_samples_converted, y_train = convert_string_lists_to_lists(train_samples, train_labels, "values", "type")

In [None]:
# free memory
train_samples = None
train_labels = None

In [None]:
y_train_subset = y_train[:100]

In [None]:
start = datetime.now()

extract_features(X_train_filename, train_samples_converted)

print(f'Extract Features (train) process took {datetime.now() - start} seconds.')

In [None]:
train_samples_converted = None

### VALIDATION SET (OLD METHOD)

In [None]:
validation_samples = pd.read_parquet('../data/raw/val_values.parquet')
validation_labels = pd.read_parquet('../data/raw/val_labels.parquet')

In [None]:
validation_samples_converted, y_validation = convert_string_lists_to_lists(validation_samples, validation_labels, "values", "type")

In [None]:
# free memory
validation_samples = None
validation_labels = None

In [None]:
y_validation_subset = y_validation[:100]

In [None]:
start = datetime.now()

extract_features(X_validation_filename, validation_samples_converted)

print(f'Extract Features (validation) process took {datetime.now() - start} seconds.')

In [None]:
validation_samples_converted = None