# Investigation of Methods of Storing Data in `numpy` Arrays on Disk

In [1]:
from sys import getsizeof

import tables
import numpy as np
from itertools import chain
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction import (FeatureHasher,
                                        DictVectorizer)

from src import parse_non_nlp_features_string
from src.mongodb import connect_to_db
from src.experiments import (make_cursor,
                             get_data_point,
                             ExperimentalData)
from src.datasets import get_bin_ranges_helper

In [2]:
# Connect to reviews database
db = connect_to_db(host='localhost', port=37017)

ERROR:src.mongodb:Unable to connect client to Mongo server at mongodb://localhost:37017.


ConnectionFailure: [Errno 111] Connection refused

In [None]:
# Get some data for "Dota_2" with the label "total_game_hours" and with
# the number of bins set to 3 and the bin factor set to 6.0
game = 'Dota_2'
label = 'total_game_hours'
nbins = 3
bin_factor = 6.0
#bin_ranges = get_bin_ranges_helper(db, [game], label, nbins, bin_factor)
bin_ranges = [(0.0, 382.2), (382.3, 2675.5), (2675.6, 16435.0)]
print("bin_ranges = {}".format(bin_ranges))

In [None]:
data = ExperimentalData(db=db, prediction_label=label, games=[game],
                        folds=1, fold_size=30, grid_search_folds=0,
                        grid_search_fold_size=0, bin_ranges=bin_ranges)

In [None]:
# Now we have a set of IDs to work with, which point to samples in the
# dataset
data.training_set

In [None]:
data_ids = list(data.training_set[0])
data_ids[:10]

In [None]:
# For our features, we will use all of the NLP features + the review/reviewer
# attributes that are not directly related to the label ("total_game_hours")
non_nlp_feature_set_labels = parse_non_nlp_features_string("all", label)
print(non_nlp_feature_set_labels)

In [None]:
# Now we'll get the actual labels and the corresponding features for each
# sample
y = []
X = []
for doc in make_cursor(db, projection={'_id': 0}, id_strings=data_ids):
    sample = get_data_point(doc, prediction_label=label,
                            non_nlp_features=non_nlp_feature_set_labels,
                            bin_ranges=bin_ranges)
    if not sample: continue
    y.append(sample.get('y', sample))
    X.append(sample.get('x', sample))
y = np.array(y)
X = np.array(X)

In [None]:
# Example labels
y[:10]

In [None]:
# Example features (sparse format)
list(X[0].items())[:10]

In [None]:
# Now we'll make a vectorizer object (actually, two vectors, one with `DictVectorizer`
# and the other with `FeatureHasher`) and fit with `X`
dict_vec = DictVectorizer(sparse=True)
feature_hasher_vec = FeatureHasher(n_features=2**18, non_negative=True)

In [None]:
dict_vec.fit(X)
feature_hasher_vec.fit(X)

In [None]:
# Now we must transform `X` with the vectorizers to get the sparse scipy arrays
X_dict_vectorized = dict_vec.transform(X)
X_feature_hasher_vectorized = feature_hasher_vec.transform(X)

In [None]:
X_dict_vectorized

In [None]:
type(X_dict_vectorized)

In [None]:
type(X_feature_hasher_vectorized)

In [None]:
X_dict_vectorized.indptr

In [None]:
X_feature_hasher_vectorized.indptr

## Use `pytables` to Store `scipy` Arrays to Disk

- `scipy` sparse arrays unfortunately cannot be stored with `pytables`; however, they can be converted to dense arrays and then stored

In [None]:
# Open new empty HDF5 files
X_dict_vectorized_dense_file = tables.open_file("X_dict_vectorized_dense.h5", mode="w")
X_feature_hasher_dense_file = tables.open_file("X_feature_hasher_dense.h5", mode="w")

In [None]:
# Get the root groups
root_dict_vectorized = X_dict_vectorized_dense_file.root
root_feature_hasher_vectorized = X_feature_hasher_dense_file.root

In [None]:
# Save the dense arrays on the HDF5 files
X_dict_vectorized_dense_hdf_array = \
    X_dict_vectorized_dense_file.create_array(root_dict_vectorized,
                                              'X_dict_vectorized_dense',
                                              X_dict_vectorized.todense(),
                                              "X dict vectorized dense")
X_feature_hasher_dense_hdf_array = \
    X_feature_hasher_dense_file.create_array(root_feature_hasher_vectorized,
                                             'X_feature_hasher_dense',
                                             X_feature_hasher_vectorized.todense(),
                                             "X feature hasher vectorized dense")

In [None]:
! ls -lh X*_dense.h5

- So, to store even a relatively small 30-sample dataset vectorized with `DictVectorizer` in a dense format, it can be 35 MB
- Suprisingly, storing the same dataset vectorized with `FeatureHasher`, which is supposed to be memory-efficient, requires even more memory to store the same data (61 MB) in a dense format
- Let's see what we can do with the array that's saved

In [None]:
X_dict_vectorized_dense_hdf_array

In [None]:
X_feature_hasher_dense_hdf_array

- Can a model be trained with `pytables` data?

In [None]:
perc1 = Perceptron()

In [None]:
perc1.fit(X_dict_vectorized_dense_hdf_array, y)

In [None]:
perc1.predict(X_dict_vectorized_dense_hdf_array)

In [None]:
perc2 = Perceptron()
perc2.fit(X_feature_hasher_dense_hdf_array, y)

In [None]:
perc2.predict(X_feature_hasher_dense_hdf_array)

- Indeed, it seems that `pytables` data can be used as a drop-in replacement for non-sparse `numpy` arrays (or `todense`-converted sparse `scipy` arrays as generated via `DictVectorizer`/`FeatureHasher`

In [None]:
X_dict_vectorized_dense_file.close()
X_feature_hasher_dense_file.close()

- In order to use `pytables` in order to increase memory efficiency, a possible algorithm would be the following:
    - Extract and vectorize data
    - Use `todense` to make the arrays dense
    - Save to data an `hdf5` file with `pytables`
    - Remove the original data so that it gets garbage-collected
    - Use `pytables` arrays in place of data wherever needed
    - Remove the `hdf5` files after complete

- Note that it is possible to create enlargeable arrays with `pytables`, so it's possible that an array file could be generated, saved, and closed, and then reopened and enlargened and stored again.

- Now that the `hdf5` files have been created, arrays saved to them, and then closed, let's try to read in the data again and use it

In [None]:
X_dict_vectorized_dense_file = tables.open_file("X_dict_vectorized_dense.h5")
X_feature_hasher_dense_file = tables.open_file("X_feature_hasher_dense.h5")

In [None]:
X_dict_vectorized_dense_hdf_array = \
    X_dict_vectorized_dense_file.root.X_dict_vectorized_dense

In [None]:
X_dict_vectorized_dense_hdf_array

In [None]:
X_feature_hasher_dense_hdf_array = \
    X_feature_hasher_dense_file.root.X_feature_hasher_dense

In [None]:
X_feature_hasher_dense_hdf_array

In [None]:
perc1.fit(X_dict_vectorized_dense_hdf_array, y)

In [None]:
perc2.fit(X_feature_hasher_dense_hdf_array, y)

In [None]:
X_dict_vectorized_dense_file.close()
X_feature_hasher_dense_file.close()

### Using Compressed `pytables`
- `pytables` also exposes an `CArray` type that compresses the data
- A number of compression algorithms are provided, including `zlib`, `blosc`, and `lzo`
- Furthermore, memory can be optimized using `HD5`'s ability to handle in-memory processing via the `H5FD_CORE` driver

In [None]:
# This time, we'll create one table to store both arrays and we'll use `blosc` for
# compression
X_compressed_file = tables.open_file("X_compressed.h5", mode="w")
filters = tables.Filters(complevel=5, complib='blosc')
X_dict_vectorized_CArray = \
    X_compressed_file.create_carray(X_compressed_file.root,
                                    'X_dict_vectorized_CArray',
                                    tables.Atom.from_dtype(X_dict_vectorized.dtype),
                                    shape=X_dict_vectorized.shape,
                                    filters=filters)
X_dict_vectorized_CArray[:] = X_dict_vectorized.todense()
X_feature_hasher_CArray = \
    X_compressed_file.create_carray(X_compressed_file.root,
                                    'X_feature_hasher_CArray',
                                    tables.Atom.from_dtype(X_feature_hasher_vectorized.dtype),
                                    shape=X_feature_hasher_vectorized.shape,
                                    filters=filters)
X_feature_hasher_CArray[:] = X_feature_hasher_vectorized.todense()

In [None]:
X_compressed_file.close()

- Now the size of `X_compressed.h5`, which contains both arrays, is only 8.6 MB!

In [None]:
# Let's read in the data from the table with the `H5FD_CORE` driver and train
# a model with it
X_compressed_file = tables.open_file("X_compressed.h5", mode='r', driver='H5FD_CORE')

In [None]:
X_dict_vectorized_CArray = X_compressed_file.root.X_dict_vectorized_CArray
X_feature_hasher_CArray = X_compressed_file.root.X_dict_vectorized_CArray

In [None]:
perc1 = Perceptron()
perc2 = Perceptron()
perc1.fit(X_dict_vectorized_CArray, y)
perc1.fit(X_feature_hasher_CArray, y)

In [None]:
X_compressed_file.close()