# Investigation of Methods of Storing Data in `numpy` Arrays on Disk

In [54]:
from sys import getsizeof

import tables
import numpy as np
from itertools import chain
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction import (FeatureHasher,
                                        DictVectorizer)

from src import parse_non_nlp_features_string
from src.mongodb import connect_to_db
from src.experiments import (make_cursor,
                             get_data_point,
                             ExperimentalData)
from src.datasets import get_bin_ranges_helper

In [3]:
# Connect to reviews database
db = connect_to_db(host='localhost', port=37017)

In [4]:
# Get some data for "Dota_2" with the label "total_game_hours" and with
# the number of bins set to 3 and the bin factor set to 6.0
game = 'Dota_2'
label = 'total_game_hours'
nbins = 3
bin_factor = 6.0
#bin_ranges = get_bin_ranges_helper(db, [game], label, nbins, bin_factor)
bin_ranges = [(0.0, 382.2), (382.3, 2675.5), (2675.6, 16435.0)]
print("bin_ranges = {}".format(bin_ranges))

bin_ranges = [(0.0, 382.2), (382.3, 2675.5), (2675.6, 16435.0)]


In [5]:
data = ExperimentalData(db=db, prediction_label=label, games=[game],
                        folds=1, fold_size=30, grid_search_folds=0,
                        grid_search_fold_size=0, bin_ranges=bin_ranges)

In [6]:
# Now we have a set of IDs to work with, which point to samples in the
# dataset
data.training_set

[array(['5690a60fe76db81bef5c4613', '5690a60fe76db81bef5c30ed',
        '5690a60fe76db81bef5c322d', '5690a60fe76db81bef5c49ce',
        '5690a60fe76db81bef5c28ce', '5690a60fe76db81bef5c44b0',
        '5690a60fe76db81bef5c4987', '5690a60fe76db81bef5c496c',
        '5690a60fe76db81bef5c3e3f', '5690a60fe76db81bef5c4b13',
        '5690a60fe76db81bef5c403e', '5690a60fe76db81bef5c3cec',
        '5690a60fe76db81bef5c2e06', '5690a60fe76db81bef5c4464',
        '5690a60fe76db81bef5c3fd2', '5690a60fe76db81bef5c3510',
        '5690a60fe76db81bef5c36bb', '5690a60fe76db81bef5c2f74',
        '5690a60fe76db81bef5c2f51', '5690a60fe76db81bef5c4479',
        '5690a60fe76db81bef5c4ad5', '5690a60fe76db81bef5c47ff',
        '5690a60fe76db81bef5c3f2a', '5690a60fe76db81bef5c3872',
        '5690a60fe76db81bef5c2a3f', '5690a60fe76db81bef5c4465',
        '5690a60fe76db81bef5c2750', '5690a60fe76db81bef5c46c6',
        '5690a60fe76db81bef5c2aba', '5690a60fe76db81bef5c3973'], 
       dtype='<U24')]

In [8]:
data_ids = list(data.training_set[0])
data_ids[:10]

['5690a60fe76db81bef5c4613',
 '5690a60fe76db81bef5c30ed',
 '5690a60fe76db81bef5c322d',
 '5690a60fe76db81bef5c49ce',
 '5690a60fe76db81bef5c28ce',
 '5690a60fe76db81bef5c44b0',
 '5690a60fe76db81bef5c4987',
 '5690a60fe76db81bef5c496c',
 '5690a60fe76db81bef5c3e3f',
 '5690a60fe76db81bef5c4b13']

In [9]:
# For our features, we will use all of the NLP features + the review/reviewer
# attributes that are not directly related to the label ("total_game_hours")
non_nlp_feature_set_labels = parse_non_nlp_features_string("all", label)
print(non_nlp_feature_set_labels)

{'friend_player_level', 'num_screenshots', 'num_found_unhelpful', 'found_helpful_percentage', 'num_games_owned', 'num_comments', 'num_voted_helpfulness', 'num_achievements_possible', 'num_guides', 'num_found_helpful', 'num_badges', 'num_achievements_attained', 'num_groups', 'num_found_funny', 'num_achievements_percentage', 'num_workshop_items', 'num_reviews', 'num_friends'}


In [10]:
# Now we'll get the actual labels and the corresponding features for each
# sample
y = []
X = []
for doc in make_cursor(db, projection={'_id': 0}, id_strings=data_ids):
    sample = get_data_point(doc, prediction_label=label,
                            non_nlp_features=non_nlp_feature_set_labels,
                            bin_ranges=bin_ranges)
    if not sample: continue
    y.append(sample.get('y', sample))
    X.append(sample.get('x', sample))
y = np.array(y)
X = np.array(X)

In [11]:
# Example labels
y[:10]

array([2, 3, 2, 1, 2, 1, 1, 2, 1, 2])

In [12]:
# Example features (sparse format)
list(X[0].items())[:10]

[('yea awk', 1),
 ('stupidity', 1),
 ('badge ,', 1),
 ('legislation that', 1),
 ('other features', 1),
 ('thana', 1),
 ('would like', 1),
 ('ranked matches', 1),
 ('in comments', 1),
 ('my virgin', 1)]

In [13]:
# Now we'll make a vectorizer object (actually, two vectors, one with `DictVectorizer`
# and the other with `FeatureHasher`) and fit with `X`
dict_vec = DictVectorizer(sparse=True)
feature_hasher_vec = FeatureHasher(n_features=2**18, non_negative=True)

In [14]:
dict_vec.fit(X)
feature_hasher_vec.fit(X)

FeatureHasher(dtype=<class 'numpy.float64'>, input_type='dict',
       n_features=262144, non_negative=True)

In [15]:
# Now we must transform `X` with the vectorizers to get the sparse scipy arrays
X_dict_vectorized = dict_vec.transform(X)
X_feature_hasher_vectorized = feature_hasher_vec.transform(X)

In [16]:
X_dict_vectorized

<30x150447 sparse matrix of type '<class 'numpy.float64'>'
	with 4223615 stored elements in Compressed Sparse Row format>

In [17]:
type(X_dict_vectorized)

scipy.sparse.csr.csr_matrix

In [18]:
type(X_feature_hasher_vectorized)

scipy.sparse.csr.csr_matrix

In [22]:
X_dict_vectorized.indptr

array([      0,  140430,  280869,  421947,  562705,  706627,  847056,
        987834, 1128189, 1268661, 1409102, 1549611, 1689989, 1830817,
       1971240, 2111773, 2252133, 2392525, 2532868, 2673419, 2814388,
       2954836, 3096452, 3237277, 3378794, 3521468, 3662042, 3802450,
       3942804, 4083194, 4223615], dtype=int32)

In [52]:
X_feature_hasher_vectorized.indptr

array([      0,  108600,  217212,  326169,  434956,  545560,  654160,
        762970,  871527,  980153, 1088762, 1197416, 1305980, 1414819,
       1523405, 1632054, 1740618, 1849192, 1957736, 2066414, 2175328,
       2283926, 2393221, 2502043, 2611281, 2721157, 2829832, 2938418,
       3046972, 3155548, 3264138], dtype=int32)

## Use `pytables` to Store `scipy` Arrays to Disk

- `scipy` sparse arrays unfortunately cannot be stored with `pytables`; however, they can be converted to dense arrays and then stored

In [44]:
# Open new empty HDF5 files
X_dict_vectorized_dense_file = tables.open_file("X_dict_vectorized_dense.h5", mode="w")
X_feature_hasher_dense_file = tables.open_file("X_feature_hasher_dense.h5", mode="w")

In [45]:
# Get the root groups
root_dict_vectorized = X_dict_vectorized_dense_file.root
root_feature_hasher_vectorized = X_feature_hasher_dense_file.root

In [46]:
# Save the dense arrays on the HDF5 files
X_dict_vectorized_dense_hdf_array = \
    X_dict_vectorized_dense_file.create_array(root_dict_vectorized,
                                              'X_dict_vectorized_dense',
                                              X_dict_vectorized.todense(),
                                              "X dict vectorized dense")
X_feature_hasher_dense_hdf_array = \
    X_feature_hasher_dense_file.create_array(root_feature_hasher_vectorized,
                                             'X_feature_hasher_dense',
                                             X_feature_hasher_vectorized.todense(),
                                             "X feature hasher vectorized dense")

In [53]:
! ls -lh *h5

-rw-rw-r-- 1 mmulholland mmulholland 35M Feb 29 22:24 X_dict_vectorized_dense.h5
-rw-rw-r-- 1 mmulholland mmulholland 61M Feb 29 22:24 X_feature_hasher_dense.h5


- So, to store even a relatively small 30-sample dataset vectorized with `DictVectorizer` in a dense format, it can be 35 MB
- Suprisingly, storing the same dataset vectorized with `FeatureHasher`, which is supposed to be memory-efficient, requires even more memory to store the same data (61 MB) in a dense format
- Let's see what we can do with the array that's saved

In [50]:
X_dict_vectorized_dense_hdf_array

/X_dict_vectorized_dense (Array(30, 150447)) 'X dict vectorized dense'
  atom := Float64Atom(shape=(), dflt=0.0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

In [51]:
X_feature_hasher_dense_hdf_array

/X_feature_hasher_dense (Array(30, 262144)) 'X feature hasher vectorized dense'
  atom := Float64Atom(shape=(), dflt=0.0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

- Can a model be trained with `pytables` data?

In [56]:
perc1 = Perceptron()

In [58]:
perc1.fit(X_dict_vectorized_dense_hdf_array, y)

Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)

In [60]:
perc1.predict(X_dict_vectorized_dense_hdf_array)

array([1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2])

In [59]:
perc2 = Perceptron()
perc2.fit(X_feature_hasher_dense_hdf_array, y)

Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)

In [62]:
perc2.predict(X_feature_hasher_dense_hdf_array)

array([2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 3, 2, 2, 1, 2, 2])

- Indeed, it seems that `pytables` data can be used as a drop-in replacement for non-sparse `numpy` arrays (or `todense`-converted sparse `scipy` arrays as generated via `DictVectorizer`/`FeatureHasher`

In [63]:
X_dict_vectorized_dense_file.close()
X_feature_hasher_dense_file.close()

- In order to use `pytables` in order to increase memory efficiency, a possible algorithm would be the following:
    - Extract and vectorize data
    - Use `todense` to make the arrays dense
    - Save to data an `hdf5` file with `pytables`
    - Remove the original data so that it gets garbage-collected
    - Use `pytables` arrays in place of data wherever needed
    - Remove the `hdf5` files after complete

- Note that it is possible to create enlargeable arrays with `pytables`, so it's possible that an array file could be generated, saved, and closed, and then reopened and enlargened and stored again.

- Now that the `hdf5` files have been created, arrays saved to them, and then closed, let's try to read in the data again and use it

In [65]:
X_dict_vectorized_dense_file = tables.open_file("X_dict_vectorized_dense.h5")
X_feature_hasher_dense_file = tables.open_file("X_feature_hasher_dense.h5")

In [73]:
X_dict_vectorized_dense_hdf_array = \
    X_dict_vectorized_dense_file.root.X_dict_vectorized_dense

In [74]:
X_dict_vectorized_dense_hdf_array

/X_dict_vectorized_dense (Array(30, 150447)) 'X dict vectorized dense'
  atom := Float64Atom(shape=(), dflt=0.0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

In [75]:
X_feature_hasher_dense_hdf_array = \
    X_feature_hasher_dense_file.root.X_feature_hasher_dense

In [76]:
X_feature_hasher_dense_hdf_array

/X_feature_hasher_dense (Array(30, 262144)) 'X feature hasher vectorized dense'
  atom := Float64Atom(shape=(), dflt=0.0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

In [78]:
perc1.fit(X_dict_vectorized_dense_hdf_array, y)

Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)

In [79]:
perc2.fit(X_feature_hasher_dense_hdf_array, y)

Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)

In [80]:
X_dict_vectorized_dense_file.close()
X_feature_hasher_dense_file.close()

### Using Compressed `pytables`
- `pytables` also exposes an `CArray` type that compresses the data
- A number of compression algorithms are provided, including `zlib`, `blosc`, and `lzo`
- Furthermore, memory can be optimized using `HD5`'s ability to handle in-memory processing via the `H5FD_CORE` driver

In [92]:
# This time, we'll create one table to store both arrays and we'll use `blosc` for
# compression
X_compressed_file = tables.open_file("X_compressed.h5", mode="w")
filters = tables.Filters(complevel=5, complib='blosc')
X_dict_vectorized_CArray = \
    X_compressed_file.create_carray(X_compressed_file.root,
                                    'X_dict_vectorized_CArray',
                                    tables.Atom.from_dtype(X_dict_vectorized.dtype),
                                    shape=X_dict_vectorized.shape,
                                    filters=filters)
X_dict_vectorized_CArray[:] = X_dict_vectorized.todense()
X_feature_hasher_CArray = \
    X_compressed_file.create_carray(X_compressed_file.root,
                                    'X_feature_hasher_CArray',
                                    tables.Atom.from_dtype(X_feature_hasher_vectorized.dtype),
                                    shape=X_feature_hasher_vectorized.shape,
                                    filters=filters)
X_feature_hasher_CArray[:] = X_feature_hasher_vectorized.todense()

In [93]:
X_compressed_file.close()

- Now the size of `X_compressed.h5`, which contains both arrays, is only 8.6 MB!

In [97]:
# Let's read in the data from the table with the `H5FD_CORE` driver and train
# a model with it
X_compressed_file = tables.open_file("X_compressed.h5", mode='r', driver='H5FD_CORE')

In [98]:
X_dict_vectorized_CArray = X_compressed_file.root.X_dict_vectorized_CArray
X_feature_hasher_CArray = X_compressed_file.root.X_dict_vectorized_CArray

In [99]:
perc1 = Perceptron()
perc2 = Perceptron()
perc1.fit(X_dict_vectorized_CArray, y)
perc1.fit(X_feature_hasher_CArray, y)

Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)

In [100]:
X_compressed_file.close()