# Load Libraries

In [2]:
import h5py
import numpy as np
import pandas as pd

# Load Dataset

In [36]:
# Load existing file
filename = "./data/large_data.hdf5"
dataset = h5py.File(filename, "r")


# Explore Dataset

In [37]:
dataset.keys()

<KeysViewHDF5 ['table']>

In [38]:
tab = dataset["table"]

In [39]:
tab.keys()

<KeysViewHDF5 ['columns']>

In [40]:
col = tab["columns"]

In [51]:
data_val = col.keys()

In [56]:
data_val

<KeysViewHDF5 ['f1', 'f2', 'f3', 'f4', 'target']>

In [57]:
list(data_val)

['f1', 'f2', 'f3', 'f4', 'target']

In [48]:
for key in col.keys():
    print(key)

f1
f2
f3
f4
target


In [64]:
col["f1"]

<HDF5 group "/table/columns/f1" (1 members)>

In [65]:
type(col["f1"])

h5py._hl.group.Group

In [69]:
col["f1"].keys()

<KeysViewHDF5 ['data']>

In [72]:
list(data_val)

['f1', 'f2', 'f3', 'f4', 'target']

In [78]:
col["f1"].keys()

<KeysViewHDF5 ['data']>

In [80]:
col["f1"]["data"]

<HDF5 dataset "data": shape (30000000,), type "<f8">

In [84]:
f1_val = col["f1"]["data"][:]

In [89]:
for i in list(data_val):
    print(str(i)+"_val") 

f1_val
f2_val
f3_val
f4_val
target_val


In [91]:
mylist = []
for i in list(data_val):
    mylist.append(col[i]["data"][:])

In [96]:
mylist

[array([ 0.20393641, -1.19543194, -2.56027118, ..., -0.18381148,
        -0.71526345,  1.31346736]),
 array([-0.29119124,  0.06921412, -0.82784557, ...,  0.51125574,
         2.32002815, -1.02830094]),
 array([-0.61725063,  0.29186146,  0.40555111, ..., -1.24999891,
        -0.13065231, -0.42811914]),
 array([-1.46130047,  1.97251077, -0.13258803, ...,  0.56387117,
        -0.79473282, -1.87614419]),
 array([ 10.79665986,  11.15401848, -20.34268409, ...,   9.9548302 ,
         52.81310081, -49.97451745])]

In [97]:
# Now we ahve seen our data now lets start working with Vaex package & h5 file format

# Vaex Package

In [98]:
import vaex

In [102]:
# https://vaex.readthedocs.io/en/latest/guides/io.html

# Its not bringing entire data to the memory but loading only the meta data.
df = vaex.open("./data/large_data.hdf5")

In [103]:
df.shape

(30000000, 5)

In [105]:
df.describe().T

Unnamed: 0,data_type,count,NA,mean,std,min,max
f1,float64,30000000,0,-0.0003258115779447,0.99999,-5.491886,5.500586
f2,float64,30000000,0,-0.0001060974338844,1.000074,-5.333572,5.449671
f3,float64,30000000,0,-1.3602340446046406e-05,0.999861,-5.312364,5.500811
f4,float64,30000000,0,-6.534419772419989e-05,1.000105,-5.237933,5.347441
target,float64,30000000,0,-0.00836397608058,31.311352,-169.434796,177.839871


In [106]:
# Vaex is fast as it goes to h5 file and read meta data

In [107]:
df = df.shuffle()
df_train, df_test = df.ml.train_test_split(test_size=0.2)



In [108]:
df_train.shape, df_test.shape

((24000000, 5), (6000000, 5))

In [109]:
from vaex.ml.sklearn import IncrementalPredictor
from sklearn.linear_model import SGDRegressor

features = ["f1","f2","f3","f4"]
target = "target"

model = SGDRegressor()

vaex_model = IncrementalPredictor(features = features, target = target, model = model, batch_size = 500000)

vaex_model.fit(df = df_train, progress="widget")

HBox(children=(FloatProgress(value=0.0, max=1.0), Label(value='In progress...')))

In [111]:
df_test = vaex_model.transform(df_test)
df_test.head()

#,f1,f2,f3,f4,target,prediction
0,1.41915,0.137176,1.52219,-0.562451,5.4802,3.37519
1,0.799657,2.02601,0.322278,-1.82846,48.2477,49.185
2,-1.03163,2.2024,-0.972632,-0.343627,43.7794,53.0593
3,0.818666,0.0447691,0.468079,-1.93543,-22.2781,1.40225
4,-0.302,-1.94349,-1.42975,1.86179,-57.0623,-47.3879
5,-1.86761,-1.79496,-0.415552,-1.68272,-58.0239,-43.1616
6,1.99184,1.64538,0.610103,-1.5712,33.8013,40.0033
7,1.37187,-1.2576,-1.0566,-0.622511,-24.269,-30.2555
8,-1.15243,0.646262,-0.990573,0.598763,-6.60637,15.3142
9,-1.81286,0.952314,-0.82758,-1.48375,32.3527,23.0908


In [112]:
from sklearn.metrics import r2_score,mean_absolute_error
print(r2_score(df_test['target'].values,df_test['prediction'].values))
print(mean_absolute_error(df_test['target'].values,df_test['prediction'].values))

0.5920069568303892
15.957512042934047


In [116]:
df.info(description=True)

column,type,unit,description,expression
f1,float64,,,
f2,float64,,,
f3,float64,,,
f4,float64,,,
target,float64,,,

#,f1,f2,f3,f4,target
0,1.419149077185487,0.1371756549486364,1.5221874429768134,-0.5624508212979337,5.480199993950429
1,0.7996566370269093,2.026006111649416,0.3222775507949805,-1.8284632262896627,48.24772882324523
2,-1.03163265018483,2.202399783225899,-0.9726322726708004,-0.3436267911465092,43.77938503226455
3,0.8186659055814459,0.0447691225882184,0.4680794597614,-1.93542936067231,-22.278119173889507
4,-0.3020004654796523,-1.943492371444181,-1.4297526811152097,1.8617866803930847,-57.06233011737409
...,...,...,...,...,...
29999995,0.1261115116615045,1.6754473663184832,-0.0601861186288427,-0.4582442252935049,45.77066780969077
29999996,-1.1563370971977232,1.1741822734726295,-0.2459905163366146,-0.6807595170945469,21.123397584888487
29999997,-0.932365462133778,2.07767862051482,-0.2659435175249801,-0.565129559274725,82.80325550066478
29999998,-0.3918029254513583,-0.9463686130523704,-0.9216856628471514,0.6702061169601413,-38.25973648516042


In [117]:
# Datatable
# Dask
# cuDF - using GPU
# Distributed platforms - Flink, Dask, Kafka, Apache Spark
# Getting data from SQL - https://www.youtube.com/watch?v=8Awk8CpfeGc
# Pandas using chunk size
# Pandas - 1 - 5 GB
# Pandas with Chunk size - 5 - 30 GB
# Dask - 30 - 200 GB
# Spark/Pyspark - 1000 GB(1TB) - 1000TB(1PT)