# Opening large files with Dask

## json

In [1]:
import dask.dataframe as dd

In [2]:
df_from_json = dd.read_json("data/yelp_academic_dataset_covid_features.json", orient="records", lines=True)

In [3]:
print(df_from_json.compute())

                   business_id highlights delivery or takeout Grubhub enabled  \
0       9kXRUIkwdDtnAPO6tVo51g      FALSE               FALSE           FALSE   
1       H6D5HOTfMjrZt7r1EObZ1g      FALSE               FALSE           FALSE   
2       FYddq7fUtzobZcw4jOJgVA      FALSE               FALSE           FALSE   
3       c75jLTjlgA9q3gImLEGT6w      FALSE               FALSE           FALSE   
4       YfzPiY50h_10Sjlg3mnNWQ      FALSE               FALSE           FALSE   
...                        ...        ...                 ...             ...   
209790  PAuSe4-JzDLq6YO6OKPeTg      FALSE               FALSE           FALSE   
209791  GQEe9qAe2alEA1zzRzUr0w      FALSE               FALSE           FALSE   
209792  T-PWmMvktw0nMNO4N5XLXA      FALSE               FALSE           FALSE   
209793  stVu6q5hDxQuppwoo6m0EQ      FALSE               FALSE           FALSE   
209794  UYig-qxG9zpzuMKns-1tDg      FALSE               FALSE           FALSE   

       Call To Action enabl

## Parquet

In [4]:
# Use engine = "fastparquet" or "pyarrow" (faster for some operations)
#df_parquet = read_parquet("", engine="fastparquet")
#print(df_parquet.compute())

## HDF5

In [5]:
import h5py
import numpy as np
import dask.array as da

In [6]:
# Create a large HDF5 dataset
FILE_PATH = "data/large_data.h5"

with h5py .File(FILE_PATH, 'w') as f:
    f.create_dataset("dataset", data=np.random.rand(10000,10000))

# Read it
h5_file = h5py.File(FILE_PATH, 'r')

# Convert the Numpy array in the dataset into a dask array
dask_array = da.from_array(h5_file["dataset"], chunks = (1000,1000)) # divide it into chunks

# Perfrom computations
mean_value = dask_array.mean()
mean_value.compute()

np.float64(0.499994250263206)