## Examples

### SETUP

In [1]:
import deeplake
import os
from dotenv import load_dotenv

# Load environment variables 
load_dotenv(override = True)
open_api_key = os.getenv('OPENAI_API_KEY')
activeloop_token = os.getenv('ACTIVELOOP_TOKEN')

# Set environment variables
# os.environ['OPENAI_API_KEY'] = open_api_key
# os.environ['ACTIVELOOP_TOKEN'] = activeloop_token

# Set user-defined constants
DATASET_PATH_1 = "hub://pavelkloscz/twitter_algorithm_twml_2"
DATASET_PATH_2 = "hub://pavelkloscz/twitter_algorithm"
DATASET_PATH_3 = "hub://pavelkloscz/val2017-100"
DATASET_PATH_4 = "hub://pavelkloscz/ds-scifact"
LOCAL_PATH_1 = "./twitter_algorithm_twml_2"  # 361 kB (40 files, 25 directories)
LOCAL_PATH_2 = "./twitter_algorithm"  # 147 kB (40 files, 25 directories)
LOCAL_PATH_3 = "./val2017-100"  # 15,1 MB (71 files, 50 directories)
LOCAL_PATH_4 = "./ds-scifact"  # 38,1 MB (40 files, 25 directories)

# NIH Chest X-rays (Over 112,000 Chest X-ray images from more than 30,000 unique patients)**<br>
# [NIH Chest X-Ray (DeepLake)]
# - https://app.activeloop.ai/activeloop/nih-chest-xray-train)
# [NIH Chest X-Ray (Kaggle)]
# - https://www.kaggle.com/datasets/nih-chest-xrays/data
DATASET_PATH_5 = "hub://activeloop/nih-chest-xray-train"
LOCAL_PATH_5 = "./nih-chest-xray-train"  # 38,1 MB (40 files, 25 directories)

MODEL_GPT = 'gpt-4o-mini'



### Download dataset from DeepLake hub and save it locally

In [2]:
# URL of the dataset in DeepLake hub
dataset_path = DATASET_PATH_1

# Local path where you want to save the dataset
local_path = LOCAL_PATH_1

# Download the dataset from hub to local storage
deeplake.copy(
    src=dataset_path,
    dest=local_path,
    overwrite=True  # Set to False if you want to avoid overwriting existing data
)

# Load the local dataset to verify
local_dataset = deeplake.load(local_path)
print(f"Dataset downloaded from {dataset_path} and saved to {local_path}")
print(local_dataset.summary())

./twitter_algorithm_twml_2 loaded successfully.


Evaluating copy transform: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:04<00:00

./twitter_algorithm_twml_2 loaded successfully.

Dataset downloaded from hub://pavelkloscz/twitter_algorithm_twml_2 and saved to ./twitter_algorithm_twml_2
Dataset(path='./twitter_algorithm_twml_2', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (24, 1536)  float32   None   
    id        text      (24, 1)      str     None   
 metadata     json      (24, 1)      str     None   
   text       text      (24, 1)      str     None   
None





In [3]:
# Access specific tensors
texts = local_dataset.text.numpy()
embeddings = local_dataset.embedding.numpy()

In [4]:
print(len(texts))
print(len(embeddings))

24
24


In [5]:
print(texts[0])

['# pylint: disable=wildcard-import\r\n"""\r\nThis module contains the ``tf.layers.Layer`` subclasses implemented in twml.\r\nLayers are used to instantiate common subgraphs.\r\nTypically, these layers are used when defining a ``build_graph_fn``\r\nfor the ``twml.trainers.Trainer``.\r\n"""\r\n\r\nfrom .batch_prediction_tensor_writer import BatchPredictionTensorWriter  # noqa: F401\r\nfrom .batch_prediction_writer import BatchPredictionWriter  # noqa: F401\r\nfrom .data_record_tensor_writer import DataRecordTensorWriter  # noqa: F401\r\nfrom .full_dense import full_dense, FullDense  # noqa: F401\r\nfrom .full_sparse import full_sparse, FullSparse  # noqa: F401\r\nfrom .isotonic import Isotonic  # noqa: F401\r\nfrom .layer import Layer  # noqa: F401\r\nfrom .mdl import MDL  # noqa: F401\r\nfrom .partition import Partition  # noqa: F401\r\nfrom .percentile_discretizer import PercentileDiscretizer  # noqa: F401\r\nfrom .sequential import Sequential  # noqa: F401\r\nfrom .sparse_max_norm im

In [6]:
print(embeddings[0])

[-0.00466384  0.01501529 -0.02373155 ... -0.00217907 -0.01788753
 -0.03816387]


## FUNCTIONS

In [3]:
def copy_dataset(dataset_path, local_path):
    # URL of the dataset in DeepLake hub
    # dataset_path = DATASET_PATH_1

    # Local path where you want to save the dataset
    # local_path = LOCAL_PATH_1

    # Download the dataset from hub to local storage
    deeplake.copy(
        src=dataset_path,
        dest=local_path,
        overwrite=True  # Set to False if you want to avoid overwriting existing data
    )

    # Load the local dataset to verify
    local_dataset = deeplake.load(local_path)
    print(f"Dataset downloaded from {dataset_path} and saved to {local_path}")
    print(local_dataset.summary())

    return local_dataset

## TRY-CHECK

In [8]:
dataset_2 = copy_dataset(DATASET_PATH_2, LOCAL_PATH_2)

./twitter_algorithm loaded successfully.


Evaluating copy transform: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:05<00:00

./twitter_algorithm loaded successfully.

Dataset downloaded from hub://pavelkloscz/twitter_algorithm and saved to ./twitter_algorithm
Dataset(path='./twitter_algorithm', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (22, 1536)  float32   None   
    id        text      (22, 1)      str     None   
 metadata     json      (22, 1)      str     None   
   text       text      (22, 1)      str     None   
None





In [9]:
dataset_3 = copy_dataset(DATASET_PATH_3, LOCAL_PATH_3)

./val2017-100 loaded successfully.


Evaluating copy transform: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:20<00:00


./val2017-100 loaded successfully.

Dataset downloaded from hub://pavelkloscz/val2017-100 and saved to ./val2017-100
Dataset(path='./val2017-100', tensors=['embedding', 'filename', 'id', 'image'])

  tensor      htype               shape               dtype  compression
  -------    -------             -------             -------  ------- 
 embedding  embedding           (100, 512)           float32   None   
 filename     text               (100, 1)              str     None   
    id        text               (100, 1)              str     None   
   image      image    (100, 240:640, 320:640, 1:3)   uint8    jpeg   
None




In [10]:
dataset_4 = copy_dataset(DATASET_PATH_4, LOCAL_PATH_4)

./ds-scifact loaded successfully.


Evaluating copy transform: 100%|███████████████████████████████████████████████████████████████████████████████████████| 5183/5183 [00:23<00:00


./ds-scifact loaded successfully.

Dataset downloaded from hub://pavelkloscz/ds-scifact and saved to ./ds-scifact
Dataset(path='./ds-scifact', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype       shape       dtype  compression
  -------    -------     -------     -------  ------- 
 embedding  embedding  (5183, 1536)  float32   None   
    id        text      (5183, 1)      str     None   
 metadata     json      (5183, 1)      str     None   
   text       text      (5183, 1)      str     None   
None




## TRY-CHECK (macOS) "NIH Chest X-rays"

In [4]:
dataset_5 = copy_dataset(DATASET_PATH_5, LOCAL_PATH_5)

./nih-chest-xray-train loaded successfully.


Evaluating copy transform: 9%|████████▎                                                                               | 8188/86524 [02:31<24:09

./nih-chest-xray-train loaded successfully.

Dataset downloaded from hub://activeloop/nih-chest-xray-train and saved to ./nih-chest-xray-train
Dataset(path='./nih-chest-xray-train', tensors=['findings', 'images', 'metadata/follow_up_num', 'metadata/orig_img_h', 'metadata/orig_img_pix_spacing_x', 'metadata/orig_img_pix_spacing_y', 'metadata/orig_img_w', 'metadata/patient_age', 'metadata/patient_gender', 'metadata/patient_id', 'metadata/view_position'])

             tensor                  htype        shape       dtype  compression
             -------                -------      -------     -------  ------- 
            findings              class_label  (86524, 1:9)  uint32    None   
             images                  image         (0,)       uint8     png   
     metadata/follow_up_num         generic        (0,)       int32    None   
       metadata/orig_img_h          generic        (0,)      uint32    None   
 metadata/orig_img_pix_spacing_x    generic        (0,)      float3


