In [None]:
# default_exp data.dataset
%load_ext autoreload
%autoreload 2

In [None]:
# export
# hide
from typing import List, Union
from pathlib import Path
from pymemri.data.itembase import Item
from pymemri.exporters.exporters import Query

In [None]:
# hide
from nbdev import show_doc

# Datasets

A dataset is a central item in the pod that organizes your project data and label annotations. To facilitate using `Dataset` items in your datascience workflow, the `Dataset` class contains methods to convert the data to a popular datascience format, or save a dataset to disk. 

In [None]:
# export
# hide
def filter_rows(dataset: dict, filter_val=None) -> dict:
    missing_idx = set()
    for column in dataset.values():
        missing_idx.update([i for i, val in enumerate(column) if val == filter_val])
    return {
        k: [item for i, item in enumerate(v) if i not in missing_idx] for k, v in dataset.items()
    }

In [None]:
# export
class DatasetEntry(Item):
    edges = Item.edges + ["data", "annotation"]

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.data: list = list()
        self.annotation: list = list()

class Dataset(Item):
    """
    The main Dataset class
    """
    properties= Item.properties + ["name", "queryStr"]
    edges = Item.edges + ["entry"]
    requires_client_ref = True
    
    def __init__(self, name: str = None, queryStr: str = None, item: list = None, **kwargs):
        super().__init__(**kwargs)
        self.queryStr = queryStr
        self.name = name
        self.entry: list = item if item is not None else []
        self.labellingTask: list = list()
        self._client = None
            
    def _get_items(self):
        if self._client is None:
            raise ValueError("Dataset does not have associated PodClient.")
        if not len(self.entry):
            edges = self._client.get_edges(self.id)
            for e in self._client.get_edges(self.id):
                self.add_edge(e["name"], e["item"])

        return self.entry
            
    def _get_data(self, dtype: str, columns: List[str], filter_missing: bool = True):
        if self._client is None:
            raise ValueError("Dataset does not have associated PodClient.")
        items = self._get_items()
            
        query = Query("id", *columns)
        result = query.execute(self._client, items)
        if filter_missing:
            result = filter_rows(result, filter_val=None)
        return query.convert_dtype(result, dtype)
            
    def to(self, dtype: str, columns: List[str], filter_missing: bool = True):
        """
        Converts Dataset to a different format.
        
        Available formats:
        list: a 2-dimensional list, containing one dataset entry per row
        dict: a list of dicts, where each dict contains {column: value} for each column
        pd: a Pandas dataframe
        

        Args:
            dtype (str): Datatype of the returned dataset
            columns (List[str]): Column names of the dataset
            filter_missing (bool, optional): If true, all rows that contain `None` values are omitted. 
                Defaults to True.

        Returns:
            Any: Dataset formatted according to `dtype`
        """
        return self._get_data(dtype, columns, filter_missing)
    
    def save(self, path: Union[Path, str], columns: List[str], filter_missing: bool = True):
        """
        Save dataset to CSV.
        """
        result = self._get_data("pandas", columns, filter_missing)
        result.to_csv(path, index=False)

In [None]:
show_doc(Dataset.to)

show_doc(Dataset.save)

<h4 id="Dataset.to" class="doc_header"><code>Dataset.to</code><a href="__main__.py#L47" class="source_link" style="float:right">[source]</a></h4>

> <code>Dataset.to</code>(**`dtype`**:`str`, **`columns`**:`List`\[`str`\], **`filter_missing`**:`bool`=*`True`*)

Converts Dataset to a different format.

Available formats:
list: a 2-dimensional list, containing one dataset entry per row
dict: a list of dicts, where each dict contains {column: value} for each column
pd: a Pandas dataframe


Args:
    dtype (str): Datatype of the returned dataset
    columns (List[str]): Column names of the dataset
    filter_missing (bool, optional): If true, all rows that contain `None` values are omitted. 
        Defaults to True.

Returns:
    Any: Dataset formatted according to `dtype`

<h4 id="Dataset.save" class="doc_header"><code>Dataset.save</code><a href="__main__.py#L68" class="source_link" style="float:right">[source]</a></h4>

> <code>Dataset.save</code>(**`path`**:`Union`\[`Path`, `str`\], **`columns`**:`List`\[`str`\], **`filter_missing`**:`bool`=*`True`*)

Save dataset to CSV.

In [None]:
# hide
from pymemri.pod.client import PodClient
from pymemri.data.schema import Account, Person, Message, Label
from pymemri.data.itembase import Edge
import random
import tempfile
import pandas as pd

In [None]:
# hide
client = PodClient.from_local_keys()
client.add_to_schema(Account, Person, Message, Dataset, DatasetEntry)

dataset = Dataset(name="example-dataset")

num_items = 10
messages = []
items = [dataset]
edges = []
for i in range(num_items):
    entry = DatasetEntry()
    msg = Message(content=f"content_{i}", service="my_service")
    account = Account(handle=f"account_{i}")
    person = Person(firstName=f"firstname_{i}")
    label = Label(name=f"label_{i}")
    items.extend([entry, msg, account, person, label])
    edges.extend([
        Edge(dataset, entry, "entry"),
        Edge(entry, msg, "data"),
        Edge(msg, account, "sender"),
        Edge(entry, label, "annotation"),
        Edge(account, person, "owner")
    ])
    messages.append(msg)

client.bulk_action(
    create_items=items,
    create_edges=edges
)

reading database_key from /home/eelco/.pymemri/pod_keys/keys.json
reading owner_key from /home/eelco/.pymemri/pod_keys/keys.json
BULK: Writing 101/101 items/edges
Completed Bulk action, written 101 items/edges


True

## Usage

To convert the data in the pod to a different format, `Dataset` implements the `Dataset.to` method. In the `columns` argument, you can define which features will be included in your dataset. A `column` is either a property of an entry in the dataset, or a property of an item connected to an entry in the dataset.

The Pod uses the following schema for Dataset items. Note that the `DatasetEntry` item is always included, and the actual data can be found by traversing the `entry.data` Edge.

<img src="images/dataset-diagram.png">

Now for example, if a dataset is a set of `Message` items, and the content has to be included as column, `data.content` would be the column name. If the name of the `sender` of a message has to be included, `data.sender.handle` would be a valid column name.

The following example retrieves an example dataset of `Message` items, and formats them to a Pandas dataframe:

In [None]:
client = PodClient.from_local_keys()
client.add_to_schema(Dataset, DatasetEntry)
dataset = client.get_dataset("example-dataset")

columns = ["data.content", "data.sender.handle", "annotation.name"]
dataframe = dataset.to("pd", columns=columns)
dataframe.head()

reading database_key from /home/eelco/.pymemri/pod_keys/keys.json
reading owner_key from /home/eelco/.pymemri/pod_keys/keys.json


Unnamed: 0,id,data.content,data.sender.handle,annotation.name
0,fdc8e7e0ccb247e8a3f91963f636708f,content_0,account_0,label_0
1,77bf661b853848f4b82c5c7f2716fb12,content_1,account_1,label_1
2,8cdfad5b1e3a4a42be7d08f524966db7,content_2,account_2,label_2
3,def16b5f018146278bd4bf41e2d27bb1,content_3,account_3,label_3
4,f5ad890e4ca842ca9b7104e0d4f3fd68,content_4,account_4,label_4


In [None]:
# hide
columns = ["data.content", "data.sender.owner.firstName", "annotation.name"]
dataframe = dataset.to("pd", columns=columns)
dataframe.head()

assert isinstance(dataframe, pd.DataFrame)
assert all(dataframe.columns == ["id"] + columns)
assert len(dataframe) == num_items
dataframe.head()

Unnamed: 0,id,data.content,data.sender.owner.firstName,annotation.name
0,593917e6671d4fd3ace3aa4524895c07,content_0,firstname_0,label_0
1,423bd734474b4c049f53ce4277c05151,content_1,firstname_1,label_1
2,f63e6852899e4377aac2e1496e0e6e98,content_2,firstname_2,label_2
3,d1f36ef2501a49cc8ab14726758bd8b1,content_3,firstname_3,label_3
4,41e91222c973440abd7539a4c2560dcb,content_4,firstname_4,label_4


In [None]:
# hide
# with tempfile.TemporaryFile(mode='w+') as f:
#     dataset.save(f, columns=["content", "sender.owner.firstName", "label.name"])
#     f.seek(0)
#     result = pd.read_csv(f)
    
# assert result.equals(dataframe)

In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted Untitled.ipynb.
Converted basic.ipynb.
Converted cvu.utils.ipynb.
Converted data.dataset.ipynb.
Converted data.photo.ipynb.
Converted exporters.exporters.ipynb.
Converted index.ipynb.
Converted itembase.ipynb.
Converted plugin.authenticators.credentials.ipynb.
Converted plugin.authenticators.oauth.ipynb.
Converted plugin.listeners.ipynb.
Converted plugin.pluginbase.ipynb.
Converted plugin.states.ipynb.
Converted plugins.authenticators.password.ipynb.
Converted pod.api.ipynb.
Converted pod.client.ipynb.
Converted pod.db.ipynb.
Converted pod.utils.ipynb.
Converted template.config.ipynb.
Converted template.formatter.ipynb.
Converted test_schema.ipynb.
Converted test_utils.ipynb.
