In [None]:
# default_exp data.dataset
%load_ext autoreload
%autoreload 2

In [None]:
# export
# hide
from typing import List, Union
from pathlib import Path
from pymemri.data.itembase import Item
from pymemri.exporters.exporters import Query

In [None]:
# export
def filter_rows(dataset: dict, filter_val=None) -> dict:
    missing_idx = set()
    for column in dataset.values():
        missing_idx.update([i for i, val in enumerate(column) if val == filter_val])
    return {
        k: [item for i, item in enumerate(v) if i not in missing_idx] for k, v in dataset.items()
    }

In [None]:
# export
class Dataset(Item):
    """
    Temporary dataset schema, needs update when MVP2 is done.
    """
    properties= Item.properties + ["name", "queryStr"]
    edges = Item.edges + ["item"]
    
    def __init__(self, name: str = None, queryStr: str = None, item: list = None, **kwargs):
        super().__init__(**kwargs)
        self.queryStr = queryStr
        self.name = name
        self.item: list = item if item is not None else []
            
    def _get_items(self):
        if self._client is None:
            raise ValueError("Dataset does not have associated PodClient.")
        if not len(self.item):
            edges = self._client.get_edges(self.id)
            for e in self._client.get_edges(self.id):
                self.add_edge(e["name"], e["item"])

        return self.item
            
    def _get_data(self, dtype: str, columns: List[str], filter_missing: bool = True):
        if self._client is None:
            raise ValueError("Dataset does not have associated PodClient.")
        items = self._get_items()
            
        query = Query("id", *columns)
        result = query.execute(self._client, items)
        if filter_missing:
            result = filter_rows(result, filter_val=None)
        return query.convert_dtype(result, dtype)
            
    def to(self, dtype: str, columns: List[str], filter_missing: bool = True):
        return self._get_data(dtype, columns, filter_missing)
    
    def save(self, path: Union[Path, str], columns: List[str], filter_missing: bool = True):
        result = self._get_data("pandas", columns, filter_missing)
        result.to_csv(path, index=False)

In [None]:
# hide
from pymemri.pod.client import PodClient
from pymemri.data.schema import Account, Person, Message, Label
from pymemri.data.itembase import Edge
import random
import tempfile
import pandas as pd

In [None]:
# hide
client = PodClient()
client.add_to_schema(Account, Person, Message, Dataset)

dataset = Dataset()

num_items = 10
messages = []
items = [dataset]
edges = []
for i in range(num_items):   
    msg = Message(content=f"content_{i}", service="my_service")
    account = Account(handle=f"account_{i}")
    person = Person(firstName=f"firstname_{i}")
    label = Label(name=f"label_{i}")
    items.extend([msg, account, person, label])
    edges.extend([
        Edge(dataset, msg, "item"),
        Edge(msg, account, "sender"),
        Edge(msg, label, "label"),
        Edge(account, person, "owner")
    ])
    messages.append(msg)

client.bulk_action(
    create_items=items,
    create_edges=edges
)

BULK: Writing 81/81 items/edges
Completed Bulk action, written 81 items/edges


True

In [None]:
# hide
dataframe = dataset.to("pd", columns=["content", "sender.owner.firstName", "label.name"])
dataframe.head()

assert isinstance(dataframe, pd.DataFrame)
dataframe.head()

Unnamed: 0,id,content,sender.owner.firstName,label.name
0,d49425e997814990911b0bffd0c10300,content_0,firstname_0,label_0
1,bad388bb62d74127bafaac3e293c767d,content_1,firstname_1,label_1
2,bdab8a9a6bdb4b15b8ef74f48450e099,content_2,firstname_2,label_2
3,ab4f0f71a2564e5dbb85ff1cb0afbcd1,content_3,firstname_3,label_3
4,6c96d870e2d74568b8a314cdb21eaf83,content_4,firstname_4,label_4


In [None]:
# hide
with tempfile.TemporaryFile(mode='w') as f:
    dataset.save(f, columns=["content", "sender.owner.firstName", "label.name"])
    f.seek(0)
    result = pd.read_csv(f)
    
assert result.equals(dataframe)

In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted Untitled.ipynb.
Converted basic.ipynb.
Converted cvu.utils.ipynb.
Converted data.dataset.ipynb.
Converted data.photo.ipynb.
Converted exporters.exporters.ipynb.
Converted index.ipynb.
Converted itembase.ipynb.
Converted plugin.authenticators.credentials.ipynb.
Converted plugin.authenticators.oauth.ipynb.
Converted plugin.listeners.ipynb.
Converted plugin.pluginbase.ipynb.
Converted plugin.states.ipynb.
Converted plugins.authenticators.password.ipynb.
Converted pod.api.ipynb.
Converted pod.client.ipynb.
Converted pod.db.ipynb.
Converted pod.utils.ipynb.
Converted template.config.ipynb.
Converted template.formatter.ipynb.
Converted test_schema.ipynb.
Converted test_utils.ipynb.
