In [None]:
# default_exp exporters.dataset
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# export
from typing import Dict, List, Optional

from pymemri.pod.client import PodClient
from pymemri.data.itembase import Item

## Column utilities

In [None]:
# export
class DataColumn:
    def __init__(self, definition, name=None):
        self.definition = definition
        self.edges, self.property = self._parse(definition)
        self.name = name if name is not None else definition

    @staticmethod
    def _parse(definition):
        definition = definition.split(".")
        edges = definition[:-1]
        prop = definition[-1]
        return edges, prop
    
def parse_columns(columns: List[str], column_names: Optional[list] = None) -> List[DataColumn]:
    if column_names is not None:
        columns = [
            DataColumn(col, name=col_name)
            for col, col_name in zip(columns, column_names)
        ]
    else:
        columns = [DataColumn(col) for col in columns]
    
    return columns


def get_column_value(client: PodClient, item: Item, column: DataColumn):
    for edge in column.edges:
        if edge not in item.edges or not isinstance(getattr(item, edge), list):
            return None

        if len(getattr(item, edge)) == 0:
            item = client.get(item.id)

        if len(getattr(item, edge)) == 0:
            return None

        item = getattr(item, edge)[0]

    return getattr(item, column.property, None)


def get_column_values(client, item, columns):
    return [get_column_value(client, item, column) for column in columns]

## Exporting datasets

In [None]:
# export
def export_dataset(
    client: PodClient,
    items: List[Item],
    columns: List[str],
    filter_incomplete: bool = True,
    column_names: Optional[list] = None,
) -> Dict[str, list]:
    """Exports a dataset from the pod to a dictionary of lists, with each list a field in the dataset.

    Example usage:
    Given a dataset of Messages, each with a sender edge and label edge:
    columns = ["content", "sender.handle", "label.name"]
    dataset = export_dataset(messages, columns)

    Args:
        client (PodClient): A PodClient to export data from
        items (List[Item]): A list of items, entries in the dataset
        columns (List[str]): Columns of the dataset, see example usage.
        filter_incomplete (bool, optional): If True, skip any item with missing values. Defaults to True.
        column_names (list, optional): Optional names for the columns. Column definitions are used for names if left blank. Defaults to None.

    Returns:
        Dict[str, list]: dict of `col_name`: `values` for each col in columns
    """

    columns = parse_columns(columns, column_names)

    dataset = {column.name: list() for column in columns}
    for item in items:
        values = get_column_values(client, item, columns)
        if filter_incomplete and None in values:
            continue
        for col, value in zip(columns, values):
            dataset[col.name].append(value)
    return dataset

# Tests

In [None]:
from pymemri.pod.client import PodClient
from pymemri.data.schema import Account, Person, Message, Label
from pymemri.data.itembase import Edge
import random

### Create dummy data for dataset

In [None]:
client = PodClient()
client.add_to_schema(Account, Person, Message)

messages = []
items = []
edges = []
for i in range(100):   
    msg = Message(content=f"content_{i}")
    account = Account(handle=f"account_{i}")
    person = Person(firstName=f"person_{i}")
    label = Label(name=f"label_{i}")
    items.extend([msg, account, person, label])
    edges.extend([
        Edge(msg, account, "sender"),
        Edge(msg, label, "label"),
        Edge(account, person, "owner")
    ])
    messages.append(msg)
    
# Dataset is not perfect, remove some random edges
edges = random.sample(edges, int(len(edges)*0.8))

client.bulk_action(
    create_items=items,
    create_edges=edges
)

BULK: Writing 640/640 items/edges
Completed Bulk action, written 640 items/edges


True

In [None]:
columns = ["content", "sender.handle", "sender.owner.firstName", "label.name"]
column_names = ["content", "handle", "first_name", "label"]

dataset = export_dataset(client, messages, columns, column_names=column_names, filter_incomplete=False)
dataset_filtered = export_dataset(client, messages, columns, column_names=column_names, filter_incomplete=True)

In [None]:
len(dataset["content"])

4

In [None]:
assert len(dataset) == len(columns)
assert all(len(vals) == len(dataset["content"]) for vals in dataset.values())
assert len(dataset["content"]) == 100
assert len(dataset_filtered["content"]) < 100

## Convert to Pandas

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame.from_dict(dataset)
df.head()

Unnamed: 0,content,handle,first_name,label
0,content_0,account_0,,label_0
1,content_1,account_1,person_1,label_1
2,content_2,,,
3,content_3,account_3,person_3,label_3
4,content_4,account_4,person_4,label_4


In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted Untitled.ipynb.
Converted basic.ipynb.
Converted cvu.utils.ipynb.
Converted data.photo.ipynb.
Converted exporters.dataset.ipynb.
Converted index.ipynb.
Converted itembase.ipynb.
Converted plugin.authenticators.credentials.ipynb.
Converted plugin.authenticators.oauth.ipynb.
Converted plugin.listeners.ipynb.
Converted plugin.pluginbase.ipynb.
Converted plugin.states.ipynb.
Converted plugins.authenticators.password.ipynb.
Converted pod.api.ipynb.
Converted pod.client.ipynb.
Converted pod.db.ipynb.
Converted pod.utils.ipynb.
Converted template.config.ipynb.
Converted template.formatter.ipynb.
Converted test_schema.ipynb.
Converted test_utils.ipynb.
