In [None]:
# default_exp exporters.query
%load_ext autoreload
%autoreload 2

In [None]:
# export
# hide
from typing import Dict, List, Optional, Iterable, Any
import pandas as pd

from pymemri.pod.client import PodClient
from pymemri.data.itembase import Item

## Query

In [None]:
# export
class Query:
    def __init__(self, *properties: List[str]):
        """
        A Query implements functionality to retrieve data from the pod to a tabular format.
        
        Given a list of `properties`, the `execute` method queries the pod for a set of given items, 
        and retrieves the properties for each item if it exists. Note that a properties can be nested behind
        multiple edges, such as "sender.owner.firstName".
        """
        self.properties = list(properties)

    def traverse_edges(self, client: PodClient, items: List[Item], edges: List[str]) -> List[Item]:
        items = items.copy()

        for edge in edges:
            items_to_query = dict()
            for i in range(len(items)):
                item = items[i]
                # Replace item with target item. If the edge is empty, it has to be queried again.
                try:
                    if edge not in item.edges:
                        items[i] = None
                    else:
                        items[i] = getattr(item, edge)[0]
                except Exception:
                    items_to_query[i] = item
                    items[i] = None

            # TODO Pod can't currently get multiple items by ID, API call for each item is required for now.
            for i, item in items_to_query.items():
                try:
                    result = client.get(item.id)
                    items[i] = getattr(result, edge)[0]
                except Exception:
                    items[i] = None

        return items

    def get_property_values(
        self, client: PodClient, prop: str, items: List[Item]
    ) -> list:
        edges, prop_name = self.parse_property(prop)
        target_items = self.traverse_edges(client, items, edges)

        result = [getattr(item, prop_name, None) for item in target_items]
        return result

    @staticmethod
    def parse_property(prop: str):
        prop = prop.split(".")
        edges = prop[:-1]
        prop = prop[-1]
        return edges, prop

    @staticmethod
    def convert_dtype(result, dtype):
        if dtype == "dict":
            return result
        elif dtype in {"pandas", "pd", "df"}:
            return pd.DataFrame.from_dict(result)
        else:
            raise ValueError(f"Unknown dtype: {dtype}")

    def execute(
        self, client: PodClient, items: List[Item], dtype="dict", include_ids=True
    ) -> Any:
        if "id" not in self.properties and include_ids:
            properties = ["id"] + self.properties
        else:
            properties = self.properties

        result = {
            prop: self.get_property_values(client, prop, items) for prop in properties
        }
        return self.convert_dtype(result, dtype)

# Tests

In [None]:
# hide
from pymemri.pod.client import PodClient
from pymemri.data.schema import Account, Person, Message, Label
from pymemri.data.itembase import Edge
import random

### Create dummy data for dataset-

In [None]:
# hide
client = PodClient()
client.add_to_schema(Account, Person, Message)

num_items = 100
messages = []
items = []
edges = []
for i in range(num_items):   
    msg = Message(content=f"content_{i}")
    account = Account(handle=f"account_{i}")
    person = Person(firstName=f"person_{i}")
    label = Label(name=f"label_{i}")
    items.extend([msg, account, person, label])
    edges.extend([
        Edge(msg, account, "sender"),
        Edge(msg, label, "label"),
        Edge(account, person, "owner")
    ])
    messages.append(msg)
    
# Dataset is not perfect, drop some random edges
edges = random.sample(edges, int(len(edges)*0.8))

client.bulk_action(
    create_items=items,
    create_edges=edges
)

BULK: Writing 640/640 items/edges
Completed Bulk action, written 640 items/edges


True

In [None]:
# hide
q = Query("content", "label.name", "sender.owner.firstName", "sender.handle", "wrong_property")
result = q.execute(client, messages)

assert all(len(vals) == len(result["id"]) for vals in result.values())
assert len(result["content"]) == num_items

for prop in ["label.name", "sender.owner.firstName", "sender.handle"]:
    for i in range(num_items):
        assert result[prop][i] is None or result[prop][i].endswith(str(i))
        
assert all(val is None for val in result["wrong_property"])

In [None]:
q = Query("content", "label.name", "sender.owner.firstName", "sender.handle")
result = q.execute(client, messages, dtype="pandas")
result.head()

Unnamed: 0,id,content,label.name,sender.owner.firstName,sender.handle
0,8f5f01e76b4a43898e8a9f8190900ea0,content_0,,person_0,account_0
1,20c8b46b7cb34ae985a3700d19041c2e,content_1,,person_1,account_1
2,20ed894a6f134aadb2af058d35b1d97f,content_2,,,account_2
3,55fdc5383bc7451da1b319a72f0f7a7e,content_3,label_3,person_3,account_3
4,08e84094473c45dbb45de0ddb99a992a,content_4,label_4,person_4,account_4


In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted basic.ipynb.
Converted cvu.utils.ipynb.
Converted data.photo.ipynb.
Converted exporters.query.ipynb.
Converted index.ipynb.
Converted itembase.ipynb.
Converted plugin.authenticators.credentials.ipynb.
Converted plugin.authenticators.oauth.ipynb.
Converted plugin.listeners.ipynb.
Converted plugin.pluginbase.ipynb.
Converted plugin.states.ipynb.
Converted plugins.authenticators.password.ipynb.
Converted pod.api.ipynb.
Converted pod.client.ipynb.
Converted pod.db.ipynb.
Converted pod.utils.ipynb.
Converted template.config.ipynb.
Converted template.formatter.ipynb.
Converted test_schema.ipynb.
Converted test_utils.ipynb.
