In [2]:
# default_exp exporters.exporters
%load_ext autoreload
%autoreload 2

In [3]:
# export
# hide
from typing import Dict, List, Optional, Iterable, Any
import pandas as pd
import json

from pymemri.data.itembase import Item

# Query

With the `Query` class you can query the Pod with a list of properties, and export the result to a tabular dataformat. These properties can include edges that point to different items.

In [4]:
# export
class Query:
    def __init__(self, *properties: List[str]):
        """
        A Query implements functionality to retrieve data from the pod to a tabular format.
        
        Given a list of `properties`, the `execute` method queries the pod for a set of given items, 
        and retrieves the properties for each item if it exists. Note that a properties can be nested behind
        multiple edges, such as "sender.owner.firstName".
        """
        self.properties = list(properties)

    def traverse_edges(self, client: "PodClient", items: List[Item], edges: List[str]) -> List[Item]:
        items = items.copy()

        for edge in edges:
            ids_to_query = list()
            query_item_idx = list()
            for i in range(len(items)):
                item = items[i]
                if item is None:
                    continue
                # Replace item with target item. If the edge is empty, it has to be queried again.
                try:
                    if edge not in item.edges:
                        items[i] = None
                    else:
                        items[i] = getattr(item, edge)[0]
                except Exception:
                    ids_to_query.append(item.id)
                    query_item_idx.append(i)
                    items[i] = None

            
            new_items = client.search({"ids": ids_to_query})
            for i, new_item in zip(query_item_idx, new_items):
                try:
                    items[i] = getattr(new_item, edge)[0]
                except Exception:
                    items[i] = None
        return items

    def get_property_values(
        self, client: "PodClient", prop: str, items: List[Item]
    ) -> list:
        edges, prop_name = self.parse_property(prop)
        target_items = self.traverse_edges(client, items, edges)

        result = [getattr(item, prop_name, None) for item in target_items]
        return result

    @staticmethod
    def parse_property(prop: str):
        prop = prop.split(".")
        edges = prop[:-1]
        prop = prop[-1]
        return edges, prop

    def convert_dtype(self, result, dtype):
        if dtype == "dict":
            return result
        elif dtype == "list":
            return [result[prop] for prop in self.properties]
        elif dtype in {"pandas", "pd", "df"}:
            return pd.DataFrame.from_dict(result)
        else:
            raise ValueError(f"Unknown dtype: {dtype}")

    def execute(self, client: "PodClient", items: List[Item], dtype="dict") -> Any:
        result = {
            prop: self.get_property_values(client, prop, items) for prop in self.properties
        }
        return self.convert_dtype(result, dtype)

## Tests

In [5]:
# hide
from pymemri.pod.client import PodClient
from pymemri.data.schema import Account, Person, Message, CategoricalLabel
from pymemri.data.itembase import Edge
import random

### Create dummy data for dataset-

In [13]:
# hide
client = PodClient()
client.add_to_schema(Account, Person, Message, CategoricalLabel)

num_items = 10
messages = []
items = []
edges = []
for i in range(num_items):   
    msg = Message(content=f"content_{i}", service="my_service")
    account = Account(handle=f"account_{i}")
    person = Person(firstName=f"firstname_{i}")
    label = CategoricalLabel(labelValue=f"label_{i}")
    items.extend([msg, account, person, label])
    edges.extend([
        Edge(msg, account, "sender"),
        Edge(msg, label, "label"),
        Edge(account, person, "owner")
    ])
    messages.append(msg)
    
# Dataset is not perfect, drop some random edges
edges = random.sample(edges, int(len(edges)*0.8))

client.bulk_action(
    create_items=items,
    create_edges=edges
)

BULK: Writing 64/64 items/edges
Completed Bulk action, written 64 items/edges


True

### Example usage
For example, if we have a list of Messages and we want to get message content,
the accompanying account handles and account owner names, we can query:

In [14]:
messages = client.search({"type": "Message", "service": "my_service"})
query = Query("content", "sender.handle", "sender.owner.firstName")
query.execute(client, messages, dtype="pandas")

Unnamed: 0,content,sender.handle,sender.owner.firstName
0,content_7,,
1,content_5,,
2,content_0,,
3,content_3,account_3,firstname_3
4,content_4,account_4,firstname_4
5,content_6,account_6,
6,content_9,account_9,firstname_9
7,content_1,account_1,firstname_1
8,content_2,account_2,firstname_2
9,content_8,account_8,firstname_8


In [15]:
q = Query("content", "label.name", "sender.owner.firstName", "sender.handle", "wrong_property")
result = q.execute(client, messages)
result

{'content': ['content_7',
  'content_5',
  'content_0',
  'content_3',
  'content_4',
  'content_6',
  'content_9',
  'content_1',
  'content_2',
  'content_8'],
 'label.name': [None, None, None, None, None, None, None, None, None, None],
 'sender.owner.firstName': [None,
  None,
  None,
  'firstname_3',
  'firstname_4',
  None,
  'firstname_9',
  'firstname_1',
  'firstname_2',
  'firstname_8'],
 'sender.handle': [None,
  None,
  None,
  'account_3',
  'account_4',
  'account_6',
  'account_9',
  'account_1',
  'account_2',
  'account_8'],
 'wrong_property': [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None]}

In [16]:
# hide
q = Query("data.content", "data.annotation.labelValue", "data.sender.owner.firstName", "data.sender.handle", "wrong_property")
result = q.execute(client, messages)

print("RES:", result)
assert all(len(vals) == len(result["data.content"]) for vals in result.values())
assert len(result["data.content"]) == num_items

# Check if columns all align
valid_props = ["data.annotation.labelValue", "data.sender.owner.firstName", "data.sender.handle"]
for i in range(num_items):
    row = [result[prop][i] for prop in valid_props]
    row_idx = [val[-1] for val in row if val is not None]
    assert len(set(row_idx)) <= 1

assert all(val is None for val in result["wrong_property"])

RES: {'data.content': [None, None, None, None, None, None, None, None, None, None], 'data.annotation.labelValue': [None, None, None, None, None, None, None, None, None, None], 'data.sender.owner.firstName': [None, None, None, None, None, None, None, None, None, None], 'data.sender.handle': [None, None, None, None, None, None, None, None, None, None], 'wrong_property': [None, None, None, None, None, None, None, None, None, None]}


KeyError: 'content'

In [None]:
q = Query("content", "label.name", "sender.owner.firstName", "sender.handle")
result = q.execute(client, messages, dtype="pandas")
result.head()

Unnamed: 0,content,label.name,sender.owner.firstName,sender.handle
0,content_0,label_0,firstname_0,account_0
1,content_1,label_1,firstname_1,account_1
2,content_2,,,account_2
3,content_3,,firstname_3,account_3
4,content_4,label_4,firstname_4,account_4


In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted Untitled.ipynb.
Converted basic.ipynb.
Converted cvu.utils.ipynb.
Converted data.dataset.ipynb.
Converted data.photo.ipynb.
Converted exporters.exporters.ipynb.
Converted index.ipynb.
Converted itembase.ipynb.
Converted plugin.authenticators.credentials.ipynb.
Converted plugin.authenticators.oauth.ipynb.
Converted plugin.listeners.ipynb.
Converted plugin.pluginbase.ipynb.
Converted plugin.states.ipynb.
Converted plugins.authenticators.password.ipynb.
Converted pod.api.ipynb.
Converted pod.client.ipynb.
Converted pod.db.ipynb.
Converted pod.utils.ipynb.
Converted template.config.ipynb.
Converted template.formatter.ipynb.
Converted test_schema.ipynb.
Converted test_utils.ipynb.
