In [None]:
# default_exp exporters.query
%load_ext autoreload
%autoreload 2

In [None]:
# export
# hide
from typing import Dict, List, Optional, Iterable, Any
import pandas as pd
import json

from pymemri.pod.client import PodClient
from pymemri.data.itembase import Item

## Query

In [None]:
# export
class Query:
    def __init__(self, *properties: List[str]):
        """
        A Query implements functionality to retrieve data from the pod to a tabular format.
        
        Given a list of `properties`, the `execute` method queries the pod for a set of given items, 
        and retrieves the properties for each item if it exists. Note that a properties can be nested behind
        multiple edges, such as "sender.owner.firstName".
        """
        self.properties = list(properties)

    def traverse_edges(self, client: PodClient, items: List[Item], edges: List[str]) -> List[Item]:
        items = items.copy()

        for edge in edges:
            items_to_query = dict()
            for i in range(len(items)):
                item = items[i]
                # Replace item with target item. If the edge is empty, it has to be queried again.
                try:
                    if edge not in item.edges:
                        items[i] = None
                    else:
                        items[i] = getattr(item, edge)[0]
                except Exception:
                    items_to_query[i] = item
                    items[i] = None

            # TODO Pod can't currently get multiple items by ID, API call for each item is required for now.
            for i, item in items_to_query.items():
                try:
                    result = client.get(item.id)
                    items[i] = getattr(result, edge)[0]
                except Exception:
                    items[i] = None

        return items

    def get_property_values(
        self, client: PodClient, prop: str, items: List[Item]
    ) -> list:
        edges, prop_name = self.parse_property(prop)
        target_items = self.traverse_edges(client, items, edges)

        result = [getattr(item, prop_name, None) for item in target_items]
        return result

    @staticmethod
    def parse_property(prop: str):
        prop = prop.split(".")
        edges = prop[:-1]
        prop = prop[-1]
        return edges, prop

    def convert_dtype(self, result, dtype):
        if dtype == "dict":
            return result
        elif dtype == "list":
            return [result[prop] for prop in self.properties]
        elif dtype in {"pandas", "pd", "df"}:
            return pd.DataFrame.from_dict(result)
        else:
            raise ValueError(f"Unknown dtype: {dtype}")

    def execute(self, client: PodClient, items: List[Item], dtype="dict") -> Any:
        result = {
            prop: self.get_property_values(client, prop, items) for prop in self.properties
        }
        return self.convert_dtype(result, dtype)

# Tests

In [None]:
# hide
from pymemri.pod.client import PodClient
from pymemri.data.schema import Account, Person, Message, Label
from pymemri.data.itembase import Edge
import random

### Create dummy data for dataset-

In [None]:
# hide
client = PodClient()
client.add_to_schema(Account, Person, Message)

num_items = 100
messages = []
items = []
edges = []
for i in range(num_items):   
    msg = Message(content=f"content_{i}")
    account = Account(handle=f"account_{i}")
    person = Person(firstName=f"person_{i}")
    label = Label(name=f"label_{i}")
    items.extend([msg, account, person, label])
    edges.extend([
        Edge(msg, account, "sender"),
        Edge(msg, label, "label"),
        Edge(account, person, "owner")
    ])
    messages.append(msg)
    
# Dataset is not perfect, drop some random edges
edges = random.sample(edges, int(len(edges)*0.8))

client.bulk_action(
    create_items=items,
    create_edges=edges
)

Could no connect to backend
HTTPConnectionPool(host='localhost', port=3030): Max retries exceeded with url: /v4/2989062768046831120988064860003548235973730595841524394834470394/bulk (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fb05255b070>: Failed to establish a new connection: [Errno 111] Connection refused'))


AssertionError: 

In [None]:
# hide
q = Query("content", "label.name", "sender.owner.firstName", "sender.handle", "wrong_property")
result = q.execute(client, messages)

assert all(len(vals) == len(result["content"]) for vals in result.values())
assert len(result["content"]) == num_items

for prop in ["label.name", "sender.owner.firstName", "sender.handle"]:
    for i in range(num_items):
        assert result[prop][i] is None or result[prop][i].endswith(str(i))
        
assert all(val is None for val in result["wrong_property"])

In [None]:
q = Query("content", "label.name", "sender.owner.firstName", "sender.handle")
result = q.execute(client, messages, dtype="pandas")
result.head()

Unnamed: 0,content,label.name,sender.owner.firstName,sender.handle
0,content_0,label_0,,account_0
1,content_1,label_1,,
2,content_2,,person_2,account_2
3,content_3,label_3,,
4,content_4,,person_4,account_4


In [None]:
class Dataset(Item):
    properties = Item.properties + ["query"]
    edges = Item.edges + ["item"]
    
    def __init__(self, query: str = None, item: list = None, **kwargs):
        super().__init__(**kwargs)
        self.query = query
        self.item: list = item if item is not None else []

In [None]:
def filter_missing(dataset: dict) -> dict:
    missing_idx = set()
    for column in dataset.values():
        missing_idx.update([i for i, val in enumerate(column) if val is None])
    return {
        k: [item for i, item in enumerate(v) if i not in missing_idx] for k, v in dataset.items()
    }
        
    
def export_dataset(
    client: PodClient,
    dataset: Dataset,
    content_fields: List[str] = ["content"],
    label_field: str = "label.name",
    labelled_only: bool = True,
    dtype: str = "dict",
):
    items = dataset.item
    query = Query("id", *content_fields, label_field)
    result = query.execute(client, dataset.item)
    
    if labelled_only:
        result = filter_missing(result)
        
    return query.convert_dtype(result, dtype)

In [None]:
client.add_to_schema(Dataset)
search_query = json.dumps({"type": "Message"})

messages = [client.get(msg.id) for msg in messages]
dataset = Dataset(search_query)
edges = [
    Edge(dataset, msg, "item") for msg in messages
]

client.bulk_action(
    create_items = [dataset],
    create_edges = edges
)

BULK: Writing 101/101 items/edges
Completed Bulk action, written 101 items/edges


True

In [None]:
dataset = client.get(dataset.id)

export_dataset(client, dataset, dtype="list")

[['012b6c5ce79d4096a70ecdf902d91f4f',
  'c2a60f38907b44bc84fc5d79ed759307',
  '1dc25297d00748268e3b09bbe9f37061',
  'c2ce2a9871d4425a870666bd7559b844',
  'fe4dd1bd39e44d578d0d5a82d680310a',
  '26e6baebf2ef41a89eaddf1efa081eb6',
  '231f189bebe44dbcaa9f3f57c0b6c829',
  '08391ccff3b44cd481d8248d0799a524',
  '303b07a3f0e443efad3a750c824dff5e',
  'a09f56b96ac84da6b6e1aafddc78dcf9',
  'e9ba987eb4bd4293bdadb40a134b3a88',
  'a837d889eb744c0a8dfd6b0b54ea6184',
  '6f0cef21ff3b407583a8050f37d65c70',
  '19e2b5bf8a9e49c1bb70c2baf554d23b',
  '5e70376b19ef443f9e847f77b55f3e68',
  '8666b595f21d49b2840473b5732c8b9b',
  '540208882e4e46b8920eb7255531a8dd',
  '389bd50af4e24bcba007d3f1347e33fc',
  '5457e9ea2029467da5e9e8168fd40996',
  '4e5620db1872448781a9767a5e8ca3a2',
  'd73b26db8c1e48038c4c06c92c200cd1',
  '40fe8f9213a54b018305ca81cbb09166',
  '3d048c70c00842d8a76afc491facefae',
  'ed56247d2b4c43a0af4d0cdce355e936',
  'a10dd80947be4ca5ba8c4f22499d4f58',
  '06c07390d4674f51bebb00147f4a33d5',
  'bd0de0dc6

In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted basic.ipynb.
Converted cvu.utils.ipynb.
Converted data.photo.ipynb.
Converted exporters.query.ipynb.
Converted index.ipynb.
Converted itembase.ipynb.
Converted plugin.authenticators.credentials.ipynb.
Converted plugin.authenticators.oauth.ipynb.
Converted plugin.listeners.ipynb.
Converted plugin.pluginbase.ipynb.
Converted plugin.states.ipynb.
Converted plugins.authenticators.password.ipynb.
Converted pod.api.ipynb.
Converted pod.client.ipynb.
Converted pod.db.ipynb.
Converted pod.utils.ipynb.
Converted template.config.ipynb.
Converted template.formatter.ipynb.
Converted test_schema.ipynb.
Converted test_utils.ipynb.
