# Introduction to yabadaba: Defining Records

The main effort associated with using the yabadaba package is defining your own Record classes.  This Notebook provides a simple demonstration by defining a FAQ Record class for representing frequently asked questions.

In [1]:
# Standard Python libraries
import datetime

import yabadaba
from yabadaba.record import Record
from yabadaba import load_query, recordmanager

# https://github.com/usnistgov/DataModelDict
from DataModelDict import DataModelDict as DM

import pandas as pd

from IPython.display import display, Markdown

# Show yabadaba version
print('yabadaba version =', yabadaba.__version__)

# Show date of Notebook execution
print('Notebook executed on', datetime.date.today())

yabadaba version = 0.2.0
Notebook executed on 2023-04-03


## 1. FAQ records

The FAQ records are incredibly simple data models that have a single root of 'faq' and two fields: the 'quesion' and the associated 'answer'.

In [2]:
faqfaq = """{
    "faq": {
        "question": "What does a FAQ Record represent?",
        "answer": "A frequently asked question and the corresponding answer."
    }
}"""

woodchuckfaq = """{
    "faq": {
        "question": "How much wood would a woodchuck chuck if a woodchuck could chuck wood?",
        "answer": "A woodchuck would chuck as much wood as a woodchuck could chuck if a woodchuck could chuck wood."
    }
}"""

fuzzyfaq = """{
    "faq": {
        "question": "Fuzzywuzzy was a bear. Fuzzywuzzy had no hair. So Fuzzywuzzy wasn't fuzzy, was he?",
        "answer": "Nope."
    }
}"""

## 2. Record class definition

Here is the full FAQ class definition.  Each part will be described in more detail in the following sections.

In [3]:
class FAQ(Record):
    """
    Class for representing FAQ (frequently asked question) records.
    """
    def __init__(self, model=None, name=None, **kwargs):
        """
        Initializes a Record object for a given style.
        
        Parameters
        ----------
        model : str, file-like object, DataModelDict
            The contents of the record.
        name : str, optional
            The unique name to assign to the record.  If model is a file
            path, then the default record name is the file name without
            extension.
        """
        self.__question = None
        self.__answer = None
        super().__init__(model=model, name=name, **kwargs)

    @property
    def style(self):
        """str: The record style"""
        return 'FAQ'

    @property
    def modelroot(self):
        """str: The root element of the content"""
        return 'faq'
    
    @property
    def xsl_filename(self):
        """tuple: The module path and file name of the record's xsl html transformer"""
        return ('yabadaba.demo', 'FAQ.xsl')

    @property
    def xsd_filename(self):
        """tuple: The module path and file name of the record's xsd schema"""
        return ('yabadaba.demo', 'FAQ.xsd')

    @property
    def question(self):
        """str: The frequently asked question."""
        return self.__question

    @question.setter
    def question(self, value):
        if value is None:
            self.__question = None
        else:
            self.__question = str(value)

    @property
    def answer(self):
        """str: The answer to the frequently asked question."""
        return self.__answer

    @answer.setter
    def answer(self, value):
        if value is None:
            self.__answer = None
        else:
            self.__answer = str(value)

    def load_model(self, model, name=None):
        """
        Loads record contents from a given model.

        Parameters
        ----------
        model : str or DataModelDict
            The model contents of the record to load.
        name : str, optional
            The name to assign to the record.  Often inferred from other
            attributes if not given.
        """
        super().load_model(model, name=name)

        faq = self.model[self.modelroot]
        self.question = faq['question']
        self.answer = faq['answer']

    def set_values(self, name=None, question=None, answer=None):
        """
        Set multiple object attributes at the same time.

        Parameters
        ----------
        name : str, optional
            The name to assign to the record.  Often inferred from other
            attributes if not given.
        question : str, optional
            The frequently asked question.
        answer : str, optional
            The answer to the frequently asked question.
        """
        if question is not None:
            self.question = question
        if answer is not None:
            self.answer = answer
        if name is not None:
            self.name = name

    def build_model(self):
        """
        Generates and returns model content based on the values set to object.
        """
        model = DM()
        model['faq'] = DM()
        model['faq']['question'] = self.question
        model['faq']['answer'] = self.answer

        self._set_model(model)
        return model

    def metadata(self):
        """
        Generates a dict of simple metadata values associated with the record.
        Useful for quickly comparing records and for building pandas.DataFrames
        for multiple records of the same style.
        """
        meta = {}
        meta['name'] = self.name
        meta['question'] = self.question
        meta['answer'] = self.answer
        return meta

    @property
    def queries(self):
        """dict: Query objects and their associated parameter names."""
        return {
            'question': load_query('str_contains',
                name='question', path=f'{self.modelroot}.question',
                description='Search the FAQ question field to see if it contains certain strings.'),
            'answer': load_query('str_contains',
                name='answer', path=f'{self.modelroot}.answer',
                description='Search the FAQ answer field to see if it contains certain strings.'),
        }

## 3. Class init

Related to Record behaviors, the init should
- Initialize values for any class attributes, and
- Pass model, name, and kwargs to the super Record init.

The super Record init initializes the name and model attributes, then calls load_model if model is given or set_values if not.

```python
    def __init__(self, model=None, name=None, **kwargs):
        """
        Initializes a Record object for a given style.
        
        Parameters
        ----------
        model : str, file-like object, DataModelDict
            The contents of the record.
        name : str, optional
            The unique name to assign to the record.  If model is a file
            path, then the default record name is the file name without
            extension.
        """
        self.__question = None
        self.__answer = None
        super().__init__(model=model, name=name, **kwargs)
```

In [4]:
records = [
    FAQ(name='faq', model=faqfaq),
    FAQ(name='woodchuck', model=woodchuckfaq),
    FAQ(name='fuzzy', model=fuzzyfaq),
    FAQ(name='define', question='Can I define a FAQ using parameters?', answer="Yes, you can.")
]

__NOTE__: If model is not given when a Record is initialized, the associated model contents will _not_ be constructed until the build_model() method is called.  This makes it possible to create Record objects without needing to specify all values.  

In [5]:
for record in records:
    try:
        print(record.model.json())
    except AttributeError as e:
        print('AttributeError:', e)
    print()

{"faq": {"question": "What does a FAQ Record represent?", "answer": "A frequently asked question and the corresponding answer."}}

{"faq": {"question": "How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "answer": "A woodchuck would chuck as much wood as a woodchuck could chuck if a woodchuck could chuck wood."}}

{"faq": {"question": "Fuzzywuzzy was a bear. Fuzzywuzzy had no hair. So Fuzzywuzzy wasn't fuzzy, was he?", "answer": "Nope."}}

AttributeError: model content has not been loaded or built



## 4. Class attributes

### 4.1.  Common Record attributes

These attributes are defined for all Record subclasses

- __style__ is the style name associated with the class.  It should correspond to the style given to the recordmanager.

- __modelroot__ is the name of the root element of the data model schema.

```python
    @property
    def style(self):
        """str: The record style"""
        return 'FAQ'

    @property
    def modelroot(self):
        """str: The root element of the content"""
        return 'faq'
```

In [6]:
print('style =', records[0].style)
print('modelroot =',records[0].modelroot)

style = FAQ
modelroot = faq


In [7]:
print(records[2])

FAQ record named fuzzy


### 4.2. Optional XML supporting attributes

These attributes are optional, but if given allow for methods related to the XML representation of the data to be used.

- __xsl_filename__ is a tuple consisting of module path and file name where an XSL file that transforms the XML content to HTML can be found.  Specifying this allows for the record's content to be rendered as HTML using the html() method.

- __xsd_filename__ is a tuple consisting of module path and file name where an XSD file that defines a schema for the record can be found.  Specifying this allows for record contents to be validated against the schema with the valid_xml() method.

```python
    @property
    def xsl_filename(self):
        """tuple: The module path and file name of the record's xsl html transformer"""
        return ('yabadaba.demo', 'FAQ.xsl')

    @property
    def xsd_filename(self):
        """tuple: The module path and file name of the record's xsd schema"""
        return ('yabadaba.demo', 'FAQ.xsd')
```

__Note__: the XML-based methods require that the data model be loaded/built as they use the XML representation of the data. 

In [8]:
# Call build_model() on all records to ensure that the models exist 
for record in records:
    record.build_model()

In [9]:
for record in records:
    print(record.valid_xml())

True
True
True
True


In [10]:
for record in records:
    record.html(render=True)

### 4.3. Class-specific attributes

It is highly encouraged that each Record subclass also defines attributes and methods that are specifically associated with the underlying data.  In this way, the Record objects serve both as a means of interacting with the raw content and interacting with the data in Pythonic ways.  Here, the FAQ class defines attributes for question and answer and allows for the values to be directly set.

```python
    @property
    def question(self):
        """str: The frequently asked question."""
        return self.__question

    @question.setter
    def question(self, value):
        if value is None:
            self.__question = None
        else:
            self.__question = str(value)

    @property
    def answer(self):
        """str: The answer to the frequently asked question."""
        return self.__answer

    @answer.setter
    def answer(self, value):
        if value is None:
            self.__answer = None
        else:
            self.__answer = str(value)
```

In [11]:
records[0].question

'What does a FAQ Record represent?'

In [12]:
records[0].answer

'A frequently asked question and the corresponding answer.'

## 5. Data interpretation methods

These are the methods that are defined on a per-schema basis to interpret and convert the data into different representations.

### 5.1. load_model

The load_model method reads the given model contents as a DataModelDict, sets the object's model attribute, and sets the values of any class-specific attributes according to the model contents.  This method is called during init if the model parameter is given.

```python
    def load_model(self, model, name=None):
        """
        Loads record contents from a given model.

        Parameters
        ----------
        model : str or DataModelDict
            The model contents of the record to load.
        name : str, optional
            The name to assign to the record.  Often inferred from other
            attributes if not given.
        """
        super().load_model(model, name=name)

        faq = self.model[self.modelroot]
        self.question = faq['question']
        self.answer = faq['answer']
```

The base Record class also defines a reload_model() method. reload_model() calls load_model() using the current Record model contents.  This allows for the class parameters to be updated after making direct changes to the model contents.


### 5.2. set_values

The set_values method provides a single function that can set most of the class's attribute values. Note that calling set_values does _not_ set or change the value of the model attribute.  This method is called during init if the model parameter is not given.

Here are some different suggested variations of using set_values
- Define setters for all class attributes and have set_values assign all given values using them (seen here).
- Use set_values as the primary means of setting class attributes.  Useful if class attribute values are not independent of each other.
- Don't define a set_values function and don't define setters for the class attributes.  This makes the Record's attributes read-only from the model.  Note that the Record will not be completely read-only as changes can still be made to the Record's model.


```python
    def set_values(self, name=None, question=None, answer=None):
        """
        Set multiple object attributes at the same time.

        Parameters
        ----------
        name : str, optional
            The name to assign to the record.  Often inferred from other
            attributes if not given.
        question : str, optional
            The frequently asked question.
        answer : str, optional
            The answer to the frequently asked question.
        """
        if question is not None:
            self.question = question
        if answer is not None:
            self.answer = answer
        if name is not None:
            self.name = name
```

### 5.3. build_model

The build_model method takes the current values of the class attributes and builds the data model contents based on them.  It then updates the Record's model attribute and returns the new model.  The actions of this method should be consistent with the Record's schema.  

```python
    def build_model(self):
        """
        Generates and returns model content based on the values set to object.
        """
        model = DM()
        model['faq'] = DM()
        model['faq']['question'] = self.question
        model['faq']['answer'] = self.answer

        self._set_model(model)
        return model
```

### 5.4. metadata

The metadata method returns a dictionary containing class attributes.  Ideally, the included fields should only be simple values, such as str, int, float, list and dict, and embedded lists and dicts should be avoided whenever possible to keep the overall structure flat.  This dictionary is used to construct pandas.DataFrames based on sets of records to allow for easy comparison and parsing of the records, so it should contain all meaningful fields for such operations.

```python
    def metadata(self):
        """
        Generates a dict of simple metadata values associated with the record.
        Useful for quickly comparing records and for building pandas.DataFrames
        for multiple records of the same style.
        """
        meta = {}
        meta['name'] = self.name
        meta['question'] = self.question
        meta['answer'] = self.answer
        return meta
```

In [13]:
# Quick demo of building a metadata DataFrame
df = []
for record in records:
    df.append(record.metadata())
df = pd.DataFrame(df)
df

Unnamed: 0,name,question,answer
0,faq,What does a FAQ Record represent?,A frequently asked question and the correspond...
1,woodchuck,How much wood would a woodchuck chuck if a woo...,A woodchuck would chuck as much wood as a wood...
2,fuzzy,Fuzzywuzzy was a bear. Fuzzywuzzy had no hair....,Nope.
3,define,Can I define a FAQ using parameters?,"Yes, you can."


## 6. Queries

The Record classes also contain methods that translate input parameters into delimiting database query operations.  Doing so makes it possible for the Database classes to perform efficient queries that return consistent responses across all of the database styles.

### 6.1. Query object definitions

A queries attribute can be defined which consists of a dict of Query objects.  Specifying the Query objects in this dict makes it possible to automate the creation of the query methods below.

For the queries dict

- The dict's keys correspond to the keyword parameter names that will be recognized by the Record's query methods.  
- The dict's values are Query objects that classify the type of query and what data fields are associated with it.  These can be initialized using load_query() and the following parameters
    - __style__ (*str*) indicates the type of query operation to be performed.
    - __path__ (*str*) provides the period-delimited data model path from model root to the field that the query operates on.
    - __name__ (*str*) indicates the field in the metadata representation that the query operates on.
    - __parent__(*str or None*) is an optional parameter that allows for queries on the metadata to be inside an embedded dict, i.e. the value being queried is at metadata()\[parent\]\[name\]. Note that this only allows one level of embedding in the metadata to be explored, so metadata should be as flat as possible.
    - __description__ (*str, optional*) Description of the query operation, i.e. what it is searching.

The current list of Query styles
- __str_match__ performs a query on a string data field and identifies matches when one of the given values exactly matches the data field's value.
- __int_match__ performs a query on a integer data field and identifies matches when one of the given values exactly matches the data field's value.
- __float_match__ performs a query on a float data field and identifies matches when one of the given values matches with the data field's value within a set absolute tolerance.  This style takes two additional optional parameters
    - __atol__ (*float, optional*) is the absolute tolerance for the match.  The default value is 1e-5.
    - __unit__ (*str or None, optional*) Unit to use for the given query value.  This should be the unit used in the JSON/XML records for the value in question.  A value of None indicates no unit or conversion needed.
- __date_match__ performs a query on a date data field and identifies matches when one of the given values exactly matches the data field's value.
- __str_contains__ performs a query on a string data field and identifies matches when all of the given values are contained in the data field's value.
- __list_contains__ performs a query on a list data field and identifies matches when all of the given values are contained in the data field's value.

```python
    @property
    def queries(self):
        """dict: Query objects and their associated parameter names."""
        return {
            'question': load_query('str_contains',
                name='question', path=f'{self.modelroot}.question',
                description='Search the FAQ question field to see if it contains certain strings.'),
            'answer': load_query('str_contains',
                name='answer', path=f'{self.modelroot}.answer',
                description='Search the FAQ answer field to see if it contains certain strings.'),
        }
```

In [14]:
for key, value in records[0].queries.items():
    print(key, ':', value)

question : query style str_contains
answer : query style str_contains


### 6.2. View all query discriptions for the record with querydoc

As each Record class will have different query terms based on the content that they contain, it is important for end users to know what terms that they can search on.  The querydoc attribute of the Record class will return string documentation for the queries based on their descriptions. The querydoc content is formatted as Markdown code, so you can alternately render it as such in a Jupyter Notebook.

In [15]:
print(records[0].querydoc)

# FAQ Query Parameters

- __question__ (str_contains): Search the FAQ question field to see if it contains certain strings.
- __answer__ (str_contains): Search the FAQ answer field to see if it contains certain strings.



In [16]:
# Use Ipython display. Extra '##' is optional and just reduces the header size of the title here
display(Markdown('##' + records[0].querydoc))

### FAQ Query Parameters

- __question__ (str_contains): Search the FAQ question field to see if it contains certain strings.
- __answer__ (str_contains): Search the FAQ answer field to see if it contains certain strings.


### 6.3. Querying and parsing metadata using pandasfilter

The pandasfilter() method takes a pandas.DataFrame containing the Record's metadata fields and returns a filtering mask based on the given keyword arguments.  This parses the DataFrame and only returns the entries that correspond to the data query operations.  Note that while this is used by some of the Database querying operations, the pandasfilter() can be used to directly operate on a DataFrame object either generated manually or obtained from a less rigorous query.

The base Record class defines pandasfilter() in such a way that it will automatically build and apply filters for all Queries in queries.  Defining pandasfilter() in the child class is entirely optional, but makes it possible to manually adjust, replace or add query operations beyond what the Query objects can handle.  If pandasfilter() is defined in the subclass, it should always call the super() version to handle the "name" query.

Below is a template pandasfilter() method for use in a child record class if needed. 

```python
    def pandasfilter(self, dataframe, name=None, **kwargs):
        """
        Filters a pandas.DataFrame based on kwargs values for the record style.
        
        Parameters
        ----------
        dataframe : pandas.DataFrame
            A table of metadata for multiple records of the record style.
        name : str or list
            The record name(s) to parse by.
        **kwargs : any
            Any of the record style-specific search parameters.
        
        Returns
        -------
        pandas.Series
            Boolean map of matching values
        """
        matches = super().pandasfilter(dataframe, name=name, **kwargs)
        
        # Apply additional filters as follows:
        newtest = dataframe[CONDITION]             # Test for a condition on the dataframe
        matches = (matches & newtest)              # Update the matches mask 
        
        return matches
```

In [17]:
df

Unnamed: 0,name,question,answer
0,faq,What does a FAQ Record represent?,A frequently asked question and the correspond...
1,woodchuck,How much wood would a woodchuck chuck if a woo...,A woodchuck would chuck as much wood as a wood...
2,fuzzy,Fuzzywuzzy was a bear. Fuzzywuzzy had no hair....,Nope.
3,define,Can I define a FAQ using parameters?,"Yes, you can."


In [18]:
# Find which records have question that contains "FAQ"
records[0].pandasfilter(df, question='FAQ')

0     True
1    False
2    False
3     True
dtype: bool

### 6.3. Build database queries using mongoquery and cdcsquery

The mongoquery() and cdcsquery() methods construct Mongo-style queries according to the given kwargs.  Each method returns the generated query operations as a dict.  The associated Database query operations transmit the query dict to the database, which in turn only returns records that adhere to the query.

The base Record class defines mongoquery() and cdcsquery() methods in such a way that it will automatically generate all query operations associated with the defined Query objects in the queries attribute.  Defining these methods in the child class is entirely optional, but makes it possible to manually adjust, replace or add query operations beyond what the Query objects can handle.  If the mongoquery() class is defined, it should always call the super() version as the base Record handles the "name" query.  The cdcsquery() method does not include the "name" parameter as the related operation is handled elsewhere.


Below are template mongoquery() and cdcsquery() methods for use in a child record class if needed. Note that operations between the two methods are typically identical except that

- cdcs query operations only operate on record contents and paths start with the record's root.
- mongo query operations can operate on record metadata as well, so queries to record contents need to start with 'content.' prior to the record's root.  Defining operations around non-content queries is not recommended as they would be unique to mongo.

```python
    def mongoquery(self, name=None, **kwargs):
        """
        Builds a Mongo-style query based on kwargs values for the record style.
        
        Parameters
        ----------
        name : str or list
            The record name(s) to parse by.
        **kwargs : any
            Any of the record style-specific search parameters.
        
        Returns
        -------
        dict
            The Mongo-style query
        """     
        mquery = super().mongoquery(name=name, **kwargs)
        
        # Apply additional filters as follows:
        newquery = {'content.root.path': CONDITION}    # Define the query operation
        mquery['$and'].append(newquery)                # Add the query to the top level $and 
        
        return mquery

    def cdcsquery(self, **kwargs):
        """
        Builds a CDCS-style query based on kwargs values for the record style.
        
        Parameters
        ----------
        **kwargs : any
            Any of the record style-specific search parameters.
        
        Returns
        -------
        dict
            The CDCS-style query
        """
        mquery = super().cdcsquery(**kwargs)
               
        # Apply additional filters as follows:
        newquery = {'root.path': CONDITION}            # Define the query operation
        mquery['$and'].append(newquery)                # Add the query to the top level $and 
               
        return mquery
```

In [19]:
records[0].mongoquery(question='Record')

{'$and': [{}, {'$and': [{'content.faq.question': {'$regex': 'Record'}}]}]}

In [20]:
records[0].cdcsquery(question='Record')

{'$and': [{}, {'$and': [{'faq.question': {'$regex': 'Record'}}]}]}

## 7. Next steps

Once a Record subclass has been defined it can be integrated into the yabadaba database tools by adding it to the recordmanager.  This can be done by adding the class to the recordmanager's loaded_styles as shown below, but it is highly recommended to follow the procedure described in the next Notebook.

In [21]:
recordmanager.loaded_styles['FAQ'] = FAQ

In [22]:
recordmanager.check_styles()

Record styles that passed import:
- FAQ: <class '__main__.FAQ'>
Record styles that failed import:

