diff --git a/.gitignore b/.gitignore index f7abd33..ee1bca6 100644 --- a/.gitignore +++ b/.gitignore @@ -18,12 +18,6 @@ .spyproject/config/backups/encoding.ini.bak .spyproject/config/backups/vcs.ini.bak .spyproject/config/backups/workspace.ini.bak -<<<<<<< HEAD - -build -dist -rds.egg-info -======= rds/__pycache__/dataproduct.cpython-37.pyc rds/__pycache__/__init__.cpython-37.pyc tests/__pycache__/test_rds.cpython-37-pytest-5.4.2.pyc @@ -32,4 +26,3 @@ rds/__init__.pyc rds/dataproduct.pyc tests/__init__.pyc tests/__pycache__/test_rds.cpython-27-PYTEST.pyc ->>>>>>> 5390867fec909a8c0aa036814abedc913cd1b5da diff --git a/build/lib/rds/__init__.py b/build/lib/rds/__init__.py new file mode 100644 index 0000000..ef560a0 --- /dev/null +++ b/build/lib/rds/__init__.py @@ -0,0 +1,4 @@ +# +from .dataproduct import DataProduct + +__all__ = ['DataProduct'] diff --git a/build/lib/rds/dataproduct.py b/build/lib/rds/dataproduct.py new file mode 100644 index 0000000..8e9b55a --- /dev/null +++ b/build/lib/rds/dataproduct.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Retrieves data and metadata from RDS. +""" + +# Built-in/Generic Imports +import sys +import urllib +import json + +__version__ = "0.1.0" +__author__ = "Metadata Technology North America Inc." +__email__ = "mtna@mtna.us" +__maintainer__ = "Sean Lucas" +__credits__ = ["Pascal Heus", "Andrew DeCarlo"] +__status__ = "Development" +__license__ = "Apache-2.0" +__copyright__ = "Copyright 2020, Metadata Technology North America Inc." + + +class DataProduct: + """ + Holds information to connect to a data product and allows methods for querying it. + + Parameters + ---------- + host : str, required + The url hosting an RDS server + catalog_id : str, required + ID of the catalog, required + dataproduct_id : str, optional + ID of the data product. Default is None + """ + + def __init__( + self, host, catalog_id, dataproduct_id=None + ): + self.host = host + self.catalog_id = catalog_id + self.dataproduct_id = dataproduct_id + self.param_delim = "" + + def select( + self, + cols=None, + where=None, + orderby=None, + groupby=None, + collimit=1000, + coloffset=0, + inject_metadata=True, + totals=False, + limit=1000, + offset=0, + ): + """ + Queries the data product for a set of records. + + Parameters + ---------- + cols : list of str, required + the columns of the records to be returned. The default is None which queries for all columns. + where : list of str, optional + filtering by comparative and conjunctive operators. The default is None. + orderby : list of str, optional + orders the records by a one or more columns. The default is None. + groupby : list of str, optional + groups the records by one or more columns. The default is None. + collimit : int, optional + limit of the columns in the data frame. The default is 1000. + coloffset : int, optional + offset of the columns in the data frame. The default is 0. + inject_metadata : bool, optional + flag for if metadata should be used/returned with the data frame. The default is True. + totals : bool, optional + flag for if the totals should be returned with the data frame. The default is False. + limit : int, optional + limit for the records in the data frame. The default is 1000. + offset : int, optional + offset for the records in the data frame. The default is 0. + + Returns + ------- + results : object + A wrapper object for the dataframe and metadata. + + """ + api_call = self._get_url("query") + "/select?" + params = {} + params.update(self._get_param(cols, "cols")) + params.update(self._get_param(where, "where")) + params.update(self._get_param(orderby, "orderby")) + params.update(self._get_param(groupby, "groupby")) + params.update(self._get_param(collimit, "collimit")) + params.update(self._get_param(coloffset, "coloffset")) + params.update(self._get_param(inject_metadata, "metadata")) + params.update(self._get_param(totals, "totals")) + + results = self._batch(api_call, params, limit, offset) + self.param_delim = "" + + metadata = None + if inject_metadata: + metadata = _get_metadata(results) + + return _get_rds_results(results, metadata, cols) + + def tabulate( + self, + dims=None, + measure=None, + where=None, + orderby=None, + totals=False, + inject_metadata=True, + inject=False, + ): + """ + Queries the data product for a set of tabulated records. + + Parameters + ---------- + dims : list of str, required + dimensions of the tabulation, represents the rows and table of table. Default is None. + measure : list of str, optional + value the tabulation is calculating from the measures, represents the values in the + cells of a table. The default is count(*). + where : list of str, optional + filtering by comparative and conjunctive operators. The default is None. + orderby : list of str, optional + orders the records by one or more columns. The default is None. + totals : bool, optional + flag for if the totals should be returned with the data frame. The default is False. + inject_metadata : bool, optional + flag for if metadata should be used/returned with the data frame. The default is True. + inject : bool, optional + flag for if the code labels should be used over the code values. The default is False. + + Returns + ------- + results : object + A wrapper object for the dataframe and metadata. + """ + api_call = self._get_url("query") + "/tabulate?" + params = {} + params.update(self._get_param(dims, "dims")) + params.update(self._get_param(measure, "measure")) + params.update(self._get_param(where, "where")) + params.update(self._get_param(orderby, "orderby")) + params.update(self._get_param([str(inject_metadata).lower()], "metadata")) + params.update(self._get_param([str(inject).lower()], "inject")) + params.update(self._get_param([str(totals).lower()], "totals")) + + results = self._batch(api_call, params) + self.param_delim = "" + + metadata = None + if inject_metadata: + metadata = _get_metadata(results) + + return _get_rds_results(results, metadata, dims + measure) + + def catalog(self): + """ + Gets the metadata for a catalog in JSON format. + + Returns + ------- + metadata : JSON + Detailed information surrounding the catalog. + + """ + api_call = self.host + "/rds/api/catalog/" + self.catalog_id + + response = _get_response(api_call) + return json.load(response) + + def dataproduct(self): + """ + Gets the metadata for the dataproduct in JSON format. + + Returns + ------- + metadata : JSON + Detailed information surrounding the dataproduct. + + """ + api_call = self._get_url("catalog") + + response = _get_response(api_call) + return json.load(response) + + def variable(self, variable=None): + """ + Gets the metadata for one or more variables in JSON format. + + Parameters + ---------- + variable : list of str, optional + A list of variable names you want the metadata of. The default is None which + will return metadata for all variables. + + Returns + ------- + metadata : JSON + Detailed information surrounding the variable(s). + + """ + api_call = self._get_url("catalog") + if variable is None: + api_call += "/variables" + else: + api_call += "/variable/" + variable + + response = _get_response(api_call) + return json.load(response) + + def classification(self, classification=None): + """ + Gets the metadata for one or more classifications in JSON format. + + Parameters + ---------- + classification : list of str, optional + A list of classification names you want the metadata of. The default is None which + will return metadata for all classifications. + + Returns + ------- + metadata : JSON + Detailed information surrounding the classification(s). + + """ + api_call = self._get_url("catalog") + if classification is None: + api_call += "/classifications" + else: + api_call += "/classification/" + classification + + response = _get_response(api_call) + return json.load(response) + + def code(self, classification, limit=20): + """ + Gets the metadata for codes in JSON format. + + Parameters + ---------- + classification : str + The name of the classification you want the codes' metadata of. + limit : int, optional + The amount of codes you want returned. The default is 20. + + Returns + ------- + metadata : JSON + Detailed information surrounding the code(s). + + """ + api_call = ( + self._get_url("catalog") + "/classification/" + classification + "/codes?" + ) + api_call += self._get_param(limit, "limit") + + response = _get_response(api_call) + return json.load(response) + + def profile(self, variable): + """ + Gets a profile on a variable that contains statistical information. + + Parameters + ---------- + variable : str + The name of the variable you want the profile of. + + Returns + ------- + profile : JSON + Detailed information surrounding a profile on a variable. + + """ + api_call = self._get_url("catalog") + "/variables/profile?cols=" + variable + + response = _get_response(api_call) + return json.load(response) + + def _get_url(self, endpoint): + if self.catalog_id is None: + raise ValueError("Catalog ID must be specified") + + if self.dataproduct_id is None: + raise ValueError("Data Product ID must be specified") + + return ( + self.host + + "/rds/api/" + + endpoint + + "/" + + self.catalog_id + + "/" + + self.dataproduct_id + ) + + def _get_param(self, param_values, param_name): + if param_values is not None: + param = "" + if type(param_values) is list: + value_delim = "" + for param_value in param_values: + param += value_delim + str(param_value) + value_delim = "," + else: + param = str(param_values) + + return {param_name: param} + else: + return {} + + def _batch(self, api_call, params, limit=10000, offset=0): + results = [] + + first_pass = True + more_rows = True + while (first_pass or more_rows) and limit > 0: + first_pass = False + api_call_copy = api_call + params.update(self._get_param(offset, "offset")) + + if limit > 500: + params.update(self._get_param(500, "limit")) + offset += 500 + limit -= 500 + else: + params.update(self._get_param(limit, "limit")) + limit = 0 + + # must use different methods depending on python version 3.X vs 2.X + if sys.version_info > (3, 0): + api_call_copy += urllib.parse.urlencode(params) + else: + api_call_copy += urllib.urlencode(params) + + response = _get_response(api_call_copy) + result = json.load(response) + results.append(result) + + more_rows = result["info"]["moreRows"] + + return results + + +class RdsResults: + """A wrapper object that binds the records, the column names, and metadata on the columns together.""" + + def __init__(self, records, columns, metadata): + self.records = records + self.columns = columns + self.metadata = metadata + + +def _get_response(api_call): + # must use different methods depending on python version 3.X vs 2.X + if sys.version_info > (3, 0): + try: + return urllib.request.urlopen(api_call) + except urllib.request.HTTPError as e: + raise ValueError("Error " + str(e.code) + ": Invalid Query") + else: + try: + return urllib.urlopen(api_call) + except urllib.HTTPError as e: + raise ValueError("Error " + str(e.code) + ": Invalid Query") + + +def _get_metadata(results): + metadata = [] + for result in results: + for variable in result["variables"]: + metadata.append(variable) + return metadata + + +def _get_rds_results(results, metadata, columns): + col_names = [] + if metadata is not None: + for variable in metadata: + try: + col_names.append(variable["label"]) + except KeyError: + col_names.append(variable["name"]) + else: + for column in columns: + col_names.append(column) + + records = [] + for result in results: + for record in result["records"]: + records.append(record) + + return RdsResults(records, col_names, metadata) diff --git a/dist/rds-0.1.0-py3-none-any.whl b/dist/rds-0.1.0-py3-none-any.whl new file mode 100644 index 0000000..1700795 Binary files /dev/null and b/dist/rds-0.1.0-py3-none-any.whl differ diff --git a/dist/rds-0.1.0.tar.gz b/dist/rds-0.1.0.tar.gz new file mode 100644 index 0000000..df59f95 Binary files /dev/null and b/dist/rds-0.1.0.tar.gz differ diff --git a/rds.egg-info/PKG-INFO b/rds.egg-info/PKG-INFO new file mode 100644 index 0000000..887f8dd --- /dev/null +++ b/rds.egg-info/PKG-INFO @@ -0,0 +1,113 @@ +Metadata-Version: 2.1 +Name: rds +Version: 0.1.0 +Summary: A library to query the Rich Data Services API framework developed by MTNA +Home-page: https://github.com/mtna/rds-python/ +Author: Metadata Technology North America Inc. +Author-email: mtna@mtna.us +License: UNKNOWN +Description: # RDS Python + ## WARNING: THIS PROJECT IS IN EARLY DEVELOPMENT STAGE. CONTENT OR CODE SHOULD ONLY BE USED FOR TESTING OR EVALUATION PURPOSES. + [![Build Status](https://travis-ci.com/mtna/rds-python.svg?branch=master)](https://travis-ci.org/mtna/rds-python) + [![Coverage Status](https://coveralls.io/repos/github/mtna/rds-python/badge.svg?branch=master&service=github)](https://coveralls.io/github/mtna/rds-python?branch=master) + ![Release Version](https://img.shields.io/badge/release-0.1.0-blue) + ![Python Version](https://img.shields.io/badge/python-2.7|3.6|3.7|3.8-blue) + [![License](https://img.shields.io/badge/license-apache_2.0-green)](https://www.apache.org/licenses/LICENSE-2.0) + [![Code Style](https://img.shields.io/badge/code_style-black-black)](https://pypi.org/project/black/) + + This python module utilizes MTNA's Rich Data Services API to quickly and efficiently access data sets and metadata stored in our repository. Through RDS, you can easily perform complex queries and tabulations on the data you are interested in while also getting back any relevant metadata. + + RDS greatly simplifies the long process finding the data to begin with, cleaning and transforming the data, and converting the data into a dataframe. All of this is done in a single step using our queries. This lets you focus on the analyzing and visualizing of the data instead of managing it. + + **Contents:** + - [Announcements](#announcements) + - [Installation](#installation) + - [Usage](#usage) + - [About](#about) + - [Software](#software) + - [License](#license) + + ## Announcements + ### Version v0.1.0 released + The initial version of **RDS** Python allows you to take advantage of our powerful database framework through its select queries, tabulation queries, and metadata retrieval. All features for our query system are available through this python API. + {release date} + + ## Installation + ### Using pip + Use the package manager [pip](https://pip.pypa.io/en/stable/) to install rds python + ```bash + pip install rds + ``` + + ## Usage + Through the **RDS** API, you care able to query for records of data as well as perform a tabulation. Both a simple query and a tabulation contain options for grouping, ordering and filtering of the data, as well as specifying if metadata is wanted or not. + + The data returned by a query/tabulation will be contained within an `RdsResults` object. This object has three properties: one is the records of data that can be used to build out a dataframe for a graph or chart, one is the column names for each column of data in the records, and the last is a collection of metadata in JSON format that provides information that can be used for better analyzation of your data. + + ### Select Query + Imagine that you would like to get some demographic data in the United State. You look through our **Catalog** and see that we have the data you are interested in. The first thing you would need to do to access this data is to establish a link to the demographic dataset that we host in our repository. To do this, you simply create a `DataProduct` with the **ID** of the dataproduct that contains the demographic information and the **ID** of the catalog that contains the dataproduct. + ```python + from rds import DataProduct + + dataproduct = DataProduct("catalog_id", "dataproduct_id") + ``` + + Once the `DataProduct` is created, you can perform your query and get back the results (which contains records in a dataframe). If you wanted to know how many people were born between the years 1900 and 1950 for each year, you could perform the following query. + ```python + results = dataproduct.select(cols=["year_of_birth", "amount_born:count(*)"], where=["year_of_birth>1900"], orderby=["year_of_birth"], groupby=["year_of_birth"], limit=50) + ``` + + This query tells **RDS** that we want the year of birth for each records as well as the number of records with that year of birth (where we are renaming the column to "amount_born"). We then filter for everyone born after 1900. We also make sure the data is in the correct order and then group the data by year of birth so that we only have a single record returned per year. Setting the limit to 50 ensures we only get date from years 1900 to 1950 (assuming there are no missing years of data). + + After obtaining the data, you can pull out the records and columns and place directly into a dataframe for use in a graph or chart. Below we demonstrate by building out a simple line plot of people born per year, utilizing the pandas package. + ```python + import pandas as pd + + dataframe = pd.DataFrame(results.records, columns=results.columns) + + sns.lineplot(data=dataframe, x=dataframe.columns[0], y=dataframe.columns[1]) + plt.show() + ``` + + ### Tabulation Query + A tabulation query is used almost identically to a select query, except it uses different parameters as a tabulation is more useful for checking the relationships between columns of data + + If you wanted to know the amount of male/females for each race in the census, you would perform the below tabulation query. + ```python + results = dataproduct.select(dims=["sex", "race"], measure=["count(*)"], orderby=["race"], inject=True) + ``` + + You can think of the parameter `dims` as the dimension of a tabulation table, and the parameter `measure` as the value that you want in each cell of the table. One thing you may notice that is new is the `inject` parameter. This signifies that we want to replace any "coded" values with their more readable labels. Sex can be an example of a "coded" value as many times the data is coded as "1" to refer to male and a "2" to refer to female. Since "1" and "2" would not be very descriptive in a chart, **RDS** gives you the ability to replace them with what the codes actually mean. + + ### Metadata + Metadata can be directly asked for on any of our resources. This includes catalogs, dataproduct, variables, classifications, and codes. The metadata contains extensive information on what the resource is and what it is used for. + + ## About + This project is developed and maintained by [MTNA](https://www.mtna.us/). + + More detailed documentation about what the current version of RDS can do can be found [here](https://documenter.getpostman.com/view/2220438/SzS4QmXD?version=latest#intro.) + + If you are interested in using the RDS framework directly, you can visit our site [here](https://www2.richdataservices.com/). + + ## Software + Compatible with Python 2.7 and Python 3.6 and higher. + + If using python 3, it is recommended that you utilize [pandas](https://pandas.pydata.org/) dataframes when working with any records returned from an RDS query. + + The are no dependencies required to run RDS Python. + + ## License + [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) + +Platform: UNKNOWN +Classifier: Topic :: Database :: Database Engines/Servers +Classifier: Natural Language :: English +Classifier: Development Status :: 3 - Alpha +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: License :: OSI Approved :: Apache Software License +Classifier: Operating System :: OS Independent +Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4 +Description-Content-Type: text/markdown diff --git a/rds.egg-info/SOURCES.txt b/rds.egg-info/SOURCES.txt new file mode 100644 index 0000000..272c5c6 --- /dev/null +++ b/rds.egg-info/SOURCES.txt @@ -0,0 +1,8 @@ +README.md +setup.py +rds/__init__.py +rds/dataproduct.py +rds.egg-info/PKG-INFO +rds.egg-info/SOURCES.txt +rds.egg-info/dependency_links.txt +rds.egg-info/top_level.txt \ No newline at end of file diff --git a/rds.egg-info/dependency_links.txt b/rds.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/rds.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/rds.egg-info/top_level.txt b/rds.egg-info/top_level.txt new file mode 100644 index 0000000..de07d2f --- /dev/null +++ b/rds.egg-info/top_level.txt @@ -0,0 +1 @@ +rds