<a href="https://colab.research.google.com/github/mafux777/Alation_Article/blob/master/Alation_API_Training_Spring_2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Alation API Training Worksheet

We need to import just a small number of libraries to create an Alation Instance. You can use this Class to test against any official or unofficial API.

In [0]:
import pandas as pd
import os
import requests

import time
import json

import pprint
pp = pprint.PrettyPrinter(indent=4)

The class AlationInstance is created with a URL, username and password.

In [0]:
import urllib

# The AlationInstance class is a handle to an Alation server defined by a URL
# A server admin user name and password needs to be provided and all API actions
# will be run as that user
class AlationInstance():
    # The __init__ method is the constructor used for instantiating
    # email: the up to 30 chars user name, often the email, but for long emails could be cut off
    # password: could be the LDAP password, as well
    # verify: Requests verifies SSL certificates for HTTPS requests, just like a web browser.
    # By default, SSL verification is enabled, and Requests will throw a SSLError if it’s unable to verify the certificate
    def __init__(self, host, account, password, verify=True):
        self.host = host
        self.verify = verify
        self.account = account
        self.password = password
        self.token = self.get_token()
        self.headers = self.login(account, password)

    # The login method is used to obtain a session ID and relevant cookies
    # They are cached in the headers variable
    # account: the up to 30 chars user name, often the email, but for long emails could be cut off
    # password: could be the LDAP password, as well
    def login(self, account, password):
        URL = self.host + '/login/'

        s = requests.Session()
        s.get(URL, verify=self.verify)

        # get the cookie token
        csrftoken = s.cookies.get('csrftoken')

        # login with user name and password (and token)
        payload = {"csrfmiddlewaretoken": csrftoken, "ldap_user": account, "password": password}
        headers = {"Referer": URL}
        log_me("Logging in to {}".format(URL))
        r = s.post(URL, data=payload, verify=self.verify, headers=headers)

        # get the session ID and store it for all future API calls
        sessionid = s.cookies.get('sessionid')
        if not sessionid:
            log_me('No session ID, probably wrong user name / password')
        headers = {"X-CSRFToken": csrftoken,
                   "Cookie": f"csrftoken={csrftoken}; sessionid={sessionid}",
                   "Referer": URL
                   }

        return headers

    def get_token(self):
        change_token = "/api/v1/changeToken/"  # if you already have a token, use this url
        new_token = "/api/v1/getToken/"  # if you have never generated a token, use this url
        data = dict(username=self.account, password=self.password)
        response = requests.post(self.host + new_token, data=data)
        api_token = response.text
        if api_token == "EXISTING":
            response = requests.post(self.host + change_token, data=data)
            api_token = response.text
        return api_token

    # The generic_api_post method posts a request to Alation and if necessary checks the status
    def generic_api_post(self, api, params=None, body=None, official=False):
        if official:
            headers_final = dict(token=self.token)
        else:
            headers_final = self.headers
            headers_final['Referer'] = self.host + api
        r = requests.post(self.host + api, json=body, params=params, headers=headers_final)

        if r.status_code:
            r_parsed = r.json()
            # do we need to ask the job status API for help?
            if 'job_id' in r_parsed:
                params = dict(id=r_parsed['job_id'])
                url_job = "/api/v1/bulk_metadata/job/"
                # Let's wait for the job to finish
                while (True):
                    status = self.generic_api_get(api=url_job, params=params, official=True, verify=self.verify)
                    if status['status'] != 'running':
                        objects = status['result']
                        # if objects:
                        #     # for error in error_objects:
                        #     print(objects)
                        # else:
                        #     #print(status)
                        #     pass
                        break
                r_parsed = status
            return r_parsed
        else:
            return r.content

    # The generic_api_post method posts a request to Alation and if necessary checks the status
    def generic_api_put(self, api, params=None, body=None):
        r = requests.put(self.host + api, json=body, params=params, headers=self.headers, verify=self.verify)
        return r.content

    # The generic_api_get implements a REST get, with API token if official or Cookie if not.
    # If the callers sends header, it needs to contain API or cookie
    def generic_api_get(self, api, headers=None, params=None, official=False):
        if headers:
            # caller has supplied the headers
            headers_final = headers
        else:
            if official:
                headers_final = dict(token=self.token)
            else:
                headers_final = self.headers
                headers_final['Referer'] = self.host + api
        r = requests.get(self.host + api, headers=headers_final, params=params, verify=self.verify)
        if r.status_code in [200, 201]:
            try:
                return r.json()
            except:
                return r.content # for LogicalMetadata API which does not use standard JSON
        else:
            return r.content



Let's create our first AlationInstance object:

In [27]:
def log_me(text):
  print(text)

alation = AlationInstance('http://18.218.6.215',
                         'matthias+1@alation.com',
                         'FkuVnULF!hQv644VMsfo')

Logging in to http://18.218.6.215/login/


**Queries** in Alation are documents of SQL text. They can have multiple SQL statements, and their content is editable by their creator. They are associated with a single data source on creation and cannot be moved to a different data source.

**Queries** can be in draft state (unsaved), saved, or published. If a **query** is published it is also saved.

When a **query** is executed, each statement (CREATE, UPDATE, INSERT, SELECT, etc) is executed in turn by the database, and Alation stores metadata in an **execution event**. If the statement returns a **result**, Alation caches up to 16MB of that result on the server (unless the user is running an export, in which case only a 1000 row result preview is cached). These cached results (sometimes whole, sometimes partial) are available via API or in the Alation catalog until they expire. The expiration period is configurable on your server, but defaults to 1 week. Small **results** (by default less than 1MB) are stored indefinitely, with no expiration date.

In addition to an **execution event**, there is also a **session** and a **batch** associated with the execution of a query. If a query contains multiple **statements**, then multiple **events** -- one event per query statement -- would be created and would belong to a single **execution batch**. Each execution of a query in Compose will create a new batch; a single **session** could have many such batches. In the case of scheduled queries, a new session is created for each run and each such session will have only a single batch; each scheduled query is run via a designated Celery task.

# Summary



---
term | dependency | description
---- | ---------- | -----------
Query | N/A | a tab in compose, one more statements
Statement | ^Query | separated by ";"
Session | N/A | a period of time
(Execution) Batch | ^Session | Click on "run"
(Execution) Event | ^Batch | One per statement
(Execution) Result | ^Event | The results of one statement






In [10]:
my_query = dict(
    datasource_id=42,
    content="select count(*) from banana;",
    saved=True,
    author=dict(id=1, email='matthias.funke@alation.com', username='matthias.funke@alation.com')
)
q = alation.generic_api_post('/integration/v1/query/', body=my_query, official=True)
pp.pprint(q)



{   'autosave_content': 'select count(*) from banana;',
    'catalog_url': '/query/89/',
    'compose_url': '/compose/query/89/',
    'content': 'select count(*) from banana;',
    'datasource': {   'id': 42,
                      'title': 'Fennel',
                      'uri': 'postgresql://fennel2.cluster-cingqyuv6npc.us-east-2.rds.amazonaws.com:5433/fennel',
                      'url': '/data/42/'},
    'datasource_id': 42,
    'description': '',
    'has_unsaved_changes': False,
    'id': 89,
    'published': False,
    'saved': True,
    'schedules': [],
    'title': '',
    'ts_last_saved': '2020-04-19T06:19:48.322846-07:00',
    'url': '/integration/v1/query/89/'}


The previous query, alas, has no title and no description. At least is has a number.

Since there is no official API to execute the query, we will schedule a different [query](http://18.218.6.215/compose/query/85/) to run automatically and see what we get.

In [11]:
q = alation.generic_api_get('/integration/v1/query/85/', official=True)
pp.pprint(q)

{   'autosave_content': 'select schema_name, table_name, attribute_name, '
                        'count(1) as freq\n'
                        'from public.zucchini\n'
                        'group by 1, 2, 3\n'
                        'order by 4 DESC\n'
                        'limit 1000;\n'
                        '\n'
                        'select now();',
    'catalog_url': '/query/85/',
    'compose_url': '/compose/query/85/',
    'content': 'select schema_name, table_name, attribute_name, count(1) as '
               'freq\n'
               'from public.zucchini\n'
               'group by 1, 2, 3\n'
               'order by 4 DESC\n'
               'limit 1000;\n'
               '\n'
               'select now();',
    'datasource': {   'id': 42,
                      'title': 'Fennel',
                      'uri': 'postgresql://fennel2.cluster-cingqyuv6npc.us-east-2.rds.amazonaws.com:5433/fennel',
                      'url': '/data/42/'},
    'datasource_id': 42,
    'de

This output gives us interesting details about the automatically executed results. Perhaps we can use any?

In [12]:
params=dict(query_id=q['id'])
exec_session = alation.generic_api_get('/integration/v1/query/execution_session/', params=params, official=True)
pp.pprint(exec_session)

[   {   'batch_ids': [195, 196, 199, 203, 204, 205, 206],
        'client_session_id': '2020-04-19T11:35:08.036Z#15d15b7373134',
        'id': 80,
        'is_scheduled': False,
        'query_id': 85,
        'sandbox_id': 'compose-85',
        'ts_start': '2020-04-19T04:36:14.269717-07:00'},
    {   'batch_ids': [200, 201],
        'client_session_id': '2020-04-19T05:20:00.261962#218a53c4',
        'id': 83,
        'is_scheduled': True,
        'query_id': 85,
        'sandbox_id': 'schedule-1',
        'ts_start': '2020-04-19T05:20:00.264583-07:00'},
    {   'batch_ids': [202, 207],
        'client_session_id': '2020-04-19T05:41:00.208143#936dda61',
        'id': 84,
        'is_scheduled': True,
        'query_id': 85,
        'sandbox_id': 'schedule-1',
        'ts_start': '2020-04-19T05:41:00.211132-07:00'},
    {   'batch_ids': [208],
        'client_session_id': '2020-04-19T06:04:00.233709#80b85921',
        'id': 85,
        'is_scheduled': True,
        'query_id': 85,
     

The result shows a small bug in the Alation code: for a scheduled execution run, the timestamp does not show the UTC time.

Let's get the batch details by using the execution batch API



In [13]:
batch_id = exec_session[-1]['batch_ids'][0]
batch = alation.generic_api_get(f'/integration/v1/query/execution_batch/{batch_id}/', official=True)
pp.pprint(batch)

{   'events': [   {'id': 390, 'index_in_batch': 1},
                  {'id': 389, 'index_in_batch': 0}],
    'id': 209,
    'query_id': 85,
    'session_id': 86}


Let's get the execution event details.

In [23]:
base_url = "http://18.218.6.215"

for event in batch['events']:
  event_id = event['id']
  exec_event = alation.generic_api_get(f"/integration/v1/query/execution_event/{event_id}/", official=True)
  print('-----------------------------')
  pp.pprint(exec_event)
  print('-----------------------------')
  print(f"{base_url}{exec_event['result']['url']} ({exec_event['result']['title']})")

-----------------------------
{   'batch_id': 209,
    'canceled': False,
    'datasource': {'id': 42, 'title': 'Fennel', 'url': '/data/42/'},
    'db_username': None,
    'elapsed_seconds': 0.040502,
    'execution_error': None,
    'id': 390,
    'index_in_batch': 1,
    'num_result_rows': 1,
    'query_id': 85,
    'result': {   'byte_count': 35,
                  'data_schema': [   {   'name': 'now',
                                         'norm_type': 'TIMESTAMP',
                                         'original_name': 'now',
                                         'type': 'timestamptz'}],
                  'deleted': False,
                  'expired': False,
                  'id': 111,
                  'query': {   'description': '<p>i have decided to publish '
                                              'this to enlighten the '
                                              'world</p>\n',
                               'id': 85,
                               'title': 'b

Now let's use an unoffical API to get hold of the actual result.

In [22]:
result_id = exec_event['result']['id']
data_schema = exec_event['result']['data_schema']
cols = [schema['name'] for schema in data_schema]
res = alation.generic_api_get(f'/ajax/execution_result_data/{result_id}/')
pd.DataFrame(res, columns=cols)

Unnamed: 0,schema_name,table_name,attribute_name,freq
0,public,inventory,store,18
1,public,employees,emp_yrs,18
2,public,inventory,store_state,18
3,public,parts,cat,18
4,public,employees,emp_name,18
...,...,...,...,...
995,loan_data,full_loan_details_full,settlement_term,5
996,loan_data,full_loan_details_full,num_tl_30dpd,5
997,bank,uic_credit_data_2,bill_amt4,5
998,bank,uic_credit_data_2,education,5


# More fun with the unofficial API

In [16]:
otype='table'
id=869
t = alation.generic_api_get(f'/api/{otype}/{id}/')

pp.pprint(t)

{   'auto_title_status': 'HIGH_CONFIDENCE',
    'base_table': None,
    'bucket_attributes': None,
    'constraint_text': None,
    'custom_fields': [   {   'allowed_otypes': None,
                             'builtin_name': None,
                             'can_edit': True,
                             'can_view': True,
                             'field_type': 'PICKER',
                             'id': 10298,
                             'name_plural': '(01NT)Data Object',
                             'name_singular': '(01NT)Data Object',
                             'tooltip_text': ''},
                         {   'allowed_otypes': [   'user',
                                                   'groupprofile',
                                                   'groupprofile'],
                             'builtin_name': 'steward',
                             'can_edit': True,
                             'can_view': True,
                             'field_type': 'OBJECT_SE

## Get a file handle to this Notebook

This code won't work for you unless you have access to the file.

In [18]:
from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

response = drive_service.files().list(q="name='Alation API Training Spring 2020.ipynb'",
                                      spaces='drive',
                                      fields='nextPageToken, files(id, name)').execute()
file = response.get('files', [])[0]
print ('Found file: {} {}' .format (file.get('name'), file.get('id')))
alation_notebook = file.get('id')
file



Found file: Alation API Training Spring 2020.ipynb 16BiqA8Oy53iwQumPhIOOI3hfh-GFjntd


{'id': '16BiqA8Oy53iwQumPhIOOI3hfh-GFjntd',
 'name': 'Alation API Training Spring 2020.ipynb'}

Copy the notebook (running this cell only defines the method)

In [0]:
def copy_notebook(name):
  body = dict(name=name)
  response = drive_service.files().copy(fileId=alation_notebook, body=body).execute()
  return '<p><a href="https://colab.research.google.com/drive/{0}" rel="noopener noreferrer" target="_blank">https://colab.research.google.com/drive/{0}</a></p>'.format(response['id'])

In [21]:
copy_notebook("My friend.ipynb")

'<p><a href="https://colab.research.google.com/drive/1EhpkAbCK10rYaVBIXi39B7uKVgRODtIG" rel="noopener noreferrer" target="_blank">https://colab.research.google.com/drive/1EhpkAbCK10rYaVBIXi39B7uKVgRODtIG</a></p>'