<a href="https://colab.research.google.com/github/mafux777/Alation_Article/blob/master/Alation_API_Training_July_2021.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Alation API Training Worksheet

We need to import just a small number of libraries to create an Alation Instance. You can use this Class to test against any official or unofficial API.

In [16]:
import pandas as pd
import os
import requests

import time
import json

import pprint
pp = pprint.PrettyPrinter(indent=4)

The class AlationInstance is created with a URL, username and password.

In [17]:
import urllib

# The AlationInstance class is a handle to an Alation server defined by a URL
# A server admin user name and password needs to be provided and all API actions
# will be run as that user
class AlationInstance():
    # The __init__ method is the constructor used for instantiating
    # email: the up to 30 chars user name, often the email, but for long emails could be cut off
    # password: could be the LDAP password, as well
    # verify: Requests verifies SSL certificates for HTTPS requests, just like a web browser.
    # By default, SSL verification is enabled, and Requests will throw a SSLError if it’s unable to verify the certificate
    def __init__(self, host, account, password, verify=True):
        self.host = host
        self.verify = verify
        self.account = account
        self.password = password
        self.token = self.get_token()
        self.headers = self.login(account, password)

    # The login method is used to obtain a session ID and relevant cookies
    # They are cached in the headers variable
    # account: the up to 30 chars user name, often the email, but for long emails could be cut off
    # password: could be the LDAP password, as well
    def login(self, account, password):
        URL = self.host + '/login/'

        s = requests.Session()
        s.get(URL, verify=self.verify)

        # get the cookie token
        csrftoken = s.cookies.get('csrftoken')

        # login with user name and password (and token)
        payload = {"csrfmiddlewaretoken": csrftoken, "ldap_user": account, "password": password}
        headers = {"Referer": URL}
        log_me("Logging in to {}".format(URL))
        r = s.post(URL, data=payload, verify=self.verify, headers=headers)

        # get the session ID and store it for all future API calls
        sessionid = s.cookies.get('sessionid')
        if not sessionid:
            log_me('No session ID, probably wrong user name / password')
        headers = {"X-CSRFToken": csrftoken,
                   "Cookie": f"csrftoken={csrftoken}; sessionid={sessionid}",
                   "Referer": URL
                   }

        return headers

    def get_token(self):
        change_token = "/api/v1/changeToken/"  # if you already have a token, use this url
        new_token = "/api/v1/getToken/"  # if you have never generated a token, use this url
        data = dict(username=self.account, password=self.password)
        response = requests.post(self.host + new_token, data=data)
        api_token = response.text
        if api_token == "EXISTING":
            response = requests.post(self.host + change_token, data=data)
            api_token = response.text
        return api_token

    # The generic_api_post method posts a request to Alation and if necessary checks the status
    def generic_api_post_form(self, api, params=None, data=None, official=False):
        if official:
            headers_final = dict(token=self.token)
        else:
            headers_final = self.headers
            headers_final['Referer'] = self.host + api
        r = requests.post(self.host + api, data=data, params=params, headers=headers_final)

        if r.status_code:
            r_parsed = r.json()
            return r_parsed
        else:
            return r.content

    # The generic_api_post method posts a request to Alation and if necessary checks the status
    def generic_api_post(self, api, params=None, body=None, official=False):
        if official:
            headers_final = dict(token=self.token)
        else:
            headers_final = self.headers
            headers_final['Referer'] = self.host + api
        r = requests.post(self.host + api, json=body, params=params, headers=headers_final)

        if r.status_code:
            r_parsed = r.json()
            # do we need to ask the job status API for help?
            if 'job_id' in r_parsed:
                params = dict(id=r_parsed['job_id'])
                url_job = "/api/v1/bulk_metadata/job/"
                # Let's wait for the job to finish
                while (True):
                    status = self.generic_api_get(api=url_job, params=params, official=True, verify=self.verify)
                    if status['status'] != 'running':
                        objects = status['result']
                        # if objects:
                        #     # for error in error_objects:
                        #     print(objects)
                        # else:
                        #     #print(status)
                        #     pass
                        break
                r_parsed = status
            return r_parsed
        else:
            return r.content

    # The generic_api_put method posts a request to Alation and if necessary checks the status
    def generic_api_put(self, api, params=None, body=None):
        r = requests.put(self.host + api, json=body, params=params, headers=self.headers, verify=self.verify)
        return r.content

    # The generic_api_patch method posts a request to Alation and if necessary checks the status
    def generic_api_patch(self, api, params=None, body=None):
        r = requests.patch(self.host + api, json=body, params=params, headers=self.headers, verify=self.verify)
        return r.content

    # The generic_api_get implements a REST get, with API token if official or Cookie if not.
    # If the callers sends header, it needs to contain API or cookie
    def generic_api_get(self, api, headers=None, params=None, official=False):
        if headers:
            # caller has supplied the headers
            headers_final = headers
        else:
            if official:
                headers_final = dict(token=self.token)
            else:
                headers_final = self.headers
                headers_final['Referer'] = self.host + api
        r = requests.get(self.host + api, headers=headers_final, params=params, verify=self.verify)
        if r.status_code in [200, 201]:
            try:
                return r.json()
            except:
                return r.content # for LogicalMetadata API which does not use standard JSON
        else:
            return r.content


Let's create our first AlationInstance object:

In [18]:
def log_me(text):
  print(text)

alation = AlationInstance('https://2021-2-sandbox.alationproserv.com/',
                         'olen.musuk@bitcoin.com',
                         's*7PcGDCeDJHezf')

Logging in to https://2021-2-sandbox.alationproserv.com//login/


**Queries** in Alation are documents of SQL text. They can have multiple SQL statements, and their content is editable by their creator. They are associated with a single data source on creation and cannot be moved to a different data source.

**Queries** can be in draft state (unsaved), saved, or published. If a **query** is published it is also saved.

When a **query** is executed, each statement (CREATE, UPDATE, INSERT, SELECT, etc) is executed in turn by the database, and Alation stores metadata in an **execution event**. If the statement returns a **result**, Alation caches up to 16MB of that result on the server (unless the user is running an export, in which case only a 1000 row result preview is cached). These cached results (sometimes whole, sometimes partial) are available via API or in the Alation catalog until they expire. The expiration period is configurable on your server, but defaults to 1 week. Small **results** (by default less than 1MB) are stored indefinitely, with no expiration date.

In addition to an **execution event**, there is also a **session** and a **batch** associated with the execution of a query. If a query contains multiple **statements**, then multiple **events** -- one event per query statement -- would be created and would belong to a single **execution batch**. Each execution of a query in Compose will create a new batch; a single **session** could have many such batches. In the case of scheduled queries, a new session is created for each run and each such session will have only a single batch; each scheduled query is run via a designated Celery task.

# Summary



---
term | dependency | description
---- | ---------- | -----------
Query | N/A | a tab in compose, one more statements
Statement | ^Query | separated by ";"
Session | N/A | a period of time
(Execution) Batch | ^Session | Click on "run"
(Execution) Event | ^Batch | One per statement
(Execution) Result | ^Event | The results of one statement






In [23]:
query_text="""
SELECT
  symbol,
  name,
  round("metrics.market_data.volume_last_24_hours"/1e6) as volume
FROM crypto.crypto_large 
WHERE "metrics.market_data.volume_last_24_hours" is not null
ORDER BY 3 DESC
"""

In [24]:
my_query = dict(
    datasource_id=22,
    content=query_text,
    saved=True#,
    #author=dict(id=9, email='matthias+2@alation.com', username='matthias+2@alation.com')
)
q = alation.generic_api_post('/integration/v1/query/', body=my_query, official=True)
pp.pprint(q)



{   'autosave_content': '\n'
                        'SELECT\n'
                        '  symbol,\n'
                        '  name,\n'
                        '  '
                        'round("metrics.market_data.volume_last_24_hours"/1e6) '
                        'as volume\n'
                        'FROM crypto.crypto_large \n'
                        'WHERE "metrics.market_data.volume_last_24_hours" is '
                        'not null\n'
                        'ORDER BY 3 DESC\n',
    'catalog_url': '/query/22/',
    'compose_url': '/compose/query/22/',
    'content': '\n'
               'SELECT\n'
               '  symbol,\n'
               '  name,\n'
               '  round("metrics.market_data.volume_last_24_hours"/1e6) as '
               'volume\n'
               'FROM crypto.crypto_large \n'
               'WHERE "metrics.market_data.volume_last_24_hours" is not null\n'
               'ORDER BY 3 DESC\n',
    'datasource': {   'id': 22,
                      'titl

The previous query, alas, has no title and no description. At least is has a number.

Since there is no official API to execute the query, we will schedule a different [query](http://18.218.6.215/compose/query/85/) to run automatically and see what we get.

In [25]:
q = alation.generic_api_get('/integration/v1/query/19/', official=True)
pp.pprint(q)

(b'\n\n<!DOCTYPE html>\n<html>\n  <head>\n    <link rel="stylesheet" href="/'
 b'static/CACHE/css/output.b3389db0ad10.css" type="text/css" />\n    <title>'
 b'\n      Server Error - Alation\n    </title>\n  </head>\n  <body>\n    <d'
 b"iv class='error-page'>\n      <div class='alation-bot-image' id='robot-im"
 b"g-500'>\n          <a href='/'><img src='/static/img/robot500.png'></a>\n "
 b"     </div>\n      <div class='error-text'>\n        <p class='alation-bot"
 b'-quote\'>"I seem to be experiencing technical difficulties."</p>\n        '
 b"<div class='error-message'>\n          <p class='status-code'>500</p>\n   "
 b"       <p class='status-message'>\n            Please try again after we'"
 b've made some repairs.\n          </p>\n        </div>\n      </div>\n    </d'
 b'iv>\n  </body>\n</html>\n')


In [61]:
q = alation.generic_api_get('/integration/v1/query/19/', official=True)
pp.pprint(q)

(b'\n\n<!DOCTYPE html>\n<html>\n  <head>\n    <link rel="stylesheet" href="/'
 b'static/CACHE/css/output.b3389db0ad10.css" type="text/css" />\n    <title>'
 b'\n      Server Error - Alation\n    </title>\n  </head>\n  <body>\n    <d'
 b"iv class='error-page'>\n      <div class='alation-bot-image' id='robot-im"
 b"g-500'>\n          <a href='/'><img src='/static/img/robot500.png'></a>\n "
 b"     </div>\n      <div class='error-text'>\n        <p class='alation-bot"
 b'-quote\'>"I seem to be experiencing technical difficulties."</p>\n        '
 b"<div class='error-message'>\n          <p class='status-code'>500</p>\n   "
 b"       <p class='status-message'>\n            Please try again after we'"
 b've made some repairs.\n          </p>\n        </div>\n      </div>\n    </d'
 b'iv>\n  </body>\n</html>\n')


This output gives us interesting details about the automatically executed results. Perhaps we can use any?

In [26]:
# params=dict(query_id=q['id'])
params=dict(query_id=19)
exec_session = alation.generic_api_get('/integration/v1/query/execution_session/', params=params, official=True)
pp.pprint(exec_session)

[   {   'batch_ids': [32],
        'client_session_id': '2021-07-20T01:15:00.403972#c44c32f0',
        'id': 17,
        'is_scheduled': True,
        'query_id': 19,
        'sandbox_id': 'schedule-1',
        'ts_start': '2021-07-20T01:15:00.406536-07:00'},
    {   'batch_ids': [33, 35, 36],
        'client_session_id': '2021-07-20T08:16:03.032Z#a6ae48b646e7e',
        'id': 18,
        'is_scheduled': False,
        'query_id': 19,
        'sandbox_id': 'compose-19',
        'ts_start': '2021-07-20T01:33:28.551237-07:00'},
    {   'batch_ids': [37],
        'client_session_id': '2021-07-20T02:00:00.449814#ad8a3069',
        'id': 20,
        'is_scheduled': True,
        'query_id': 19,
        'sandbox_id': 'schedule-1',
        'ts_start': '2021-07-20T02:00:00.452403-07:00'},
    {   'batch_ids': [39],
        'client_session_id': '2021-07-20T09:12:45.724Z#e0bd9c5cc93f8',
        'id': 22,
        'is_scheduled': False,
        'query_id': 19,
        'sandbox_id': 'compose-19',
 

The result shows a small bug in the Alation code: for a scheduled execution run, the timestamp does not show the UTC time.

Let's get the batch details by using the execution batch API



In [27]:
batch_id = exec_session[-1]['batch_ids'][0]
batch = alation.generic_api_get(f'/integration/v1/query/execution_batch/{batch_id}/', official=True)
pp.pprint(batch)

{   'events': [   {'id': 50, 'index_in_batch': 0},
                  {'id': 51, 'index_in_batch': 1}],
    'id': 47,
    'query_id': 19,
    'session_id': 30}


Let's get the execution event details.

In [28]:
base_url = "https://2021-2-sandbox.alationproserv.com"

for event in batch['events']:
  event_id = event['id']
  exec_event = alation.generic_api_get(f"/integration/v1/query/execution_event/{event_id}/", official=True)
  print('-----------------------------')
  pp.pprint(exec_event)
  print('-----------------------------')
  print(f"{base_url}{exec_event['result']['url']} ({exec_event['result']['title']})")

-----------------------------
{   'batch_id': 47,
    'canceled': False,
    'datasource': {'id': 22, 'title': 'Demo Database', 'url': '/data/22/'},
    'db_username': 'olen.musuk',
    'elapsed_seconds': 0.096235,
    'execution_error': None,
    'id': 50,
    'index_in_batch': 0,
    'num_result_rows': 10,
    'query_id': 19,
    'result': {   'byte_count': 266,
                  'data_schema': [   {   'name': 'symbol',
                                         'norm_type': 'STRING',
                                         'original_name': 'symbol',
                                         'type': 'text'},
                                     {   'name': 'name',
                                         'norm_type': 'STRING',
                                         'original_name': 'name',
                                         'type': 'text'},
                                     {   'name': 'volume',
                                         'norm_type': 'FLOAT',
                 

Now let's use an unoffical API to get hold of the actual result.

In [29]:
result_id = exec_event['result']['id'] #148
data_schema = exec_event['result']['data_schema']
cols = [schema['name'] for schema in data_schema]
res = alation.generic_api_get(f'/ajax/execution_result_data/{result_id}/')
pd.DataFrame(res, columns=cols)

Unnamed: 0,symbol,name,volume
0,USDT,Tether,82033
1,BTC,Bitcoin,48801
2,ETH,Ethereum,30133
3,BUSD,Binance USD,5958
4,XRP,XRP,3703
...,...,...,...
95,XVS,VENUS,60
96,MANA,Decentraland,60
97,ZEN,Horizen,59
98,TRB,Tellor,57


The email which we normally get contains a link to download the file. It is created like this...

In [11]:
print(f'{base_url}/ajax/get_result_table_data/{result_id}/?csv=1')

https://2021-2-sandbox.alationproserv.com/ajax/get_result_table_data/8/?csv=1


# Alternative to using Alation Compose

You can use a combination of Pandas and PsycoPG2 to get the data quickly and easily.

In [30]:
from psycopg2 import connect, extensions, DatabaseError

connection = connect(host='fennel2.cluster-cingqyuv6npc.us-east-2.rds.amazonaws.com',
                     port=5433,
                     user='olen.musuk',
                     password='s*7PcGDCeDJHezf',
                     dbname='_MySQL__Analytics')

df = pd.read_sql(query_text, connection)
df


  """)


Unnamed: 0,symbol,name,volume
0,USDT,Tether,82033.0
1,BTC,Bitcoin,48801.0
2,ETH,Ethereum,30133.0
3,BUSD,Binance USD,5958.0
4,XRP,XRP,3703.0
...,...,...,...
1700,CS,Credits,0.0
1701,,Pibble,0.0
1702,MEX,MEX,0.0
1703,XSR,XSR,0.0


## Trust Flags Galore


In [19]:
# https://2021-2-sandbox.alationproserv.com/schema/40/

my_tables = alation.generic_api_get(f'/integration/v2/table/', params=dict(schema_id=41), official=True)
my_tables

for t in my_tables:
  flag = {
    "flag_type" : "WARNING",
    "subject": {
        "id":t['id'],
        "otype":"table"
    },
    "flag_reason" : f"Table with funny name {t['name']} looks fishy."
  }
  #/integration/flag/
  my_flag = alation.generic_api_post(f'/integration/flag/', body=flag,official=True)
  print(my_flag)




In [22]:
my_tables

[{'base_table_key': None,
  'custom_fields': [],
  'description': '',
  'ds_id': 22,
  'id': 796,
  'key': '22.census.state_lu',
  'name': 'state_lu',
  'partition_columns': None,
  'partition_definition': None,
  'schema_id': 41,
  'schema_name': 'census',
  'sql': None,
  'table_comment': 'Original source: https://demo-sales.alationcatalog.com/table/835/',
  'table_type': 'TABLE',
  'title': '',
  'url': '/table/796/'},
 {'base_table_key': None,
  'custom_fields': [],
  'description': '',
  'ds_id': 22,
  'id': 834,
  'key': '22.census.zip_pop',
  'name': 'zip_pop',
  'partition_columns': None,
  'partition_definition': None,
  'schema_id': 41,
  'schema_name': 'census',
  'sql': None,
  'table_comment': 'Original source: https://demo-sales.alationcatalog.com/table/322/',
  'table_type': 'TABLE',
  'title': '',
  'url': '/table/834/'},
 {'base_table_key': None,
  'custom_fields': [],
  'description': '',
  'ds_id': 22,
  'id': 856,
  'key': '22.census.census_data1',
  'name': 'census

In [20]:
# https://2021-2-sandbox.alationproserv.com/ajax/change_num_sample_rows/table/753/
t = 897
r = alation.generic_api_post_form(f'/ajax/change_num_sample_rows/table/{t}/', data=dict(new_value=0))
r

{'status': 'ok'}

In [21]:
# https://2021-2-sandbox.alationproserv.com/ajax/set_attr_sensitivity/19759/
col = 21357
r = alation.generic_api_post_form(f'/ajax/set_attr_sensitivity/{col}/', data=dict(action='mark_sensitive'))
r

{'status': 'ok'}

# More fun with the unofficial API

Let's see what the unofficial API reveals about our Query...


In [52]:
t = alation.generic_api_get('/api/query/')

df = pd.DataFrame(t)
df.sort_values('last_executed_at', ascending=False).iloc[0]

id                                                                    19
display_content        SELECT\n  symbol,\n  name,\n  round("metrics.m...
otype                                                              query
icon                                                               query
url                                                           /query/19/
form_url                                                 /query/19/form/
dict_url                                                       /dict/22/
history_url                                           /query/19/history/
num_clones                                                             0
last_edited_at                          2021-07-20T01:11:25.641794-07:00
last_autosaved_at                                                   None
last_saved_at                           2021-07-20T01:11:54.175186-07:00
last_published_at                       2021-07-20T01:11:54.184538-07:00
last_executed_at                        2021-07-20T

In [53]:
otype='query'
id=21

p = alation.generic_api_patch(f'/api/{otype}/{id}/', body=dict(discarded=True))
p_parsed = json.loads(p)
p_parsed['discarded']


True

## Get a file handle to this Notebook

This code won't work for you unless you have access to the file.

In [None]:
from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

response = drive_service.files().list(q="name='Alation API Training Spring 2020.ipynb'",
                                      spaces='drive',
                                      fields='nextPageToken, files(id, name)').execute()
file = response.get('files', [])[0]
print ('Found file: {} {}' .format (file.get('name'), file.get('id')))
alation_notebook = file.get('id')
file



Found file: Alation API Training Spring 2020.ipynb 16BiqA8Oy53iwQumPhIOOI3hfh-GFjntd


{'id': '16BiqA8Oy53iwQumPhIOOI3hfh-GFjntd',
 'name': 'Alation API Training Spring 2020.ipynb'}

Copy the notebook (running this cell only defines the method)

In [None]:
def copy_notebook(name):
  body = dict(name=name)
  response = drive_service.files().copy(fileId=alation_notebook, body=body).execute()
  return '<p><a href="https://colab.research.google.com/drive/{0}" rel="noopener noreferrer" target="_blank">https://colab.research.google.com/drive/{0}</a></p>'.format(response['id'])

In [None]:
copy_notebook("My friend.ipynb")

'<p><a href="https://colab.research.google.com/drive/1EhpkAbCK10rYaVBIXi39B7uKVgRODtIG" rel="noopener noreferrer" target="_blank">https://colab.research.google.com/drive/1EhpkAbCK10rYaVBIXi39B7uKVgRODtIG</a></p>'