# Deployment Insights ETL

ETL for the Deployment Insights database.

### Ensure that the Aerospike Database is running

In [1]:
!asd >& /dev/null
!pgrep -x asd >/dev/null && echo "Aerospike database is running!" || echo "**Aerospike database is not running!**"

Aerospike database is running!


### Initialize Client
Initialize Python Client used to access features stored in the Aerospike feature store.

In [2]:
import aerospike
import sys
# connect to the database
config = {
  'hosts': [ ('127.0.0.1', 3000) ]
}
try:
  client = aerospike.client(config).connect()
except:
  print("failed to connect to the cluster with", config['hosts'])
  sys.exit(1)
print('Client initialized and connected to database')

Client initialized and connected to database


# Read JSON Documents

In [3]:
import json

def readJsonDataFromFile(jsonFile):  
    # JSON file
    f = open (jsonFile, "r")
    # Reading from file
    data = json.loads(f.read())
    # Closing file
    f.close()
    return data

json_file = '../data/20220115_004012_ascinfo.json' # example json file
ascinfo_json = readJsonDataFromFile(json_file)
print('Read ascinfo json file:', json_file)

Read ascinfo json file: ../data/20220115_004012_ascinfo.json


In [4]:
# extract top level info: timestamp and cluster
def get_timestamp(json):
    timestamp = list(json.keys())[0]
    return timestamp

def get_cluster_items(json):
    ts_items = list(json.items())
    cluster_items = list(ts_items[0][1].items())
    return cluster_items

def get_node_items(cluster_items):
    return list(cluster_items[0][1].items())

timestamp = get_timestamp(ascinfo_json)
print('timestamp:', timestamp)

cluster_itmes = get_cluster_items(ascinfo_json)
cluster_name = cluster_itmes[0][0]
print('cluster name:', cluster_name)

node_items = get_node_items(cluster_itmes)
num_nodes = len(node_items)
print('number of nodes:', num_nodes)

timestamp: 2022-01-15 00:40:12 UTC
cluster name: conv-euw1
number of nodes: 6


# Collect Insights
Collect the following cluster insights:

- name
- timestamp
- num_nodes 
- num_ns
- features: List (xdr, strong_consistency, single_bin, data_in_index, ...)
- num_objects
- storage_engines
- num_device_bytes
- num_memory bytes

Collect the following namespace insights:
- name
- strong_consistency
- num_secondary_indices
- num_sets
- num_bins
- num_device_bytes
- num_memory_bytes
- num_objects
- replication_factor
- single_bin (true/false)
- data_in_index (true/false)
- storage_engine


### Namespace Info

Collect features for each distinct namespace from each node and aggregate stats (device/mem_bytes, objects, storage_engines):
```
for each node:
    for each namespace:
        if new, 
            add namespace to namespaces map
            copy features
        # else: ensure settings are same
        Aggregate stats      
```

In [5]:
def get_namespace_info(node_items):
    # Potential future additions:
                #Secondary index #, types?
                #Total device bytes + total memory bytes - aggregate from sets    namespace_info = {}
    namespace_info = {}
    for node, node_subtree in node_items:
        for ns, ns_subtree in node_subtree['as_stat']['statistics']['namespace'].items():
            if ns not in namespace_info:
                nsinfo = {}
                nsinfo['name'] = ns
                nsinfo['num_bins'] = ns_subtree['bin'].get('bin_names', 0) 
                nsinfo['num_sets'] = len(ns_subtree['set'])
                nsinfo['num_sindex'] = len(ns_subtree['sindex'])
                nsinfo['replication_factor'] = ns_subtree['service'].get('replication-factor', 0) 
                nsinfo['storage_engine'] = ns_subtree['service']['storage-engine']
                nsinfo['ns_cluster_size'] = ns_subtree['service'].get('ns_cluster_size', 0)
                nsinfo['master_objects'] = ns_subtree['service']['master_objects']
                nsinfo['objects'] = ns_subtree['service']['objects']
                nsinfo['single_bin'] = ns_subtree['service']['single-bin']
                nsinfo['strong_consistency'] = ns_subtree['service'].get('strong-consistency', 'false')
                nsinfo['data_in_index'] = ns_subtree['service']['data-in-index']
                namespace_info[ns] = nsinfo
    return namespace_info

namespace_info = get_namespace_info(node_items)
print(namespace_info)

{'memory': {'name': 'memory', 'num_bins': '5', 'num_sets': 2, 'num_sindex': 0, 'replication_factor': '3', 'storage_engine': 'memory', 'ns_cluster_size': '6', 'master_objects': '226', 'objects': '604', 'single_bin': 'false', 'strong_consistency': 'false', 'data_in_index': 'false'}}


### Cluster Info

Collect features at the cluster level:

- Case, customer, timestamp, num_nodes
- edition, asd-build
- Aggregated from namespaces: num_ns, features, device/mem bytes, objects, storage engines.


In [6]:
def get_cluster_info(timestamp, cluster_name, node_items, namespace_info):
    cluster_info = {}
    cluster_info['timestamp'] = timestamp
    cluster_info['name'] = cluster_name 
    cluster_info['num_nodes'] = len(node_items)
    cluster_info['edition'] = node_items[0][1]['as_stat']['meta_data'].get('edition', 'unspecified')
    cluster_info['asd_build'] = node_items[0][1]['as_stat']['meta_data']['asd_build']

    cluster_info['num_namespaces'] = len(namespace_info)

    cluster_info['num_objects'] = 0
    cluster_info['storage_engines'] = set()
    cluster_info['features_in_use'] = set()
    for nsinfo in namespace_info.values():
        cluster_info['num_objects'] = cluster_info['num_objects'] + int(nsinfo['objects']) 
        cluster_info['storage_engines'] |= set([nsinfo['storage_engine']]) 
        cluster_info['features_in_use'] |= set(['single_bin'] if nsinfo['single_bin'] == 'true' else [])
        cluster_info['features_in_use'] |= set(['strong_consistency'] if nsinfo['strong_consistency'] == 'true' else [])
        cluster_info['features_in_use'] |= set(['data_in_index'] if nsinfo['data_in_index'] == 'true' else [])
    cluster_info['storage_engines'] = list(cluster_info['storage_engines'])
    return cluster_info

def set_features_in_use(node_items, cluster_info):
    for node, node_subtree in node_items:
        cluster_info['features_in_use'] |= set(['xdr'] if len(node_subtree['as_stat']['config'].get('xdr', {})) > 0 else [])
        # infer other features
    cluster_info['features_in_use'] = list(cluster_info['features_in_use'])
    return cluster_info
    
cluster_info = get_cluster_info(timestamp, cluster_name, node_items, namespace_info)
cluster_info = set_features_in_use(node_items, cluster_info)
print(cluster_info)


{'timestamp': '2022-01-15 00:40:12 UTC', 'name': 'conv-euw1', 'num_nodes': 6, 'edition': 'Aerospike Enterprise Edition build 5.5.0.9', 'asd_build': '5.5.0.9', 'num_namespaces': 1, 'num_objects': 604, 'storage_engines': ['memory'], 'features_in_use': []}


# Load Into Database
Insert the case record with case-number as the (user) key, and bins: timestamp, customer, cluster, and namespaces.

In [7]:
NAMESPACE = 'test'
SET = 'insights'
def load(case_num, customer, timestamp, cluster_info, namespace_info):
    client.put((NAMESPACE, SET, case_num), 
               {'case_num': case_num,
                'customer': customer,
                'timestamp': timestamp,
                'clusters':cluster_info, 
                'namespaces':namespace_info})
    return

CASE_NUM = 100
CUST_NAME = 'Widgets, Inc.'
load(CASE_NUM, CUST_NAME, timestamp, cluster_info, namespace_info)
print('record inserted into the database')

record inserted into the database


## Validate Data in Database

In [8]:
!aql -c "set output raw; select * from test.insights"

set output raw
OUTPUT = RAW
select * from test.insights
*************************** 1. row ***************************
case_num: 100
customer: "Widgets, Inc."
timestamp: "2022-01-15 00:40:12 UTC"
clusters: MAP('{"asd_build":"5.5.0.9", "name":"conv-euw1", "storage_engines":["memory"], "features_in_use":[], "num_namespaces":1, "timestamp":"2022-01-15 00:40:12 UTC", "num_nodes":6, "num_objects":604, "edition":"Aerospike Enterprise Edition build 5.5.0.9"}')
namespaces: MAP('{"memory":{"ns_cluster_size":"6", "replication_factor":"3", "num_bins":"5", "storage_engine":"memory", "num_sets":2, "name":"memory", "data_in_index":"false", "single_bin":"false", "objects":"604", "master_objects":"226", "num_sindex":0, "strong_consistency":"false"}}')

1 row in set (0.249 secs)

OK




# Batch ETL 
Now we ETL all ascinfo.json files in the "data" directory.

In [9]:
import glob
def process_batch_etl():
    CASE_NUM = 200
    CUST_NAMES = ['Widgets, Inc','Wares Corp','Parts Ltd','Component Factory','Modular Design','We Assemble']
    for json_file in glob.glob('../data/*_ascinfo.json'):
        json_info = readJsonDataFromFile(json_file)
        print('Read ascinfo json file:', json_file)
        
        timestamp = get_timestamp(json_info)
        print('timestamp:', timestamp)

        cluster_itmes = get_cluster_items(json_info)
        cluster_name = cluster_itmes[0][0]
        print('cluster name:', cluster_name)

        node_items = get_node_items(cluster_itmes)
        num_nodes = len(node_items)
        print('number of nodes:', num_nodes)

        namespace_info = get_namespace_info(node_items)
        print(namespace_info)
        
        cluster_info = get_cluster_info(timestamp, cluster_name, node_items, namespace_info)
        cluster_info = set_features_in_use(node_items, cluster_info)
        print(cluster_info)

        CASE_NUM += 1
        CUST_NAME = CUST_NAMES[CASE_NUM%len(CUST_NAMES)]
        load(CASE_NUM, CUST_NAME, timestamp, cluster_info, namespace_info)
        print('record inserted into the database')        
        
        print('\n')
    return
    
process_batch_etl()
print('batch etl done.')

Read ascinfo json file: ../data/20190315_203751_ascinfo.json
timestamp: 2019-03-15 20:39:59 UTC
cluster name: null
number of nodes: 34
{'users': {'name': 'users', 'num_bins': '9', 'num_sets': 2, 'num_sindex': 0, 'replication_factor': 0, 'storage_engine': 'device', 'ns_cluster_size': 0, 'master_objects': '1377143747', 'objects': '2361400096', 'single_bin': 'false', 'strong_consistency': 'false', 'data_in_index': 'false'}, 'context': {'name': 'context', 'num_bins': '14', 'num_sets': 1, 'num_sindex': 0, 'replication_factor': 0, 'storage_engine': 'device', 'ns_cluster_size': 0, 'master_objects': '111906126', 'objects': '221789094', 'single_bin': 'false', 'strong_consistency': 'false', 'data_in_index': 'false'}}
{'timestamp': '2019-03-15 20:39:59 UTC', 'name': 'null', 'num_nodes': 34, 'edition': 'unspecified', 'asd_build': '3.13.0.11', 'num_namespaces': 2, 'num_objects': 2583189190, 'storage_engines': ['device'], 'features_in_use': ['xdr']}
record inserted into the database


Read ascinfo j

In [10]:
!aql -c "set output raw; select * from test.insights"

set output raw
OUTPUT = RAW
select * from test.insights
*************************** 1. row ***************************
case_num: 201
customer: "Component Factory"
timestamp: "2019-03-15 20:39:59 UTC"
clusters: MAP('{"asd_build":"3.13.0.11", "name":"null", "storage_engines":["device"], "features_in_use":["xdr"], "num_namespaces":2, "timestamp":"2019-03-15 20:39:59 UTC", "num_nodes":34, "num_objects":2583189190, "edition":"unspecified"}')
namespaces: MAP('{"users":{"ns_cluster_size":0, "replication_factor":0, "num_bins":"9", "storage_engine":"device", "num_sets":2, "name":"users", "data_in_index":"false", "single_bin":"false", "objects":"2361400096", "master_objects":"1377143747", "num_sindex":0, "strong_consistency":"false"}, "context":{"ns_cluster_size":0, "replication_factor":0, "num_bins":"14", "storage_engine":"device", "num_sets":1, "name":"context", "data_in_index":"false", "single_bin":"false", "objects":"221789094", "master_objects":"111906126", "num_sindex":0, "strong_consisten

# Trino SQL Queries
For the following queries to exexute in the notebook, you must have a Trino server running at port 8080 of the host, connected to this container's Aerospike database via the Aerospike Trino Connector.

In [11]:
# show schemas (namespaces)
!../trino --server host.docker.internal:8080 --catalog aerospike --schema test --execute 'show schemas';


"information_schema"
"test"


In [12]:
# show tables (sets)
!../trino --server host.docker.internal:8080 --catalog aerospike --schema test --execute 'show tables';

"__default"
"insights"
"redis"


In [13]:
# get customer name for customers using feature 'single_bin'
!../trino --server host.docker.internal:8080 --catalog aerospike --schema test --execute "select customer from insights where contains(cast(json_extract(clusters, '$.features_in_use') as array(VARCHAR)),'single_bin')" ;


"Parts Ltd"


In [14]:
# get customer name and release for customers using feature 'xdr' and release after 5.x
!../trino --server host.docker.internal:8080 --catalog aerospike --schema test --execute "select customer, json_extract(clusters, '$.asd_build') as release from insights where contains(cast(json_extract(clusters, '$.features_in_use') as array(VARCHAR)),'xdr')  and regexp_like(cast(json_extract(clusters, '$.asd_build') as VARCHAR), '5.*');" ;


"Widgets, Inc","""5.1.0.10"""
"Component Factory","""5.1.0.7"""


In [15]:
!aql -c "truncate test.insights"

truncate test.insights
OK


