In [1]:
import scape.registry as R
import scape.sql as sql
import os,json
from getpass import getuser,getpass

import sqlalchemy
from sqlalchemy import Column, Integer, String, Numeric, DATETIME
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from getpass import (getpass, getuser)

In [2]:
#Prepare the URL with authentication data

pwd = getpass()
myurl = sqlalchemy.engine.url.URL(
    'postgresql',         # SQL backend type
    username='postgres',  # or getuser() if username is the same
    password = pwd,       #  
    host='localhost',     # assuming you're on a workstation
    database='lanldb',    # Name of your LANL database
    port=5432)            # Make sure this matches your database configuration

········


In [3]:
# Create a persistent engine
global engine
engine=sqlalchemy.create_engine( myurl )

In [4]:
def connect(engine):
    """Read the registry file and connect the metadata to the database object"""
    with open('lanldataregistry.json','rt') as f:
        overlay = json.load(f)
    
    tmd={k:sql.SqlDataSource(engine=engine,
                             metadata=R.TableMetadata(v),
                             table=k,
                             description='') for k,v in overlay.items()}
    reg = R.Registry(tmd)
    return reg

In [5]:
registry=connect(engine)

## Registry exploration to learn about available data sources.

In [6]:
# just evaluate rigistry to identify the available data sources
registry

0,1,2
auth,SqlDataSource,
dns,SqlDataSource,
flows,SqlDataSource,
proc,SqlDataSource,
redteam,SqlDataSource,


The data source class helps developers know where to look to find translators between SCAPE queries and the ntive queries issued to the back end database.

In [7]:
#The has function will identify data sources that have a field with a particular dimension associated to it. 
registry.has('hostname')

0,1,2,3,4,5
auth,SqlDataSource,,,,
,,,shost,hostname,src
,,,dhost,hostname,dst
dns,SqlDataSource,,,,
,,,resolvedhost,hostname,"dnsresponse,dst,dns"
,,,shost,hostname,src
flows,SqlDataSource,,,,
,,,shost,hostname,src
,,,dhost,hostname,dst
proc,SqlDataSource,,,,


In [8]:
# We can also ask about a tagged dimension associated with a field 
registry.has('src:hostname')

0,1,2,3,4,5
auth,SqlDataSource,,,,
,,,shost,hostname,src
dns,SqlDataSource,,,,
,,,shost,hostname,src
flows,SqlDataSource,,,,
,,,shost,hostname,src
redteam,SqlDataSource,,,,
,,,shost,hostname,src


###### This query tells us all data streams that have hosts acting as the source in some network interaction.

In [9]:
# Multiple tags can be associated with the same field.  We query the registry using multiple tags, 
# and it will return data streams that contain all the specified tags. Tags are delimited by colons,
# and the final element in the list is a dimension not a tag.
registry.has('dst:dns:hostname')

0,1,2,3,4,5
dns,SqlDataSource,,,,
,,,resolvedhost,hostname,"dnsresponse,dst,dns"


In [10]:
# You can also ask about tags without specifying a dimension
registry.has('dst:')

0,1,2,3,4,5
auth,SqlDataSource,,,,
,,,duser,username,dst
,,,dhost,hostname,dst
dns,SqlDataSource,,,,
,,,resolvedhost,hostname,"dnsresponse,dst,dns"
flows,SqlDataSource,,,,
,,,dport,port,dst
,,,dhost,hostname,dst
redteam,SqlDataSource,,,,
,,,dhost,hostname,dst


## Explore a particular data stream

In [11]:
# Select a datastream of interest to investigate
dns=registry['dns']

In [12]:
#Print out the knowledge engineering for this data stream
dns

0,1,2
Field,Dim,Tags
id,primarykey,
resolvedhost,hostname,"dnsresponse,dst,dns"
shost,hostname,src
time,time,seconds


In [27]:
#get all data from this data stream, and return a list of dictionaries
alldns=dns.select().run(out='list')
alldns[:5]

[{'id': 0, 'resolvedhost': 'C5030', 'shost': 'C4653', 'time': 2},
 {'id': 1, 'resolvedhost': 'C16712', 'shost': 'C5782', 'time': 2},
 {'id': 2, 'resolvedhost': 'C419', 'shost': 'C1191', 'time': 6},
 {'id': 3, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 15},
 {'id': 4, 'resolvedhost': 'C5030', 'shost': 'C2436', 'time': 18}]

In [26]:
# You can also call get a list of records by calling list()
alldns=dns.select().list()
alldns[:5]

[{'id': 0, 'resolvedhost': 'C5030', 'shost': 'C4653', 'time': 2},
 {'id': 1, 'resolvedhost': 'C16712', 'shost': 'C5782', 'time': 2},
 {'id': 2, 'resolvedhost': 'C419', 'shost': 'C1191', 'time': 6},
 {'id': 3, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 15},
 {'id': 4, 'resolvedhost': 'C5030', 'shost': 'C2436', 'time': 18}]

The call below returns the results in a pandas dataframe. It's the same as calling

    alldnsdf=dns.select().run(out='pandas')

In [15]:
#get all data from this data stream, and return a pandas dataframe
alldnsdf=dns.select().pandas()
alldnsdf

Unnamed: 0,id,time,shost,resolvedhost
0,0,2,C4653,C5030
1,1,2,C5782,C16712
2,2,6,C1191,C419
3,3,15,C3380,C22841
4,4,18,C2436,C5030
5,5,31,C161,C2109
6,6,35,C5642,C528
7,7,38,C3380,C22841
8,8,42,C2428,C1065
9,9,42,C2428,C2109


In [16]:
# Limit result set to a smaller numbern (Currently buggy)
tendns=dns.select(limit=6).list()
tendns

[{'id': 0, 'resolvedhost': 'C5030', 'shost': 'C4653', 'time': 2},
 {'id': 1, 'resolvedhost': 'C16712', 'shost': 'C5782', 'time': 2},
 {'id': 2, 'resolvedhost': 'C419', 'shost': 'C1191', 'time': 6},
 {'id': 3, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 15},
 {'id': 4, 'resolvedhost': 'C5030', 'shost': 'C2436', 'time': 18},
 {'id': 5, 'resolvedhost': 'C2109', 'shost': 'C161', 'time': 31}]

In [17]:
# Ask for a few of the columns by column name
dns.select('@resolvedhost,@shost',limit=6).list()

[{'resolvedhost': 'C5030', 'shost': 'C4653'},
 {'resolvedhost': 'C16712', 'shost': 'C5782'},
 {'resolvedhost': 'C419', 'shost': 'C1191'},
 {'resolvedhost': 'C22841', 'shost': 'C3380'},
 {'resolvedhost': 'C5030', 'shost': 'C2436'},
 {'resolvedhost': 'C2109', 'shost': 'C161'}]

In [18]:
#Ask for columns by dimension
dns.select('hostname,time',limit=6).list()

[{'resolvedhost': 'C5030', 'shost': 'C4653', 'time': 2},
 {'resolvedhost': 'C16712', 'shost': 'C5782', 'time': 2},
 {'resolvedhost': 'C419', 'shost': 'C1191', 'time': 6},
 {'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 15},
 {'resolvedhost': 'C5030', 'shost': 'C2436', 'time': 18},
 {'resolvedhost': 'C2109', 'shost': 'C161', 'time': 31}]

In [19]:
#Ask for columns by tag
dns.select('src:,dst:',limit=6).list()

[{'resolvedhost': 'C5030', 'shost': 'C4653'},
 {'resolvedhost': 'C16712', 'shost': 'C5782'},
 {'resolvedhost': 'C419', 'shost': 'C1191'},
 {'resolvedhost': 'C22841', 'shost': 'C3380'},
 {'resolvedhost': 'C5030', 'shost': 'C2436'},
 {'resolvedhost': 'C2109', 'shost': 'C161'}]

In [20]:
#Ask for columns by tagged dimension
dns.select('src:hostname',limit=6).list()

[{'shost': 'C4653'},
 {'shost': 'C5782'},
 {'shost': 'C1191'},
 {'shost': 'C3380'},
 {'shost': 'C2436'},
 {'shost': 'C161'}]

## Filter the datastream to return subsets

In [21]:
# Ask when a dimension has a particular value
dns.select(limit=6).where('hostname=="C3380"').list()

[{'id': 3, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 15},
 {'id': 7, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 38},
 {'id': 30, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 76},
 {'id': 38, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 100},
 {'id': 47, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 137},
 {'id': 60, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 162}]

In [22]:
# Ask when a tagged data item has a particular value
dns.select(limit=6).where('src:=="C3380"').list()

[{'id': 3, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 15},
 {'id': 7, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 38},
 {'id': 30, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 76},
 {'id': 38, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 100},
 {'id': 47, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 137},
 {'id': 60, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 162}]

In [23]:
# Ask when a data item with particular tagged dimension has a particular value
dns.select(limit=6).where('src:hostname=="C3380"').list()

[{'id': 3, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 15},
 {'id': 7, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 38},
 {'id': 30, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 76},
 {'id': 38, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 100},
 {'id': 47, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 137},
 {'id': 60, 'resolvedhost': 'C22841', 'shost': 'C3380', 'time': 162}]

In [24]:
# Ask for a match to any of a list of systems of interest -- Does Not Work Yet
#list(dns.select(limit=6).where('src:hostname==["C3380","C161"]').run())