In [4]:
import collections
import json
import codecs
from pprint import pprint

In [37]:
def nested_dict_iter(nested):
    for key, value in nested.items():
        if isinstance(value, collections.Mapping):
            for inner_key, inner_value in nested_dict_iter(value):
                yield inner_key, inner_value
        else:
            yield key, value

def printSendersAndReceivers(s,r):
    print("Senders:")
    for e in s: print(e)
    print()
    print("Receivers:")
    for e in r: print(e)
        
def getAllSenderAndReceiverEntities(jsonobj):
    senders,receivers = set(),set()
    for key,value in nested_dict_iter(jsonobj):
        if key == 'cf:sender':
            senders.add(value)
        elif key == 'cf:receiver':
            receivers.add(value)
    return senders,receivers

def getAllValuesForKey(k,nested):
    for key, value in nested.items():
        if k == key:
            yield value
        if isinstance(value, collections.Mapping):
            for inner_key, inner_value in nested_dict_iter(value):
                if inner_key == k:
                    yield inner_value
                
def getAllKeys(nested):
    for key, value in nested.items():
        if isinstance(value, collections.Mapping):
            yield key
            for inner_key, inner_value in nested_dict_iter(value):
                yield inner_key
        else:
            yield key
            
def getDictImmediatelyEnclosingKeyValue(key1,value1,nested):
    for key, value in nested.items():
        if value1 == value and key1 == key:
            yield "<main dict>"
        if isinstance(value, collections.Mapping):
            for inner_key, inner_value in nested_dict_iter(value):
                if value1 == inner_value and key1 == inner_key:
                    yield {prev_key:prev_val}
                prev_key,prev_val = inner_key,inner_value

# CamFlow v0.2.0 - The Thomas example where containers are seen in the graph

In [6]:
d1 = json.load(codecs.open('logs/thomas-container-diff.json', 'r', 'utf-8-sig'))

In [7]:
s,r = getAllSenderAndReceiverEntities(d1)
printSendersAndReceivers(s,r)
#Note: Both sets are the same below.
# The servers running seem to be 
# Proxy http://10.5.0.3:8081/
# Server: httpd = SocketServer.ForkingTCPServer(('10.5.0.2',PORT), Proxy)

Senders:
10.5.0.2:48032
10.5.0.1:38442
10.5.0.3:8081
10.5.0.2:48040
10.5.0.2:48036

Receivers:
10.5.0.2:48032
10.5.0.1:38442
10.5.0.3:8081
10.5.0.2:48040
10.5.0.2:48036


In [8]:
print("All keys in the json dict:")
pprint(set([v for v in getAllKeys(d1)]))

All keys in the json dict:
{'activity',
 'cf',
 'cf:allowed',
 'cf:boot_id',
 'cf:camflow',
 'cf:cid',
 'cf:date',
 'cf:gid',
 'cf:id',
 'cf:ino',
 'cf:jiffies',
 'cf:machine',
 'cf:machine_id',
 'cf:mode',
 'cf:nodename',
 'cf:offset',
 'cf:pathname',
 'cf:pid',
 'cf:receiver',
 'cf:release',
 'cf:secctx',
 'cf:sender',
 'cf:seq',
 'cf:sysname',
 'cf:uid',
 'cf:uuid',
 'cf:version',
 'cf:vpid',
 'entity',
 'host',
 'path',
 'prefix',
 'prov',
 'prov:activity',
 'prov:entity',
 'prov:generatedEntity',
 'prov:informant',
 'prov:informed',
 'prov:label',
 'prov:type',
 'prov:usedEntity',
 'serv',
 'type',
 'used',
 'wasDerivedFrom',
 'wasGeneratedBy',
 'wasInformedBy'}


## Let's look at what the standard says about the structure of this json:  
## [Link to PROV data model spec](https://www.w3.org/TR/prov-dm/ "PROV data model").  

## "At its core, provenance describes the:  
## *use and production* of **entities**  
## *by* **activities**,   
## which may be in various ways *influenced by* **agents**."   

## ![https://www.w3.org/TR/prov-dm/uml/essentials.png](prov-core-structure.png)

## PROV-N in particular looks helpful because "PROV-N (Recommendation), a notation for provenance aimed at human consumption": https://www.w3.org/TR/2013/REC-prov-n-20130430/

## Based on this document and eyeballing the json data, we perform some analysis:


In [9]:
# What are the prov:label and prov:type values in our execution:
print('prov:label values')
pprint(set([v for v in getAllValuesForKey('prov:label',d1)]))

prov:label values
{'[address] IPV4 10.5.0.3',
 '[address] UNIX /var/run/nscd/socket',
 '[fifo] 0',
 '[file] 0',
 '[file] 1',
 '[machine] 1022090165',
 '[machine] 1030075630',
 '[machine] 1048856292',
 '[machine] 1054465990',
 '[machine] 1100367967',
 '[machine] 114986689',
 '[machine] 1237468202',
 '[machine] 1268947775',
 '[machine] 1404656863',
 '[machine] 1480552698',
 '[machine] 1558345364',
 '[machine] 1577082000',
 '[machine] 1941367434',
 '[machine] 2007545844',
 '[machine] 289813543',
 '[machine] 340646718',
 '[machine] 776556187',
 '[machine] 81213873',
 '[machine] 868444599',
 '[machine] 972296071',
 '[machine] 973350807',
 '[mmaped_file] 0',
 '[packet] 10.5.0.1:38442->10.5.0.3:8081 (2059)',
 '[packet] 10.5.0.1:38442->10.5.0.3:8081 (2315)',
 '[packet] 10.5.0.2:48032->10.5.0.3:8081 (33877)',
 '[packet] 10.5.0.2:48032->10.5.0.3:8081 (34133)',
 '[packet] 10.5.0.2:48036->10.5.0.3:8081 (32973)',
 '[packet] 10.5.0.2:48036->10.5.0.3:8081 (33229)',
 '[packet] 10.5.0.2:48036->10.5.0.3

In [10]:
print('prov:type values')
pprint(set([v for v in getAllValuesForKey('prov:type',d1)]))

prov:type values
{'accept',
 'address',
 'clone',
 'connect',
 'create',
 'fifo',
 'file',
 'file_name',
 'getattr',
 'link',
 'mmap',
 'mmap_exec',
 'mmap_read',
 'mmap_write',
 'mmaped_file',
 'named_process',
 'open',
 'packet',
 'perm_read',
 'read',
 'receive',
 'receive_packet',
 'send',
 'send_packet',
 'socket',
 'task',
 'unknown',
 'version_activity',
 'version_entity',
 'write'}


## First hypothesis: in a communication between one container and the other, the entities are packets.

## Why is this helpful? If this is true, one way of reducing the graph is by reducing entities that correspond to packets

In [11]:
#Test the hypothesis that packets are entities:
#Hypothesis: some or all of the keys: (entity, prov:entity, type) can be indicative of whether a provenance
#  entry is about an entity
#Experiment: Check the output of all values that belong to these keys

In [13]:
pprint([v for v in getAllValuesForKey('entity',d1)])
# There's too much data here to be able to discern anything meaningful

[{'1022090165': {'cf:camflow': 'v0.2.0',
                 'cf:date': '2017:01:30T04:47:06',
                 'cf:machine': 'x86_64',
                 'cf:nodename': 'machine6',
                 'cf:release': '4.9.5',
                 'cf:sysname': 'Linux',
                 'cf:version': '#1 SMP Wed Jan 25 05:52:47 UTC 2017',
                 'prov:label': '[machine] 1022090165'},
  '1030075630': {'cf:camflow': 'v0.2.0',
                 'cf:date': '2017:01:25T01:39:32',
                 'cf:machine': 'x86_64',
                 'cf:nodename': 'machine2',
                 'cf:release': '4.9.5',
                 'cf:sysname': 'Linux',
                 'cf:version': '#1 SMP Wed Jan 25 01:33:54 UTC 2017',
                 'prov:label': '[machine] 1030075630'},
  '1048856292': {'cf:camflow': 'v0.2.1',
                 'cf:date': '2017:02:01T00:50:51',
                 'cf:machine': 'x86_64',
                 'cf:nodename': 'localhost.localdomain',
                 'cf:release': '4.9.5',
    

In [15]:
pprint(set([v for v in getAllValuesForKey('prov:entity',d1)]))
# Thomas said this is a combination (perhaps concatenation even) of machine_id, boot_id, 
#    and some other ids that are guaranteed to make it a unique id

{'AAAIAAAAACAXJwEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAAIAAAAACAsJwEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAAIAAAAACB2NQEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAAIAAAAACCONQEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAAIAAAAACDlNQEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAEAAAAAACAWJwEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAEAAAAAACAZJwEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAEAAAAAACAZJwEAAAAAAIdLGEIqRMJJAQAAAAAAAAA=',
 'AAEAAAAAACAbJwEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAEAAAAAACAdJwEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAEAAAAAACAdJwEAAAAAAIdLGEIqRMJJAQAAAAAAAAA=',
 'AAEAAAAAACAfJwEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAEAAAAAACAoJwEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAEAAAAAACAuJwEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAEAAAAAACAyJwEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAEAAAAAACB3NQEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAEAAAAAACB4NQEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAEAAAAAACB6NQEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'AAEAAAAAACCVHwEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'ABAAAAAAACBkIwEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=',
 'ACAAAAAAACAmJwEAAA

In [16]:
pprint(set([v for v in getAllValuesForKey('type',d1)]))

{'AF_UNIX', 'AF_INET'}


In [17]:
pprint(set([v for v in getAllValuesForKey('cf:ino',d1)]))
#Observations: cf:ino 
# 32421,32422,32423,32424,32409,31901,31902 correspond to sockets
# 31112 correspond to fifo

{0, 32421, 32422, 32423, 31112, 32424, 32409, 31901, 31902}


In [18]:
pprint(set([v for v in getAllValuesForKey('cf:gid',d1)]))

{0, 50}


In [22]:
pprint(set([v for v in getAllValuesForKey('cf:machine_id',d1)]))
pprint(set([v for v in getAllValuesForKey('cf:boot_id',d1)]))
# Nothing interesting here

{1237468202}
{1108888455}


In [21]:
pprint(set([v for v in getAllValuesForKey('cf:jiffies',d1)]))

{'4294765090',
 '4294765276',
 '4294765277',
 '4294765298',
 '4294765299',
 '4294765301',
 '4294765302',
 '4294765305',
 '4294765306',
 '4294765349',
 '4294765350',
 '4294765359',
 '4294765370',
 '4294765371',
 '4294765378',
 '4294948641',
 '4294948645',
 '4294954243',
 '4294954256',
 '4294954258',
 '4294954261',
 '4294954262',
 '4294954264',
 '4294954266',
 '4294954282',
 '4294954293',
 '4294957782',
 '4294957786',
 '4294957796',
 '4294957797',
 '4294957808',
 '4294958939',
 '4294958946'}


In [24]:
pprint(set([v for v in getAllValuesForKey('cf:secctx',d1)]))
#This seems key, and Thomas also mentioned this: " It handles SELinux security context, 
#  CGroup and similar things that are used to build "container" 
#  (that do not exist as such in the kernel). 
#  At the minima they are now recorded in the activity/entity data, 
#  and you can setup "filter" based on it. \ie record all provenance from CGroup X, 
#  where the CGroup correspond to your container, or similarly using the security context. "

{'system_u:object_r:svirt_sandbox_file_t:s0:c372,c604',
 'system_u:object_r:svirt_sandbox_file_t:s0:c593,c741',
 'system_u:object_r:unlabeled_t:s0',
 'system_u:system_r:docker_t:s0',
 'system_u:system_r:svirt_lxc_net_t:s0:c372,c604',
 'system_u:system_r:svirt_lxc_net_t:s0:c593,c741'}


In [38]:
dicts = getDictImmediatelyEnclosingKeyValue(key1='cf:secctx',value1='system_u:system_r:svirt_lxc_net_t:s0:c372,c604',nested=d1)
for item in dicts:
    if isinstance(item,dict):
        print(item)
# OK, so, task just might be a sister json dict, and not the parent of cf:secctx, 
#   so i need to tweak the function to get this right. But I can infer something using just good old 
#   search in my text file:
'''
"AQAAAAAAAEDDJQEAAAAAAIdLGEIqRMJJAAAAAAAAAAA=": {
			"cf:id": "75203",
			"prov:type": "task",
			"cf:boot_id": 1108888455,
			"cf:machine_id": 1237468202,
			"cf:version": 0,
			"cf:date": "2017:02:10T14:51:27",
			"cf:jiffies": "4294765090",
			"cf:uid": 0,
			"cf:gid": 0,
			"cf:pid": 2618,
			"cf:vpid": 1,
			"cf:cid": 4026531835,
			"cf:secctx": "system_u:system_r:svirt_lxc_net_t:s0:c372,c604",
			"prov:label": "[task] 0"
		},

"AQAAAAAAAEBmNQEAAAAAAIdLGEIqRMJJAQAAAAAAAAA=": {
			"cf:id": "79206",
			"prov:type": "task",
			"cf:boot_id": 1108888455,
			"cf:machine_id": 1237468202,
			"cf:version": 1,
			"cf:date": "2017:02:10T14:54:35",
			"cf:jiffies": "4294954282",
			"cf:uid": 0,
			"cf:gid": 0,
			"cf:pid": 2850,
			"cf:vpid": 7,
			"cf:cid": 4026531835,
			"cf:secctx": "system_u:system_r:svirt_lxc_net_t:s0:c372,c604",
			"prov:label": "[task] 1"
		},

'''

{'prov:type': 'task'}
{'prov:type': 'task'}
{'prov:type': 'task'}
{'prov:type': 'task'}
{'prov:type': 'task'}
{'prov:type': 'task'}
{'prov:type': 'task'}
{'prov:type': 'task'}
