In [289]:
from hashlib import sha256
from pathlib import Path
from typing import Any, Dict, List, NamedTuple, Optional, Union
import json
import pandas as pd

import sys
sys.path.append("../src")
from kql_query import KqlQuery
# interface
# get_query_ids() 
    # returns a DF of source_path, query_id, query_hash - the idea here is that you
    # (or someone) can check for existing queries based on path. I guess I could also
    # do that in the store - i.e. don't add a new one if the hash is the same, 
    # just overwrite with the new details. Hmm. Maybe you don't need to create a query_id. 
    # I could just do this checking in the data layer comparing source_path and 
    # source_index with existing values. LMK what you think.
#
# add_queries(queries: List[Dict[as described above]])
# add_kql_properties(query_id, properties: Dict[Liam's dict])
# get_filter_lists() - will return a dictionary of lists of unique values of various properties for the UI filtering
#     I could also return lists of unique query names and paths
# find_queries(**kwargs) - this is going to be an interesting one given that we have a flexible set of properties to search on.
#     kwargs lets us specify a flexible list of conditions, examples:
#         source_path="/some/path - exact string match (prob case insensitive)
#         query_name={matches: regex} - match based on a pandas operator like regex, startswith, contains
#         table=["table1", "table2"] - intersection of queries that use both these tables
#     it will return a DF of query_id + basic properties.
# get_query(query_id) - find_queries will return a list, to get all the props for a query, you'd need to call this.
# get_schema(table) 

QueryDict = Dict[str, Union[str, int, Dict[str, Any]]]
QueryList = List[QueryDict]

KqlQueryList = List[KqlQuery]


class QueryUpdate(NamedTuple):
    query: QueryDict
    action: str
    query_id: Optional[str] = None


class DataStore:

    _ATTRIB_INDEXES = {
        "tactics": list,
        "techniques": list
    }
    _KQL_INDEXES = {
        "tables": list,
        "operators": list,
        "fields": list,
        "functions": list,
        "tactics": list,
        "techniques": list,
    }

    _OPER = {
        "startswith": "^{expr}.*",
        "endswith": ".*{expr}$",
        "contains": ".*{expr}.*",
        "matches": "{expr}"
    }

    def __init__(
        self,
        kql_queries: Union[None, KqlQueryList, QueryList] = None,
        json_path: Optional[str] = None,

    ):
        self._json_path = json_path
        if json_path:
            self._data = {query.query_id: KqlQuery(**query) for query in self._read_json_data(json_path)}
        elif kql_queries:
            self._data = {query.query_id: query.asdict() for query in kql_queries}
        # self.attributes = self._extract_attributes()
        self._indexes = {}
        self._create_indexes("attributes")

    @property
    def _data(self):
        """Return internal data."""
        return self._internal_data

    @_data.setter
    def _data(self, value):
        """Set internal data to `value`."""
        self._internal_data = value
        self._data_df = pd.DataFrame(self.queries).set_index("query_id")

    @property
    def queries(self) -> List[KqlQuery]:
        """Get the list of current queries."""
        return [KqlQuery(**query) for query in self._data.values()]

    @property
    def queries_dict(self) -> List[KqlQuery]:
        """Get the list of current queries."""
        return list(self._data.values())

    def to_json(self, file_path: Optional[str] = None) -> Optional[str]:
        """Return the queries as JSON or save to `file_path`, if specified."""
        if file_path:
            Path(file_path).write_text(self.to_json())
        return json.dumps(self.queries_dict)

    def to_df(self) -> pd.DataFrame:
        """Return queries as a pandas DataFrame."""
        return pd.DataFrame(self.queries)

    def get_query_ids(self) -> pd.DataFrame:
        return self._data[["source_path", "query_name" "query_hash"]]

    def add_queries(self, queries: KqlQueryList):
        self._data.update({
            query.query_id: query.asdict() for query in queries
        })

    def add_query(self, query: KqlQuery):
        self._data[query.query_id] = query

    def add_kql_properties(self, query_id: str, kql_properties: Dict[str, Any]):
        self._data[query_id]["kql_properties"] = kql_properties

    
    def get_filter_lists(self, categories: Optional[List[str]] = None) -> Dict[str, List[str]]:
        """Return unique lists of values for each category."""
        return {
            attrib: sorted(ds._indexes[attrib].index.unique())
            for attrib in {**self._ATTRIB_INDEXES, **self._KQL_INDEXES}
            if attrib in self._indexes and (categories is None or attrib in categories)
        }

    def find_queries(self, case: bool = False, **kwargs) -> pd.DataFrame:
        """
        Return matching values as a pandas DataFrame.

        Parameters
        ----------
        case : bool, optional
            Use case-sensitive matching, by default False

        Other Parameters
        ----------------
        kwargs : 
            You can specify search criteria in the general form attrib_name=expression.
            You can specify multiple criteria - all will be ANDed together.
            attrib=value - exact match (case sensitive for strings)
            attrib={operator: value} - match based on a string operator (matches,
            contains, startswith, endswith)
            attrib=["value1", "value2"] - intersection of items that have
            matches for ALL items in the list.
        
        Returns
        -------
        pd.DataFrame
            DataFrame of matching queries
        
        Examples
        --------
        Some examples of expressions:

        - source_path="/some/path" - exact string match (case insensitive)
        - query_name={matches: "AAD.*"} - match based on a  operator like regex, startswith, contains
        - table=["table1", "table2"] - the queries that use both these tables
        
        """
        criteria = True
        for arg_name, arg_expr in kwargs.items():
            
            if isinstance(arg_expr, str):
                criteria &= (self._data_df[arg_name] == arg_expr)
            if isinstance(arg_expr, dict):
                oper, expr = next(iter(arg_expr.items()))
                crit_expr = self._OPER.get(oper)
                if crit_expr:
                    criteria &= (self._data_df[arg_name].str.match(crit_expr.format(expr=expr), case=case))
            if isinstance(arg_expr, list) and arg_name in self._indexes:
                query_ids = None
                # we're looking for queries in the indexes that have a matching value
                for match_value in arg_expr:
                    # matched_ids == all query_ids with this property
                    matched_ids = set(ds._indexes[arg_name][ds._indexes[arg_name].index == match_value]["query_id"].values)
                    # AND this with query_ids (unless None, then just use this as the 
                    # first criterion)
                    query_ids = matched_ids if query_ids is None else matched_ids & query_ids
                # Add the matched query IDs to criteria
                criteria &= (self._data_df.index.isin(query_ids))
        # return the data subset
        return self._data_df[criteria]

    @staticmethod
    def _read_json_data(json_path: str):
        return json.loads(Path(json_path).read_text(encoding="utf-8"))

    def _create_indexes(self, sub_key: str):
        """Create indexes for child items in queries."""
        # create DF with attributes expanded to columns

        exp_df = self._data_df[[sub_key]].apply(lambda x: pd.Series(x[sub_key]), result_type="expand", axis=1)
        for key, data_type in self._ATTRIB_INDEXES.items():
            if data_type == list:
                self._indexes[key] = self._create_list_index(
                    data=exp_df,
                    key_col=key,
                )
            if data_type == dict:
                self._indexes[key] = self._create_dict_index(
                    data=exp_df,
                    key_col=key,
                )


    @staticmethod
    def _create_list_index(data, key_col):
        return data[[key_col]].explode(key_col).reset_index().set_index([key_col])

    @staticmethod
    def _extract_dict_keys(row, col_name):
        if isinstance(row[col_name], dict):
            return {col_name : [inner_val for val in row[col_name].values() for inner_val in val.keys() if isinstance(val, dict) ]}
        return row

    def _create_dict_index(self, data, key_col):
        df_dict_keys = data[[key_col]].apply(lambda x: self._extract_dict_keys(x, key_col), result_type="expand", axis=1)
        return self._create_list_index(df_dict_keys, key_col)

k_queries = [KqlQuery(**q) for q in queries]
ds = DataStore(k_queries)

In [231]:
ds._data_df.to_dict(orient="index").values()

dict_values([{'source_path': '/github.com/foo/0', 'query': 'SecurityAlert\\n| Where foo == bar', 'source_type': 'text', 'source_index': 0, 'query_name': '0', 'attributes': {'description': 'Query one description', 'tactics': ['Exploitation', 'Compromise'], 'techniques': ['T1000', 'T1005'], 'test_dict': {'joins': {'inner': ['one', 'two'], 'outer': ['three', 'four']}}}, 'kql_properties': {}, 'query_hash': '7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7b0feb90f4e12c446a9', 'query_version': 0}, {'source_path': '/github.com/foo/1', 'query': 'SecurityAlert\\n| Where foo == bar', 'source_type': 'text', 'source_index': 1, 'query_name': '1', 'attributes': {'description': 'Query one description', 'tactics': ['Exploitation', 'Compromise'], 'techniques': ['T1001', 'T1015'], 'test_dict': {'joins': {'inner': ['one', 'two'], 'outer': ['three', 'four']}}}, 'kql_properties': {}, 'query_hash': '7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7b0feb90f4e12c446a9', 'query_version': 0}, {'source_path': '/githu

In [279]:
arg_name = "tactics"
arg_expr = ["Compromise", "Exploitation"]
criteria = True
# query_ids = ds._indexes[arg_name][ds._indexes[arg_name].index.isin(arg_expr)]
# criteria &= (ds._data_df.index.isin(query_ids["query_id"].values))
# print(criteria)
# query_ids
# query_ids["query_id"].values
# ds._data_df.columns

if isinstance(arg_expr, list) and arg_name in ds._indexes:
    query_ids = None
    attr_criteria = True
    for match_value in arg_expr:
        print(list(ds._indexes[arg_name][ds._indexes[arg_name].index == match_value]["query_id"].values))
        matched_ids = set(ds._indexes[arg_name][ds._indexes[arg_name].index == match_value]["query_id"].values)
        print("mids", matched_ids)
        query_ids = matched_ids if query_ids is None else matched_ids & query_ids
        print("mv", match_value, query_ids)
    # query_ids = ds._indexes[arg_name][attr_criteria]
    print("qids", query_ids)
    criteria &= (ds._data_df.index.isin(query_ids))

ds._data_df[criteria]
query_ids
ds._data_df[ds._data_df.index.isin(query_ids)]

['d04f5e98-8ada-420a-b095-a852e42089ad', 'ca9b169e-8dd0-40a2-be1e-b143d3cbd79a', '9917fa2c-fe65-4455-92ca-185a0f91a0dd', '8ce0a49e-3bdf-4cd9-9e14-15f5ff68aac2', '7a37d590-600c-4c0a-be7c-b236e4597eaa']
mids {'d04f5e98-8ada-420a-b095-a852e42089ad', '8ce0a49e-3bdf-4cd9-9e14-15f5ff68aac2', '9917fa2c-fe65-4455-92ca-185a0f91a0dd', 'ca9b169e-8dd0-40a2-be1e-b143d3cbd79a', '7a37d590-600c-4c0a-be7c-b236e4597eaa'}
mv Compromise {'d04f5e98-8ada-420a-b095-a852e42089ad', '8ce0a49e-3bdf-4cd9-9e14-15f5ff68aac2', '9917fa2c-fe65-4455-92ca-185a0f91a0dd', 'ca9b169e-8dd0-40a2-be1e-b143d3cbd79a', '7a37d590-600c-4c0a-be7c-b236e4597eaa'}
['d04f5e98-8ada-420a-b095-a852e42089ad', 'ca9b169e-8dd0-40a2-be1e-b143d3cbd79a', '9917fa2c-fe65-4455-92ca-185a0f91a0dd', '8ce0a49e-3bdf-4cd9-9e14-15f5ff68aac2', '7a37d590-600c-4c0a-be7c-b236e4597eaa']
mids {'d04f5e98-8ada-420a-b095-a852e42089ad', '8ce0a49e-3bdf-4cd9-9e14-15f5ff68aac2', '9917fa2c-fe65-4455-92ca-185a0f91a0dd', 'ca9b169e-8dd0-40a2-be1e-b143d3cbd79a', '7a37d590-6

Unnamed: 0_level_0,source_path,query,source_type,source_index,query_name,attributes,kql_properties,query_hash,query_version
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
d04f5e98-8ada-420a-b095-a852e42089ad,/github.com/foo/0,SecurityAlert\n| Where foo == bar,text,0,0,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0
ca9b169e-8dd0-40a2-be1e-b143d3cbd79a,/github.com/foo/1,SecurityAlert\n| Where foo == bar,text,1,1,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0
9917fa2c-fe65-4455-92ca-185a0f91a0dd,/github.com/foo/2,SecurityAlert\n| Where foo == bar,text,6,2,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0
8ce0a49e-3bdf-4cd9-9e14-15f5ff68aac2,/github.com/foo/3,SecurityAlert\n| Where foo == bar,text,0,3,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0
7a37d590-600c-4c0a-be7c-b236e4597eaa,/github.com/foo/4,SecurityAlert\n| Where foo == bar,text,2,4,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0


In [291]:
ds.find_queries(query={"endswith": "bar"}, tactics=["Compromise"])

crit query {'endswith': 'bar'} query_id
1f3e16a8-eff1-4b88-992e-d0b7a91f1b2b    True
9fce33a1-d298-43d3-b5d1-2333eca370eb    True
1db9c848-484d-48d2-82b5-82eef5bf00a1    True
f89052f1-5a65-4ec3-9095-cb4c0c870c8a    True
812accf9-b4ea-4828-93ad-c7062d44ce18    True
Name: query, dtype: bool
mids {'9fce33a1-d298-43d3-b5d1-2333eca370eb', '812accf9-b4ea-4828-93ad-c7062d44ce18', '1f3e16a8-eff1-4b88-992e-d0b7a91f1b2b', '1db9c848-484d-48d2-82b5-82eef5bf00a1'}
crit tactics ['Compromise'] query_id
1f3e16a8-eff1-4b88-992e-d0b7a91f1b2b     True
9fce33a1-d298-43d3-b5d1-2333eca370eb     True
1db9c848-484d-48d2-82b5-82eef5bf00a1     True
f89052f1-5a65-4ec3-9095-cb4c0c870c8a    False
812accf9-b4ea-4828-93ad-c7062d44ce18     True
Name: query, dtype: bool


Unnamed: 0_level_0,source_path,query,source_type,source_index,query_name,attributes,kql_properties,query_hash,query_version
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1f3e16a8-eff1-4b88-992e-d0b7a91f1b2b,/github.com/foo/0,SecurityAlert\n| Where foo == bar,text,4,0,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0
9fce33a1-d298-43d3-b5d1-2333eca370eb,/github.com/foo/1,SecurityAlert\n| Where foo == bar,text,5,1,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0
1db9c848-484d-48d2-82b5-82eef5bf00a1,/github.com/foo/2,SecurityAlert\n| Where foo == bar,text,3,2,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0
812accf9-b4ea-4828-93ad-c7062d44ce18,/github.com/foo/4,SecurityAlert\n| Where foo == bar,text,0,4,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0


In [212]:
ds._data_df.head()
# "8017527f-bdd6-44cb-8512-b8984cdc61b7" in ds._data.index

Unnamed: 0_level_0,source_path,query,source_type,source_index,query_name,attributes,kql_properties,query_hash,query_version
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
d04f5e98-8ada-420a-b095-a852e42089ad,/github.com/foo/0,SecurityAlert\n| Where foo == bar,text,0,0,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0
ca9b169e-8dd0-40a2-be1e-b143d3cbd79a,/github.com/foo/1,SecurityAlert\n| Where foo == bar,text,1,1,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0
9917fa2c-fe65-4455-92ca-185a0f91a0dd,/github.com/foo/2,SecurityAlert\n| Where foo == bar,text,6,2,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0
8ce0a49e-3bdf-4cd9-9e14-15f5ff68aac2,/github.com/foo/3,SecurityAlert\n| Where foo == bar,text,0,3,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0
7a37d590-600c-4c0a-be7c-b236e4597eaa,/github.com/foo/4,SecurityAlert\n| Where foo == bar,text,2,4,"{'description': 'Query one description', 'tact...",{},7125e1eaab3a1960eaae7f7aaa68f68b44eb6abf54cbf7...,0


In [40]:
attr_df = ds._data.apply(lambda x: pd.Series(x.attributes), axis=1)
tactics_df = attr_df[["tactics"]].explode("tactics").reset_index()
display(tactics_df)
tactics_df = tactics_df.set_index(["tactics"])
display(tactics_df)
# tactics_df.iloc[:,"Exploitation"]
tactics_df.loc["Compromise"]

Unnamed: 0,query_id,tactics
0,8017527f-bdd6-44cb-8512-b8984cdc61b7,Exploitation
1,8017527f-bdd6-44cb-8512-b8984cdc61b7,Compromise
2,707a7a6c-7f3f-4927-9975-00e0c4b77ef5,Exploitation
3,707a7a6c-7f3f-4927-9975-00e0c4b77ef5,Compromise
4,6860c851-0758-4b73-a460-a542b4017b98,Exploitation
5,6860c851-0758-4b73-a460-a542b4017b98,Compromise
6,62450409-d53b-4f06-8e38-69c024fb74d5,Exploitation
7,62450409-d53b-4f06-8e38-69c024fb74d5,Compromise
8,b3812073-0cd4-47bc-b0c6-d1b3fc4f1c1e,Exploitation
9,b3812073-0cd4-47bc-b0c6-d1b3fc4f1c1e,Compromise


Unnamed: 0_level_0,query_id
tactics,Unnamed: 1_level_1
Exploitation,8017527f-bdd6-44cb-8512-b8984cdc61b7
Compromise,8017527f-bdd6-44cb-8512-b8984cdc61b7
Exploitation,707a7a6c-7f3f-4927-9975-00e0c4b77ef5
Compromise,707a7a6c-7f3f-4927-9975-00e0c4b77ef5
Exploitation,6860c851-0758-4b73-a460-a542b4017b98
Compromise,6860c851-0758-4b73-a460-a542b4017b98
Exploitation,62450409-d53b-4f06-8e38-69c024fb74d5
Compromise,62450409-d53b-4f06-8e38-69c024fb74d5
Exploitation,b3812073-0cd4-47bc-b0c6-d1b3fc4f1c1e
Compromise,b3812073-0cd4-47bc-b0c6-d1b3fc4f1c1e


Unnamed: 0_level_0,query_id
tactics,Unnamed: 1_level_1
Compromise,8017527f-bdd6-44cb-8512-b8984cdc61b7
Compromise,707a7a6c-7f3f-4927-9975-00e0c4b77ef5
Compromise,6860c851-0758-4b73-a460-a542b4017b98
Compromise,62450409-d53b-4f06-8e38-69c024fb74d5
Compromise,b3812073-0cd4-47bc-b0c6-d1b3fc4f1c1e


## Test data generation

```
source_path: github_path,          # we can prob make up a path for API-sourced queries
source_type: text, markdown, api...,
source_index: 0,                           # if there are multiple queries in a file
name: query_name,                     # either filename, filename_index, or a name from metadata
query: query_text,
context: text,                               # e.g. text from markdown
attributes: DICT {}
```

In [8]:
json_query_data = """
{
    "query_id": "1234291720927310",
    "source_path": "/github.com/foo",
    "source_type": "text",
    "source_index": 0,
    "name": "query_1",
    "query": "SecurityAlert\\n| Where foo == bar",
    "context": "text from markdown",
    "attributes": {
        "description": "Query one description",
        "tactics": ["Exploitation", "Compromise"],
        "techniques": ["T.1055", "T.1345"]
    }
}
"""
template_dict = json.loads(json_query_data)
print(template_dict["query"])

json_kql_parse = """
{
    "FunctionCalls":["count","tostring","make_list","toreal"],
    "Joins":["rightsemi","leftouter"],
    "Operators":["where","extend","summarize","mv-expand","project-away","project"],
    "Tables":["SigninLogs"]
}
"""

json.loads(json_query_data)

SecurityAlert
| Where foo == bar


In [288]:
table_names = ['AADB2CRequestLogs', 'AADDomainServicesAccountLogon',
       'AADDomainServicesAccountManagement',
       'AADDomainServicesDirectoryServiceAccess',
       'AADDomainServicesLogonLogoff', 'AADDomainServicesPolicyChange',
       'AADDomainServicesPrivilegeUse', 'AADManagedIdentitySignInLogs',
       'AADNonInteractiveUserSignInLogs', 'AADProvisioningLogs',
       'AADRiskyServicePrincipals', 'AADRiskyUsers',
       'AADServicePrincipalRiskEvents', 'AADServicePrincipalSignInLogs',
       'AADUserRiskEvents', 'ADFSSignInLogs', 'AlertEvidence',
       'Anomalies', 'AppServiceIPSecAuditLogs',
       'AppServiceServerlessSecurityPluginData', 'ASimDnsActivityLogs',
       'AuditLogs', 'AWSCloudTrail', 'AWSGuardDuty', 'AWSVPCFlow',
       'AZFWApplicationRule', 'AZFWApplicationRuleAggregation',
       'AZFWDnsQuery', 'AZFWIdpsSignature',
       'AZFWInternalFqdnResolutionFailure', 'AZFWNatRule',
       'AZFWNatRuleAggregation', 'AZFWNetworkRule',
       'AZFWNetworkRuleAggregation', 'AZFWThreatIntel', 'AzureActivity',
       'AzureDiagnostics', 'BehaviorAnalytics', 'CloudAppEvents',
       'CommonSecurityLog', 'ConfidentialWatchlist', 'DeviceEvents',
       'DeviceFileCertificateInfo', 'DeviceFileEvents',
       'DeviceImageLoadEvents', 'DeviceInfo', 'DeviceLogonEvents',
       'DeviceNetworkEvents', 'DeviceNetworkInfo', 'DeviceProcessEvents',
       'DeviceRegistryEvents', 'DeviceTvmSecureConfigurationAssessment',
       'DeviceTvmSoftwareInventory', 'DeviceTvmSoftwareVulnerabilities',
       'DSMAzureBlobStorageLogs', 'DSMDataClassificationLogs',
       'DSMDataLabelingLogs', 'DynamicEventCollection',
       'EmailAttachmentInfo', 'EmailEvents', 'EmailPostDeliveryEvents',
       'EmailUrlInfo', 'GCPAuditLogs', 'HDInsightSecurityLogs',
       'HuntingBookmark', 'IdentityDirectoryEvents',
       'IdentityLogonEvents', 'IdentityQueryEvents', 'LinuxAuditLog',
       'McasShadowItReporting', 'NetworkAccessTraffic', 'NetworkSessions',
       'NSPAccessLogs', 'OfficeActivity', 'PowerBIActivity',
       'ProjectActivity', 'ProtectionStatus',
       'PurviewDataSensitivityLogs', 'SecurityAlert', 'SecurityBaseline',
       'SecurityBaselineSummary', 'SecurityDetection', 'SecurityEvent',
       'SecurityIoTRawEvent', 'SecurityRecommendation', 'SentinelAudit',
       'SentinelHealth', 'SigninLogs', 'Syslog',
       'ThreatIntelligenceIndicator', 'Update', 'UrlClickEvents',
       'UserAccessAnalytics', 'UserPeerAnalytics', 'Watchlist',
       'WindowsEvent', 'WindowsFirewall', 'WireData']

field_names = ['SourceType', 'DomainBehaviorVersion', 'OperationName',
       'BookmarkName', 'SentinelResourceId', 'OSName', 'ActualResult',
       'CreatedBy', 'CreatedDateTime', 'LatencySamplingTimeStamp',
       'Environment', 'CorrelationId', 'MachineGroup',
       'SumResponseBodySize', 'RecordId', 'DstUserUpn', 'ResourceId',
       'InitiatingProcessSHA1', 'ObjectId', 'AssetType', 'Title',
       'InitiatingProcessAccountDomain', 'AuthorizationInfo',
       'TargetContextId', 'LogonId', 'CveTags', 'SourceComputerId',
       'ResourceIdentity', 'ClusterName', 'TdoAttributes',
       'EntityMapping', 'DnssecOkBit', 'DeviceCustomString5',
       'TransmittedServices', 'DeviceCustomDate2Label']


import random
import uuid

def get_random_items(data=table_names, count=3):
    return list({
        random.choice(data)
        for _ in range(count)
    })

def get_random_query(index=0):
    tactic_idx = index % 7
    return {
        "query_id": str(uuid.uuid4()),
        "source_path": f"/github.com/foo/{index}",
        "source_type": "text",
        "source_index": random.randint(0, 7),
        "query_name": f"query_{index}",
        "query": "SecurityAlert\\n| Where foo == bar",
        # "context": "text from markdown",
        "attributes": {
            "description": "Query one description",
            "tactics": get_random_items(data=["Exploitation", "Compromise", "LateralMovement"], count=2),
            "techniques": [f"T10{tactic_idx:0>2d}", f"T1{tactic_idx:0>2d}5"],
            "test_dict": {"joins": {"inner": ["one", "two"], "outer": ["three", "four"]}}
        }
    }

queries = [get_random_query(i) for i in range(5)]
json.dumps(queries)

'[{"query_id": "1f3e16a8-eff1-4b88-992e-d0b7a91f1b2b", "source_path": "/github.com/foo/0", "source_type": "text", "source_index": 4, "query_name": "query_0", "query": "SecurityAlert\\\\n| Where foo == bar", "attributes": {"description": "Query one description", "tactics": ["Exploitation", "Compromise"], "techniques": ["T1000", "T1005"], "test_dict": {"joins": {"inner": ["one", "two"], "outer": ["three", "four"]}}}}, {"query_id": "9fce33a1-d298-43d3-b5d1-2333eca370eb", "source_path": "/github.com/foo/1", "source_type": "text", "source_index": 5, "query_name": "query_1", "query": "SecurityAlert\\\\n| Where foo == bar", "attributes": {"description": "Query one description", "tactics": ["LateralMovement", "Compromise"], "techniques": ["T1001", "T1015"], "test_dict": {"joins": {"inner": ["one", "two"], "outer": ["three", "four"]}}}}, {"query_id": "1db9c848-484d-48d2-82b5-82eef5bf00a1", "source_path": "/github.com/foo/2", "source_type": "text", "source_index": 3, "query_name": "query_2", "qu

In [142]:
k_queries = [KqlQuery(**q) for q in queries]
kq_df = pd.DataFrame([k.asdict() for k in k_queries]).set_index("query_id")

def _create_list_index(data, key_col):
    return data[[key_col]].explode(key_col).reset_index().set_index([key_col])

def _create_dict_index(data, key_col):
    df_dict_keys = data[[key_col]].apply(lambda x: _extract_dict_keys(x, key_col), result_type="expand", axis=1)
    return _create_list_index(df_dict_keys, key_col)

# _create_dict_index(kq_df, "attributes")

data = kq_df
key_col = "attributes"
exp_data = data[[key_col]].apply(lambda x: pd.Series(x[key_col]), result_type="expand", axis=1)
# _create_dict_index(exp_data, "test_dict")
def _extract_dict_keys(row, col_name):
    print(type(row))
    if isinstance(row[col_name], dict):
        print("dict", row[col_name].values())
        return {col_name : [inner_val for val in row[col_name].values() for inner_val in val.keys() if isinstance(val, dict) ]}
    return row

out_df = exp_data[["test_dict"]].apply(lambda x: _extract_dict_keys(x, "test_dict"), result_type="expand", axis=1)
type(out_df)
out_df
_create_dict_index(exp_data, "test_dict")

<class 'pandas.core.series.Series'>
dict dict_values([{'inner': ['one', 'two'], 'outer': ['three', 'four']}])
<class 'pandas.core.series.Series'>
dict dict_values([{'inner': ['one', 'two'], 'outer': ['three', 'four']}])
<class 'pandas.core.series.Series'>
dict dict_values([{'inner': ['one', 'two'], 'outer': ['three', 'four']}])
<class 'pandas.core.series.Series'>
dict dict_values([{'inner': ['one', 'two'], 'outer': ['three', 'four']}])
<class 'pandas.core.series.Series'>
dict dict_values([{'inner': ['one', 'two'], 'outer': ['three', 'four']}])
<class 'pandas.core.series.Series'>
dict dict_values([{'inner': ['one', 'two'], 'outer': ['three', 'four']}])
<class 'pandas.core.series.Series'>
dict dict_values([{'inner': ['one', 'two'], 'outer': ['three', 'four']}])
<class 'pandas.core.series.Series'>
dict dict_values([{'inner': ['one', 'two'], 'outer': ['three', 'four']}])
<class 'pandas.core.series.Series'>
dict dict_values([{'inner': ['one', 'two'], 'outer': ['three', 'four']}])
<class 'pa

Unnamed: 0_level_0,query_id
test_dict,Unnamed: 1_level_1
inner,d04f5e98-8ada-420a-b095-a852e42089ad
outer,d04f5e98-8ada-420a-b095-a852e42089ad
inner,ca9b169e-8dd0-40a2-be1e-b143d3cbd79a
outer,ca9b169e-8dd0-40a2-be1e-b143d3cbd79a
inner,9917fa2c-fe65-4455-92ca-185a0f91a0dd
outer,9917fa2c-fe65-4455-92ca-185a0f91a0dd
inner,8ce0a49e-3bdf-4cd9-9e14-15f5ff68aac2
outer,8ce0a49e-3bdf-4cd9-9e14-15f5ff68aac2
inner,7a37d590-600c-4c0a-be7c-b236e4597eaa
outer,7a37d590-600c-4c0a-be7c-b236e4597eaa


In [131]:
exp_data

Unnamed: 0_level_0,description,tactics,techniques,test_dict
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d04f5e98-8ada-420a-b095-a852e42089ad,Query one description,"[Exploitation, Compromise]","[T1000, T1005]","{'joins': {'inner': ['one', 'two'], 'outer': [..."
ca9b169e-8dd0-40a2-be1e-b143d3cbd79a,Query one description,"[Exploitation, Compromise]","[T1001, T1015]","{'joins': {'inner': ['one', 'two'], 'outer': [..."
9917fa2c-fe65-4455-92ca-185a0f91a0dd,Query one description,"[Exploitation, Compromise]","[T1002, T1025]","{'joins': {'inner': ['one', 'two'], 'outer': [..."
8ce0a49e-3bdf-4cd9-9e14-15f5ff68aac2,Query one description,"[Exploitation, Compromise]","[T1003, T1035]","{'joins': {'inner': ['one', 'two'], 'outer': [..."
7a37d590-600c-4c0a-be7c-b236e4597eaa,Query one description,"[Exploitation, Compromise]","[T1004, T1045]","{'joins': {'inner': ['one', 'two'], 'outer': [..."


In [75]:
import hashlib
import json
from dataclasses import dataclass, field, asdict
from typing import Any, Dict,Literal


_SOURCE_TYPES = ["text", "markdown", "sentinel_yaml", "api", "other"]

SourceType = Literal["text", "markdown", "sentinel_yaml", "api", "other"]


def _uuid_str():
    return str(uuid.uuid4())


@dataclass
class KqlQuery:
    """
    Data format for KqlQuery record.
    
    Attributes
    ----------
    source_path : str
        The path to the original file or API identifier.
    query : str
        The raw query string
    source_type : SourceType, optional
        String - the source file/data type. Valid types are:
        text, markdown, sentinel_yaml, api, other
    source_index : int, optional
        The index (0-based) if the query is one of several in the
        file pointed to by source_path. The default is 0.
    query_name : Optional[str]
        The name of the query. If None this will be derived from
        the last element of source_path
    attributes: Dict[str, Any], optional
        Dictionary of any metadata attributes read from the source
        file.
    kql_properties: Dict[str, Any], optional
        Dictionary of properties derived from the KQL query
    query_id: Optional[str], optional
        UUID used to identify the query
    query_hash: int, optional
        Hash of the query text
    query_version: int, optional
        Query version, not currently used. Default is 0

    Examples
    --------
    Create a KqlQuery instance
    >>>> kql = KqlQuery(
    ...     source_path="https://github.com/a/b/file.kql",
    ...     query="SecurityAlert | take 1"
    ... )

    Create a KqlQuery instance from a dict
    >>>> attribs = {
    ...     "source_path": "https://github.com/a/b/file.kql",
    ...     "query": "SecurityAlert | take 1",
    ... }
    ... kql = KqlQuery(**attribs)

    Different default representation
    >>>> kql
    KqlQuery(source_path='https://github.com/a/b/file.kql', query='SecurityAlert... query_version=0)

    As a dict
    >>>> print(kql.asdict())
    {'source_path': 'https://github.com/a/b/file.kql', 'query': 'SecurityAlert... 'query_version': 0}

    As JSON
    print(kql.to_json())
    {"source_path": "https://github.com/a/b/file.kql", "query": "SecurityAlert... "query_version": 0}

    Class method to convert a list of KqlQuery instances to a list of dicts
    >>>> KqlQuery.kql_list_to_pylist([kql, kql])

    Class method to convert a list of KqlQuery instances to JSON
    >>>> KqlQuery.kql_list_to_json([kql, kql])
    '[{"source_path": "https://github.com/a/b/file.kql", "query": "SecurityAlert... "query_version": 0}]'

    Class method to convert list of KqlQuery instances to a DataFrame
    """

    source_path: str
    query: str
    source_type: SourceType = "text"
    source_index: int = 0
    query_name: Optional[str] = None
    attributes: Dict[str, Any] = field(default_factory=dict)
    kql_properties: Dict[str, Any] = field(default_factory=dict)
    query_id: str = field(default_factory=_uuid_str)
    query_hash: int = 0
    query_version: int = 0

    def __post_init__(self):
        if self.source_path is not None:
            self.query_name = self.source_path.split("/")[-1]
        if self.query:
            self.query_hash = hashlib.sha256(bytes(self.query, encoding="utf-8"), usedforsecurity=False).hexdigest()

    def asdict(self):
        return asdict(self)

    def to_json(self):
        return json.dumps(self.asdict())

    # helper methods and properties
    @property
    def source_types(self):
        del self
        return _SOURCE_TYPES

    @staticmethod
    def kql_list_to_pylist(kql_queries: List[KqlQuery]):
        """Return a list of Python dicts from a list of KqlQuery instances."""
        return [
            kql.asdict() for kql in kql_queries
        ]

    @classmethod
    def kql_list_to_json(cls, kql_queries: List[KqlQuery]):
        """Return JSON from a list of KqlQuery instances."""
        return json.dumps(cls.kql_list_to_pylist(kql_queries))

    @classmethod
    def kql_list_to_df(cls, kql_queries: List[KqlQuery]):
        """Return JSON from a list of KqlQuery instances."""
        return pd.DataFrame(cls.kql_list_to_pylist(kql_queries))


In [85]:
# Example usage
import sys
sys.path.append("../src")
from kql_query import KqlQuery

kql = KqlQuery(
    source_path="https://github.com/a/b/file.kql",
    query="SecurityAlert | take 1",
    query_name="foo",
    attributes={"foo": "bar"},
    kql_properties={"operators": []}
)
print(kql)
print(kql.asdict())
print(kql.to_json())

KqlQuery.kql_list_to_pylist([kql, kql])

KqlQuery.kql_list_to_json([kql, kql])

KqlQuery.kql_list_to_df([kql, kql])

KqlQuery(source_path='https://github.com/a/b/file.kql', query='SecurityAlert | take 1', source_type='text', source_index=0, query_name='file.kql', attributes={'foo': 'bar'}, kql_properties={'operators': []}, query_id='be72ead5-04d8-4d3f-a0cb-a28c3f6b73c2', query_hash='c86ed067ad3516435ca23a7a963909069f7702c1038a5833dcc4363977f0f6b2', query_version=0)
{'source_path': 'https://github.com/a/b/file.kql', 'query': 'SecurityAlert | take 1', 'source_type': 'text', 'source_index': 0, 'query_name': 'file.kql', 'attributes': {'foo': 'bar'}, 'kql_properties': {'operators': []}, 'query_id': 'be72ead5-04d8-4d3f-a0cb-a28c3f6b73c2', 'query_hash': 'c86ed067ad3516435ca23a7a963909069f7702c1038a5833dcc4363977f0f6b2', 'query_version': 0}
{"source_path": "https://github.com/a/b/file.kql", "query": "SecurityAlert | take 1", "source_type": "text", "source_index": 0, "query_name": "file.kql", "attributes": {"foo": "bar"}, "kql_properties": {"operators": []}, "query_id": "be72ead5-04d8-4d3f-a0cb-a28c3f6b73c2",

Unnamed: 0,source_path,query,source_type,source_index,query_name,attributes,kql_properties,query_id,query_hash,query_version
0,https://github.com/a/b/file.kql,SecurityAlert | take 1,text,0,file.kql,{'foo': 'bar'},{'operators': []},be72ead5-04d8-4d3f-a0cb-a28c3f6b73c2,c86ed067ad3516435ca23a7a963909069f7702c1038a58...,0
1,https://github.com/a/b/file.kql,SecurityAlert | take 1,text,0,file.kql,{'foo': 'bar'},{'operators': []},be72ead5-04d8-4d3f-a0cb-a28c3f6b73c2,c86ed067ad3516435ca23a7a963909069f7702c1038a58...,0


In [91]:
df = KqlQuery.kql_list_to_df([kql, kql])
queries_list = []
for idx, x in df.iterrows():
    kql_item = KqlQuery(
        source_path=x.source_path,
        query=x.query,
        attributes={
            "tactics": x.attributes.get("tactics")
        }
    )
    queries_list.append(kql_item)

queries_list

[KqlQuery(source_path='https://github.com/a/b/file.kql', query='SecurityAlert | take 1', source_type='text', source_index=0, query_name='file.kql', attributes={'tactics': None}, kql_properties={}, query_id='7711a844-0335-4260-a95a-b29dee555b87', query_hash='c86ed067ad3516435ca23a7a963909069f7702c1038a5833dcc4363977f0f6b2', query_version=0),
 KqlQuery(source_path='https://github.com/a/b/file.kql', query='SecurityAlert | take 1', source_type='text', source_index=0, query_name='file.kql', attributes={'tactics': None}, kql_properties={}, query_id='86a76381-4c02-4a11-8ccf-0b3cb4854521', query_hash='c86ed067ad3516435ca23a7a963909069f7702c1038a5833dcc4363977f0f6b2', query_version=0)]

In [64]:
help(KqlQuery)

Help on class KqlQuery in module __main__:

class KqlQuery(builtins.object)
 |  KqlQuery(source_path: str, query: str, source_type: Literal['text', 'markdown', 'sentinel_yaml', 'api', 'other'] = 'text', source_index: int = 0, query_name: Optional[str] = None, attributes: Dict[str, Any] = <factory>, kql_properties: Dict[str, Any] = <factory>, query_id: Optional[str] = None, query_hash: int = 0, query_version: int = 0) -> None
 |  
 |  Data format for KqlQuery record.
 |  
 |  Attributes
 |  ----------
 |  source_path : str
 |      The path to the original file or API identifier.
 |  query : str
 |      The raw query string
 |  source_type : SourceType, optional
 |      String - the source file/data type. Valid types are:
 |      text, markdown, sentinel_yaml, api, other
 |  source_index : int, optional
 |      The index (0-based) if the query is one of several in the
 |      file pointed to by source_path. The default is 0.
 |  query_name : Optional[str]
 |      The name of the query. I