# Regex definition and storage

## Entity - Regexes
- Account - SID, EMAIL, NTACCT, GUID
- Host - DNS
- IP address - IPV4, IPV6
- URL - URL
- Azure resource - RESOURCEID
- Registry key - REGKEY
- Domain name (DNS) - DNS
- File - LXPATH, WINPATH
- File hash - MD5, SHA1, SHA256

In [23]:
import json
import re


# re.compile
# pathlib

def write_regexes(data, path="./", fileName="regexes"):
    filePathNameWExt = "./" + path + "/" + fileName + ".json"
    with open(filePathNameWExt, "w") as fp:
        json.dump(data, fp)


data = {
    "DNS_REGEX": {
        "regex": r"^((?=[a-z0-9-]{1,63}\.)[a-z0-9]+(-[a-z0-9]+)*\.){1,126}[a-z]{2,63}$",
        "priority": "1",
        "entity": "host",
    },
    "IPV4_REGEX": {
        "regex": r"^(?P<ipaddress>(?:[0-9]{1,3}\.){3}[0-9]{1,3})$",
        "priority": "0",
        "entity": "ipaddress",
    },
    "IPV6_REGEX": {
        "regex": r"^(?<![:.\w])(?:[A-F0-9]{0,4}:){2,7}[A-F0-9]{0,4}(?![:.\w])$",
        "priority": "0",
        "entity": "ipaddress",
    },
    "URL_REGEX": {
        "regex": r"""
            ^
            (?P<protocol>(https?|ftp|telnet|ldap|file)://)
            (?P<userinfo>([a-z0-9-._~!$&\'()*+,;=:]|%[0-9A-F]{2})*@)?
            (?P<host>([a-z0-9-._~!$&\'()*+,;=]|%[0-9A-F]{2})*)
            (:(?P<port>\d*))?
            (/(?P<path>([^?\#"<>\s]|%[0-9A-F]{2})*/?))?
            (\?(?P<query>([a-z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*))?
            (\#(?P<fragment>([a-z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*))?
            $
            """,
        "priority": "0",
        "entity": "url",
    },
    "MD5_REGEX": {
        "regex": r"^(?:^|[^A-Fa-f0-9])(?P<hash>[A-Fa-f0-9]{32})(?:$|[^A-Fa-f0-9])$",
        "priority": "1",
        "entity": "hash",
    },
    "SHA1_REGEX": {
        "regex": r"^(?:^|[^A-Fa-f0-9])(?P<hash>[A-Fa-f0-9]{40})(?:$|[^A-Fa-f0-9])$",
        "priority": "1",
        "entity": "hash",
    },
    "SHA256_REGEX": {
        "regex": r"^(?:^|[^A-Fa-f0-9])(?P<hash>[A-Fa-f0-9]{64})(?:$|[^A-Fa-f0-9])$",
        "priority": "1",
        "entity": "hash",
    },
    "LXPATH_REGEX": {
        "regex": r"""
            ^(?P<root>/+||[.]+)
            (?P<folder>/(?:[^\\/:*?<>|\r\n]+/)*)
            (?P<file>[^/\0<>|\r\n ]+)$
            """,
        "priority": "2",
        "entity": "file",
    },
    "WINPATH_REGEX": {
        "regex": r"""
            ^(?P<root>[a-z]:|\\\\[a-z0-9_.$-]+||[.]+)
            (?P<folder>\\(?:[^\\/:*?"'<>|\r\n]+\\)*)
            (?P<file>[^\\/*?""<>|\r\n ]+)$
            """,
        "priority": "1",
        "entity": "file",
    },
    "WINPROCESS_REGEX": {
        "regex": r"""
            ^(?P<root>[a-z]:|\\\\[a-z0-9_.$-]+||[.]+)?
            (?P<folder>\\(?:[^\\/:*?"'<>|\r\n]+\\)*)?
            (?P<file>[^\\/*?""<>|\r\n ]+\.exe)$
        """,
        "priority": "1",
        "entity": "process",
    },
    'EMAIL_REGEX': {
        'regex': r"^[\w\d._%+-]+@(?:[\w\d-]+\.)+[\w]{2,}$", 
        'priority': '0', 
        'entity': 'account'
    },
    'RESOURCEID_REGEX': {
        'regex': r"(\/[a-z]+\/)[a-z0-9]{8}(-[a-z0-9]{4}){3}-[a-z0-9]{12}(\/[a-z]+\/).*", 
        'priority': '0', 
        'entity': 'azureresource'
    },
    'NTACCT_REGEX': {
        'regex': r"^([^\/:*?\"<>|]){2,15}\\[^\/:*?\"<>|]{2,15}$", 
        'priority': '0', 
        'entity': 'account'
    },
    'SID_REGEX': {
        'regex': r"^S-[\d]+(-[\d]+)+$", 
        'priority': '1', 
        'entity': 'account'
    },
    'REGKEY_REGEX': {
        'regex': r"""("|'|\s)?(?P<hive>HKLM|HKCU|HKCR|HKU|HKEY_(LOCAL_MACHINE|USERS|CURRENT_USER|CURRENT_CONFIG|CLASSES_ROOT))(?P<key>(\\[^"'\\/]+){1,}\\?)("|'|\s)?""", 
        'priority': '1', 
        'entity': 'registrykey'
    },
    'GUID_REGEX': {
        'regex': r"^[a-z0-9]{8}(-[a-z0-9]{4}){3}-[a-z0-9]{12}$", 
        'priority': '1', 
        'data_format': 'uuid'
    },

}

write_regexes(data)

In [24]:
def append_regex(name, regex, priority, entity):
    with open ('regexes.json') as json_file:
        data = json.load(json_file)
        y = {name: {'regex': regex, 'priority': priority, 'entity': entity}}
        data.update(y)
    with open ('regexes.json', 'w') as f:
        json.dump(data, f)

In [25]:
def get_regexes():
    with open('regexes.json') as f:
        return json.load(f)

In [26]:
entity_regexes = get_regexes()
print(entity_regexes)

{'DNS_REGEX': {'regex': '^((?=[a-z0-9-]{1,63}\\.)[a-z0-9]+(-[a-z0-9]+)*\\.){1,126}[a-z]{2,63}$', 'priority': '1', 'entity': 'host'}, 'IPV4_REGEX': {'regex': '^(?P<ipaddress>(?:[0-9]{1,3}\\.){3}[0-9]{1,3})$', 'priority': '0', 'entity': 'ipaddress'}, 'IPV6_REGEX': {'regex': '^(?<![:.\\w])(?:[A-F0-9]{0,4}:){2,7}[A-F0-9]{0,4}(?![:.\\w])$', 'priority': '0', 'entity': 'ipaddress'}, 'URL_REGEX': {'regex': '\n            ^\n            (?P<protocol>(https?|ftp|telnet|ldap|file)://)\n            (?P<userinfo>([a-z0-9-._~!$&\\\'()*+,;=:]|%[0-9A-F]{2})*@)?\n            (?P<host>([a-z0-9-._~!$&\\\'()*+,;=]|%[0-9A-F]{2})*)\n            (:(?P<port>\\d*))?\n            (/(?P<path>([^?\\#"<>\\s]|%[0-9A-F]{2})*/?))?\n            (\\?(?P<query>([a-z0-9-._~!$&\'()*+,;=:/?@]|%[0-9A-F]{2})*))?\n            (\\#(?P<fragment>([a-z0-9-._~!$&\'()*+,;=:/?@]|%[0-9A-F]{2})*))?\n            $\n            ', 'priority': '0', 'entity': 'url'}, 'MD5_REGEX': {'regex': '^(?:^|[^A-Fa-f0-9])(?P<hash>[A-Fa-f0-9]{32})(?:$

# Regex application to tables

In [27]:
# Core MSTICPy initialization for Notebooks
from msticpy.nbtools import nbinit
nbinit.init_notebook(namespace=globals());

# Load query providers (typically you'll be using just one)
qry_prov = QueryProvider("AzureSentinel")

In [28]:
qry_prov.connect(WorkspaceConfig())

In [29]:
len(qry_prov.schema)

331

In [30]:
signin_df = qry_prov.exec_query("SigninLogs | sample 100")
signin_df.head()

Unnamed: 0,TenantId,SourceSystem,TimeGenerated,ResourceId,OperationName,OperationVersion,Category,ResultType,ResultSignature,ResultDescription,DurationMs,CorrelationId,Resource,ResourceGroup,ResourceProvider,Identity,Level,Location,AlternateSignInName,AppDisplayName,AppId,AuthenticationDetails,AuthenticationMethodsUsed,AuthenticationProcessingDetails,AuthenticationRequirement,...,RiskEventTypes,RiskEventTypes_V2,RiskLevelAggregated,RiskLevelDuringSignIn,RiskState,ResourceDisplayName,ResourceIdentity,ServicePrincipalId,ServicePrincipalName,Status,TokenIssuerName,TokenIssuerType,UserAgent,UserDisplayName,UserId,UserPrincipalName,AADTenantId,UserType,FlaggedForReview,IPAddressFromResourceProvider,SignInIdentifier,SignInIdentifierType,ResourceTenantId,HomeTenantId,Type
0,8ecf8077-cf51-4820-aadd-14040956f35d,Azure AD,2021-07-27 11:36:14.794000+00:00,/tenants/4b2462a4-bbee-495a-a0e1-f23ae524cc9c/providers/Microsoft.aadiam,Sign-in activity,1.0,SignInLogs,0,,,0,204fa613-b849-4aab-986c-43786bdec271,Microsoft.aadiam,Microsoft.aadiam,,Traveler Account,4,RU,,Azure Portal,c44b4083-3bb0-49c1-b47d-974e53cbdf3c,"[\r\n {\r\n ""authenticationStepDateTime"": ""2021-07-27T11:36:14.7946273+00:00"",\r\n ""authe...",,"[\r\n {\r\n ""key"": ""IsCAEToken"",\r\n ""value"": ""False""\r\n }\r\n]",singleFactorAuthentication,...,[],[],none,none,none,Windows Azure Service Management API,797f4846-ba00-4fd7-ba43-dac1f8f63013,,,{'errorCode': 0},,AzureAD,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.447...",Traveler Account,fac439e8-41d3-414d-b1c6-ea168a1e80b3,traveleraccount@seccxpninja.onmicrosoft.com,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Member,,,,,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,SigninLogs
1,8ecf8077-cf51-4820-aadd-14040956f35d,Azure AD,2021-05-05 10:44:30.585000+00:00,/tenants/4b2462a4-bbee-495a-a0e1-f23ae524cc9c/providers/Microsoft.aadiam,Sign-in activity,1.0,SignInLogs,16000,,Other,0,420e8936-b427-4d50-b364-5aaa78e86ba9,Microsoft.aadiam,Microsoft.aadiam,,Pavel Morozov,4,RU,,Azure Portal,c44b4083-3bb0-49c1-b47d-974e53cbdf3c,"[\r\n {\r\n ""authenticationStepDateTime"": ""2021-05-05T10:44:30.5857172+00:00"",\r\n ""authe...",,"[\r\n {\r\n ""key"": ""IsCAEToken"",\r\n ""value"": ""False""\r\n }\r\n]",singleFactorAuthentication,...,[],[],none,none,none,Windows Azure Service Management API,797f4846-ba00-4fd7-ba43-dac1f8f63013,,,"{'errorCode': 16000, 'failureReason': 'Other'}",,AzureAD,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.443...",Pavel Morozov,bb8912d4-25b7-4e82-a16f-289774cef163,pmorozov@viacode.com,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Member,,,,,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,,SigninLogs
2,8ecf8077-cf51-4820-aadd-14040956f35d,Azure AD,2021-06-16 15:28:01.513000+00:00,/tenants/4b2462a4-bbee-495a-a0e1-f23ae524cc9c/providers/Microsoft.aadiam,Sign-in activity,1.0,SignInLogs,50126,,Invalid username or password or Invalid on-premise username or password.,0,c8e98a2f-1927-47e5-85f0-02f57bb2c4cf,Microsoft.aadiam,Microsoft.aadiam,,Purview DataCurator,4,US,purviewdc@seccxp.ninja,Microsoft Azure Purview Studio,632d803a-b0c2-49b4-a944-e13c384c04a8,"[\r\n {\r\n ""authenticationStepDateTime"": ""2021-06-16T15:28:01.5131514+00:00"",\r\n ""authe...",,"[\r\n {\r\n ""key"": ""IsCAEToken"",\r\n ""value"": ""False""\r\n }\r\n]",singleFactorAuthentication,...,[],[],none,none,none,Microsoft Graph,00000003-0000-0000-c000-000000000000,,,"{'errorCode': 50126, 'failureReason': 'Invalid username or password or Invalid on-premise userna...",,AzureAD,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.447...",Purview DataCurator,ad75bd68-b15a-465b-8192-0bf2ce84095a,purviewdc@seccxp.ninja,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Member,,,purviewdc@seccxp.ninja,,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,SigninLogs
3,8ecf8077-cf51-4820-aadd-14040956f35d,Azure AD,2021-05-31 13:47:00.875000+00:00,/tenants/4b2462a4-bbee-495a-a0e1-f23ae524cc9c/providers/Microsoft.aadiam,Sign-in activity,1.0,SignInLogs,0,,,0,535d4ef9-ad2a-4ad3-b0d5-c6a22f2ce980,Microsoft.aadiam,Microsoft.aadiam,,Arseny Vasilev,4,RU,,Azure Portal,c44b4083-3bb0-49c1-b47d-974e53cbdf3c,"[\r\n {\r\n ""authenticationStepDateTime"": ""2021-05-31T13:47:00.875543+00:00"",\r\n ""authen...",,"[\r\n {\r\n ""key"": ""IsCAEToken"",\r\n ""value"": ""False""\r\n }\r\n]",singleFactorAuthentication,...,[],[],none,none,none,Windows Azure Service Management API,797f4846-ba00-4fd7-ba43-dac1f8f63013,,,{'errorCode': 0},,AzureAD,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.443...",Arseny Vasilev,9267d02c-5f76-40a9-a9eb-b686f3ca47aa,avasilev@viacode.com,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Guest,,,,,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,5fccd68a-e65e-46ae-96b1-2d896d680249,SigninLogs
4,8ecf8077-cf51-4820-aadd-14040956f35d,Azure AD,2021-05-18 10:06:03.226000+00:00,/tenants/4b2462a4-bbee-495a-a0e1-f23ae524cc9c/providers/Microsoft.aadiam,Sign-in activity,1.0,SignInLogs,0,,,0,0f5ad317-0d7a-42d5-bbce-6b7d7b14cd4e,Microsoft.aadiam,Microsoft.aadiam,,Michał Olczak (PL),4,PL,,Azure Portal,c44b4083-3bb0-49c1-b47d-974e53cbdf3c,"[\r\n {\r\n ""authenticationStepDateTime"": ""2021-05-18T10:06:03.2263377+00:00"",\r\n ""authe...",,"[\r\n {\r\n ""key"": ""IsCAEToken"",\r\n ""value"": ""False""\r\n }\r\n]",singleFactorAuthentication,...,[],[],none,none,none,Windows Azure Service Management API,797f4846-ba00-4fd7-ba43-dac1f8f63013,,,{'errorCode': 0},,AzureAD,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.443...",Michał Olczak (PL),672885e3-2e3b-4514-a620-4dc1dbc8b095,michal.olczak@pwc.com,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Guest,,,,,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,513294a0-3e20-41b2-a970-6d30bf1546fa,SigninLogs


In [31]:
secevent_df = qry_prov.exec_query("SecurityEvent | sample 100")
secevent_df.head()

Unnamed: 0,TenantId,TimeGenerated,SourceSystem,Account,AccountType,Computer,EventSourceName,Channel,Task,Level,EventData,EventID,Activity,SourceComputerId,EventOriginId,MG,TimeCollected,ManagementGroupName,AccessList,AccessMask,AccessReason,AccountDomain,AccountExpires,AccountName,AccountSessionIdentifier,...,TargetUserName,TargetUserSid,TemplateContent,TemplateDSObjectFQDN,TemplateInternalName,TemplateOID,TemplateSchemaVersion,TemplateVersion,TokenElevationType,TransmittedServices,UserAccountControl,UserParameters,UserPrincipalName,UserWorkstations,VirtualAccount,VendorIds,Workstation,WorkstationName,PartitionKey,RowKey,StorageAccount,AzureDeploymentID,AzureTableName,Type,_ResourceId
0,8ecf8077-cf51-4820-aadd-14040956f35d,2021-06-11 16:12:32.930000+00:00,OpsManager,\ADMIN,User,SOC-FW-RDP,Microsoft-Windows-Security-Auditing,Security,12544,16,,4625,4625 - An account failed to log on.,41502da5-21b7-48ec-81c9-baeea8d7d669,5bb62b73-62d6-48d0-98b7-8da16a75b407,00000000-0000-0000-0000-000000000001,2021-06-11 16:12:35.274000+00:00,AOI-8ecf8077-cf51-4820-aadd-14040956f35d,,,,,,,,...,ADMIN,S-1-0-0,,,,,,,,-,,,,,,,,-,,,,,,SecurityEvent,/subscriptions/d1d8779d-38d7-4f06-91db-9cbc8de0176f/resourcegroups/soc-fortinet/providers/micros...
1,8ecf8077-cf51-4820-aadd-14040956f35d,2021-07-27 16:07:34.030000+00:00,OpsManager,NT AUTHORITY\SYSTEM,User,ADFS01.seccxp.ninja,Microsoft-Windows-AppLocker,Microsoft-Windows-AppLocker/EXE and DLL,0,4,"<UserData xmlns=""http://schemas.microsoft.com/win/2004/08/events/event"">\r\n <RuleAndFileData x...",8002,8002 - A process was allowed to run.,20ca974d-fade-4e11-abc4-6fc11997180a,23a7b5c5-d8fa-42a5-b4a4-9b7f04af9653,00000000-0000-0000-0000-000000000001,2021-07-27 16:08:05.355000+00:00,AOI-8ecf8077-cf51-4820-aadd-14040956f35d,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,SecurityEvent,/subscriptions/d1d8779d-38d7-4f06-91db-9cbc8de0176f/resourcegroups/simuland/providers/microsoft....
2,8ecf8077-cf51-4820-aadd-14040956f35d,2021-05-23 15:53:26.993000+00:00,OpsManager,\ADMINISTRATOR,User,SOC-FW-RDP,Microsoft-Windows-Security-Auditing,Security,12544,16,,4625,4625 - An account failed to log on.,41502da5-21b7-48ec-81c9-baeea8d7d669,746bd43b-ce81-45f9-ad53-5d149e2b003e,00000000-0000-0000-0000-000000000001,2021-05-23 15:53:31.542000+00:00,AOI-8ecf8077-cf51-4820-aadd-14040956f35d,,,,,,,,...,ADMINISTRATOR,S-1-0-0,,,,,,,,-,,,,,,,,-,,,,,,SecurityEvent,/subscriptions/d1d8779d-38d7-4f06-91db-9cbc8de0176f/resourcegroups/soc-fortinet/providers/micros...
3,8ecf8077-cf51-4820-aadd-14040956f35d,2021-07-27 16:07:34.023000+00:00,OpsManager,SECCXP\ADFS01$,Machine,ADFS01.seccxp.ninja,Microsoft-Windows-Security-Auditing,Security,13312,8,,4688,4688 - A new process has been created.,20ca974d-fade-4e11-abc4-6fc11997180a,edbab4c6-0323-4552-8c83-de0afed8a790,00000000-0000-0000-0000-000000000001,2021-07-27 16:08:05.355000+00:00,AOI-8ecf8077-cf51-4820-aadd-14040956f35d,,,,,,,,...,-,S-1-0-0,,,,,,,%%1936,,,,,,,,,,,,,,,SecurityEvent,/subscriptions/d1d8779d-38d7-4f06-91db-9cbc8de0176f/resourcegroups/simuland/providers/microsoft....
4,8ecf8077-cf51-4820-aadd-14040956f35d,2021-07-27 16:07:34.050000+00:00,OpsManager,SECCXP\ADFS01$,Machine,ADFS01.seccxp.ninja,Microsoft-Windows-Security-Auditing,Security,12801,8,"<EventData xmlns=""http://schemas.microsoft.com/win/2004/08/events/event"">\r\n <Data Name=""Subje...",4663,4663 - An attempt was made to access an object.,20ca974d-fade-4e11-abc4-6fc11997180a,6f4e61a1-5a1a-4b2f-9996-ade3bed99b63,00000000-0000-0000-0000-000000000001,2021-07-27 16:08:05.355000+00:00,AOI-8ecf8077-cf51-4820-aadd-14040956f35d,%%4432 \t\t\t\t,0x1,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,SecurityEvent,/subscriptions/d1d8779d-38d7-4f06-91db-9cbc8de0176f/resourcegroups/simuland/providers/microsoft....


In [32]:
offact_df = qry_prov.exec_query("OfficeActivity | sample 100")
offact_df.head()

Unnamed: 0,TenantId,Application,UserDomain,UserAgent,RecordType,TimeGenerated,Operation,OrganizationId,OrganizationId_,UserType,UserKey,OfficeWorkload,ResultStatus,ResultReasonType,OfficeObjectId,UserId,UserId_,ClientIP,ClientIP_,Scope,Site_,ItemType,EventSource,Source_Name,MachineDomainInfo,...,ChannelType,ChannelName,ChannelGuid,ExtraProperties,AddOnType,AddonName,TabType,Name,OldValue,NewValue,ItemName,ChatThreadId,ChatName,CommunicationType,AADGroupId,AddOnGuid,AppDistributionMode,TargetUserId,OperationScope,AzureADAppId,OperationProperties,AppId,ClientAppId,Type,_ResourceId
0,8ecf8077-cf51-4820-aadd-14040956f35d,,,,50,2021-07-27 11:11:58+00:00,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F88D275,Exchange,Succeeded,Succeeded,,seb@seccxpninja.onmicrosoft.com,seb@seccxpninja.onmicrosoft.com,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,"[{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]",7ab7862c-4c57-491e-8a45-d52a7e023983,,OfficeActivity,
1,8ecf8077-cf51-4820-aadd-14040956f35d,,,,50,2021-07-07 13:29:47+00:00,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,10032000D105B030,Exchange,Succeeded,Succeeded,,JBritt@seccxpninja.onmicrosoft.com,JBritt@seccxpninja.onmicrosoft.com,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,"[{'Value': 'Bind', 'Name': 'MailAccessType'}, {'Value': 'False', 'Name': 'IsThrottled'}]",7a5fbd1c-3e6d-461a-9075-83049393b3a7,7a5fbd1c-3e6d-461a-9075-83049393b3a7,OfficeActivity,
2,8ecf8077-cf51-4820-aadd-14040956f35d,,,,50,2021-07-27 11:18:13+00:00,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F8A6FC7,Exchange,Succeeded,Succeeded,,MeganB@seccxp.ninja,MeganB@seccxp.ninja,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,"[{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]",414a677a-e50f-46ea-b89c-aebb8a9efbe2,,OfficeActivity,
3,8ecf8077-cf51-4820-aadd-14040956f35d,,,,ExchangeAdmin,2021-06-11 11:18:27+00:00,Set-User,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,DcAdmin,NT AUTHORITY\SYSTEM (Microsoft.Exchange.Management.ForwardSync),Exchange,True,True,NAMPR06A007.PROD.OUTLOOK.COM/Microsoft Exchange Hosted Organizations/seccxpninja.onmicrosoft.com...,NT AUTHORITY\SYSTEM (Microsoft.Exchange.Management.ForwardSync),NT AUTHORITY\SYSTEM (Microsoft.Exchange.Management.ForwardSync),,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,OfficeActivity,
4,8ecf8077-cf51-4820-aadd-14040956f35d,,,,50,2021-07-27 11:18:13+00:00,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F8A6FC7,Exchange,Succeeded,Succeeded,,MeganB@seccxp.ninja,MeganB@seccxp.ninja,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,"[{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]",414a677a-e50f-46ea-b89c-aebb8a9efbe2,,OfficeActivity,


In [33]:
comsec_df = qry_prov.exec_query("CommonSecurityLog | sample 1000")
comsec_df.head()

Unnamed: 0,TenantId,SourceSystem,TimeGenerated,ReceiptTime,DeviceVendor,DeviceProduct,DeviceEventClassID,LogSeverity,OriginalLogSeverity,DeviceAction,SimplifiedDeviceAction,Computer,CommunicationDirection,DeviceFacility,DestinationPort,DestinationIP,DeviceAddress,DeviceName,Message,Protocol,SourcePort,SourceIP,RemoteIP,RemotePort,MaliciousIP,...,DeviceCustomString4,DeviceCustomString4Label,DeviceCustomString5,DeviceCustomString5Label,DeviceCustomString6,DeviceCustomString6Label,DeviceCustomDate1,DeviceCustomDate1Label,DeviceCustomDate2,DeviceCustomDate2Label,FlexDate1,FlexDate1Label,FlexNumber1,FlexNumber1Label,FlexNumber2,FlexNumber2Label,FlexString1,FlexString1Label,FlexString2,FlexString2Label,AdditionalExtensions,StartTime,EndTime,Type,_ResourceId
0,8ecf8077-cf51-4820-aadd-14040956f35d,OpsManager,2021-05-27 10:13:41.583000+00:00,,Fortinet,Fortigate,00013,3,,,,,,,3389.0,10.0.0.5,,,,6.0,57487.0,91.223.67.13,,,,...,,,,,,,,,,,,,,,,,,,,,FortinetFortiGateeventtime=1622110543134663766;FortinetFortiGatetz=-0700;FortinetFortiGatelogid=...,NaT,NaT,CommonSecurityLog,/subscriptions/d1d8779d-38d7-4f06-91db-9cbc8de0176f/resourcegroups/soc-fortinet/providers/micros...
1,8ecf8077-cf51-4820-aadd-14040956f35d,OpsManager,2021-07-07 03:01:29.513000+00:00,,Fortinet,Fortigate,28704,2,,,,,1.0,,443.0,20.44.8.3,,,"Cloud.IT: Microsoft.Azure,",6.0,51002.0,10.0.1.5,,,,...,,,,,,,,,,,,,,,,,,,,,FortinetFortiGateeventtime=1625626912883535736;FortinetFortiGatetz=-0700;FortinetFortiGatelogid=...,NaT,NaT,CommonSecurityLog,/subscriptions/d1d8779d-38d7-4f06-91db-9cbc8de0176f/resourcegroups/soc-fortinet/providers/micros...
2,8ecf8077-cf51-4820-aadd-14040956f35d,OpsManager,2021-06-03 17:45:12.266000+00:00,$cefformatted-receive_time,Palo Alto Networks,PAN-OS,end,1,,,,,,,,10.6.1.5,,,,,,40.76.235.24,,,,...,,,,,,,,,,,,,,,,,,,,,,NaT,NaT,CommonSecurityLog,/subscriptions/d1d8779d-38d7-4f06-91db-9cbc8de0176f/resourcegroups/soc-fortinet/providers/micros...
3,8ecf8077-cf51-4820-aadd-14040956f35d,OpsManager,2021-07-23 01:45:34.986000+00:00,$cefformatted-receive_time,Palo Alto Networks,PAN-OS,end,1,,,,,,,,10.6.1.5,,,,,,20.51.113.252,,,,...,,,,,,,,,,,,,,,,,,,,,,NaT,NaT,CommonSecurityLog,/subscriptions/d1d8779d-38d7-4f06-91db-9cbc8de0176f/resourcegroups/soc-fortinet/providers/micros...
4,8ecf8077-cf51-4820-aadd-14040956f35d,OpsManager,2021-07-27 15:39:32.680000+00:00,,Fortinet,Fortigate,28704,2,,,,,0.0,,443.0,20.44.8.204,,,"Web.Client: HTTPS.BROWSER,",6.0,46320.0,10.0.1.5,,,,...,,,,,,,,,,,,,,,,,,,,,FortinetFortiGateeventtime=1627400380074752837;FortinetFortiGatetz=-0700;FortinetFortiGatelogid=...,NaT,NaT,CommonSecurityLog,/subscriptions/d1d8779d-38d7-4f06-91db-9cbc8de0176f/resourcegroups/soc-fortinet/providers/micros...


In [151]:
from collections import defaultdict

def match_regexes(data, debug=False):
    """Function to apply every regex to every column in the given table.

    Args:
        data (DataFrame): A table/log queried from the connected Azure Sentinel workspace.
        debug (bool, optional): If True, prints the columns for which no match was found. Defaults to False.

    Returns:
        Dictionary: {Column: {Regex: (Match ratio excluding blanks, Total match ratio)}}
    """
    # Dictionary to store results
    full_matches = {}
    # Iterate over each column   
    for col in data.columns:
        # Skip non-string columns
        if data[col].dtype != np.dtype("O"):
            if debug:
                print(f" -- col {col} is type {data[col].dtype}. Skipping")
            continue
        # Iterate over every regex
        for name, regex in entity_regexes.items():
            # Try the regex on the column
            match_series = data[col].str.match(regex['regex'], case=False, flags=re.VERBOSE)
            # If there are more than zero rows in the table
            if len(match_series) > 0:
                # Calculate the match ratios, including blanks (total_match_ratio) 
                # and not including blanks (match_ratio)
                total_match_ratio = match_series.sum() / len(match_series)
                blanks_df = data[col].str.strip() == ""
                num_non_blanks = len(match_series) - blanks_df.sum()
                match_ratio = match_series.sum() / num_non_blanks if num_non_blanks > 0 else total_match_ratio
                # If at least one entry in the column matched the regex 
                if total_match_ratio > 0:
                    # Add the column, regex, and match ratios to the dict
                    # If this column has already matched with a regex
                    if col in full_matches:
                        full_matches[col][name] = match_ratio, total_match_ratio
                    else:
                        full_matches[col] = {}
                        full_matches[col][name] = match_ratio, total_match_ratio
        if col not in full_matches and debug:
            print(f" -- col {col} no match found")
    return full_matches

In [152]:
table_regexes = {}
table_regexes["SigninLogs"] = match_regexes(signin_df)
table_regexes["SecurityEvent"] = match_regexes(secevent_df)
table_regexes["OfficeActivity"] = match_regexes(offact_df)
table_regexes["CommonSecurityLog"] = match_regexes(comsec_df)

import pprint
for table, cols in table_regexes.items():
    print(table)  
    print("-" * len(table))
    pprint.pprint(cols)

SigninLogs
----------
{'AADTenantId': {'GUID_REGEX': (1.0, 1.0)},
 'AlternateSignInName': {'EMAIL_REGEX': (1.0, 0.34)},
 'AppDisplayName': {'DNS_REGEX': (0.01, 0.01)},
 'AppId': {'GUID_REGEX': (1.0, 1.0)},
 'CorrelationId': {'GUID_REGEX': (1.0, 1.0)},
 'HomeTenantId': {'GUID_REGEX': (1.0, 0.98)},
 'IPAddress': {'IPV4_REGEX': (0.94, 0.94), 'IPV6_REGEX': (0.06, 0.06)},
 'Id': {'GUID_REGEX': (1.0, 1.0)},
 'OriginalRequestId': {'GUID_REGEX': (1.0, 1.0)},
 'Resource': {'DNS_REGEX': (1.0, 1.0)},
 'ResourceGroup': {'DNS_REGEX': (1.0, 1.0)},
 'ResourceId': {'LXPATH_REGEX': (1.0, 1.0), 'RESOURCEID_REGEX': (1.0, 1.0)},
 'ResourceIdentity': {'GUID_REGEX': (1.0, 0.95)},
 'ResourceTenantId': {'GUID_REGEX': (1.0, 1.0)},
 'SignInIdentifier': {'EMAIL_REGEX': (1.0, 0.34)},
 'TenantId': {'GUID_REGEX': (1.0, 1.0)},
 'UserId': {'GUID_REGEX': (1.0, 1.0)},
 'UserPrincipalName': {'EMAIL_REGEX': (1.0, 1.0)}}
SecurityEvent
-------------
{'Account': {'NTACCT_REGEX': (0.6923076923076923, 0.63),
             'WIN

In [153]:
def table_match_to_html(table_name, show_guids=False):
    """Return table column matches as HTML table."""
    if table_name not in table_regexes:
        return HTML("No data")
    
    # Create html table header
    table_html = ["<table><thead><tr><th>Column</th><th>Matches</th></tr></thead><tbody>"]

    for col, matches in table_regexes[table_name].items():
        col_html = {}
        for rgx_match, perc_match in matches.items():
            if rgx_match == "GUID_REGEX" and not show_guids:
                continue
            # Get the entity name and priority for this match
            entity_name = entity_regexes.get(rgx_match, {}).get("entity")
            regex_priority = entity_regexes.get(rgx_match, {}).get("priority", 0)
            if not entity_name:
                entity_name = rgx_match
            # Add a row for the column (using a dictionary since we later want to sort
            # based on priority)
            col_html[regex_priority] = (
                f"<b>{entity_name}</b> [p:{regex_priority}] "
                f"(matched {rgx_match} {perc_match[0] * 100:0.1f}%,  "
                f"all rows {perc_match[1] * 100:0.1f}%) "
            )
        # sort the different matches by priority
        sorted_by_pri = [value for key, value in sorted(col_html.items(), key=lambda item: item[0])]
        # join the matches with some space separators
        cols = "&nbsp;&nbsp;".join(sorted_by_pri)
        # add this as an html table row to the table list
        table_html.append(f"<tr><td><b>{col}</b></td><td>{cols}</td><tr>")
    # add a text heading
    header = "<h2>Column entities</h2>"
    # build and return the table html
    return HTML(f"{header} {''.join(table_html)}</tbody></table>")


nbwidgets.SelectItem(item_list=list(qry_prov.schema.keys()), height="300px", action=table_match_to_html)

VBox(children=(Text(value='', description='Filter:', style=DescriptionStyle(description_width='initial')), Sel…

Column,Matches
TenantId,
,
Account,"account [p:0] (matched NTACCT_REGEX 69.2%, all rows 63.0%) file [p:1] (matched WINPATH_REGEX 30.8%, all rows 28.0%)"
,
Computer,"host [p:1] (matched DNS_REGEX 51.0%, all rows 51.0%)"
,
SourceComputerId,
,
EventOriginId,
,


## Modification of match_regexes function for partial (substring) matches

In [154]:
from collections import defaultdict

def match_regexes_partial(data, debug=False):
    full_matches = {}
    for col in data.columns:

        if data[col].dtype != np.dtype("O"):
            if debug:
                print(f" -- col {col} is type {data[col].dtype}. Skipping")
            continue
            
        for name, regex in entity_regexes.items():
            # strip off ^ and $ delimiters
            part_regex = re.sub(r"^\s*\^(.*)\s*\$\s*$", r"\1", regex["regex"])
            match_series = data[col].str.match(part_regex, case=False, flags=re.VERBOSE)
            
            if len(match_series) > 0:
                match_ratio = match_series.sum() / len(match_series)
                blanks_df = data[col].str.strip() == ""
                num_non_blanks = len(match_series) - blanks_df.sum()
                true_match_ratio = match_series.sum() / num_non_blanks if num_non_blanks > 0 else match_ratio
                if match_ratio > 0:
                    if col in full_matches:
                        full_matches[col][name] = true_match_ratio
                    else:
                        full_matches[col] = {}
                        full_matches[col][name] = true_match_ratio
                
        if col not in full_matches and debug:
            print(f" -- col {col} no match found")
    return full_matches

In [155]:
table_regexes_part = {}
table_regexes_part["SigninLogs"] = match_regexes_partial(signin_df)
table_regexes_part["SecurityEvent"] = match_regexes_partial(secevent_df)
table_regexes_part["OfficeActivity"] = match_regexes_partial(offact_df)
table_regexes_part["CommonSecurityLog"] = match_regexes_partial(comsec_df)

import pprint
for table, cols in table_regexes_part.items():
    print(table)
    
    print("-" * len(table))
    pprint.pprint(cols)

SigninLogs
----------
{'AADTenantId': {'GUID_REGEX': 1.0},
 'AlternateSignInName': {'EMAIL_REGEX': 1.0},
 'AppDisplayName': {'DNS_REGEX': 0.01},
 'AppId': {'GUID_REGEX': 1.0},
 'CorrelationId': {'GUID_REGEX': 1.0},
 'HomeTenantId': {'GUID_REGEX': 1.0},
 'IPAddress': {'IPV4_REGEX': 0.94, 'IPV6_REGEX': 0.06},
 'Id': {'GUID_REGEX': 1.0},
 'OriginalRequestId': {'GUID_REGEX': 1.0},
 'Resource': {'DNS_REGEX': 1.0},
 'ResourceGroup': {'DNS_REGEX': 1.0},
 'ResourceId': {'LXPATH_REGEX': 1.0, 'RESOURCEID_REGEX': 1.0},
 'ResourceIdentity': {'GUID_REGEX': 1.0},
 'ResourceTenantId': {'GUID_REGEX': 1.0},
 'SignInIdentifier': {'EMAIL_REGEX': 1.0},
 'TenantId': {'GUID_REGEX': 1.0},
 'UserId': {'GUID_REGEX': 1.0},
 'UserPrincipalName': {'DNS_REGEX': 0.09, 'EMAIL_REGEX': 1.0}}
SecurityEvent
-------------
{'Account': {'NTACCT_REGEX': 0.6923076923076923,
             'WINPATH_REGEX': 0.3076923076923077},
 'AdditionalInfo2': {'NTACCT_REGEX': 1.0},
 'Computer': {'DNS_REGEX': 0.51},
 'EventOriginId': {'GUID_

In [61]:
table_regexes

{'UserAccessAnalytics': {'TenantId': {'GUID_REGEX': (1.0, 1.0)},
  'AADTenantId': {'GUID_REGEX': (1.0, 1.0)},
  'SourceEntityId': {'GUID_REGEX': (1.0, 1.0)},
  'SourceEntityName': {'SHA256_REGEX': (0.010416666666666666, 0.01),
   'GUID_REGEX': (0.041666666666666664, 0.04)},
  'TargetEntityId': {'GUID_REGEX': (1.0, 1.0)},
  'AccessId': {'GUID_REGEX': (1.0, 0.24)}},
 'Usage': {'TenantId': {'GUID_REGEX': (1.0, 1.0)},
  'ResourceUri': {'LXPATH_REGEX': (1.0, 1.0), 'RESOURCEID_REGEX': (1.0, 1.0)},
  'MeterId': {'GUID_REGEX': (1.0, 1.0)},
  'LinkedMeterId': {'GUID_REGEX': (1.0, 1.0)}},
 'UpdateSummary': {'TenantId': {'GUID_REGEX': (1.0, 1.0)},
  'MG': {'GUID_REGEX': (1.0, 1.0)},
  'SourceComputerId': {'GUID_REGEX': (1.0, 1.0)},
  'Computer': {'DNS_REGEX': (0.39, 0.39)},
  'SubscriptionId': {'GUID_REGEX': (1.0, 1.0)},
  'ResourceProvider': {'DNS_REGEX': (1.0, 1.0)},
  'ResourceId': {'LXPATH_REGEX': (1.0, 1.0), 'RESOURCEID_REGEX': (1.0, 1.0)},
  'VMUUID': {'GUID_REGEX': (1.0, 1.0)},
  '_Resourc

In [169]:
def interpret_matches(table_match_dic):
    """For each column apply priority and match percentage logic to assign an entity to the column.
        
    Args:
        table_match_dic (Dict): Output of match_entities function. Dict showing all columns that matched one or more regexes.
    Returns:
        Dictionary: {Table: {Column: Entity}}
    """
    entity_assignments = {}
    for table, cols in table_match_dic.items():
        entity_assignments[table] = {}
        for col, matches in cols.items(): 
            highest_perc = 0
            highest_pri = 3
            isMatch = False
            for rgx_match, perc_match in matches.items():
                # Ignore GUID matches
                if rgx_match == "GUID_REGEX":
                    continue
                # Choose entity corresponding to the regex with the highest total match percentage
                # If tie, choose entity with highest priority
                # 0 has highest priority, 2 is the lowest
                isMatch = True
                regex_priority = int(entity_regexes.get(rgx_match, {}).get("priority", 0))
                regex = rgx_match
                if regex_priority < highest_pri:
                    highest_pri = regex_priority
                    rgx = rgx_match
                if perc_match[0] > highest_perc == True:
                    highest_perc = perc_match
                    regex = rgx_match
                elif perc_match[0] == highest_perc:
                    regex = rgx
            if(isMatch):
                entity_name = entity_regexes.get(regex, {}).get("entity")
                entity_assignments[table][col] = entity_name

    return entity_assignments
    

In [171]:
table_entities = interpret_matches(table_regexes)

for table, cols in table_entities.items():
    print(table)
    
    print("-" * len(table))
    pprint.pprint(cols)

SigninLogs
----------
{'AlternateSignInName': 'account',
 'AppDisplayName': 'host',
 'IPAddress': 'ipaddress',
 'Resource': 'host',
 'ResourceGroup': 'host',
 'ResourceId': 'azureresource',
 'SignInIdentifier': 'account',
 'UserPrincipalName': 'account'}
SecurityEvent
-------------
{'Account': 'account',
 'Computer': 'host',
 'FileHash': 'hash',
 'FilePath': 'account',
 'IpAddress': 'ipaddress',
 'MandatoryLabel': 'account',
 'NewProcessName': 'process',
 'ObjectName': 'file',
 'ParentProcessName': 'process',
 'Process': 'process',
 'ProcessName': 'process',
 'SubjectAccount': 'account',
 'SubjectUserSid': 'account',
 'TargetAccount': 'account',
 'TargetUser': 'account',
 'TargetUserSid': 'account',
 '_ResourceId': 'azureresource'}
OfficeActivity
--------------
{'ClientIP': 'ipaddress',
 'ClientIP_': 'ipaddress',
 'Client_IPAddress': 'ipaddress',
 'LogonUserSid': 'account',
 'MailboxOwnerSid': 'account',
 'MailboxOwnerUPN': 'account',
 'OfficeObjectId': 'url',
 'OrganizationName': 'hos

In [161]:
def create_entity_index(data):
    """Iterates through the interpreted results to create a dict keyed by entity type.

    Args:
        data (Dict): Output of interpret_matches function. Dict of column-entity mappings keyed by table and column.

    Returns:
        Dict: {entity: [(table, col)]}
    """
    entity_dict = {}
    for table, cols in data.items():
        for col, entity in cols.items():
            entity_dict[entity] = []
    for table, cols in data.items():
        for col, entity in cols.items():
            entity_dict[entity].append((table, col))
    return entity_dict

In [172]:
key_entity_dict = create_entity_index(table_entities)

import pprint
for entity, tables_cols in key_entity_dict.items():
    print(entity)
    
    print("-" * len(entity))
    pprint.pprint(tables_cols)

azureresource
-------------
[('SigninLogs', 'ResourceId'),
 ('SecurityEvent', '_ResourceId'),
 ('CommonSecurityLog', '_ResourceId')]
host
----
[('SigninLogs', 'Resource'),
 ('SigninLogs', 'ResourceGroup'),
 ('SigninLogs', 'AppDisplayName'),
 ('SecurityEvent', 'Computer'),
 ('OfficeActivity', 'SourceFileName'),
 ('OfficeActivity', 'SourceFileName_'),
 ('OfficeActivity', 'OrganizationName'),
 ('CommonSecurityLog', 'RequestURL'),
 ('CommonSecurityLog', 'DeviceCustomString5')]
account
-------
[('SigninLogs', 'AlternateSignInName'),
 ('SigninLogs', 'UserPrincipalName'),
 ('SigninLogs', 'SignInIdentifier'),
 ('SecurityEvent', 'Account'),
 ('SecurityEvent', 'FilePath'),
 ('SecurityEvent', 'MandatoryLabel'),
 ('SecurityEvent', 'SubjectAccount'),
 ('SecurityEvent', 'SubjectUserSid'),
 ('SecurityEvent', 'TargetAccount'),
 ('SecurityEvent', 'TargetUser'),
 ('SecurityEvent', 'TargetUserSid'),
 ('OfficeActivity', 'UserKey'),
 ('OfficeActivity', 'UserId'),
 ('OfficeActivity', 'UserId_'),
 ('OfficeAc

In [181]:
output_regexes = {}

for i in range(3):
    table, cols = qry_prov.schema.popitem()
    df = qry_prov.exec_query(f"{table} | sample 100")
    while len(df) == 0:
        table, cols = qry_prov.schema.popitem()
        df = qry_prov.exec_query(f"{table} | sample 100")
    output_regexes[table] = match_regexes(df)
output_entities = interpret_matches(output_regexes)
keyed_entities = create_entity_index(output_entities)
        
for entity, tables_cols in keyed_entities.items():
    print(entity)
    
    print("-" * len(entity))
    pprint.pprint(tables_cols)

azureresource
-------------
[('SecurityNestedRecommendation', 'AssessedResourceId'),
 ('SecurityNestedRecommendation', 'Id'),
 ('SecurityEvent', '_ResourceId')]
url
---
[('SecurityNestedRecommendation', 'RemediationDescription'),
 ('SecurityIncident', 'IncidentUrl')]
account
-------
[('SecurityEvent', 'Account'),
 ('SecurityEvent', 'FilePath'),
 ('SecurityEvent', 'MandatoryLabel'),
 ('SecurityEvent', 'ObjectName'),
 ('SecurityEvent', 'SubjectAccount'),
 ('SecurityEvent', 'SubjectUserSid'),
 ('SecurityEvent', 'TargetAccount'),
 ('SecurityEvent', 'TargetSid'),
 ('SecurityEvent', 'TargetUser'),
 ('SecurityEvent', 'TargetUserSid')]
host
----
[('SecurityEvent', 'Computer'), ('SecurityEvent', 'TargetUserName')]
hash
----
[('SecurityEvent', 'FileHash')]
ipaddress
---------
[('SecurityEvent', 'IpAddress')]
process
-------
[('SecurityEvent', 'NewProcessName'),
 ('SecurityEvent', 'ParentProcessName'),
 ('SecurityEvent', 'Process'),
 ('SecurityEvent', 'ProcessName')]


# Autogenerating queries

In [145]:
query_template = """
{table}
| where TimeGenerated > ago(1d)
| where {ColumnName} == "{{MySearch}}"
"""

In [51]:
table_regexes

{'SigninLogs': {'TenantId': {'GUID_REGEX': (1.0, 1.0)},
  'ResourceId': {'LXPATH_REGEX': (1.0, 1.0), 'RESOURCEID_REGEX': (1.0, 1.0)},
  'CorrelationId': {'GUID_REGEX': (1.0, 1.0)},
  'Resource': {'DNS_REGEX': (1.0, 1.0)},
  'ResourceGroup': {'DNS_REGEX': (1.0, 1.0)},
  'AlternateSignInName': {'EMAIL_REGEX': (1.0, 0.34)},
  'AppDisplayName': {'DNS_REGEX': (0.01, 0.01)},
  'AppId': {'GUID_REGEX': (1.0, 1.0)},
  'Id': {'GUID_REGEX': (1.0, 1.0)},
  'IPAddress': {'IPV4_REGEX': (0.94, 0.94), 'IPV6_REGEX': (0.06, 0.06)},
  'OriginalRequestId': {'GUID_REGEX': (1.0, 1.0)},
  'ResourceIdentity': {'GUID_REGEX': (1.0, 0.95)},
  'UserId': {'GUID_REGEX': (1.0, 1.0)},
  'UserPrincipalName': {'EMAIL_REGEX': (1.0, 1.0)},
  'AADTenantId': {'GUID_REGEX': (1.0, 1.0)},
  'SignInIdentifier': {'EMAIL_REGEX': (1.0, 0.34)},
  'ResourceTenantId': {'GUID_REGEX': (1.0, 1.0)},
  'HomeTenantId': {'GUID_REGEX': (1.0, 0.98)}},
 'SecurityEvent': {'TenantId': {'GUID_REGEX': (1.0, 1.0)},
  'Account': {'WINPATH_REGEX': (

In [146]:
def generate_query(entity_type, search_value):
    email_queries = []
    for table, matches in table_regexes.items():
        for col, regexes in matches.items():
            print(regexes)
            if entity_type in regexes.keys():

                for label, entity in regexes.items():
                    if entity_type == entity:
                        print("found match", table, col, entity)
                        query = query_template.format(table=table, ColumnName=col)
                        email_queries.append(query.format(MySearch=search_value))
    return email_queries

In [147]:
queries=generate_query("EMAIL_REGEX", "michelle.duncan@conocophillips.com")
print(queries)

{'GUID_REGEX': (1.0, 1.0)}
{'GUID_REGEX': (1.0, 1.0)}
{'GUID_REGEX': (1.0, 1.0)}
{'GUID_REGEX': (1.0, 1.0)}
{'GUID_REGEX': (1.0, 1.0)}
{'GUID_REGEX': (1.0, 1.0)}
{'DNS_REGEX': (1.0, 1.0)}
{'LXPATH_REGEX': (1.0, 1.0), 'RESOURCEID_REGEX': (1.0, 1.0)}
{'GUID_REGEX': (1.0, 1.0)}
{'LXPATH_REGEX': (1.0, 1.0), 'RESOURCEID_REGEX': (1.0, 1.0)}
[]


In [148]:
for query in queries:
    query_result=qry_prov.exec_query(query)
    print(query)
    print("-" * len(query))
    display(query_result)