# Generate Signatures for Tables

## MSTICPYConfig

In [41]:
%%writefile msticpyconfig.yaml
AzureSentinel:
  Workspaces:
    ASIHuntOMSWorkspaceV4:
      TenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47
      WorkspaceId: 52b1ab41-869e-4138-9e40-2a4457f09bf0
    CyberSecuritySoc:
      TenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47
      WorkspaceId: 8ecf8077-cf51-4820-aadd-14040956f35d
    Default:
      TenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47
      WorkspaceId: 8ecf8077-cf51-4820-aadd-14040956f35d

Overwriting msticpyconfig.yaml


## Initiializing MSTICPy

In [42]:
# Core MSTICPy initialization for Notebooks
from msticpy.nbtools import nbinit
nbinit.init_notebook(namespace=globals());

# Load query providers (typically you'll be using just one)
qry_prov = QueryProvider("AzureSentinel")

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


Please wait. Loading Kqlmagic extension...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Connect to workspace 
```
WorkspaceConfig(workspace=WS_NAME)
```

By default, uses the Default entry

In [43]:
qry_prov.connect(WorkspaceConfig())

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Read in Tables to be Analyzed

In [44]:
import os

with open('tables.txt') as file:
    tables = file.read().splitlines()

# Create folder if it doesn't exist
    
for table in tables:
    if not os.path.isdir('./data/' + table):
        os.makedirs('./data/' + table)

In [45]:
# tableName contains the name of the table we are querying (string)
# maxAge is the oldest result we want to retrieve in weeks minimum 1 week(int)

def queryTable(tableName, maxAge=1):

    assert(maxAge >= 1)
    df = pd.DataFrame()
    
    for weeks in range(0, maxAge*7, 7):
        print("Retrieving data for {tableName} from {week} week(s) ago".format(tableName = tableName, week = int(weeks/7) + 1))
        queryString = "{tableName} | where TimeGenerated between (ago({end}d) .. ago({start}d))".format(tableName = tableName, start = weeks, end = weeks + 7)
        queryResult = qry_prov.exec_query(queryString)
        df = pd.concat([df, queryResult])
    return df

In [46]:
# Query N and Pickle N weeks worth of data
# Generates and saves raw data from queries
# Takes in a list of tables, a boolean that determines if we are drawing from a random sample
# and an integer n is either the number of weeks if sample is false or n is the number of smamples

def generateData(tables, sample=True, n=52):
    for table in tables:
        
        # If we are not sampling, use the queryTable to pull n weeks and concatenate that
        # else pull a random sample of n rows
        
        df = pd.DataFrame()
        
        if sample == False:
            for weeks in range(0, maxAge*7, 7):
                print("Retrieving data for {tableName} from {week} week(s) ago".format(tableName = tableName, week = int(n/7) + 1))
                queryString = "{tableName} | where TimeGenerated between (ago({end}d) .. ago({start}d))".format(tableName = tableName, start = n, end = n + 7)
                queryResult = qry_prov.exec_query(queryString)
                df = pd.concat([df, queryResult])
        else:
            print("Retrieving a sample of {nSamples} rows from {tableName}.".format(tableName = table, nSamples = n))
            queryString = "{tableName} | sample {nSamples}".format(tableName = table, nSamples = n)
            df = qry_prov.exec_query(queryString)
        df.to_pickle('./data/{table}/raw.pkl'.format(table = table))
        print("{table} pickld and saved in ./data/{table}/raw.pkl\n".format(table = table))

## Sample and Pickle 10,000 Rows

In [47]:
generateData(tables, True, 10000)

Retrieving a sample of 10000 rows from OfficeActivity.


<IPython.core.display.Javascript object>

OfficeActivity pickld and saved in ./data/OfficeActivity/raw.pkl

Retrieving a sample of 10000 rows from SigninLogs.


<IPython.core.display.Javascript object>

SigninLogs pickld and saved in ./data/SigninLogs/raw.pkl

Retrieving a sample of 10000 rows from SecurityEvent.


<IPython.core.display.Javascript object>

SecurityEvent pickld and saved in ./data/SecurityEvent/raw.pkl

Retrieving a sample of 10000 rows from CommonSecurityLog.


<IPython.core.display.Javascript object>

CommonSecurityLog pickld and saved in ./data/CommonSecurityLog/raw.pkl



## Read Data from Pickle

In [48]:
import pandas as pd
import numpy as np

rawData = {}
for table in tables:
    rawData[table] = pd.read_pickle("data/{tableName}/raw.pkl".format(tableName=table))

## Clean Tables

In [49]:
import datetime
import re

# Contain exact matches you want to filter out of table
exactMatches = []

# Contains regular expressions you want to filter out of table
regexes = [
    r'^.*[Tt][Ii][Mm][Ee].*$', # Regex for checking if the word time (case insensitive) is in the string. No ignorecase flag
]

cleanData = {}

for table in rawData:

    print('Cleaning table {table}'.format(table=table))
    
    # Remove features that may be continuous values (i.e. time) using regular expressions and exact matches
    
    for feature in rawData[table]:
        
        # Check if this feature is included in our exactMatches to remove
        
        if feature in exactMatches:
            rawData[table] = rawData[table].drop([feature], axis=1)
            print('{feature} due to being an exact match'.format(feature=feature))
            continue
            
        for regex in regexes:
            if re.match(regex, feature):
                rawData[table] = rawData[table].drop([feature], axis=1)
                print('{feature} due to being a match with regular expression: {regex}'.format(feature=feature, regex=regex))
                break
    
    # Remove unhashable types such as lists or dictionaries and convert them to a string
    cleanTable = rawData[table].applymap(lambda x: str(x) if isinstance(x, list) or isinstance(x, dict) or isinstance(x, datetime.datetime) else x)

    # Finds empty columns to prevent them from being dropped
    emptyCol = []
    for column in cleanTable:
        # Convert to numpy
        data = cleanTable[column].to_numpy() 
        if (data[0] == np.nan or data[0] == '') and (data[0] == data).all():
            emptyCol.append(column)
            
    # Copy columns over to be added back after duplicates are removed
    col = cleanTable[emptyCol]
    
    # Transpose the cleaned table and drop duplicate rows. Re-transpose to get back to the original table
    cleanTable = cleanTable.T.drop_duplicates().T
    
    # Add empty columns back into table and reorder
    cleanTable = pd.concat([cleanTable, col], axis=1)

    # Save cleaned data in a pickled file using
    cleanTable.to_pickle('./data/{table}/cleaned.pkl'.format(table = table))

    cleanData[table] = cleanTable
    
cleanData['OfficeActivity']

Cleaning table OfficeActivity
TimeGenerated due to being a match with regular expression: ^.*[Tt][Ii][Mm][Ee].*$
ElevationTime due to being a match with regular expression: ^.*[Tt][Ii][Mm][Ee].*$
ElevationApprovedTime due to being a match with regular expression: ^.*[Tt][Ii][Mm][Ee].*$
Start_Time due to being a match with regular expression: ^.*[Tt][Ii][Mm][Ee].*$
Cleaning table SigninLogs
TimeGenerated due to being a match with regular expression: ^.*[Tt][Ii][Mm][Ee].*$
CreatedDateTime due to being a match with regular expression: ^.*[Tt][Ii][Mm][Ee].*$
ProcessingTimeInMilliseconds due to being a match with regular expression: ^.*[Tt][Ii][Mm][Ee].*$
Cleaning table SecurityEvent
TimeGenerated due to being a match with regular expression: ^.*[Tt][Ii][Mm][Ee].*$
TimeCollected due to being a match with regular expression: ^.*[Tt][Ii][Mm][Ee].*$
NewTime due to being a match with regular expression: ^.*[Tt][Ii][Mm][Ee].*$
PreviousTime due to being a match with regular expression: ^.*[Tt][Ii

Unnamed: 0,TenantId,Application,UserAgent,RecordType,Operation,OrganizationId,UserType,UserKey,OfficeWorkload,ResultStatus,OfficeObjectId,UserId,ClientIP,Site_,ItemType,EventSource,MachineId,Site_Url,SourceRelativeUrl,SourceFileName,SourceFileExtension,Event_Data,Parameters,ExternalAccess,OriginatingServer,...,SendAsUserMailboxGuid,SendOnBehalfOfUserSmtp,SendonBehalfOfUserMailboxGuid,ExtendedProperties,Client,Actor,ActorContextId,ActorIpAddress,InterSystemsId,IntraSystemId,SupportTicketId,TargetContextId,EffectiveOrganization,ElevationApprover,ElevationRequestId,ElevationRole,GenericInfo,AzureActiveDirectory_EventType,AADTarget,ChannelType,OldValue,ChatThreadId,ChatName,TargetUserId,_ResourceId
0,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F8A6FC7,Exchange,Succeeded,,MeganB@seccxp.ninja,,,,,,,,,,,,False,SN4PR0601MB3693 (15.20.4200.000)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F8A6FC7,Exchange,Succeeded,,MeganB@seccxp.ninja,,,,,,,,,,,,False,SN4PR0601MB3693 (15.20.4200.000)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F8A6FC7,Exchange,Succeeded,,MeganB@seccxp.ninja,,,,,,,,,,,,False,DM5PR0601MB3686 (15.20.4200.000)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
3,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F88D275,Exchange,Succeeded,,seb@seccxp.ninja,,,,,,,,,,,,False,DM5PR06MB3180 (15.20.4108.029)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,8ecf8077-cf51-4820-aadd-14040956f35d,,,ExchangeItem,Create,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003EA95AA4,Exchange,Succeeded,,adelevan@seccxpninja.onmicrosoft.com,2603:10b6:5:2ca::6,,,,,,,,,,,False,DM6PR06MB5721 (15.20.4200.000)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F88D275,Exchange,Succeeded,,seb@seccxp.ninja,,,,,,,,,,,,False,DM5PR06MB3180 (15.20.4195.029)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
9996,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F8A6FC7,Exchange,Succeeded,,MeganB@seccxp.ninja,,,,,,,,,,,,False,DM5PR0601MB3686 (15.20.4065.037)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
9997,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F8A6FC7,Exchange,Succeeded,,MeganB@seccxp.ninja,,,,,,,,,,,,False,DM5PR0601MB3686 (15.20.4219.021)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
9998,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F8A6FC7,Exchange,Succeeded,,MeganB@seccxp.ninja,,,,,,,,,,,,False,DM5PR0601MB3686 (15.20.4200.000)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,


## Binarize Data

In [50]:
binData = {}

for table in cleanData:
    
    # Replace empty cells with NaN 
    # Comment this line out if you do not want to remove empty strings
    binTable = cleanData[table].replace(r'^\s*$', np.nan, regex=True)

    # Replace NaN values with 0 and all others with 1
    binTable = binTable.notnull().astype('int')

    # Save binary data in a pickled file
    binTable.to_pickle('./data/{table}/binarized.pkl'.format(table = table))
    
    # Clean continuous values
    
    binData[table] = binTable  

binData['OfficeActivity']

Unnamed: 0,TenantId,Application,UserAgent,RecordType,Operation,OrganizationId,UserType,UserKey,OfficeWorkload,ResultStatus,OfficeObjectId,UserId,ClientIP,Site_,ItemType,EventSource,MachineId,Site_Url,SourceRelativeUrl,SourceFileName,SourceFileExtension,Event_Data,Parameters,ExternalAccess,OriginatingServer,...,SendAsUserMailboxGuid,SendOnBehalfOfUserSmtp,SendonBehalfOfUserMailboxGuid,ExtendedProperties,Client,Actor,ActorContextId,ActorIpAddress,InterSystemsId,IntraSystemId,SupportTicketId,TargetContextId,EffectiveOrganization,ElevationApprover,ElevationRequestId,ElevationRole,GenericInfo,AzureActiveDirectory_EventType,AADTarget,ChannelType,OldValue,ChatThreadId,ChatName,TargetUserId,_ResourceId
0,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9996,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9997,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9998,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [51]:
# Takes in a string of 0 and 1 as the signature
# Takes in a list of column names
# Returns a list of features that are present and features that are missing

def getPresentColumns(signature, columns):
    present = []
    missing = []
    for index in range(len(signature)):
        if int(signature[index]):
            present.append(columns[index])
        else:
            missing.append(columns[index])
    return present, missing

In [52]:
# Counts the number of times a datapoint shows up in the features
# For example, it counts how many times the IP 44.150.161.58 shows up in the clientIP column

def countTypes(row, columns, presentFeatures, featureDict):
    for index in range(len(row)):
        
        currentFeature = columns[index]
        value = row[index]
        
        # If the feature is missing we won't count it
        if currentFeature not in presentFeatures:
            continue
            
        if value not in featureDict[currentFeature]:
            featureDict[currentFeature][value] = 1
        else:
            featureDict[currentFeature][value] += 1
    return featureDict

## Generate Dictionary of Signatures

In [53]:
import json

tableSignatures = {}
for table in binData:
    print('Generating dictionary for {tableName}'.format(tableName = table))
    columns = binData[table].columns
    signatureDict = {}
    
    for index, row in binData[table].iterrows():
        signature = ''.join(map(str, row.values.tolist()))
        
        # If this signature does not exist
        if signature not in signatureDict:
            
            # Identify Present/Missing features
            present, missing = getPresentColumns(signature, columns)
            # Generate and update number of different data types in the feature dictionary
            featureDict = {i: {} for i in present}
            featureDict = countTypes(cleanData[table].iloc[index], columns, present, featureDict)
            
            signatureDict[signature] = {
                'count': 1,
                'presentFeatures': present,
                'missingFeatures': missing,
                'featureDict': featureDict
            }
        else:
            signatureDict[signature]['count'] += 1
            signatureDict[signature]['featureDict'] = countTypes(cleanData[table].iloc[index], columns, signatureDict[signature]['presentFeatures'], signatureDict[signature]['featureDict'])
            
    tableSignatures[table] = signatureDict
    with open('./data/{table}/rawDictionary.json'.format(table=table), 'w') as f:
        json.dump(signatureDict, f)

Generating dictionary for OfficeActivity
Generating dictionary for SigninLogs
Generating dictionary for SecurityEvent
Generating dictionary for CommonSecurityLog


In [54]:
print(signatureDict[signature])

{'count': 379, 'presentFeatures': ['TenantId', 'SourceSystem', 'DeviceVendor', 'DeviceProduct', 'DeviceEventClassID', 'LogSeverity', 'DeviceAction', 'CommunicationDirection', 'DestinationIP', 'SourceIP', 'DeviceVersion', 'Activity', 'ApplicationProtocol', 'DestinationServiceName', 'DestinationHostName', 'FileType', 'ReceivedBytes', 'SentBytes', 'RequestURL', 'RequestClientApplication', 'RequestMethod', 'SourceTranslatedAddress', 'SourceUserPrivileges', 'SourceUserName', 'DeviceCustomNumber1', 'DeviceCustomNumber1Label', 'DeviceCustomString1', 'DeviceCustomString1Label', 'DeviceCustomString2', 'DeviceCustomString2Label', 'DeviceCustomString3', 'DeviceCustomString3Label', 'DeviceCustomString4', 'DeviceCustomString4Label', 'DeviceCustomString5', 'DeviceCustomString5Label', 'DeviceCustomString6', 'DeviceCustomString6Label', 'AdditionalExtensions', 'Type', '_ResourceId'], 'missingFeatures': ['OriginalLogSeverity', 'DeviceFacility', 'DestinationPort', 'DeviceAddress', 'DeviceName', 'Message'