# Generate Signatures for Tables

## MSTICPYConfig

In [95]:
%%writefile msticpyconfig.yaml
AzureSentinel:
  Workspaces:
    ASIHuntOMSWorkspaceV4:
      TenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47
      WorkspaceId: 52b1ab41-869e-4138-9e40-2a4457f09bf0
    CyberSecuritySoc:
      TenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47
      WorkspaceId: 8ecf8077-cf51-4820-aadd-14040956f35d
    Default:
      TenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47
      WorkspaceId: 8ecf8077-cf51-4820-aadd-14040956f35d

Overwriting msticpyconfig.yaml


## Initiializing MSTICPy

In [96]:
# Core MSTICPy initialization for Notebooks
from msticpy.nbtools import nbinit
nbinit.init_notebook(namespace=globals());

# Load query providers (typically you'll be using just one)
qry_prov = QueryProvider("AzureSentinel")

## Connect to workspace 
```
WorkspaceConfig(workspace=WS_NAME)
```

By default, uses the Default entry

In [97]:
qry_prov.connect(WorkspaceConfig())

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Read in Tables to be Analyzed

In [98]:
from pathlib import Path
try:
    with open('tables.txt') as file:
        tables = file.read().splitlines()
    print('Successfully read from tables.txt')
except FileNotFoundError:
    print('File not found, tables set to [].')
    tables = []
except:
    print('File was unable to be read, tables set to [].')
    tables = []


# Create folder if it doesn't exist
    
for table in tables:
    Path(Path.cwd()/"data"/table).mkdir(exist_ok=True)

Successfully read from tables.txt


In [99]:
# Query N and Pickle N weeks worth of data
# Generates and saves raw data from queries
# Takes in a list of tables, a boolean that determines if we are drawing from a random sample
# and an integer n is either the number of weeks if sample is false or n is the number of smamples

def generateData(tables, sample=True, n=52):
    
    # Limit for max rows
    limit = 10001
    
    for table in tables:
        
        # If we are not sampling, use the queryTable to pull n weeks and concatenate that
        # else pull a random sample of n rows
        
        df = pd.DataFrame()
        
        if sample == False:
            for weeks in range(0, n*7, 7):
                print(f"Retrieving data for {tableName} from {int(weeks/7) + 1} week(s) ago")
                queryString = f"{table} | where TimeGenerated between (ago({weeks+7}d) .. ago({week}d))"
                queryResult = qry_prov.exec_query(queryString)
                df = pd.concat([df, queryResult])
        else:
            print(f"Retrieving a sample of {n} rows from {table}.")
            queryString = f"{table} | sample {n} | limit {limit}"
            df = qry_prov.exec_query(queryString)
        df.to_pickle(f'./data/{table}/raw.pkl', protocol=5)
        print(f"{table} pickld and saved in ./data/{table}/raw.pkl\n")

## Sample and Pickle 10,000 Rows

In [100]:
generateData(tables, True, 10000)

Retrieving a sample of 10000 rows from OfficeActivity.


<IPython.core.display.Javascript object>

OfficeActivity pickld and saved in ./data/OfficeActivity/raw.pkl

Retrieving a sample of 10000 rows from SigninLogs.


<IPython.core.display.Javascript object>

SigninLogs pickld and saved in ./data/SigninLogs/raw.pkl

Retrieving a sample of 10000 rows from SecurityEvent.


<IPython.core.display.Javascript object>

SecurityEvent pickld and saved in ./data/SecurityEvent/raw.pkl

Retrieving a sample of 10000 rows from CommonSecurityLog.


<IPython.core.display.Javascript object>

CommonSecurityLog pickld and saved in ./data/CommonSecurityLog/raw.pkl



## Read Data from Pickle

In [102]:
import pandas as pd
import numpy as np

rawData = {}
for table in tables:
    rawData[table] = pd.read_pickle(f"data/{table}/raw.pkl")

## Clean Tables

In [103]:
import datetime
import re

# Contain exact matches you want to filter out of table
exactMatches = []

# Contains regular expressions you want to filter out of table
regexes = [
    re.compile(r'^.*[Tt][Ii][Mm][Ee].*$'), # Regex for checking if the word time (case insensitive) is in the string. No ignorecase flag
]



cleanData = {}

for table in rawData:

    print(f'Cleaning table {table}')
    
    # Remove features that may be continuous values (i.e. time) using regular expressions and exact matches
    
    for feature in rawData[table]:
        
        # Check if this feature is included in our exactMatches to remove
        
        if feature in exactMatches:
            rawData[table] = rawData[table].drop([feature], axis=1)
            print(f'{feature} due to being an exact match')
            continue
            
        for regex in regexes:
            if regex.match(feature):
                rawData[table] = rawData[table].drop([feature], axis=1)
                print(f'{feature} due to being a match with regular expression: {regex}')
                break
    
    # Remove unhashable types such as lists or dictionaries and convert them to a string
    cleanTable = rawData[table].applymap(lambda x: str(x) if isinstance(x, list) or isinstance(x, dict) or isinstance(x, datetime.datetime) else x)

    # Finds empty columns to prevent them from being dropped
    emptyCol = []
    for column in cleanTable:
        # Convert to numpy
        data = cleanTable[column].to_numpy() 
        if (data[0] == np.nan or data[0] == '') and (data[0] == data).all():
            emptyCol.append(column)
            
    # Copy columns over to be added back after duplicates are removed
    col = cleanTable[emptyCol]
    
    # Transpose the cleaned table and drop duplicate rows. Re-transpose to get back to the original table
    cleanTable = cleanTable.T.drop_duplicates().T
    
    # Add empty columns back into table and reorder
    cleanTable = pd.concat([cleanTable, col], axis=1)

    # Save cleaned data in a pickled file using
    cleanTable.to_pickle(f'./data/{table}/cleaned.pkl')

    cleanData[table] = cleanTable
    
cleanData['OfficeActivity']

Cleaning table OfficeActivity
TimeGenerated due to being a match with regular expression: re.compile('^.*[Tt][Ii][Mm][Ee].*$')
ElevationTime due to being a match with regular expression: re.compile('^.*[Tt][Ii][Mm][Ee].*$')
ElevationApprovedTime due to being a match with regular expression: re.compile('^.*[Tt][Ii][Mm][Ee].*$')
Start_Time due to being a match with regular expression: re.compile('^.*[Tt][Ii][Mm][Ee].*$')
Cleaning table SigninLogs
TimeGenerated due to being a match with regular expression: re.compile('^.*[Tt][Ii][Mm][Ee].*$')
CreatedDateTime due to being a match with regular expression: re.compile('^.*[Tt][Ii][Mm][Ee].*$')
ProcessingTimeInMilliseconds due to being a match with regular expression: re.compile('^.*[Tt][Ii][Mm][Ee].*$')
Cleaning table SecurityEvent
TimeGenerated due to being a match with regular expression: re.compile('^.*[Tt][Ii][Mm][Ee].*$')
TimeCollected due to being a match with regular expression: re.compile('^.*[Tt][Ii][Mm][Ee].*$')
NewTime due to being

Unnamed: 0,TenantId,Application,UserAgent,RecordType,Operation,OrganizationId,UserType,UserKey,OfficeWorkload,ResultStatus,OfficeObjectId,UserId,ClientIP,Site_,ItemType,EventSource,MachineId,Site_Url,SourceRelativeUrl,SourceFileName,SourceFileExtension,Event_Data,Parameters,ExternalAccess,OriginatingServer,...,Client,Actor,ActorContextId,ActorIpAddress,InterSystemsId,IntraSystemId,SupportTicketId,TargetContextId,EffectiveOrganization,ElevationApprover,ElevationRequestId,ElevationRole,GenericInfo,AzureActiveDirectory_EventType,AADTarget,ChannelType,AddonName,OldValue,ChatThreadId,ChatName,AppDistributionMode,TargetUserId,OperationScope,AzureADAppId,_ResourceId
0,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F8A6FC7,Exchange,Succeeded,,MeganB@seccxp.ninja,,,,,,,,,,,,False,DM5PR0601MB3686 (15.20.4200.000)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F8A6FC7,Exchange,Succeeded,,MeganB@seccxp.ninja,,,,,,,,,,,,False,DM5PR0601MB3686 (15.20.4200.000)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003C017CC9,Exchange,Succeeded,,FMorris@seccxpninja.onmicrosoft.com,,,,,,,,,,,,False,CO6PR06MB7122 (15.20.4200.000)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
3,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003C017CC9,Exchange,Succeeded,,FMorris@seccxpninja.onmicrosoft.com,,,,,,,,,,,,False,CO6PR06MB7122 (15.20.4200.000)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003C017CC9,Exchange,Succeeded,,FMorris@seccxpninja.onmicrosoft.com,,,,,,,,,,,,False,CO6PR06MB7122 (15.20.4200.000)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320014411A696,Exchange,Succeeded,,lhunter@contosohotels.com,,,,,,,,,,,,False,BYAPR06MB5814 (15.20.4173.030)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,
9996,8ecf8077-cf51-4820-aadd-14040956f35d,,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.447...",SharePointFileOperation,FileUploaded,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,i:0h.f|membership|10032001582e7cd2@live.com,OneDrive,,https://seccxpninja-my.sharepoint.com/personal/lrodriguez_seccxp_ninja/Documents/customer_info_i...,lrodriguez@seccxp.ninja,40.81.121.135,cad52db8-227b-4276-9a00-a63f40f5493f,File,SharePoint,,https://seccxpninja-my.sharepoint.com/personal/lrodriguez_seccxp_ninja/,Documents,customer_info_id-9072600.docx,docx,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
9997,8ecf8077-cf51-4820-aadd-14040956f35d,,Apache-HttpClient/4.5.6 (Java/1.8.0_242),SharePointFileOperation,FileAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,i:0i.t|00000003-0000-0ff1-ce00-000000000000|app@sharepoint,OneDrive,,https://seccxpninja-my.sharepoint.com/personal/lrodriguez_seccxp_ninja/Documents/customer_info_i...,app@sharepoint,40.84.4.93,cad52db8-227b-4276-9a00-a63f40f5493f,File,SharePoint,,https://seccxpninja-my.sharepoint.com/personal/lrodriguez_seccxp_ninja/,Documents,customer_info_id-9072878.docx,docx,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
9998,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F88D275,Exchange,Succeeded,,seb@seccxp.ninja,,,,,,,,,,,,False,DM5PR06MB3180 (15.20.4173.020)\r\n,...,,,,,,,,,,,,,,,,,,,,,,,,,


## Binarize Data

In [104]:
binData = {}

for table in cleanData:
    
    # Replace empty cells with NaN 
    # Comment this line out if you do not want to remove empty strings
    binTable = cleanData[table].replace(r'^\s*$', np.nan, regex=True)

    # Replace NaN values with 0 and all others with 1
    binTable = binTable.notnull().astype('int')

    # Save binary data in a pickled file
    binTable.to_pickle(f'./data/{table}/binarized.pkl')
    
    # Clean continuous values
    
    binData[table] = binTable  

binData['OfficeActivity']

Unnamed: 0,TenantId,Application,UserAgent,RecordType,Operation,OrganizationId,UserType,UserKey,OfficeWorkload,ResultStatus,OfficeObjectId,UserId,ClientIP,Site_,ItemType,EventSource,MachineId,Site_Url,SourceRelativeUrl,SourceFileName,SourceFileExtension,Event_Data,Parameters,ExternalAccess,OriginatingServer,...,Client,Actor,ActorContextId,ActorIpAddress,InterSystemsId,IntraSystemId,SupportTicketId,TargetContextId,EffectiveOrganization,ElevationApprover,ElevationRequestId,ElevationRole,GenericInfo,AzureActiveDirectory_EventType,AADTarget,ChannelType,AddonName,OldValue,ChatThreadId,ChatName,AppDistributionMode,TargetUserId,OperationScope,AzureADAppId,_ResourceId
0,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9996,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9997,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9998,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [105]:
# Takes in a string of 0 and 1 as the signature
# Takes in a list of column names
# Returns a list of features that are present and features that are missing

def getPresentColumns(signature, columns):
    present = []
    missing = []
    for index in range(len(signature)):
        if int(signature[index]):
            present.append(columns[index])
        else:
            missing.append(columns[index])
    return present, missing

In [106]:
# Counts the number of times a datapoint shows up in the features
# For example, it counts how many times the IP 44.150.161.58 shows up in the clientIP column

def countTypes(row, columns, presentFeatures, featureDict):
    for index in range(len(row)):
        
        currentFeature = columns[index]
        value = row[index]
        
        # If the feature is missing we won't count it
        if currentFeature not in presentFeatures:
            continue
            
        if value not in featureDict[currentFeature]:
            featureDict[currentFeature][value] = 1
        else:
            featureDict[currentFeature][value] += 1
    return featureDict

## Generate Dictionary of Signatures

In [107]:
import json

tableSignatures = {}
for table in binData:
    print(f'Generating dictionary for {table}')
    columns = binData[table].columns
    signatureDict = {}
    
    for index, row in binData[table].iterrows():
        signature = ''.join(map(str, row.values.tolist()))
        
        # If this signature does not exist
        if signature not in signatureDict:
            
            # Identify Present/Missing features
            present, missing = getPresentColumns(signature, columns)
            # Generate and update number of different data types in the feature dictionary
            featureDict = {i: {} for i in present}
            featureDict = countTypes(cleanData[table].iloc[index], columns, present, featureDict)
            
            signatureDict[signature] = {
                'count': 1,
                'presentFeatures': present,
                'missingFeatures': missing,
                'featureDict': featureDict
            }
        else:
            signatureDict[signature]['count'] += 1
            signatureDict[signature]['featureDict'] = countTypes(cleanData[table].iloc[index], columns, signatureDict[signature]['presentFeatures'], signatureDict[signature]['featureDict'])
            
    tableSignatures[table] = signatureDict
    with open(f'./data/{table}/signatureDictionary.json', 'w') as f:
        json.dump(signatureDict, f)

Generating dictionary for OfficeActivity
Generating dictionary for SigninLogs
Generating dictionary for SecurityEvent
Generating dictionary for CommonSecurityLog


In [108]:
from pprint import pprint

pprint(signatureDict[signature])

{'count': 450,
 'featureDict': {'Activity': {'Allowed': 448,
                              'Not allowed to browse this category': 2},
                 'AdditionalExtensions': {'reason=Allowed;about.google/assets-products/js/index.min.js?cache=992d56c;about.google/intl/en/products/?tab=wh;outcome=200;cat=Web Search;rulelabel=None;ruletype=None;urlclass=Business Use;devicemodel=NA': 1,
                                          'reason=Allowed;ad.turn.com/r/cs?pid=;9&gdpr=0;us-u.openx.net/w/1.0/cm?id=;c6a5ba0d-ce02-41bd-a1ea-842c68bd5108&ph=;8f5ed5d4-642c-4222-968a-d709c87ac3c8&us_privacy=;&r=;https://cms-xch-chicago.33across.com/match?us_privacy=;&bidder_id=;70&external_user_id=;outcome=302;cat=Corporate Marketing;rulelabel=None;ruletype=None;urlclass=Business Use;devicemodel=NA': 1,
                                          'reason=Allowed;ads.avocet.io/getuid?url=;https://ps.eyeota.net/match?bid=;b2c3gb0&referrer_pid=;6bioi0v&uid={{UUID}}%0A;inet.detik.com/?tag_from=wp_firstnav_detikIn