# Generate Signatures for Tables

## MSTICPYConfig

In [1]:
%%writefile msticpyconfig.yaml
AzureSentinel:
  Workspaces:
    ASIHuntOMSWorkspaceV4:
      TenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47
      WorkspaceId: 52b1ab41-869e-4138-9e40-2a4457f09bf0
    CyberSecuritySoc:
      TenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47
      WorkspaceId: 8ecf8077-cf51-4820-aadd-14040956f35d
    Default:
      TenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47
      WorkspaceId: 8ecf8077-cf51-4820-aadd-14040956f35d

Overwriting msticpyconfig.yaml


## Initiializing MSTICPy

In [2]:
# Core MSTICPy initialization for Notebooks
from msticpy.nbtools import nbinit
nbinit.init_notebook(namespace=globals());

# Load query providers (typically you'll be using just one)
qry_prov = QueryProvider("AzureSentinel")

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


Please wait. Loading Kqlmagic extension...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Connect to workspace 
```
WorkspaceConfig(workspace=WS_NAME)
```

By default, uses the Default entry

In [3]:
qry_prov.connect(WorkspaceConfig())

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Read in Tables to be Analyzed

In [4]:
import os

with open('tables.txt') as file:
    tables = file.read().splitlines()

# Create folder if it doesn't exist
    
for table in tables:
    if not os.path.isdir('./data/' + table):
        os.makedirs('./data/' + table)

In [5]:
# tableName contains the name of the table we are querying (string)
# maxAge is the oldest result we want to retrieve in weeks minimum 1 week(int)

def queryTable(tableName, maxAge=1):

    assert(maxAge >= 1)
    df = pd.DataFrame()
    
    for weeks in range(0, maxAge*7, 7):
        print("Retrieving data for {tableName} from {week} week(s) ago".format(tableName = tableName, week = int(weeks/7) + 1))
        queryString = "{tableName} | where TimeGenerated between (ago({end}d) .. ago({start}d))".format(tableName = tableName, start = weeks, end = weeks + 7)
        queryResult = qry_prov.exec_query(queryString)
        df = pd.concat([df, queryResult])
    return df

In [6]:
# Query N and Pickle N weeks worth of data
# Generates and saves raw data from queries
# Takes in a list of tables, a boolean that determines if we are drawing from a random sample
# and an integer n is either the number of weeks if sample is false or n is the number of smamples

def generateData(tables, sample=True, n=52):
    for table in tables:
        
        # If we are not sampling, use the queryTable to pull n weeks and concatenate that
        # else pull a random sample of n rows
        
        df = pd.DataFrame()
        
        if sample == False:
            for weeks in range(0, maxAge*7, 7):
                print("Retrieving data for {tableName} from {week} week(s) ago".format(tableName = tableName, week = int(n/7) + 1))
                queryString = "{tableName} | where TimeGenerated between (ago({end}d) .. ago({start}d))".format(tableName = tableName, start = n, end = n + 7)
                queryResult = qry_prov.exec_query(queryString)
                df = pd.concat([df, queryResult])
        else:
            print("Retrieving a sample of {nSamples} rows from {tableName}.".format(tableName = table, nSamples = n))
            queryString = "{tableName} | sample {nSamples}".format(tableName = table, nSamples = n)
            df = qry_prov.exec_query(queryString)
        df.to_pickle('./data/{table}/raw.pkl'.format(table = table))
        print("{table} pickld and saved in ./data/{table}/raw.pkl\n".format(table = table))

## Sample and Pickle 10,000 Rows

In [8]:
generateData(tables, True, 10000)

Retrieving a sample of 10000 rows from OfficeActivity.


<IPython.core.display.Javascript object>

OfficeActivity pickld and saved in ./data/OfficeActivity/raw.pkl

Retrieving a sample of 10000 rows from SigninLogs.


<IPython.core.display.Javascript object>

SigninLogs pickld and saved in ./data/SigninLogs/raw.pkl

Retrieving a sample of 10000 rows from SecurityEvent.


<IPython.core.display.Javascript object>

SecurityEvent pickld and saved in ./data/SecurityEvent/raw.pkl

Retrieving a sample of 10000 rows from CommonSecurityLog.


<IPython.core.display.Javascript object>

CommonSecurityLog pickld and saved in ./data/CommonSecurityLog/raw.pkl



## Read Data from Pickle

In [9]:
rawData = {}
for table in tables:
    rawData[table] = pd.read_pickle("data/{tableName}/raw.pkl".format(tableName=table))

## Binarize Tables

In [10]:
binData = {}

for table in rawData:
    
    # Replace empty cells with NaN
    replace = rawData[table].replace(r'^\s+$', np.nan, regex=True)
    
    # Replace NaN values with 0 and all others with 1
    binData[table] = replace.notnull().astype('int')
    binData[table].to_pickle('./data/{table}/binarized.pkl'.format(table = table))

In [11]:
# Takes in a string of 0 and 1 as the signature
# Takes in a list of column names
# Returns a list of features that are present and features that are missing

def getPresentColumns(signature, columns):
    present = []
    missing = []
    for index in range(len(signature)):
        if int(signature[index]):
            present.append(columns[index])
        else:
            missing.append(columns[index])
    return present, missing

In [32]:
# Counts the number of times a datapoint shows up in the features
# For example, it counts how many times the IP 44.150.161.58 shows up in the clientIP column

def countTypes(row, columns, presentFeatures, featureDict):
    for index in range(len(row)):
        
        currentFeature = columns[index]
        value = row[index]
        
        # If the feature is missing we won't count it
        if currentFeature not in presentFeatures:
            continue
            
        #If the value in the column is a list or a dictionary we need to cast to a tuple to use it as a key
        if isinstance(value, list) or isinstance(value, dict):
            value = str(value)

        if value not in featureDict[currentFeature]:
            featureDict[currentFeature][value] = 1
        else:
            featureDict[currentFeature][value] += 1
    return featureDict

In [37]:
print(rawData['OfficeActivity'].iloc[0])

TenantId                                                                   8ecf8077-cf51-4820-aadd-14040956f35d
Application                                                                                                    
UserDomain                                                                                                     
UserAgent                                                                                                      
RecordType                                                                                                   50
                                                                 ...                                           
OperationProperties    [{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]
AppId                                                                      414a677a-e50f-46ea-b89c-aebb8a9efbe2
ClientAppId                                                                                             

## Generate Dictionary of Signatures

In [38]:
tableSignatures = {}
for table in binData:
    print('Generating dictionary for {tableName}'.format(tableName = table))
    columns = binData[table].columns
    signatureDict = {}
    
    for index, row in binData[table].iterrows():
        signature = ''.join(map(str, row.values.tolist()))
        
        # If this signature does not exist
        if signature not in signatureDict:
            
            # Identify Present/Missing features
            present, missing = getPresentColumns(signature, columns)
            # Generate and update number of different data types in the feature dictionary
            featureDict = {i: {} for i in present}
            featureDict = countTypes(rawData[table].iloc[index], columns, present, featureDict)
            
            signatureDict[signature] = {
                'count': 1,
                'presentFeatures': present,
                'missingFeatures': missing,
                'featureDict': featureDict
            }
        else:
            signatureDict[signature]['count'] += 1
            signatureDict[signature]['featureDict'] = countTypes(rawData[table].iloc[index], columns, signatureDict[signature]['presentFeatures'], signatureDict[signature]['featureDict'])
            
    tableSignatures[table] = signatureDict

Generating dictionary for OfficeActivity
Generating dictionary for SigninLogs
Generating dictionary for SecurityEvent
Generating dictionary for CommonSecurityLog


In [47]:
print('count:',signatureDict['11111111111111011111011110111100111101110111111111110111111001111111100111111101011111111111001111011111111010101010101011111111111111111110101111110011']['count'],'\n')
print('presentFeatures:',signatureDict['11111111111111011111011110111100111101110111111111110111111001111111100111111101011111111111001111011111111010101010101011111111111111111110101111110011']['presentFeatures'],'\n')
print('missingFeatures:',signatureDict['11111111111111011111011110111100111101110111111111110111111001111111100111111101011111111111001111011111111010101010101011111111111111111110101111110011']['missingFeatures'],'\n')

for i in signatureDict['11111111111111011111011110111100111101110111111111110111111001111111100111111101011111111111001111011111111010101010101011111111111111111110101111110011']['featureDict']:
    print(i, signatureDict['11111111111111011111011110111100111101110111111111110111111001111111100111111101011111111111001111011111111010101010101011111111111111111110101111110011']['featureDict'][i],'\n')

count: 550 

presentFeatures: ['TenantId', 'SourceSystem', 'TimeGenerated', 'ReceiptTime', 'DeviceVendor', 'DeviceProduct', 'DeviceEventClassID', 'LogSeverity', 'OriginalLogSeverity', 'DeviceAction', 'SimplifiedDeviceAction', 'Computer', 'CommunicationDirection', 'DeviceFacility', 'DestinationIP', 'DeviceAddress', 'DeviceName', 'Message', 'Protocol', 'SourceIP', 'RemoteIP', 'RemotePort', 'MaliciousIP', 'IndicatorThreatType', 'ThreatDescription', 'ThreatConfidence', 'ReportReferenceLink', 'MaliciousIPCountry', 'DeviceVersion', 'Activity', 'ApplicationProtocol', 'DestinationDnsDomain', 'DestinationServiceName', 'DestinationTranslatedAddress', 'DeviceDnsDomain', 'DeviceExternalID', 'DeviceInboundInterface', 'DeviceNtDomain', 'DeviceOutboundInterface', 'DevicePayloadId', 'ProcessName', 'DeviceTranslatedAddress', 'DestinationHostName', 'DestinationMACAddress', 'DestinationNTDomain', 'DestinationUserPrivileges', 'DestinationProcessName', 'DeviceTimeZone', 'DestinationUserID', 'DestinationUse

In [30]:
import ipywidgets

text = ipywidgets.Select(description='Hello World', options=[1,2,3])
text

Select(description='Hello World', options=(1, 2, 3), value=1)

In [31]:
print(text.value)

1


In [57]:
print(rawData['CommonSecurityLog']['Protocol'])

0       6
1       6
2       6
3       6
4        
       ..
9995     
9996    6
9997    6
9998     
9999     
Name: Protocol, Length: 10000, dtype: object
