# Generate Signatures for Tables

## MSTICPYConfig

In [185]:
%%writefile msticpyconfig.yaml
AzureSentinel:
  Workspaces:
    ASIHuntOMSWorkspaceV4:
      TenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47
      WorkspaceId: 52b1ab41-869e-4138-9e40-2a4457f09bf0
    CyberSecuritySoc:
      TenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47
      WorkspaceId: 8ecf8077-cf51-4820-aadd-14040956f35d
    Default:
      TenantId: 72f988bf-86f1-41af-91ab-2d7cd011db47
      WorkspaceId: 8ecf8077-cf51-4820-aadd-14040956f35d

Overwriting msticpyconfig.yaml


## Initiializing MSTICPy

In [186]:
# Core MSTICPy initialization for Notebooks
from msticpy.nbtools import nbinit
nbinit.init_notebook(namespace=globals());

# Load query providers (typically you'll be using just one)
qry_prov = QueryProvider("AzureSentinel")

## Connect to workspace 
```
WorkspaceConfig(workspace=WS_NAME)
```

By default, uses the Default entry

In [187]:
qry_prov.connect(WorkspaceConfig())

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Read in Tables to be Analyzed

In [188]:
import os

with open('tables.txt') as file:
    tables = file.read().splitlines()

# Create folder if it doesn't exist
    
for table in tables:
    if not os.path.isdir('./data/' + table):
        os.makedirs('./data/' + table)

In [189]:
# tableName contains the name of the table we are querying (string)
# maxAge is the oldest result we want to retrieve in weeks minimum 1 week(int)

def queryTable(tableName, maxAge=1):

    assert(maxAge >= 1)
    df = pd.DataFrame()
    
    for weeks in range(0, maxAge*7, 7):
        print("Retrieving data for {tableName} from {week} week(s) ago".format(tableName = tableName, week = int(weeks/7) + 1))
        queryString = "{tableName} | where TimeGenerated between (ago({end}d) .. ago({start}d))".format(tableName = tableName, start = weeks, end = weeks + 7)
        queryResult = qry_prov.exec_query(queryString)
        df = pd.concat([df, queryResult])
    return df

In [190]:
# Query N and Pickle N weeks worth of data
# Generates and saves raw data from queries
# Takes in a list of tables, a boolean that determines if we are drawing from a random sample
# and an integer n is either the number of weeks if sample is false or n is the number of smamples

def generateData(tables, sample=True, n=52):
    for table in tables:
        
        # If we are not sampling, use the queryTable to pull n weeks and concatenate that
        # else pull a random sample of n rows
        
        df = pd.DataFrame()
        
        if sample == False:
            for weeks in range(0, maxAge*7, 7):
                print("Retrieving data for {tableName} from {week} week(s) ago".format(tableName = tableName, week = int(n/7) + 1))
                queryString = "{tableName} | where TimeGenerated between (ago({end}d) .. ago({start}d))".format(tableName = tableName, start = n, end = n + 7)
                queryResult = qry_prov.exec_query(queryString)
                df = pd.concat([df, queryResult])
        else:
            print("Retrieving a sample of {nSamples} rows from {tableName}.".format(tableName = table, nSamples = n))
            queryString = "{tableName} | sample {nSamples}".format(tableName = table, nSamples = n)
            df = qry_prov.exec_query(queryString)
        df.to_pickle('./data/{table}/raw.pkl'.format(table = table))
        print("{table} pickld and saved in ./data/{table}/raw.pkl\n".format(table = table))

## Sample and Pickle 10,000 Rows

In [191]:
generateData(tables, True, 10000)

Retrieving a sample of 10000 rows from OfficeActivity.


<IPython.core.display.Javascript object>

OfficeActivity pickld and saved in ./data/OfficeActivity/raw.pkl

Retrieving a sample of 10000 rows from SigninLogs.


<IPython.core.display.Javascript object>

SigninLogs pickld and saved in ./data/SigninLogs/raw.pkl

Retrieving a sample of 10000 rows from SecurityEvent.


<IPython.core.display.Javascript object>

SecurityEvent pickld and saved in ./data/SecurityEvent/raw.pkl

Retrieving a sample of 10000 rows from CommonSecurityLog.


<IPython.core.display.Javascript object>

CommonSecurityLog pickld and saved in ./data/CommonSecurityLog/raw.pkl



## Read Data from Pickle

In [192]:
rawData = {}
for table in tables:
    rawData[table] = pd.read_pickle("data/{tableName}/raw.pkl".format(tableName=table))

## Clean Tables

In [194]:
import datetime

cleanData = {}

for table in rawData:

    # Remove unhashable types such as lists or dictionaries and convert them to a string
    cleanTable = rawData[table].applymap(lambda x: str(x) if isinstance(x, list) or isinstance(x, dict) or isinstance(x, datetime.datetime) else x)

    # Transpose the cleaned table and drop duplicate rows. Re-transpose to get back to the original table
    # Please note that this empty columns as well, if all values are empty/NaN then the data will be dropped
    cleanTable = cleanTable.T.drop_duplicates().T

    # Save cleaned data in a pickled file using
    cleanTable.to_pickle('./data/{table}/cleaned.pkl'.format(table = table))

    cleanData[table] = cleanTable
    
cleanData['OfficeActivity']

Unnamed: 0,TenantId,Application,UserAgent,RecordType,TimeGenerated,Operation,OrganizationId,UserType,UserKey,OfficeWorkload,ResultStatus,OfficeObjectId,UserId,ClientIP,Site_,ItemType,EventSource,MachineId,Site_Url,SourceRelativeUrl,SourceFileName,SourceFileExtension,Event_Data,Parameters,ExternalAccess,...,SourceSystem,OfficeId,OfficeTenantId,TargetUserOrGroupName,TargetUserOrGroupType,MessageId,Members,TeamName,TeamGuid,ChannelName,ChannelGuid,ExtraProperties,AddOnType,AddonName,ItemName,CommunicationType,AADGroupId,AddOnGuid,AppDistributionMode,OperationScope,AzureADAppId,OperationProperties,AppId,ClientAppId,Type
0,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,2021-06-30 14:10:46+00:00,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F88D275,Exchange,Succeeded,,seb@seccxp.ninja,,,,,,,,,,,,False,...,OfficeActivityManager,b782a9ec-c250-47c8-b215-0950974a992e,$RestApiTenantId$,,,,,,,,,,,,,,,,,,,"[{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]",7ab7862c-4c57-491e-8a45-d52a7e023983,,OfficeActivity
1,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,2021-06-30 14:47:46+00:00,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,10032000D105B030,Exchange,Succeeded,,JBritt@seccxpninja.onmicrosoft.com,,,,,,,,,,,,False,...,OfficeActivityManager,7852c8e3-e90f-4fca-b5ec-df83fc6b6c5f,$RestApiTenantId$,,,,,,,,,,,,,,,,,,,"[{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]",7a5fbd1c-3e6d-461a-9075-83049393b3a7,7a5fbd1c-3e6d-461a-9075-83049393b3a7,OfficeActivity
2,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,2021-06-30 16:44:20+00:00,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,1003200152C92447,Exchange,Succeeded,,jsmith1@contosohotels.com,,,,,,,,,,,,False,...,OfficeActivityManager,b21b8862-7b8d-48c4-b95b-d225428e9998,$RestApiTenantId$,,,,,,,,,,,,,,,,,,,"[{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]",5a2ee4c5-13b8-465b-88d7-75ecf16830ad,3c8e478f-21ca-493a-b87c-c7366d664d54,OfficeActivity
3,8ecf8077-cf51-4820-aadd-14040956f35d,,,ExchangeAdmin,2021-06-30 19:15:07+00:00,Set-User,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,DcAdmin,NT AUTHORITY\SYSTEM (Microsoft.Exchange.Management.ForwardSync),Exchange,True,NAMPR06A007.PROD.OUTLOOK.COM/Microsoft Exchange Hosted Organizations/seccxpninja.onmicrosoft.com...,NT AUTHORITY\SYSTEM (Microsoft.Exchange.Management.ForwardSync),,,,,,,,,,,"[\r\n {\r\n ""Name"": ""Identity"",\r\n ""Value"": ""4b2462a4-bbee-495a-a0e1-f23ae524cc9c\\515d0...",True,...,OfficeActivityManager,9dda3698-04f0-4426-e546-08d93bfb5fb2,$RestApiTenantId$,,,,,,,,,,,,,,,,,,,,,,OfficeActivity
4,8ecf8077-cf51-4820-aadd-14040956f35d,,,ExchangeAdmin,2021-06-30 22:07:01+00:00,Set-User,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,DcAdmin,NT AUTHORITY\SYSTEM (Microsoft.Exchange.Management.ForwardSync),Exchange,True,NAMPR06A007.PROD.OUTLOOK.COM/Microsoft Exchange Hosted Organizations/seccxpninja.onmicrosoft.com...,NT AUTHORITY\SYSTEM (Microsoft.Exchange.Management.ForwardSync),,,,,,,,,,,"[\r\n {\r\n ""Name"": ""Identity"",\r\n ""Value"": ""4b2462a4-bbee-495a-a0e1-f23ae524cc9c\\c08d6...",True,...,OfficeActivityManager,44753bfc-986a-4597-7671-08d93c13635e,$RestApiTenantId$,,,,,,,,,,,,,,,,,,,,,,OfficeActivity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,8ecf8077-cf51-4820-aadd-14040956f35d,,,MicrosoftTeams,2021-06-02 10:39:51+00:00,MemberAdded,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Application,62b732f7-fc71-40bc-b27d-35efcb0509de,MicrosoftTeams,,,Microsoft Teams Sync,,,,,,,,,,,,,...,OfficeActivityManager,103a52e3-8349-5f4b-ad43-ad20ad4a4023,$RestApiTenantId$,,,,"[{'DisplayName': 'RonHD', 'Role': 1, 'UPN': 'RonHD9376@seccxpninja.onmicrosoft.com'}]",SocTeam,19:9695c2f3977a4254975475668bb52751@thread.skype,,,[],,,SocTeam,Team,d05ba55c-593e-4bfa-8011-26e0626b5c14,,,,,,,,OfficeActivity
9996,8ecf8077-cf51-4820-aadd-14040956f35d,,,50,2021-06-03 18:13:00+00:00,MailItemsAccessed,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Regular,100320003F88D275,Exchange,Succeeded,,seb@seccxp.ninja,,,,,,,,,,,,False,...,OfficeActivityManager,709a712e-4688-4780-881b-e546c336a32c,$RestApiTenantId$,,,,,,,,,,,,,,,,,,,"[{'Value': 'Bind', 'Name': 'MailAccessType'}, {'Value': 'False', 'Name': 'IsThrottled'}]",7ab7862c-4c57-491e-8a45-d52a7e023983,,OfficeActivity
9997,8ecf8077-cf51-4820-aadd-14040956f35d,,,MicrosoftTeams,2021-06-01 11:03:36+00:00,MemberAdded,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Application,62b732f7-fc71-40bc-b27d-35efcb0509de,MicrosoftTeams,,,Microsoft Teams Sync,,,,,,,,,,,,,...,OfficeActivityManager,f432dd7c-f725-5036-9b13-c0c2c92e8701,$RestApiTenantId$,,,,"[{'DisplayName': 'AATPService', 'Role': 1, 'UPN': 'AATPService8977@seccxpninja.onmicrosoft.com'}]",SocTeam,19:9695c2f3977a4254975475668bb52751@thread.skype,,,[],,,SocTeam,Team,d05ba55c-593e-4bfa-8011-26e0626b5c14,,,,,,,,OfficeActivity
9998,8ecf8077-cf51-4820-aadd-14040956f35d,,,MicrosoftTeams,2021-06-01 11:03:36+00:00,MemberAdded,4b2462a4-bbee-495a-a0e1-f23ae524cc9c,Application,62b732f7-fc71-40bc-b27d-35efcb0509de,MicrosoftTeams,,,Microsoft Teams Sync,,,,,,,,,,,,,...,OfficeActivityManager,f432dd7c-f725-5036-9b13-c0c2c92e8701,$RestApiTenantId$,,,,"[{'DisplayName': 'AATPService', 'Role': 1, 'UPN': 'AATPService8977@seccxpninja.onmicrosoft.com'}]",SocTeam,19:9695c2f3977a4254975475668bb52751@thread.skype,,,[],,,SocTeam,Team,d05ba55c-593e-4bfa-8011-26e0626b5c14,,,,,,,,OfficeActivity


## Binarize Data

In [195]:
binData = {}

for table in cleanData:
    
    # Replace empty cells with NaN 
    # Comment this line out if you do not want to remove empty strings
    binTable = cleanData[table].replace(r'^\s*$', np.nan, regex=True)

    # Replace NaN values with 0 and all others with 1
    binTable = binTable.notnull().astype('int')

    # Save binary data in a pickled file
    binTable.to_pickle('./data/{table}/binarized.pkl'.format(table = table))
    
    binData[table] = binTable  

binData['OfficeActivity']

Unnamed: 0,TenantId,Application,UserAgent,RecordType,TimeGenerated,Operation,OrganizationId,UserType,UserKey,OfficeWorkload,ResultStatus,OfficeObjectId,UserId,ClientIP,Site_,ItemType,EventSource,MachineId,Site_Url,SourceRelativeUrl,SourceFileName,SourceFileExtension,Event_Data,Parameters,ExternalAccess,...,SourceSystem,OfficeId,OfficeTenantId,TargetUserOrGroupName,TargetUserOrGroupType,MessageId,Members,TeamName,TeamGuid,ChannelName,ChannelGuid,ExtraProperties,AddOnType,AddonName,ItemName,CommunicationType,AADGroupId,AddOnGuid,AppDistributionMode,OperationScope,AzureADAppId,OperationProperties,AppId,ClientAppId,Type
0,1,0,0,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,...,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1
1,1,0,0,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,...,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1
2,1,0,0,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,...,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1
3,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,1,...,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,1,...,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,0,0,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,1,1,1,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,1
9996,1,0,0,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,...,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1
9997,1,0,0,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,1,1,1,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,1
9998,1,0,0,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,1,1,1,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,1


In [196]:
# Takes in a string of 0 and 1 as the signature
# Takes in a list of column names
# Returns a list of features that are present and features that are missing

def getPresentColumns(signature, columns):
    present = []
    missing = []
    for index in range(len(signature)):
        if int(signature[index]):
            present.append(columns[index])
        else:
            missing.append(columns[index])
    return present, missing

In [260]:
# Counts the number of times a datapoint shows up in the features
# For example, it counts how many times the IP 44.150.161.58 shows up in the clientIP column

def countTypes(row, columns, presentFeatures, featureDict):
    for index in range(len(row)):
        
        currentFeature = columns[index]
        value = row[index]
        
        # If the feature is missing we won't count it
        if currentFeature not in presentFeatures:
            continue
            
        if value not in featureDict[currentFeature]:
            featureDict[currentFeature][value] = 1
        else:
            featureDict[currentFeature][value] += 1
    return featureDict

## Generate Dictionary of Signatures

In [261]:
import json

tableSignatures = {}
for table in binData:
    print('Generating dictionary for {tableName}'.format(tableName = table))
    columns = binData[table].columns
    signatureDict = {}
    
    for index, row in binData[table].iterrows():
        signature = ''.join(map(str, row.values.tolist()))
        
        # If this signature does not exist
        if signature not in signatureDict:
            
            # Identify Present/Missing features
            present, missing = getPresentColumns(signature, columns)
            # Generate and update number of different data types in the feature dictionary
            featureDict = {i: {} for i in present}
            featureDict = countTypes(cleanData[table].iloc[index], columns, present, featureDict)
            
            signatureDict[signature] = {
                'count': 1,
                'presentFeatures': present,
                'missingFeatures': missing,
                'featureDict': featureDict
            }
        else:
            signatureDict[signature]['count'] += 1
            signatureDict[signature]['featureDict'] = countTypes(cleanData[table].iloc[index], columns, signatureDict[signature]['presentFeatures'], signatureDict[signature]['featureDict'])
            
    tableSignatures[table] = signatureDict
    with open('./data/{table}/rawDictionary.json'.format(table=table), 'w') as f:
        json.dump(signatureDict, f)

Generating dictionary for OfficeActivity
Generating dictionary for SigninLogs
Generating dictionary for SecurityEvent
Generating dictionary for CommonSecurityLog


In [259]:
import re

word = 'sdsdTiMedasdsd'
re.match(r'^.*[Tt][Ii][Mm][Ee].*$',word)

<re.Match object; span=(0, 14), match='sdsdTiMedasdsd'>

## Clean Dictionary

In [266]:
# Attempt to remove data that is continous and non-categorical
# To do so, we check how many values occur only once
# If there are more than 10 values within the signature and 80% of the values occur only once we trim the dictionary
# We can also clean on known continuous value identifiers, such as TimeGenerated
# In addition, we can use a regular expression that checks for the presence of the word time

import re

threshold = 0.9

# Contain exact matches you want to filter out of dictionary
exactMatches = []

# Contains regular expressions you want to filter out of dictionary
regexes = [
    r'^.*[Tt][Ii][Mm][Ee].*$', # Regex for checking if the word time (case insensitive) is in the string. No ignorecase flag
]

for table in tableSignatures:
    print('Cleaning table {table}\n'.format(table=table))
    for signature in tableSignatures[table]:
        
        # If there are less than 10 instances for this signature, we likely cannot draw any assumptions about this signature when cleaning data
        if tableSignatures[table][signature]['count'] < 10: 
            continue 

        for feature in list(tableSignatures[table][signature]['featureDict']):
            
            # Check if this feature is included in our exactMatches to remove
            if feature in exactMatches:
                del tableSignatures[table][signature]['featureDict'][feature]
                print('{feature} due to being an exact match'.format(feature=feature))
                continue
            
            # Check if this feature matches any of our regular expressions
            found = False
            
            for regex in regexes:
                if re.match(regex, feature):
                    del tableSignatures[table][signature]['featureDict'][feature]
                    print('{feature} due to being an exact match with regular expression: {regex}'.format(feature=feature, regex=regex))
                    found = True
                    break
                    
            if found:
                continue
                
            # Singles keeps track of how many values only show up once in a given column
            # We are also considering values that show up only twice as well
            singles = 0
            for key, value in tableSignatures[table][signature]['featureDict'][feature].items():
                if value <= 2:
                    singles += 1
            
            # If the number of times a single value shows up is greater than an arbritrary threshold we assume it is continuous and remove it
            ratio = singles / tableSignatures[table][signature]['count']
            if ratio > threshold:
                del tableSignatures[table][signature]['featureDict'][feature]
                print('{feature} was removed with a variation ratio of {ratio}'.format(feature=feature, ratio=ratio))
    print('Table {table} cleaned\n'.format(table=table))

Cleaning table OfficeActivity

Table OfficeActivity cleaned

Cleaning table SigninLogs

Table SigninLogs cleaned

Cleaning table SecurityEvent

Table SecurityEvent cleaned

Cleaning table CommonSecurityLog

Table CommonSecurityLog cleaned



In [267]:
print(tableSignatures['OfficeActivity'])



In [181]:
signature = '11101111001011001111000000001110000111000010000000000000000000000000000000000000111'

# for j in range(len(list(columns))):
#     print(columns[j], signature[j])

print('count:',signatureDict[signature]['count'],'\n')
print('presentFeatures:',signatureDict[signature]['presentFeatures'],'\n')
print('missingFeatures:',signatureDict[signature]['missingFeatures'],'\n')

for i in signatureDict[signature]['featureDict']:
    print(i, signatureDict[signature]['featureDict'][i],'\n')

count: 2004 

presentFeatures: ['TenantId', 'SourceSystem', 'TimeGenerated', 'DeviceVendor', 'DeviceProduct', 'DeviceEventClassID', 'LogSeverity', 'CommunicationDirection', 'DestinationPort', 'DestinationIP', 'Message', 'Protocol', 'SourcePort', 'SourceIP', 'DeviceVersion', 'Activity', 'ApplicationProtocol', 'DeviceExternalID', 'DeviceInboundInterface', 'DeviceOutboundInterface', 'ExternalID', 'AdditionalExtensions', 'Type', '_ResourceId'] 

missingFeatures: ['ReceiptTime', 'OriginalLogSeverity', 'DeviceAction', 'DeviceFacility', 'DeviceAddress', 'DeviceName', 'MaliciousIP', 'ThreatSeverity', 'IndicatorThreatType', 'ThreatConfidence', 'ReportReferenceLink', 'MaliciousIPLongitude', 'MaliciousIPLatitude', 'MaliciousIPCountry', 'EventCount', 'DestinationServiceName', 'DestinationTranslatedAddress', 'DestinationTranslatedPort', 'DeviceTranslatedAddress', 'DestinationHostName', 'DestinationProcessId', 'DestinationUserName', 'FileType', 'ReceivedBytes', 'SentBytes', 'RequestURL', 'RequestCli

In [21]:
import ipywidgets

text = ipywidgets.Select(description='Hello World', options=[1,2,3])
text

Select(description='Hello World', options=(1, 2, 3), value=1)

In [22]:
print(text.value)

1


In [23]:
print(signatureDict)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [29]:
print(rawData['OfficeActivity'].iloc[0])

TenantId                                                                   8ecf8077-cf51-4820-aadd-14040956f35d
Application                                                                                                    
UserDomain                                                                                                     
UserAgent                                                                                                      
RecordType                                                                                                   50
                                                                 ...                                           
OperationProperties    [{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]
AppId                                                                      414a677a-e50f-46ea-b89c-aebb8a9efbe2
ClientAppId                                                                                             

In [63]:
test = rawData['OfficeActivity']
test = test.loc[:,~test.columns.duplicated()]
print(all(list(test['OrganizationId'] == test['OrganizationId_'])))

True


In [62]:
test.columns.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [68]:
all(test['OrganizationId'] == test['OrganizationId_'])

True

In [65]:
test.T.duplicated

TypeError: unhashable type: 'list'

In [None]:
if isinstance(value, list) or isinstance(value, dict):
    value = str(value)

In [74]:
isinstance(rawData['OfficeActivity']['OperationProperties'].iloc[0], list)
rawData()

True

In [90]:
arr = test
for i in arr:
    if isinstance(arr[i].iloc[0], list) or isinstance(arr[i].iloc[0],dict):
        print(i)
arr = arr.astype({'OperationProperties': str})
arr = arr['OperationProperties'].astype(str)
print(arr)

OperationProperties
0       [{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]
1       [{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]
2       [{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]
3       [{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]
4       [{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]
                                                  ...                                           
9995    [{'Value': 'Bind', 'Name': 'MailAccessType'}, {'Value': 'False', 'Name': 'IsThrottled'}]
9996    [{'Value': 'Bind', 'Name': 'MailAccessType'}, {'Value': 'False', 'Name': 'IsThrottled'}]
9997    [{'Value': 'Bind', 'Name': 'MailAccessType'}, {'Value': 'False', 'Name': 'IsThrottled'}]
9998                                                                                        None
9999    [{

In [88]:
print(arr['OperationProperties'],'\n\n\n')
print(test['OperationProperties'])

0       [{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]
1       [{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]
2       [{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]
3       [{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]
4       [{'Name': 'MailAccessType', 'Value': 'Bind'}, {'Name': 'IsThrottled', 'Value': 'False'}]
                                                  ...                                           
9995    [{'Value': 'Bind', 'Name': 'MailAccessType'}, {'Value': 'False', 'Name': 'IsThrottled'}]
9996    [{'Value': 'Bind', 'Name': 'MailAccessType'}, {'Value': 'False', 'Name': 'IsThrottled'}]
9997    [{'Value': 'Bind', 'Name': 'MailAccessType'}, {'Value': 'False', 'Name': 'IsThrottled'}]
9998                                                                                        None
9999    [{'Value': 'Bind', 'Na

In [110]:
newtest = test
newtest = newtest.applymap(lambda x: str(x) if isinstance(x, list) or isinstance(x, dict) else x)
clean = newtest.T.drop_duplicates().T

8ecf8077-cf51-4820-aadd-14040956f35d
