In [2]:
import sys
import time

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [64]:
sys.path.append("../../")
sys.path.append("../../Core")

In [65]:
from Core import Deduplication, Variables, Tokenizer, Generator, FastClustering

## Windows dataset

In [99]:
def loadDataset(filepath):
    logs = None
    with open(filepath, "r") as ifp:
        logs = list(ifp)
    
    for i, l in enumerate(logs):
        logs[i] = l.replace('\n', '')

    return logs

In [122]:
logs = loadDataset("../datasets/Windows/Windows_2k.log")
logs[:10]

['2016-09-28 04:30:30, Info                  CBS    Loaded Servicing Stack v6.1.7601.23505 with Core: C:\\Windows\\winsxs\\amd64_microsoft-windows-servicingstack_31bf3856ad364e35_6.1.7601.23505_none_681aa442f6fed7f0\\cbscore.dll',
 '2016-09-28 04:30:31, Info                  CSI    00000001@2016/9/27:20:30:31.455 WcpInitialize (wcp.dll version 0.0.0.6) called (stack @0x7fed806eb5d @0x7fef9fb9b6d @0x7fef9f8358f @0xff83e97c @0xff83d799 @0xff83db2f)',
 '2016-09-28 04:30:31, Info                  CSI    00000002@2016/9/27:20:30:31.458 WcpInitialize (wcp.dll version 0.0.0.6) called (stack @0x7fed806eb5d @0x7fefa006ade @0x7fef9fd2984 @0x7fef9f83665 @0xff83e97c @0xff83d799)',
 '2016-09-28 04:30:31, Info                  CSI    00000003@2016/9/27:20:30:31.458 WcpInitialize (wcp.dll version 0.0.0.6) called (stack @0x7fed806eb5d @0x7fefa1c8728 @0x7fefa1c8856 @0xff83e474 @0xff83d7de @0xff83db2f)',
 '2016-09-28 04:30:31, Info                  CBS    Ending TrustedInstaller initialization.',
 '2016

## Detecting patterns

### Deduplication

In [123]:
print ("No of titles before dedup: %d" % len(logs))
dedup = Deduplication()
logs = dedup.transform(logs)
print ("No of titles after dedup: %d" % len(logs))

No of titles before dedup: 2000
No of titles after dedup: 1281


In [124]:
df = pd.DataFrame(logs, columns=["logs"])

## Pre tokenization transformation

1. Replace `,` with `<space>,`

In [125]:
# TODO: use regular expressions
for i, t in enumerate(logs):
    t = t.replace(',', ' ,')
    t = t.replace('[', '[ ')
    t = t.replace(']', ' ]')
    t = t.replace('{', '{ ')
    t = t.replace('}', ' }')
    t = t.replace('(', '( ')
    t = t.replace(')', ' )')
    t = t.replace('<', '< ')
    t = t.replace('>', '> ')
    
    logs[i] = t

In [126]:
logs[:10]

['2016-09-28 04:30:30 , Info                  CBS    Loaded Servicing Stack v6.1.7601.23505 with Core: C:\\Windows\\winsxs\\amd64_microsoft-windows-servicingstack_31bf3856ad364e35_6.1.7601.23505_none_681aa442f6fed7f0\\cbscore.dll',
 '2016-09-28 04:30:31 , Info                  CSI    00000001@2016/9/27:20:30:31.455 WcpInitialize ( wcp.dll version 0.0.0.6 ) called ( stack @0x7fed806eb5d @0x7fef9fb9b6d @0x7fef9f8358f @0xff83e97c @0xff83d799 @0xff83db2f )',
 '2016-09-28 04:30:31 , Info                  CSI    00000002@2016/9/27:20:30:31.458 WcpInitialize ( wcp.dll version 0.0.0.6 ) called ( stack @0x7fed806eb5d @0x7fefa006ade @0x7fef9fd2984 @0x7fef9f83665 @0xff83e97c @0xff83d799 )',
 '2016-09-28 04:30:31 , Info                  CSI    00000003@2016/9/27:20:30:31.458 WcpInitialize ( wcp.dll version 0.0.0.6 ) called ( stack @0x7fed806eb5d @0x7fefa1c8728 @0x7fefa1c8856 @0xff83e474 @0xff83d7de @0xff83db2f )',
 '2016-09-28 04:30:31 , Info                  CBS    Ending TrustedInstaller initial

### Variable Detection

In [127]:
var = Variables("../../Core/schema.variables.json", verbose=True)

Loading Classes
	 Loaded {GUID}
	 Loaded {DATE}
	 Loaded {TIME}
	 Loaded {DATETIME}
	 Loaded {NUMBER}
	 Loaded {DECIMAL}
	 Loaded {INTEGER}
	 Loaded {IPV4}
	 Loaded {IPV6}
	 Loaded {IP}
	 Loaded {FILENAME}
	 Loaded {PYTHON_FILENAME}
Loading Patterns
	 Loading {^[{(]?[0-9A-F]{8}[-]?([0-9A-F]{4}[-]?){3}[0-9A-F]{12}[)}]?$} {Class: GUID}
	 Loading {\d{4}[\-\s]{1}\d{2}[\-\s]{1}\d{2}} {Class: DATE}
	 Loading {\d{2}[\-\s]{1}\d{2}(?:[\-\s]{1}\d{2})*} {Class: DATE}
	 Loading {\d{2}[\:]\d{2}(?:\:[\d]{2})*(?:\.[\d]{3,6})*} {Class: TIME}
	 Loading {([-]*[\d]+)} {Class: INTEGER}
	 Loading {[^A-Za-z0-9\-\.]([\d]+)(?:[^A-Za-z0-9\-\.]|$)} {Class: INTEGER}
	 Loading {[-]*[\d]*[\.][\d]+} {Class: DECIMAL}
	 Loading {^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$} {Class: IPV4}
	 Loading {([a-zA-Z0-9\\\.]*\.py)} {Class: PYTHON_FILENAME}


In [128]:
logs_t = var.TransformAll(logs)

In [129]:
logs_t[:10]

['@DATE @TIME , Info                  CBS    Loaded Servicing Stack v6.1.7601.23505 with Core: C:\\Windows\\winsxs\\amd64_microsoft-windows-servicingstack_31bf3856ad364e35_6.1.7601.23505_none_681aa442f6fed7f0\\cbscore.dll',
 '@DATE @TIME , Info                  CSI    00000001@2016/9/@TIME WcpInitialize ( wcp.dll version @IPV4 ) called ( stack @0x7fed806eb5d @0x7fef9fb9b6d @0x7fef9f8358f @0xff83e97c @0xff83d799 @0xff83db2f )',
 '@DATE @TIME , Info                  CSI    00000002@2016/9/@TIME WcpInitialize ( wcp.dll version @IPV4 ) called ( stack @0x7fed806eb5d @0x7fefa006ade @0x7fef9fd2984 @0x7fef9f83665 @0xff83e97c @0xff83d799 )',
 '@DATE @TIME , Info                  CSI    00000003@2016/9/@TIME WcpInitialize ( wcp.dll version @IPV4 ) called ( stack @0x7fed806eb5d @0x7fefa1c8728 @0x7fefa1c8856 @0xff83e474 @0xff83d7de @0xff83db2f )',
 '@DATE @TIME , Info                  CBS    Ending TrustedInstaller initialization.',
 '@DATE @TIME , Info                  CBS    Starting the Trusted

## Fast Clustering

In [130]:
## Varaibled
THRESHOLD = 0.2

# Generate tokens
tokenizer = Tokenizer()
tokens = tokenizer.transform(logs_t)

# Generate level 1 Clusters
fc = FastClustering(threshold=THRESHOLD)

df = pd.DataFrame(logs_t, columns=['title_t'])
df['label'] = fc.transform(tokens)
print ("No of clusters generated: %d" % fc.getLabelCount())

# Generate the patterns
generator = Generator()
i = -1

for label, count in zip(df['label'].value_counts().index, df['label'].value_counts().values):
    i += 1
    if count <= 1:
        continue

    print ("[Count: %d], Pattern: %s" % (count, generator.generate(df[df['label'] == label].title_t.values)))

No of clusters generated: 28
[Count: 641], Pattern: @DATE @TIME , Info                  *    * * * * * *
[Count: 558], Pattern: @DATE @TIME , Info                  CBS    Read out cached package applicability for package: * , ApplicableState: @INTEGER , CurrentState:@INTEGER
[Count: 13], Pattern: @DATE @TIME , Info                  CBS    Failed to * * * [ HRESULT = * - * ]
[Count: 8], Pattern: @DATE @TIME , Info                  CBS    Unloading offline registry hive: { bf1a281b-ad7b-4476-ac95-f47682990ce7 *
[Count: 8], Pattern: @DATE @TIME , Info                  CBS    Loading offline registry hive: * , into registry key '{ bf1a281b-ad7b-4476-ac95-f47682990ce7 * from path *
[Count: 7], Pattern: @DATE @TIME , Info                  CBS    Expecting attribute name [ HRESULT = 0x800f080d - CBS_E_MANIFEST_INVALID_ITEM ]
[Count: 6], Pattern: @DATE @TIME , Info                  CSI    * WcpInitialize ( wcp.dll version @IPV4 ) called ( stack * * * * * * )
[Count: 4], Pattern: @DATE @TIME , 

## Linux Dataset

In [131]:
logs = loadDataset("../datasets/Linux/Linux_2k.log")

print ("No of titles before dedup: %d" % len(logs))
dedup = Deduplication()
logs = dedup.transform(logs)
print ("No of titles after dedup: %d" % len(logs))

# TODO: use regular expressions
for i, t in enumerate(logs):
    t = t.replace(',', ' ,')
    t = t.replace('[', '[ ')
    t = t.replace(']', ' ]')
    t = t.replace('{', '{ ')
    t = t.replace('}', ' }')
    t = t.replace('(', '( ')
    t = t.replace(')', ' )')
    t = t.replace('<', '< ')
    t = t.replace('>', '> ')
    t = t.replace('=', ' = ')

    
    logs[i] = t
    
logs_t = var.TransformAll(logs)

No of titles before dedup: 2000
No of titles after dedup: 2000


In [132]:
logs[:10]

['Jun 14 15:16:01 combo sshd( pam_unix )[ 19939 ]: authentication failure; logname =  uid = 0 euid = 0 tty = NODEVssh ruser =  rhost = 218.188.2.4 ',
 'Jun 14 15:16:02 combo sshd( pam_unix )[ 19937 ]: check pass; user unknown',
 'Jun 14 15:16:02 combo sshd( pam_unix )[ 19937 ]: authentication failure; logname =  uid = 0 euid = 0 tty = NODEVssh ruser =  rhost = 218.188.2.4 ',
 'Jun 15 02:04:59 combo sshd( pam_unix )[ 20882 ]: authentication failure; logname =  uid = 0 euid = 0 tty = NODEVssh ruser =  rhost = 220-135-151-1.hinet-ip.hinet.net  user = root',
 'Jun 15 02:04:59 combo sshd( pam_unix )[ 20884 ]: authentication failure; logname =  uid = 0 euid = 0 tty = NODEVssh ruser =  rhost = 220-135-151-1.hinet-ip.hinet.net  user = root',
 'Jun 15 02:04:59 combo sshd( pam_unix )[ 20883 ]: authentication failure; logname =  uid = 0 euid = 0 tty = NODEVssh ruser =  rhost = 220-135-151-1.hinet-ip.hinet.net  user = root',
 'Jun 15 02:04:59 combo sshd( pam_unix )[ 20885 ]: authentication failure

In [133]:
logs_t[:10]

['Jun @INTEGER @TIME combo sshd( pam_unix )[ @INTEGER ]: authentication failure; logname =  uid = @INTEGER euid = @INTEGER tty = NODEVssh ruser =  rhost = @IPV4 ',
 'Jun @INTEGER @TIME combo sshd( pam_unix )[ @INTEGER ]: check pass; user unknown',
 'Jun @INTEGER @TIME combo sshd( pam_unix )[ @INTEGER ]: authentication failure; logname =  uid = @INTEGER euid = @INTEGER tty = NODEVssh ruser =  rhost = @IPV4 ',
 'Jun @INTEGER @TIME combo sshd( pam_unix )[ @INTEGER ]: authentication failure; logname =  uid = @INTEGER euid = @INTEGER tty = NODEVssh ruser =  rhost = 2@DATE5-151-1.hinet-ip.hinet.net  user = root',
 'Jun @INTEGER @TIME combo sshd( pam_unix )[ @INTEGER ]: authentication failure; logname =  uid = @INTEGER euid = @INTEGER tty = NODEVssh ruser =  rhost = 2@DATE5-151-1.hinet-ip.hinet.net  user = root',
 'Jun @INTEGER @TIME combo sshd( pam_unix )[ @INTEGER ]: authentication failure; logname =  uid = @INTEGER euid = @INTEGER tty = NODEVssh ruser =  rhost = 2@DATE5-151-1.hinet-ip.hine

In [134]:
## Varaibled
THRESHOLD = 0.6

# Generate tokens
tokenizer = Tokenizer()
tokens = tokenizer.transform(logs_t)

# Generate level 1 Clusters
fc = FastClustering(threshold=THRESHOLD)

df = pd.DataFrame(logs_t, columns=['title_t'])
df['label'] = fc.transform(tokens)
print ("No of clusters generated: %d" % fc.getLabelCount())

# Generate the patterns
generator = Generator()
i = -1

for label, count in zip(df['label'].value_counts().index, df['label'].value_counts().values):
    i += 1
    if count <= 1:
        continue

    print ("[Count: %d], Pattern: %s" % (count, generator.generate(df[df['label'] == label].title_t.values)))

No of clusters generated: 44
[Count: 653], Pattern: * @INTEGER @TIME combo ftpd[ @INTEGER ]: connection from @IPV4 ( * ) at * * @INTEGER @TIME @INTEGER 
[Count: 416], Pattern: * @INTEGER @TIME combo * pam_unix )[ @INTEGER ]: authentication failure; logname =  uid = @INTEGER euid = @INTEGER tty = * ruser =  rhost = *  * * *
[Count: 256], Pattern: Jul  @INTEGER @TIME combo ftpd[ @INTEGER ]: connection from @IPV4 ( * ) at * Jul  @INTEGER @TIME @INTEGER 
[Count: 150], Pattern: * @INTEGER @TIME combo * pam_unix )[ @INTEGER ]: session * for user * by ( uid = @INTEGER )
[Count: 112], Pattern: * @INTEGER @TIME combo * pam_unix )[ @INTEGER ]: * * * * *
[Count: 97], Pattern: Jul  @INTEGER @TIME combo * pam_unix )[ @INTEGER ]: * * * * * by * uid = @INTEGER )
[Count: 74], Pattern: Jul  @INTEGER @TIME combo sshd( pam_unix )[ @INTEGER ]: authentication failure; logname =  uid = @INTEGER euid = @INTEGER tty = NODEVssh ruser =  rhost = *  user = *
[Count: 34], Pattern: * @INTEGER @TIME combo logrotate

## Android Dataset

In [135]:
logs = loadDataset("../datasets/Android/Android_2k.log")

print ("No of titles before dedup: %d" % len(logs))
dedup = Deduplication()
logs = dedup.transform(logs)
print ("No of titles after dedup: %d" % len(logs))

# TODO: use regular expressions
for i, t in enumerate(logs):
    t = t.replace(',', ' ,')
    t = t.replace('[', '[ ')
    t = t.replace(']', ' ]')
    t = t.replace('{', '{ ')
    t = t.replace('}', ' }')
    t = t.replace('(', '( ')
    t = t.replace(')', ' )')
    t = t.replace('<', '< ')
    t = t.replace('>', '> ')
    t = t.replace('=', ' = ')

    
    logs[i] = t
    
logs_t = var.TransformAll(logs)

No of titles before dedup: 2000
No of titles after dedup: 1988


In [136]:
logs[:10]

['03-17 16:13:38.811  1702  2395 D WindowManager: printFreezingDisplayLogsopening app wtoken  =  AppWindowToken{ 9f4ef63 token = Token{ a64f992 ActivityRecord{ de9231d u0 com.tencent.qt.qtl/.activity.info.NewsDetailXmlActivity t761 } } } , allDrawn =  false , startingDisplayed  =   false , startingMoved  =   false , isRelaunching  =   false',
 '03-17 16:13:38.819  1702  8671 D PowerManagerService: acquire lock = 233570404 , flags = 0x1 , tag = "View Lock" , name = com.android.systemui , ws = null , uid = 10037 , pid = 2227',
 '03-17 16:13:38.820  1702  8671 D PowerManagerService: ready = true ,policy = 3 ,wakefulness = 1 ,wksummary = 0x23 ,uasummary = 0x1 ,bootcompleted = true ,boostinprogress = false ,waitmodeenable = false ,mode = false ,manual = 38 ,auto = -1 ,adj = 0.0userId = 0',
 '03-17 16:13:38.839  1702  2113 V WindowManager: Skipping AppWindowToken{ df0798e token = Token{ 78af589 ActivityRecord{ 3b04890 u0 com.tencent.qt.qtl/com.tencent.video.player.activity.PlayerActivity t76

In [137]:
logs_t[:10]

['@DATE @TIME  @INTEGER  @INTEGER D WindowManager: printFreezingDisplayLogsopening app wtoken  =  AppWindowToken{ 9f4ef63 token = Token{ a64f992 ActivityRecord{ de9231d u0 com.tencent.qt.qtl/.activity.info.NewsDetailXmlActivity t761 } } } , allDrawn =  false , startingDisplayed  =   false , startingMoved  =   false , isRelaunching  =   false',
 '@DATE @TIME  @INTEGER  @INTEGER D PowerManagerService: acquire lock = @INTEGER , flags = 0x1 , tag = "View Lock" , name = com.android.systemui , ws = null , uid = @INTEGER , pid = @INTEGER',
 '@DATE @TIME  @INTEGER  @INTEGER D PowerManagerService: ready = true ,policy = @INTEGER ,wakefulness = @INTEGER ,wksummary = 0x23 ,uasummary = 0x1 ,bootcompleted = true ,boostinprogress = false ,waitmodeenable = false ,mode = false ,manual = @INTEGER ,auto = @INTEGER ,adj = 0.0userId = @INTEGER',
 '@DATE @TIME  @INTEGER  @INTEGER V WindowManager: Skipping AppWindowToken{ df0798e token = Token{ 78af589 ActivityRecord{ 3b04890 u0 com.tencent.qt.qtl/com.tence

In [138]:
## Varaibled
THRESHOLD = 0.6

# Generate tokens
tokenizer = Tokenizer()
tokens = tokenizer.transform(logs_t)

# Generate level 1 Clusters
fc = FastClustering(threshold=THRESHOLD)

df = pd.DataFrame(logs_t, columns=['title_t'])
df['label'] = fc.transform(tokens)
print ("No of clusters generated: %d" % fc.getLabelCount())

# Generate the patterns
generator = Generator()
i = -1

for label, count in zip(df['label'].value_counts().index, df['label'].value_counts().values):
    i += 1
    if count <= 1:
        continue

    print ("[Count: %d], Pattern: %s" % (count, generator.generate(df[df['label'] == label].title_t.values)))

No of clusters generated: 54
[Count: 693], Pattern: @DATE @TIME  @INTEGER  @INTEGER * * * * * * * * * * *
[Count: 237], Pattern: @DATE @TIME  @INTEGER  @INTEGER I * * * * * * * * * * * * * * * *
[Count: 200], Pattern: @DATE @TIME  @INTEGER  @INTEGER I PhoneStatusBar: setSystemUiVisibility vis = * mask = * oldVal = * newVal = * diff = * fullscreenStackVis = @INTEGER dockedStackVis = @INTEGER , fullscreenStackBounds = Rect( @INTEGER , @INTEGER - @INTEGER , @INTEGER ) , dockedStackBounds = Rect( @INTEGER , @INTEGER - @INTEGER , @INTEGER )
[Count: 159], Pattern: @DATE @TIME  @INTEGER  @INTEGER D PowerManagerService: ready = true ,policy = @INTEGER ,wakefulness = @INTEGER ,wksummary = * ,uasummary = 0x1 ,bootcompleted = true ,boostinprogress = false ,waitmodeenable = false ,mode = false ,manual = @INTEGER ,auto = @INTEGER ,adj = 0.0userId = @INTEGER
[Count: 72], Pattern: @DATE @TIME  @INTEGER  @INTEGER * * * * = * , * = * * * = * * * = * * * * *
[Count: 71], Pattern: @DATE @TIME  @INTEGER  

## OpenSSH Logs

In [141]:
logs = loadDataset("../datasets/OpenSSH/SSH_2k.log")

print ("No of titles before dedup: %d" % len(logs))
dedup = Deduplication()
logs = dedup.transform(logs)
print ("No of titles after dedup: %d" % len(logs))

# TODO: use regular expressions
for i, t in enumerate(logs):
    t = t.replace(',', ' ,')
    t = t.replace('[', '[ ')
    t = t.replace(']', ' ]')
    t = t.replace('{', '{ ')
    t = t.replace('}', ' }')
    t = t.replace('(', '( ')
    t = t.replace(')', ' )')
    t = t.replace('<', '< ')
    t = t.replace('>', '> ')
    t = t.replace('=', ' = ')

    
    logs[i] = t
    
logs_t = var.TransformAll(logs)

No of titles before dedup: 2000
No of titles after dedup: 2000


In [142]:
logs[:10]

['Dec 10 06:55:46 LabSZ sshd[ 24200 ]: reverse mapping checking getaddrinfo for ns.marryaldkfaczcz.com [ 173.234.31.186 ] failed - POSSIBLE BREAK-IN ATTEMPT!',
 'Dec 10 06:55:46 LabSZ sshd[ 24200 ]: Invalid user webmaster from 173.234.31.186',
 'Dec 10 06:55:46 LabSZ sshd[ 24200 ]: input_userauth_request: invalid user webmaster [ preauth ]',
 'Dec 10 06:55:46 LabSZ sshd[ 24200 ]: pam_unix( sshd:auth ): check pass; user unknown',
 'Dec 10 06:55:46 LabSZ sshd[ 24200 ]: pam_unix( sshd:auth ): authentication failure; logname =  uid = 0 euid = 0 tty = ssh ruser =  rhost = 173.234.31.186 ',
 'Dec 10 06:55:48 LabSZ sshd[ 24200 ]: Failed password for invalid user webmaster from 173.234.31.186 port 38926 ssh2',
 'Dec 10 06:55:48 LabSZ sshd[ 24200 ]: Connection closed by 173.234.31.186 [ preauth ]',
 'Dec 10 07:02:47 LabSZ sshd[ 24203 ]: Connection closed by 212.47.254.145 [ preauth ]',
 'Dec 10 07:07:38 LabSZ sshd[ 24206 ]: Invalid user test9 from 52.80.34.196',
 'Dec 10 07:07:38 LabSZ sshd[ 24

In [143]:
logs_t[:10]

['Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: reverse mapping checking getaddrinfo for ns.marryaldkfaczcz.com [ @IPV4 ] failed - POSSIBLE BREAK-IN ATTEMPT!',
 'Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: Invalid user webmaster from @IPV4',
 'Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: input_userauth_request: invalid user webmaster [ preauth ]',
 'Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: pam_unix( sshd:auth ): check pass; user unknown',
 'Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: pam_unix( sshd:auth ): authentication failure; logname =  uid = @INTEGER euid = @INTEGER tty = ssh ruser =  rhost = @IPV4 ',
 'Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: Failed password for invalid user webmaster from @IPV4 port @INTEGER ssh2',
 'Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: Connection closed by @IPV4 [ preauth ]',
 'Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: Connection closed by @IPV4 [ preauth ]',
 'Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: Invalid user test9 from @IPV4',
 'Dec @INTEGER @TIME L

In [144]:
## Varaibled
THRESHOLD = 0.6

# Generate tokens
tokenizer = Tokenizer()
tokens = tokenizer.transform(logs_t)

# Generate level 1 Clusters
fc = FastClustering(threshold=THRESHOLD)

df = pd.DataFrame(logs_t, columns=['title_t'])
df['label'] = fc.transform(tokens)
print ("No of clusters generated: %d" % fc.getLabelCount())

# Generate the patterns
generator = Generator()
i = -1

for label, count in zip(df['label'].value_counts().index, df['label'].value_counts().values):
    i += 1
    if count <= 1:
        continue

    print ("[Count: %d], Pattern: %s" % (count, generator.generate(df[df['label'] == label].title_t.values)))

No of clusters generated: 10
[Count: 523], Pattern: Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: * * for * * * * * * * * *
[Count: 504], Pattern: Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: * * * authentication * logname =  uid = @INTEGER euid = @INTEGER tty = ssh ruser =  rhost = *  * * *
[Count: 448], Pattern: Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: * * * * * * * * * * * *
[Count: 384], Pattern: Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: Received disconnect from * 11: * * * * * [ preauth ]
[Count: 85], Pattern: Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: reverse mapping checking getaddrinfo for * [ @IPV4 ] failed - POSSIBLE BREAK-IN ATTEMPT!
[Count: 45], Pattern: Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: error: Received disconnect from 103.99.0.122: 14: No more user authentication methods available. [ preauth ]
[Count: 7], Pattern: Dec @INTEGER @TIME LabSZ sshd[ @INTEGER ]: PAM service( sshd ) ignoring max retries; @INTEGER >  @INTEGER
[Count: 2], Pattern: Dec @INTEGER @TIME LabSZ ss

## Apache Logs

In [145]:
logs = loadDataset("../datasets/Apache/Apache_2k.log")

print ("No of titles before dedup: %d" % len(logs))
dedup = Deduplication()
logs = dedup.transform(logs)
print ("No of titles after dedup: %d" % len(logs))

# TODO: use regular expressions
for i, t in enumerate(logs):
    t = t.replace(',', ' ,')
    t = t.replace('[', '[ ')
    t = t.replace(']', ' ]')
    t = t.replace('{', '{ ')
    t = t.replace('}', ' }')
    t = t.replace('(', '( ')
    t = t.replace(')', ' )')
    t = t.replace('<', '< ')
    t = t.replace('>', '> ')
    t = t.replace('=', ' = ')

    
    logs[i] = t
    
logs_t = var.TransformAll(logs)

No of titles before dedup: 2000
No of titles after dedup: 1461


In [146]:
logs[:10]

['[ Sun Dec 04 04:47:44 2005 ] [ notice ] workerEnv.init(  ) ok /etc/httpd/conf/workers2.properties',
 '[ Sun Dec 04 04:47:44 2005 ] [ error ] mod_jk child workerEnv in error state 6',
 '[ Sun Dec 04 04:51:08 2005 ] [ notice ] jk2_init(  ) Found child 6725 in scoreboard slot 10',
 '[ Sun Dec 04 04:51:09 2005 ] [ notice ] jk2_init(  ) Found child 6726 in scoreboard slot 8',
 '[ Sun Dec 04 04:51:09 2005 ] [ notice ] jk2_init(  ) Found child 6728 in scoreboard slot 6',
 '[ Sun Dec 04 04:51:14 2005 ] [ notice ] workerEnv.init(  ) ok /etc/httpd/conf/workers2.properties',
 '[ Sun Dec 04 04:51:18 2005 ] [ error ] mod_jk child workerEnv in error state 6',
 '[ Sun Dec 04 04:51:37 2005 ] [ notice ] jk2_init(  ) Found child 6736 in scoreboard slot 10',
 '[ Sun Dec 04 04:51:38 2005 ] [ notice ] jk2_init(  ) Found child 6733 in scoreboard slot 7',
 '[ Sun Dec 04 04:51:38 2005 ] [ notice ] jk2_init(  ) Found child 6734 in scoreboard slot 9']

In [147]:
logs_t[:10]

['[ Sun Dec @INTEGER @TIME @INTEGER ] [ notice ] workerEnv.init(  ) ok /etc/httpd/conf/workers2.properties',
 '[ Sun Dec @INTEGER @TIME @INTEGER ] [ error ] mod_jk child workerEnv in error state @INTEGER',
 '[ Sun Dec @INTEGER @TIME @INTEGER ] [ notice ] jk2_init(  ) Found child @INTEGER in scoreboard slot @INTEGER',
 '[ Sun Dec @INTEGER @TIME @INTEGER ] [ notice ] jk2_init(  ) Found child @INTEGER in scoreboard slot @INTEGER',
 '[ Sun Dec @INTEGER @TIME @INTEGER ] [ notice ] jk2_init(  ) Found child @INTEGER in scoreboard slot @INTEGER',
 '[ Sun Dec @INTEGER @TIME @INTEGER ] [ notice ] workerEnv.init(  ) ok /etc/httpd/conf/workers2.properties',
 '[ Sun Dec @INTEGER @TIME @INTEGER ] [ error ] mod_jk child workerEnv in error state @INTEGER',
 '[ Sun Dec @INTEGER @TIME @INTEGER ] [ notice ] jk2_init(  ) Found child @INTEGER in scoreboard slot @INTEGER',
 '[ Sun Dec @INTEGER @TIME @INTEGER ] [ notice ] jk2_init(  ) Found child @INTEGER in scoreboard slot @INTEGER',
 '[ Sun Dec @INTEGER @T

In [149]:
## Varaibled
THRESHOLD = 0.3

# Generate tokens
tokenizer = Tokenizer()
tokens = tokenizer.transform(logs_t)

# Generate level 1 Clusters
fc = FastClustering(threshold=THRESHOLD)

df = pd.DataFrame(logs_t, columns=['title_t'])
df['label'] = fc.transform(tokens)
print ("No of clusters generated: %d" % fc.getLabelCount())

# Generate the patterns
generator = Generator()
i = -1

for label, count in zip(df['label'].value_counts().index, df['label'].value_counts().values):
    i += 1
    if count <= 1:
        continue

    print ("[Count: %d], Pattern: %s" % (count, generator.generate(df[df['label'] == label].title_t.values)))

No of clusters generated: 7
[Count: 836], Pattern: [ * Dec @INTEGER @TIME @INTEGER ] [ notice ] jk2_init(  ) Found child @INTEGER in scoreboard slot @INTEGER
[Count: 332], Pattern: [ * Dec @INTEGER @TIME @INTEGER ] [ error ] mod_jk child * * * state @INTEGER
[Count: 247], Pattern: [ * Dec @INTEGER @TIME @INTEGER ] [ notice ] workerEnv.init(  ) ok /etc/httpd/conf/workers2.properties
[Count: 32], Pattern: [ * Dec @INTEGER @TIME @INTEGER ] [ error ] [ client @IPV4 ] Directory index forbidden by rule: /var/www/html/
[Count: 12], Pattern: [ * Dec @INTEGER @TIME @INTEGER ] [ error ] jk2_init(  ) Can't find child @INTEGER in scoreboard
[Count: 2], Pattern: [ Mon Dec @INTEGER @TIME @INTEGER ] [ error ] mod_jk child init @INTEGER @INTEGER
