In [1]:
"""Anomaly Detection Example"""
from __future__ import print_function
import os
import sys
import argparse
import math
from collections import Counter

# Third Party Imports
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans

# Local imports
from zat import log_to_dataframe
from zat import dataframe_to_matrix

from pprint import pprint


def entropy(string):
    """Compute entropy on the string"""
    p, lns = Counter(string), float(len(string))
    return -sum(count/lns * math.log(count/lns, 2) for count in p.values())



In [2]:
pd.set_option('display.width', 1000)

# Collect args from the command line
parser = argparse.ArgumentParser()
parser.add_argument('zeek_log', type=str, help='Specify a zeek log to run ZeekLogReader test on')
args, commands = parser.parse_known_args()

# Check for unknown args
# if commands:
#     print('Unrecognized args: %s' % commands)
#     sys.exit(1)

# File may have a tilde in it
args.zeek_log="../portscan/dns.log"
if args.zeek_log:
    args.zeek_log = os.path.expanduser(args.zeek_log)

    # Sanity check either http or dns log
    if 'http' in args.zeek_log:
        log_type = 'http'
        features = ['id.resp_p', 'method', 'resp_mime_types', 'request_body_len']
    elif 'dns' in args.zeek_log:
        log_type = 'dns'
        features = ['Z', 'proto', 'qtype_name', 'query_length', 'answer_length', 'entropy']
    else:
        print('This example only works with Zeek with http.log or dns.log files..')
        sys.exit(1)

    # Create a Pandas dataframe from a Zeek log
    try:
        log_to_df = log_to_dataframe.LogToDataFrame()
        zeek_df = log_to_df.create_dataframe(args.zeek_log)
        pprint(zeek_df.head())
        print(zeek_df.shape)
    except IOError:
        print('Could not open or parse the specified logfile: %s' % args.zeek_log)
        sys.exit(1)
    print('Read in {:d} Rows...'.format(len(zeek_df)))

    # Using Pandas we can easily and efficiently compute additional data metrics
    # Here we use the vectorized operations of Pandas/Numpy to compute query length
    # We'll also compute entropy of the query
    if log_type == 'dns':
        zeek_df['query_length'] = zeek_df['query'].str.len()
        zeek_df['answer_length'] = zeek_df['answers'].str.len()
        zeek_df['entropy'] = zeek_df['query'].map(lambda x: entropy(x))

    # Use the zat DataframeToMatrix class
   

                                              uid       id.orig_h  id.orig_p    id.resp_h  id.resp_p proto  trans_id rtt                              query  qclass  ... rcode  rcode_name AA  TC RD RA  Z                       answers                   TTLs  rejected
ts                                                                                                                                                                   ...                                                                                                 
2020-07-21 04:59:58.985475063  C32bE64gxOFjXXAJlf  192.168.29.132       5353  224.0.0.251       5353   udp         0 NaT        132.29.168.192.in-addr.arpa    <NA>  ...     0     NOERROR  T   F  F  F  0  android.local,192.168.29.132  120.000000,120.000000         F
2020-07-21 05:00:27.864109039  Cdqp0e3XB6sN8S3Xx4   192.168.29.10       5353  224.0.0.251       5353   udp         0 NaT  142.26.168.192.in-addr.arpa.local       1  ...  <NA>         NaN  F   F  F  F  0

In [9]:
 to_matrix = dataframe_to_matrix.DataFrameToMatrix()
zeek_matrix = to_matrix.fit_transform(zeek_df[features])
print(zeek_matrix.shape)

# Train/fit and Predict anomalous instances using the Isolation Forest model
odd_clf = IsolationForest(contamination=0.1)  # Marking 20% as odd
odd_clf.fit(zeek_matrix)

# Now we create a new dataframe using the prediction from our classifier
predictions = odd_clf.predict(zeek_matrix)
odd_df = zeek_df[features][predictions == -1]
display_df = zeek_df[predictions == -1].copy()

# Now we're going to explore our odd observations with help from KMeans
odd_matrix = to_matrix.fit_transform(odd_df)
num_clusters = min(len(odd_df), 4)  # 4 clusters unless we have less than 4 observations
display_df['cluster'] = KMeans(n_clusters=num_clusters).fit_predict(odd_matrix)
print(odd_matrix.shape)

# Now group the dataframe by cluster
if log_type == 'dns':
    features += ['query']
else:
    features += ['host']
cluster_groups = display_df[features+['cluster']].groupby('cluster')

# Now print out the details for each cluster


Normalizing column Z...
Cannot normalize series (div by 0) so not normalizing...
Normalizing column query_length...
Normalizing column answer_length...
(68, 18)
Normalizing column Z...
Cannot normalize series (div by 0) so not normalizing...
Normalizing column query_length...
Normalizing column answer_length...
(6, 13)


In [10]:
print('<<< Outliers Detected! >>>')
for key, group in cluster_groups:
    print('\nCluster {:d}: {:d} observations'.format(key, len(group)))
    pprint(group.head())
print(display_df.shape)

<<< Outliers Detected! >>>

Cluster 0: 1 observations
                               Z proto qtype_name  query_length  answer_length   entropy                    query                    query  cluster
ts                                                                                                                                                 
2020-07-21 05:06:41.849042892  0   udp        NaN            23           14.0  3.522712  rohanelementaryos.local  rohanelementaryos.local        0

Cluster 1: 2 observations
                               Z proto qtype_name  query_length  answer_length   entropy              query              query  cluster
ts                                                                                                                                     
2020-07-21 05:06:42.060451984  0   udp       AAAA            17            NaN  3.337175  rohanelementaryos  rohanelementaryos        1
2020-07-21 05:06:42.061758041  0   udp       AAAA            17    

In [5]:
zeek_df

Unnamed: 0_level_0,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,trans_id,rtt,query,qclass,...,TC,RD,RA,Z,answers,TTLs,rejected,query_length,answer_length,entropy
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-21 04:59:58.985475063,C32bE64gxOFjXXAJlf,192.168.29.132,5353,224.0.0.251,5353,udp,0,NaT,132.29.168.192.in-addr.arpa,,...,F,F,F,0,"android.local,192.168.29.132","120.000000,120.000000",F,27,28.0,3.574358
2020-07-21 05:00:27.864109039,Cdqp0e3XB6sN8S3Xx4,192.168.29.10,5353,224.0.0.251,5353,udp,0,NaT,142.26.168.192.in-addr.arpa.local,1,...,F,F,F,0,,,F,33,,3.801377
2020-07-21 05:00:27.866831064,Cdqp0e3XB6sN8S3Xx4,192.168.29.10,5353,224.0.0.251,5353,udp,0,NaT,142.26.168.192.in-addr.arpa.local,1,...,F,F,F,0,,,F,33,,3.801377
2020-07-21 05:00:28.888165951,Cdqp0e3XB6sN8S3Xx4,192.168.29.10,5353,224.0.0.251,5353,udp,0,NaT,142.26.168.192.in-addr.arpa.local,1,...,F,F,F,0,,,F,33,,3.801377
2020-07-21 05:00:28.890889168,Cdqp0e3XB6sN8S3Xx4,192.168.29.10,5353,224.0.0.251,5353,udp,0,NaT,142.26.168.192.in-addr.arpa.local,1,...,F,F,F,0,,,F,33,,3.801377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-07-21 05:10:45.569783926,Cn3bS61UC4e7F0ZGo9,192.168.29.130,5353,224.0.0.251,5353,udp,9,NaT,_googlecast._tcp.local,1,...,F,F,F,0,,,F,22,,3.356492
2020-07-21 05:11:05.640989065,CyCW6r4T1zwY9YkLXl,192.168.29.130,5353,224.0.0.251,5353,udp,10,NaT,_googlecast._tcp.local,1,...,F,F,F,0,,,F,22,,3.356492
2020-07-21 05:11:25.715147972,Ca3j6EKkFdZaJJGva,192.168.29.130,5353,224.0.0.251,5353,udp,11,NaT,_googlecast._tcp.local,1,...,F,F,F,0,,,F,22,,3.356492
2020-07-21 05:11:45.578205109,CSBNxF8EC6BxWchGh,192.168.29.130,5353,224.0.0.251,5353,udp,12,NaT,_googlecast._tcp.local,1,...,F,F,F,0,,,F,22,,3.356492


In [8]:
pprint(zeek_df.columns)


Index(['uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto', 'trans_id', 'rtt', 'query', 'qclass', 'qclass_name', 'qtype', 'qtype_name', 'rcode', 'rcode_name', 'AA', 'TC', 'RD', 'RA', 'Z', 'answers', 'TTLs', 'rejected', 'query_length', 'answer_length', 'entropy'], dtype='object')
