# plan:

1. select top red team users (by number or percentage of compromised events)
2. make **per event** function to save user lines with features. keep only logons (auth orientation) and use other filters
3. make per 5 minutes step function to save aggregated user activity in 5 minutes frame

Features to implement:
1. fraction from the same box for destination computer, hours
2. is remote, is same domain
3. logon type: is network or anything else?
4. authentication type: is nltm or anything else, fraction from the same box
5. time sine
6. maybe user domain fraction?

# imports, funcs, consts

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
from itertools import islice
from collections import Counter
import networkx as nx

In [9]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()
# Check Spark Session Information
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/17 03:03:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


23/10/17 03:04:04 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [11]:
import pyspark.sql.functions as f
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window

In [2]:
def plot_bar_for_dict(dictionary, title, to_normalize = False):

    plt.figure()
    if to_normalize:
        normalized_dictionary = {dictionary_key : 100 * dictionary_value / sum(dictionary.values()) for dictionary_key, dictionary_value in dictionary.items()}
    else:
        normalized_dictionary = dictionary
    names_dictionary, counts_dictionary = zip(*normalized_dictionary.items())
    plt.bar(names_dictionary, counts_dictionary)
    plt.xticks(rotation=75)
    plt.ylabel('%')
    plt.title(title)
    plt.show()

In [3]:
def auth_type_lambda(x):
  if x.startswith('MICROSOFT_AUTHENT'):
    return 'microsoft_auth'
  else:
    return x.replace('_', ' ').lower().replace('authentication', 'auth')

def extract_fields_from_line(line_list):
    time = int(line_list[0][2:])
    source_user, destination_user, source_computer, destination_computer, authentication_type, logon_type, authentication_orientation  = line_list[1:-1]
    success_failure = line_list[-1][:-1]
    authentication_type = auth_type_lambda(authentication_type)

    return time, source_user, destination_user, source_computer, destination_computer, authentication_type, logon_type, authentication_orientation, success_failure

print(extract_fields_from_line(["b'1", 'C101$@DOM1', 'C101$@DOM1', 
                          'C988', 'C988', 'MICROSOFT_AUTHENTICATION_PACKAGE_V6', 
                          'Network', 'LogOff', "Success'"]))

(1, 'C101$@DOM1', 'C101$@DOM1', 'C988', 'C988', 'microsoft_auth', 'Network', 'LogOff', 'Success')


In [4]:
USER_BLACK_LIST = ["SYSTEM", "ANONYMOUS LOGON", "LOCAL SERVICE", "NETWORK SERVICE"]
MAX_NUM_LINES = int(1e8)
AUTH_COLUMNS =['time', 'source_user_at_domain','destination_user_at_domain',
                              'source_computer','destination_computer','authentication_type',
                              'logon_type','authentication_orientation','success_failure']

AUTENTICATION_TYPE_BLACK_LIST = ['wave', 'netware auth package v1 0', 'setuid', 'tivoliap', 
                                 'cygwinlsa', 'acronis relogon auth package']

AUTH_TYPES = ['microsoft_auth', 'netware auth package v1 0', 'kerberos', '?', 'negotiate', 'ntlm']

LOGON_TYPES = ['Batch','CachedInteractive', 'Interactive','Network',
               'NetworkCleartext','NewCredentials',
               'RemoteInteractive','Service', 'Unlock', '?']

HARD_LOGON_TYPES = ['Network'] # that's because we saw only network in red team events
HARD_AUTH_TYPES = ['nltm'] # same reason: only NLTM

# few red team users

In [5]:
with gzip.open('redteam.txt.gz', 'rb') as f:
    file_content = f.readlines()

lines = [x.strip() for x in file_content]
lines_for_pd = [str(x).split(',') for x in lines]

red_team = pd.DataFrame(lines_for_pd, columns=['time', 'user_at_domain', 'source_computer', 'destination_computer'])
red_team['time'] = red_team['time'].apply(lambda x: int(x.replace("b\'", "")))
red_team['destination_computer'] = red_team['destination_computer'].apply(lambda x: x[:-1])
red_team[['user', 'domain']] = red_team.user_at_domain.str.split('@', expand=True)

num_top_red_team_users = 5
top_red_team_users = [x[0] for x in Counter(red_team.user_at_domain.value_counts().to_dict()).most_common(num_top_red_team_users)]
print(f'Taking top {num_top_red_team_users} users with most red team events: {top_red_team_users}')

Taking top 5 users with most red team events: ['U66@DOM1', 'U3005@DOM1', 'U737@DOM1', 'U1653@DOM1', 'U293@DOM1']


In [6]:
red_team_dict = {(time, user) : (source_pc, dest_pc)
                for time, user, source_pc, dest_pc in zip(red_team.time, red_team.user_at_domain, red_team.source_computer, red_team.destination_computer)}

red_team_dict[(150885, 'U620@DOM1')]

('C17693', 'C1003')

# make per event function to save user lines with features. 

In [7]:
def get_user_red_team_events_dict(user, red_team_df):
    user_df = red_team_df[red_team_df.user_at_domain == user]
    if len(user_df) > 0:
        red_team_events_dict = {time : (source_computer, destination_computer) 
                            for (time, source_computer, destination_computer) in zip(user_df.time, 
                                                                                     user_df.source_computer, 
                                                                                     user_df.destination_computer)}
        return red_team_events_dict
    else:
        return dict()
    

def get_per_event_user_batch(user, 
                             auth_path,
                             user_red_team_events_dict,
                             base_output_path = None,
                             possible_auth_types = HARD_AUTH_TYPES,
                             possible_logon_types = HARD_LOGON_TYPES,
                             user_black_list = USER_BLACK_LIST):
    
    # CHECK IF USER IS IN BLACK LIST OR ENDS WITH $
    if user in user_black_list:
        print(f'Forbidden user {user} from the black list')
        return None
    if user.split('@')[0].endswith('$'):
        print(f'Forbidden user {user} ends with $')
        return None

    user_lines = []
    user_destination_computer_counter = Counter(dict())
    user_source_computer_counter = Counter(dict())
    user_hour_counter = Counter(dict())
    user_authentication_type_counter = Counter(dict())
    
    total_logons = 0
    
    with gzip.open(auth_path, 'rb') as auth_file:
        
        for line in auth_file:
            line = line.strip()
            line_parts = str(line).split(',')
            time, source_user, destination_user, source_computer, destination_computer, authentication_type, logon_type, authentication_orientation, success_failure = extract_fields_from_line(line_parts)

            # CONDITIONS
            condition1 = source_user == user
            condition2 = authentication_orientation in ['LogOn', 'LogOff']
            condition3 = (source_user != destination_user) or (source_computer != destination_computer) # this excludes logoffs too!!!
            condition4 = authentication_type not in AUTENTICATION_TYPE_BLACK_LIST

            if condition1 and condition2 and condition3 and condition4:
                
                # BINARY FEATURES
                authentication_orientation = int(authentication_orientation == 'LogOn')
                is_success = int(success_failure == 'Success')
                is_remote = int(source_computer != destination_computer)
                is_same_user = int(source_user == destination_user)
                is_red_team_action = 0
                if user_red_team_events_dict and time in user_red_team_events_dict.keys() and user_red_team_events_dict[time] == (source_computer, destination_computer):
                    is_red_team_action = 1
                    
                # TIME SINE FOR PERIODICITY
                hour24 = time // (60 * 60) % 24
                hour24_sine = np.sin((time // (60 * 60))  / 12 * np.pi)
                day7_sine = np.sin((time // (24 * 60 * 60)) / 7 * 2 * np.pi)
            
                # FRACTION FROM THE SAME BOX: DESTINATION COMPUTER, HOUR24, SOURCE COMPUTER, AUTHENTICATION TYPE!
                user_destination_computer_counter.update({destination_computer : 1})
                user_source_computer_counter.update({source_computer : 1})
                user_hour_counter.update({hour24 : 1})
                user_authentication_type_counter.update({authentication_type : 1})
        
                total_logons += 1
        
                destination_computer_fraction_from_the_same_box = user_destination_computer_counter.get(destination_computer, 0) / total_logons
                source_computer_fraction_from_the_same_box = user_source_computer_counter.get(source_computer, 0) / total_logons
                hour_fraction_from_the_same_box = user_hour_counter.get(hour24, 0) / total_logons
                authentication_type_fraction_from_the_same_box = user_authentication_type_counter.get(authentication_type, 0) / total_logons

                # COLLECT ALL FEATURES
                feature_line = [time, hour24_sine, day7_sine,
                                source_user,
                                destination_user, 
                                source_computer, 
                                destination_computer, 
                                authentication_orientation, 
                                is_success,
                                is_remote,
                                is_same_user,
                                destination_computer_fraction_from_the_same_box,
                                source_computer_fraction_from_the_same_box,
                                hour_fraction_from_the_same_box,
                                authentication_type_fraction_from_the_same_box]
                
                
                 # ADDING OHE FEATURES (OR BINARY IN EXTREME CASE)
                if len(possible_auth_types) >= 2:
                    feature_line += [int(possible_auth_type == authentication_type) for possible_auth_type in possible_auth_types]
                    auth_types_columns =  [f'is_{possible_auth_type}' for possible_auth_type in possible_auth_types]
                else:
                    auth_types_columns = [f'is_{possible_auth_types[0]}']
                    feature_line += [int(possible_auth_types[0] == authentication_type)]

                if len(possible_logon_types) >= 2:
                    feature_line += [int(possible_logon_type == logon_type) for possible_logon_type in possible_logon_types]
                    logon_types_columns = [f'is_{possible_logon_type}' for possible_logon_type in possible_logon_types]

                else:
                    logon_types_columns = [f'is_{possible_logon_types[0]}']
                    feature_line += [int(possible_logon_types[0] == logon_type)]
                
                feature_line += [is_red_team_action]
                user_lines.append(feature_line)
                
        # CREATE DF
        columns = ['time', 'hour24_sine', 'day7_sine',
                  'source_user',
                  'destination_user', 
                  'source_computer',
                  'destination_computer',
                  'authentication_orientation',
                  'is_success',
                  'is_remote',
                  'is_same_user',
                  'destination_computer_fraction_from_the_same_box',
                  'source_computer_fraction_from_the_same_box',
                  'hour_fraction_from_the_same_box',
                  'authentication_type_fraction_from_the_same_box'] + auth_types_columns + logon_types_columns + ['is_red_team_action']
        print(f'Collected {len(user_lines)} lines with {len(columns)} features')
        user_features_df = pd.DataFrame(user_lines, columns = columns)
        if base_output_path:
            user_name_for_save = user.replace('@', '_')
            path_to_save = f'{base_output_path}/{user_name_for_save}.csv'
            user_features_df.to_csv(path_to_save, index=False)
        return user_features_df

In [71]:
user = top_red_team_users[0]
user_red_team_events_dict = get_user_red_team_events_dict(user, red_team)


per_event_user_batch = get_per_event_user_batch(user, 
                                             auth_path = 'auth.txt.gz',
                                             user_red_team_events_dict = user_red_team_events_dict,
                                             base_output_path = None,
                                             possible_auth_types = HARD_AUTH_TYPES,
                                             possible_logon_types = HARD_LOGON_TYPES,
                                             user_black_list = USER_BLACK_LIST)
display(per_event_user_batch.head())

Collected 507266 lines with 18 features


Unnamed: 0,time,hour24_sine,day7_sine,source_user,destination_user,source_computer,destination_computer,authentication_orientation,is_success,is_remote,is_same_user,destination_computer_fraction_from_the_same_box,source_computer_fraction_from_the_same_box,hour_fraction_from_the_same_box,authentication_type_fraction_from_the_same_box,is_nltm,is_Network,is_red_team_action
0,11,0.0,0.0,U66@DOM1,U66@DOM1,C1823,C1732,1,1,1,1,1.0,1.0,1.0,1.0,0,1,0
1,15,0.0,0.0,U66@DOM1,U66@DOM1,C1823,C1952,1,1,1,1,0.5,1.0,1.0,1.0,0,1,0
2,21,0.0,0.0,U66@DOM1,U66@DOM1,C1823,C1952,1,1,1,1,0.666667,1.0,1.0,1.0,0,1,0
3,25,0.0,0.0,U66@DOM1,U66@DOM1,C1823,C72,1,1,1,1,0.25,1.0,1.0,1.0,0,1,0
4,31,0.0,0.0,U66@DOM1,U66@DOM1,C1823,C1952,1,1,1,1,0.6,1.0,1.0,1.0,0,1,0


In [72]:
per_event_user_batch.shape

(507266, 18)

In [73]:
extended_per_event_user_batch = get_per_event_user_batch(user, 
                                             auth_path = 'auth.txt.gz',
                                             user_red_team_events_dict = user_red_team_events_dict,
                                             base_output_path = None,
                                             possible_auth_types = AUTH_TYPES,
                                             possible_logon_types = LOGON_TYPES,
                                             user_black_list = USER_BLACK_LIST)
print(f'Dataframe shape (same as (number of lines, number of features)): {extended_per_event_user_batch.shape}')
display(extended_per_event_user_batch.head())

Collected 507266 lines with 32 features
Dataframe shape (same as (number of lines, number of features)): (507266, 32)


Unnamed: 0,time,hour24_sine,day7_sine,source_user,destination_user,source_computer,destination_computer,authentication_orientation,is_success,is_remote,...,is_CachedInteractive,is_Interactive,is_Network,is_NetworkCleartext,is_NewCredentials,is_RemoteInteractive,is_Service,is_Unlock,is_?,is_red_team_action
0,11,0.0,0.0,U66@DOM1,U66@DOM1,C1823,C1732,1,1,1,...,0,0,1,0,0,0,0,0,0,0
1,15,0.0,0.0,U66@DOM1,U66@DOM1,C1823,C1952,1,1,1,...,0,0,1,0,0,0,0,0,0,0
2,21,0.0,0.0,U66@DOM1,U66@DOM1,C1823,C1952,1,1,1,...,0,0,1,0,0,0,0,0,0,0
3,25,0.0,0.0,U66@DOM1,U66@DOM1,C1823,C72,1,1,1,...,0,0,1,0,0,0,0,0,0,0
4,31,0.0,0.0,U66@DOM1,U66@DOM1,C1823,C1952,1,1,1,...,0,0,1,0,0,0,0,0,0,0


# make per 5 minutes step function to save aggregated user activity in 5 minutes frame

In [27]:
print(per_event_user_batch_read.columns)

Index(['time', 'hour24_sine', 'day7_sine', 'source_user', 'destination_user',
       'source_computer', 'destination_computer', 'authentication_orientation',
       'is_success', 'is_remote', 'is_same_user',
       'destination_computer_fraction_from_the_same_box',
       'source_computer_fraction_from_the_same_box',
       'hour_fraction_from_the_same_box',
       'authentication_type_fraction_from_the_same_box', 'is_nltm',
       'is_Network', 'is_red_team_action'],
      dtype='object')


In [34]:
def get_window_user_batch(per_event_user_batch,
                         window_size = '5 minutes',
                         base_output_path = None,
                         possible_auth_types = HARD_AUTH_TYPES,
                         possible_logon_types = HARD_LOGON_TYPES,
                         user_black_list = USER_BLACK_LIST):
    ###
    # Getting features:
    # time, 
    # window index - time // window size
    # number of actions in this window
    # average day sine, average hour sine
    # number logons, number logoffs - instead of authentication orientation
    # number of fails, success rate - instead of is_success
    # number of remote, remote rate - instead of is_remote
    # number of actions with the same desintation user, same_user_rate - instead of is_same_user
    # minimal destination computer fraction 
    # minimal source computer fraction
    # minimal hour fraction
    # minimal authentication fraction
    # auth type ohe distribution (normalized sum)
    # logon type ohe distribution
    # maximal or sum of is red team action - to identify presense of such events

    ###
        
    # convert step size into seconds
    window_size_measure = window_size.split()[1]
    steps = int(window_size.split()[0])
    if window_size_measure.startswith('hour'):
        window_size_seconds = steps * 60 * 60
    elif window_size_measure.startswith('min'):
        window_size_seconds = steps * 60
    else:
        window_size_seconds = steps

    columns_to_drop = ['source_user', 'destination_user',
                       'source_computer', 'destination_computer']
    per_event_user_batch_pysp = spark.createDataFrame(per_event_user_batch.drop(columns = columns_to_drop))
    
    windowed_per_event_user_batch_pysp = per_event_user_batch_pysp\
    .withColumn('window_index', (f.col('time') / window_size_seconds).cast(IntegerType()))\
    .groupby('window_index')\
    .agg(f.min('time').alias('start_time'),
        f.max('time').alias('finish_time'),
        f.count('*').alias('number_of_actions'),
        f.mean('hour24_sine').alias('average_hour24_sine'),
        f.mean('day7_sine').alias('average_day7_sine'),
        f.sum('authentication_orientation').alias('number_of_logons'),
        f.mean('is_success').alias('success_rate'),
        f.sum('is_success').alias('num_successful_logons'),
        f.mean('is_remote').alias('remote_rate'),
        f.sum('is_remote').alias('num_remote_logons'),
        f.mean('is_same_user').alias('same_user_rate'),
        f.min('destination_computer_fraction_from_the_same_box').alias('min_destination_computer_fraction_from_the_same_box'),
        f.min('source_computer_fraction_from_the_same_box').alias('min_source_computer_fraction_from_the_same_box'),
        f.min('hour_fraction_from_the_same_box').alias('min_hour_fraction_from_the_same_box'),
        f.min('authentication_type_fraction_from_the_same_box').alias('min_authentication_type_fraction_from_the_same_box'),
        f.mean('is_nltm').alias('is_nltm_p'),
        f.mean('is_Network').alias('is_Network_p'),
        f.max('is_red_team_action').alias('has_red_team_action'),
        f.sum('is_red_team_action').alias('number_red_team_actions'))
    
    windowed_per_event_user_batch_pd = windowed_per_event_user_batch_pysp.toPandas().sort_values('window_index')
    if base_output_path:
        user = per_event_user_batch.source_user.values[0]
        user_name_for_save = user.replace('@', '_')
        path_to_save = f'{base_output_path}/{user_name_for_save}.csv'
        windowed_per_event_user_batch_pd.to_csv(path_to_save, index=False)
    return windowed_per_event_user_batch_pd

In [35]:
per_event_user_batch_read = pd.read_csv('per_event_user_batch_u66_dom1.csv').head(int(1e5))
per_event_user_batch_read.head(3)

windowed_per_event_user_batch_pd = get_window_user_batch(per_event_user_batch_read,
                         window_size = '5 minutes',
                         base_output_path = None,
                         possible_auth_types = HARD_AUTH_TYPES,
                         possible_logon_types = HARD_LOGON_TYPES,
                         user_black_list = USER_BLACK_LIST)

display(windowed_per_event_user_batch_pd.head(5))

Unnamed: 0,window_index,start_time,finish_time,number_of_actions,average_hour24_sine,average_day7_sine,number_of_logons,success_rate,num_successful_logons,remote_rate,num_remote_logons,same_user_rate,min_destination_computer_fraction_from_the_same_box,min_source_computer_fraction_from_the_same_box,min_hour_fraction_from_the_same_box,min_authentication_type_fraction_from_the_same_box,is_nltm_p,is_Network_p,has_red_team_action,number_red_team_actions
319,0,11,274,36,0.0,0.0,36,1.0,36,1.0,36,1.0,0.028571,1.0,1.0,0.066667,0.0,1.0,0,0
70,1,311,574,36,0.0,0.0,36,1.0,36,1.0,36,1.0,0.014493,1.0,1.0,0.115385,0.0,1.0,0,0
291,2,611,874,33,0.0,0.0,33,1.0,33,1.0,33,1.0,0.032967,1.0,1.0,0.114943,0.0,1.0,0,0
85,3,911,1174,38,0.0,0.0,38,1.0,38,1.0,38,1.0,0.02459,1.0,1.0,0.113821,0.0,1.0,0,0
164,4,1211,1474,39,0.0,0.0,39,1.0,39,1.0,39,1.0,0.02454,1.0,1.0,0.113208,0.0,1.0,0,0
