In [1]:
import pandas as pd
import os

DATASOURCE:

A. D. Kent, “Comprehensive, Multi-Source Cybersecurity Events,” Los Alamos National Laboratory,

http://dx.doi.org/10.17021/1179829, 2015.

@Misc{kent-2015-cyberdata1,

  author =     {Alexander D. Kent},
  
  title =      {{Comprehensive, Multi-Source Cyber-Security Events}},
  
  year =       {2015},
  
  howpublished = {Los Alamos National Laboratory},
  
  doi = {10.17021/1179829}
  
  
}

In [2]:
# create a list of headers corresponding to the data file as the datafile lacks headers
header = ["Time", "SourceUserDomain", "DestUserDomain", "SourceComputer", "DestComputer", "AuthType", "LogonType", "AuthOrientation", "SuccessFailure"]

# import the first 10k rows of the auth file
# authTenK = pd.read_csv('./Data/CSVs/auth10k.csv', header=None)
# authTenK.head()

# import the first 50k rows of the auth file
# authFiftyK = pd.read_csv('./Data/CSVs/auth50k.csv', header=None)
# authFiftyK.head()

# import the first 100k rows of the auth file
authHundredK = pd.read_csv('./Data/CSVs/auth100k.csv', header=None)
authHundredK.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1,ANONYMOUS LOGON@C586,ANONYMOUS LOGON@C586,C1250,C586,NTLM,Network,LogOn,Success
1,1,ANONYMOUS LOGON@C586,ANONYMOUS LOGON@C586,C586,C586,?,Network,LogOff,Success
2,1,C101$@DOM1,C101$@DOM1,C988,C988,?,Network,LogOff,Success
3,1,C1020$@DOM1,SYSTEM@C1020,C1020,C1020,Negotiate,Service,LogOn,Success
4,1,C1021$@DOM1,C1021$@DOM1,C1021,C625,Kerberos,Network,LogOn,Success


In [3]:
# rename the columns in DNS lookup

# authTenK.columns = header
# authFiftyK.columns= header
authHundredK.columns = header

# check the data to make sure the rename worked
# authTenK.head()
# authFiftyK.head()
authHundredK.head()

Unnamed: 0,Time,SourceUserDomain,DestUserDomain,SourceComputer,DestComputer,AuthType,LogonType,AuthOrientation,SuccessFailure
0,1,ANONYMOUS LOGON@C586,ANONYMOUS LOGON@C586,C1250,C586,NTLM,Network,LogOn,Success
1,1,ANONYMOUS LOGON@C586,ANONYMOUS LOGON@C586,C586,C586,?,Network,LogOff,Success
2,1,C101$@DOM1,C101$@DOM1,C988,C988,?,Network,LogOff,Success
3,1,C1020$@DOM1,SYSTEM@C1020,C1020,C1020,Negotiate,Service,LogOn,Success
4,1,C1021$@DOM1,C1021$@DOM1,C1021,C625,Kerberos,Network,LogOn,Success


In [4]:
# drop columns to not use in the model
# uncomment based on the file you are using

# authTenK = authTenK.drop(["DestUserDomain", "SourceComputer", "DestComputer", "DestUserDomain", "Time", "AuthType"], axis=1)
# authTenK.head()

# authFiftyK = authFiftyK.drop(["DestUserDomain", "SourceComputer", "DestComputer", "DestUserDomain", "Time", "AuthType"], axis=1)
# authFiftyK.head()

authHundredK = authHundredK.drop(["DestUserDomain", "SourceComputer", "DestComputer", "DestUserDomain", "Time", "AuthType"], axis=1)
authHundredK.head()

Unnamed: 0,SourceUserDomain,LogonType,AuthOrientation,SuccessFailure
0,ANONYMOUS LOGON@C586,Network,LogOn,Success
1,ANONYMOUS LOGON@C586,Network,LogOff,Success
2,C101$@DOM1,Network,LogOff,Success
3,C1020$@DOM1,Service,LogOn,Success
4,C1021$@DOM1,Network,LogOn,Success


In [5]:
# create X and y values for use in the model
# uncomment the X and y based on the datafile selected

# X = authTenK["SourceUserDomain"] + ' ' + authTenK["LogonType"] + ' ' + authTenK["AuthOrientation"]
# y = authTenK["SuccessFailure"]

# X = authFiftyK["SourceUserDomain"] + ' ' + authFiftyK["LogonType"] + ' ' + authFiftyK["AuthOrientation"]
# y = authFiftyK["SuccessFailure"]

X = authHundredK["SourceUserDomain"] + ' ' + authHundredK["LogonType"] + ' ' + authHundredK["AuthOrientation"]
y = authHundredK["SuccessFailure"]

print(X.shape, y.shape)

(100000,) (100000,)


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from sklearn.externals import joblib

# split the data into train/test, X_train will be combined with additional columns below
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# vectorize the X columns
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
joblib.dump(vectorizer, 'vectorizer.pkl') 
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)


# label encode the data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
joblib.dump(label_encoder, 'label_encoder.pkl') 

y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# convert the encoded labels to one-hot-encoding
y_train_categorical = to_categorical(y_train_encoded)
y_test_categorical = to_categorical(y_test_encoded)

print(y_test_categorical.shape, y_train_categorical.shape)
print(X_train_vec.shape)

Using TensorFlow backend.


(25000, 2) (75000, 2)
(75000, 4545)


In [7]:
# create the deep learning model
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=X_train_vec.shape[1]))
model.add(Dense(units=2, activation='softmax'))

In [8]:
# compile the model and fit it
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(
    X_train_vec,
    y_train_categorical,
    epochs=10,
    shuffle=True,
    verbose=2
)

Epoch 1/10
 - 38s - loss: 0.0206 - acc: 0.9956
Epoch 2/10
 - 36s - loss: 0.0042 - acc: 0.9987
Epoch 3/10
 - 38s - loss: 0.0034 - acc: 0.9988
Epoch 4/10
 - 37s - loss: 0.0033 - acc: 0.9989
Epoch 5/10
 - 37s - loss: 0.0030 - acc: 0.9989
Epoch 6/10
 - 38s - loss: 0.0031 - acc: 0.9988
Epoch 7/10
 - 37s - loss: 0.0032 - acc: 0.9989
Epoch 8/10
 - 37s - loss: 0.0030 - acc: 0.9989
Epoch 9/10
 - 37s - loss: 0.0030 - acc: 0.9990
Epoch 10/10
 - 37s - loss: 0.0030 - acc: 0.9989


<keras.callbacks.History at 0x26475218f60>

In [9]:
# evaluate the model
model_loss, model_accuracy = model.evaluate(X_test_vec, y_test_categorical, verbose=2)
print(f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: 0.004097415732244517, Accuracy: 0.99876


In [10]:
# test the model's predictions against a subset of test X_test_vec
encoded_predictions = model.predict_classes(X_test_vec[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_train[:10])}")

Predicted classes: ['Success' 'Success' 'Success' 'Success' 'Success' 'Success' 'Success'
 'Success' 'Success' 'Success']
Actual Labels: ['Success', 'Success', 'Success', 'Success', 'Success', 'Success', 'Success', 'Success', 'Success', 'Success']


In [11]:
# Save the model
model.save("auth_model_trained.h5")

In [None]:
# load the saved model
from keras.models import load_model
model = load_model("auth_model_trained.h5")

In [13]:
# code required for running model post creation:

# import dependencies
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib

from keras.models import load_model

# load vectorizer and label encoder
vectorizer = joblib.load('vectorizer.pkl')
label_encoder = joblib.load('label_encoder.pkl') 

# load the model
model = load_model("auth_model_trained.h5")

# test the data, assuming that the input data is in the same format as the original auth.txt.gz file without the SuccessFailure column.

# start by converting the data into a useable format
# create the header
header = ["Time", "SourceUserDomain", "DestUserDomain", "SourceComputer", "DestComputer", "AuthType", "LogonType", "AuthOrientation"]

# create a test data frame, replace the file path with that of the test data, change column names to headers, drop unnecessary columns
test_df = pd.read_csv('./Data/CSVs/tester.csv', header=None)
test_df.columns = header
test_df = test_df.drop(["DestUserDomain", "SourceComputer", "DestComputer", "DestUserDomain", "Time", "AuthType"], axis=1)

# transform the data into the proper testing format
test = test_df["SourceUserDomain"] + ' ' + test_df["LogonType"] + ' ' + test_df["AuthOrientation"]

# run the test data through the vectorizer, model, and label encoder
test_vec = vectorizer.transform(test)
encoded_test = model.predict_classes(test_vec)
predict_label = label_encoder.inverse_transform(encoded_test)
print("Logon Prediction: " + predict_label[0])

Logon Succes or Failure: Success
