In [16]:
import pandas as pd 
import re
import nltk
nltk.download("punkt")
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from datetime import datetime   

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


DATA LOADING AND INSPECTION

In [17]:
# Load the structured log data
structured_log_df = pd.read_csv("Linux_2k.log_structured.csv")
structured_log_df
# Load template csv for labeling insights
template_df = pd.read_csv("Linux_2k.log_templates.csv")
template_df

# Check if required missing
missing_values = structured_log_df.isnull().sum()
missing_values

LineId             0
Month              0
Date               0
Time               0
Level              0
Component          0
PID              151
Content            0
EventId            0
EventTemplate      0
dtype: int64

PREPROCESSING

In [18]:
# Labeling
# Generalized criteria for harmful log messages
harmful_criteria = [
    "authentication failure",
    "user unknown",
    "permission denied",
    "error",
    "failed",
    "denied",
    "abnormal",
    "critical"
    ] 
structured_log_df["Label"] = structured_log_df["Content"].apply(lambda x: "harmful" if any(criterion in str(x) for criterion in harmful_criteria) else "non-harmful"
)                                                                             
structured_log_df.to_csv("labeled.csv")


In [19]:
# Mapping 'Label' to numerical values: "harmful" -> 1, "normal" -> 0
label_mapping = {"harmful": -1, "non-harmful": 1}
structured_log_df["Label"] = structured_log_df["Label"].map(label_mapping)


In [20]:
# Assuming a default year for the datetime conversion
default_year = 2023

# Creating a new column 'Timestamp' by concatenating 'Month', 'Date', 'Time', and the default year
structured_log_df["Timestamp"] = structured_log_df.apply(lambda row: datetime.strptime(f"{default_year} {row['Month']} {row['Date']} {row['Time']}", '%Y %b %d %H:%M:%S'), axis=1)

In [21]:
#Cleaning Content
def clean_text(text):
    # Remove special characters and symbols
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    
    # Remove extra white spaces
    text = text.strip()
    
    # Lowercasing
    text = text.lower()

    return text

# Apply the cleaning function to the 'Content' column
structured_log_df["Content"] = structured_log_df["Content"].apply(clean_text)

In [22]:
#Tokenization
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply the tokenization function to the 'Content' column
structured_log_df["Content"] = structured_log_df["Content"].apply(tokenize_text)


In [23]:
# Stop word removal
# Defining a basic list of English stop words
basic_stopwords = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers",
    "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
    "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
    "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
    "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until",
    "while", "of", "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "above", "below", "to", "from", "up", "down",
    "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here",
    "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
    "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
])

# Removing stop words
structured_log_df["Content"] = structured_log_df["Content"].apply(lambda x: [word for word in x if word not in basic_stopwords])

In [24]:
# Lemmitization
# Download the necessary NLTK data 
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get the WordNet part of speech tag
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if unknown

# Function to lemmatize a list of tokens
def lemmatize_tokens(tokens):
    # Get the part of speech for each token
    pos_tags = nltk.pos_tag(tokens)
    
    # Apply lemmatization considering the part of speech
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags]
    return lemmatized_tokens

# Apply the lemmatization function
structured_log_df["Content"] = structured_log_df["Content"].apply(lemmatize_tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Since the data is already tokenized, we define a tokenizer function that just returns its input
def identity_tokenizer(text):
    return text

# Initialize the TF-IDF Vectorizer with the identity tokenizer
vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)

# Fit and transform the 'Content' column to get the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(structured_log_df["Content"])

# Show the shape of the TF-IDF matrix
tfidf_matrix.shape



(2000, 340)

MODEL TRAINING

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Fit and transform the 'Content' column to get the TF-IDF features
X = tfidf_matrix

# The labels are directly taken from the 'Label' column
y = structured_log_df["Label"]

In [28]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [31]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    "C": [0.1, 1, 10, 100],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"]
}

# Initialize the Grid Search with cross-validation
grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=5)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Find the best parameters
best_parameters = grid_search.best_params_

# Retrieve the best model from the grid search
best_svm_model = grid_search.best_estimator_

best_parameters


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   0.0s
[CV] END ...................C=0.1, gamma=auto, k

{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}

MODEL EVALUATION

In [32]:
# Test on training split
y_pred = best_svm_model.predict(X_test)

# Evaluate the model's performance
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[133   0]
 [  0 267]]
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       133
           1       1.00      1.00      1.00       267

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

