In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

from gensim.models import Word2Vec
from scapy.all import PcapReader
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from read_pcaps import pcap_to_dataframe

In [None]:
# define a variable that allows you to read prior saved pkl files
READ_FROM_PKL = True

In [None]:
if READ_FROM_PKL:
    mirai_df = pd.read_pickle("../data/blog_eda/mirai.pkl")
    benign_df = pd.read_pickle("../data/blog_eda/benign.pkl")
else:
    pcap_reader_mirai = PcapReader("../data/blog_eda/mirai.pcap")
    pcap_reader_benign = PcapReader("../data/blog_eda/benign.pcapng")
    mirai_df = pcap_to_dataframe(pcap_reader_mirai)
    benign_df = pcap_to_dataframe(pcap_reader_benign)

In [None]:
# read features saved, add new ones
if READ_FROM_PKL:
    mirai_features_df = pd.read_pickle("../data/blog_fe/mirai_features.pkl")
    benign_features_df = pd.read_pickle("../data/blog_fe/benign_features.pkl")
else:
    print(
        "Error! Feature pkl not saved. Please run blog_fe_2.ipynb, blog_fe_3.ipynb, blog_fe_4.ipynb or download from here: https://drive.google.com/drive/folders/1dBQhbQtIk_fbbb80G5pSVV3hbWYJY7fv?usp=sharing"
    )

In [None]:
# drop rows with None
mirai_df = mirai_df.dropna()
benign_df = benign_df.dropna()

# Add labels

In [None]:
# add labels, 0 for benign, 1 for malicious
mirai_features_df["Label"] = 1
benign_features_df["Label"] = 0

In [None]:
concatenated_df = pd.concat([mirai_features_df, benign_features_df], ignore_index=True)

In [None]:
shuffled_df = shuffle(concatenated_df, random_state=42)

In [None]:
# make sure that all features are numeric, otherwise feature selection will not work
shuffled_df.columns

In [None]:
# from sklearn.preprocessing import StandardScaler

# # Initialize StandardScaler
# scaler = StandardScaler()
# columns_to_scale = ["Numeric Source IP", "Numeric Destination IP"]

# # Apply Standardization to the DataFrame
# df_standardized = pd.DataFrame(scaler.fit_transform(shuffled_df[columns_to_scale]), columns=columns_to_scale)

# # Print the standardized DataFrame
# print(df_standardized)


In [None]:
# columns_to_scale = ["interarrival", "log_interarrival"]

# # Apply Standardization to the DataFrame
# df_standardized = pd.DataFrame(
#     scaler.fit_transform(shuffled_df[columns_to_scale]), columns=columns_to_scale
# )

# Print the standardized DataFrame
# print(df_standardized)

In [None]:
# df_standardized = pd.concat(
#     [df_standardized, shuffled_df.drop(columns=columns_to_scale)], axis=1
# )

In [None]:
# df_standardized = df_standardized.dropna()

In [None]:
# print("NaN values in shuffled_df:", df_standardized.isnull().sum().sum())

In [None]:
shuffled_df = shuffled_df.drop("payload_embedding", axis=1)
shuffled_df_values = shuffled_df.values.astype(np.float64)

# Find rows with infinite values
inf_rows = np.isinf(shuffled_df_values).any(axis=1)

# Remove rows with infinite values
shuffled_df = shuffled_df[~inf_rows]

In [None]:
# Find rows with infinite values
inf_rows = shuffled_df.index[np.isinf(shuffled_df).any(1)]

# Remove rows with infinite values
shuffled_df = shuffled_df.drop(inf_rows)

# Display the DataFrame after removing rows with infinite values
print("\nDataFrame after removing rows with infinite values:")
print(shuffled_df)

In [None]:
shuffled_df = shuffled_df.dropna()

# Split data
- Train
- Test

In [None]:
# X should contain your feature columns, and y should contain the labels (0 or 1)
X = shuffled_df.drop(columns=["Label"], axis=1)
y = shuffled_df["Label"]


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Filter

In [None]:
X_train_np = X_train.values if isinstance(X_train, pd.DataFrame) else X_train

# Ensure y_train is a NumPy array
y_train_np = y_train.values if isinstance(y_train, pd.Series) else y_train


In [None]:
X_train_np.shape

In [None]:
y_train_np

In [None]:
# Check for infinity
inf_mask = np.isinf(X_train_np)

# Find indices where X_train_np contains infinity
inf_indices = np.where(inf_mask)

# Check for values too large for float64
large_values_mask = np.abs(X_train_np) > np.finfo(np.float64).max

# Find indices where X_train_np contains values too large for float64
large_values_indices = np.where(large_values_mask)

# Combine the two masks to find indices where X_train_np contains either infinity or values too large for float64
problematic_indices = np.unique(
    np.concatenate((inf_indices[0], large_values_indices[0]))
)

# Print the problematic values and their corresponding indices
print("Problematic values:")
print(X_train_np[problematic_indices])
print("Indices of problematic values:")
print(problematic_indices)

In [None]:
# Select the top k features using SelectKBest and f_classif
k_best = 5  # You can adjust this value based on your dataset and requirements
selector = SelectKBest(f_classif, k=k_best)
X_train_selected = selector.fit_transform(X_train_np, y_train_np)

# Train a classifier using the selected features
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_selected, y_train_np)

# Transform the test set using the same feature selector
X_test_selected = selector.transform(X_test)

# Make predictions and evaluate the performance
y_pred = clf.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)

print(f"Selected {k_best} features using SelectKBest and f_classif.")
print(f"Accuracy on the test set: {accuracy}")

# Wrapper

# PCA