In [23]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, precision_recall_fscore_support
import matplotlib.pyplot as plt 

In [24]:
df = pd.read_csv('dataset/Severity_Clear_10to6.csv') 
df 

Unnamed: 0,date_time,host_ip,host_name,host_category,host_status,severity,CPU_load,CPU_Util,Mem_util,In_traffic,Out_traffic
0,2023-01-01 00:00:00,10.20.20.4,DQP_Server1,Server,Device Down,Critical,0,0,0,0,0
1,2023-01-01 01:00:00,10.20.20.4,DQP_Server1,Server,Device OK,Clear,9,9,9,9,9
2,2023-01-01 02:00:00,10.20.20.4,DQP_Server1,Server,Device OK,Clear,7,7,7,7,7
3,2023-01-01 03:00:00,10.20.20.4,DQP_Server1,Server,Device OK,Clear,5,5,5,5,5
4,2023-01-01 04:00:00,10.20.20.4,DQP_Server1,Server,Device OK,Clear,6,6,6,6,6
...,...,...,...,...,...,...,...,...,...,...,...
17515,2023-12-31 19:00:00,10.20.20.1,DQP_Sw,Switch,Device OK,Clear,6,6,6,6,6
17516,2023-12-31 20:00:00,10.20.20.1,DQP_Sw,Switch,Device Down,Critical,0,0,0,0,0
17517,2023-12-31 21:00:00,10.20.20.1,DQP_Sw,Switch,Device OK,Clear,6,6,6,6,6
17518,2023-12-31 22:00:00,10.20.20.1,DQP_Sw,Switch,Device OK,Clear,6,6,6,6,6


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17520 entries, 0 to 17519
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date_time      17520 non-null  object
 1   host_ip        17520 non-null  object
 2   host_name      17520 non-null  object
 3   host_category  17520 non-null  object
 4   host_status    17520 non-null  object
 5   severity       17520 non-null  object
 6   CPU_load       17520 non-null  int64 
 7   CPU_Util       17520 non-null  int64 
 8   Mem_util       17520 non-null  int64 
 9   In_traffic     17520 non-null  int64 
 10  Out_traffic    17520 non-null  int64 
dtypes: int64(5), object(6)
memory usage: 1.5+ MB


In [26]:
# np.random.seed(42)
# tf.random.set_seed(42)
 
df = df[['date_time','CPU_load', 'CPU_Util', 'Mem_util', 'In_traffic', 'Out_traffic', 'host_ip', 'host_status']]
df 

Unnamed: 0,date_time,CPU_load,CPU_Util,Mem_util,In_traffic,Out_traffic,host_ip,host_status
0,2023-01-01 00:00:00,0,0,0,0,0,10.20.20.4,Device Down
1,2023-01-01 01:00:00,9,9,9,9,9,10.20.20.4,Device OK
2,2023-01-01 02:00:00,7,7,7,7,7,10.20.20.4,Device OK
3,2023-01-01 03:00:00,5,5,5,5,5,10.20.20.4,Device OK
4,2023-01-01 04:00:00,6,6,6,6,6,10.20.20.4,Device OK
...,...,...,...,...,...,...,...,...
17515,2023-12-31 19:00:00,6,6,6,6,6,10.20.20.1,Device OK
17516,2023-12-31 20:00:00,0,0,0,0,0,10.20.20.1,Device Down
17517,2023-12-31 21:00:00,6,6,6,6,6,10.20.20.1,Device OK
17518,2023-12-31 22:00:00,6,6,6,6,6,10.20.20.1,Device OK


In [27]:
 df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17520 entries, 0 to 17519
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date_time    17520 non-null  object
 1   CPU_load     17520 non-null  int64 
 2   CPU_Util     17520 non-null  int64 
 3   Mem_util     17520 non-null  int64 
 4   In_traffic   17520 non-null  int64 
 5   Out_traffic  17520 non-null  int64 
 6   host_ip      17520 non-null  object
 7   host_status  17520 non-null  object
dtypes: int64(5), object(3)
memory usage: 1.1+ MB


In [28]:
# shuffled_data = data.sample(frac=1, random_state=42)  # frac=1 means shuffle all rows, random_state for reproducibility
 
# # Sort shuffled data according to 'date_time'
# data = shuffled_data.sort_values(by='date_time')
 
# Feature engineering and preprocessing
df['date_time'] = pd.to_datetime(df['date_time'])
df.set_index('date_time', inplace=True)
df 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_time'] = pd.to_datetime(df['date_time'])


Unnamed: 0_level_0,CPU_load,CPU_Util,Mem_util,In_traffic,Out_traffic,host_ip,host_status
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-01-01 00:00:00,0,0,0,0,0,10.20.20.4,Device Down
2023-01-01 01:00:00,9,9,9,9,9,10.20.20.4,Device OK
2023-01-01 02:00:00,7,7,7,7,7,10.20.20.4,Device OK
2023-01-01 03:00:00,5,5,5,5,5,10.20.20.4,Device OK
2023-01-01 04:00:00,6,6,6,6,6,10.20.20.4,Device OK
...,...,...,...,...,...,...,...
2023-12-31 19:00:00,6,6,6,6,6,10.20.20.1,Device OK
2023-12-31 20:00:00,0,0,0,0,0,10.20.20.1,Device Down
2023-12-31 21:00:00,6,6,6,6,6,10.20.20.1,Device OK
2023-12-31 22:00:00,6,6,6,6,6,10.20.20.1,Device OK


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17520 entries, 2023-01-01 00:00:00 to 2023-12-31 23:00:00
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CPU_load     17520 non-null  int64 
 1   CPU_Util     17520 non-null  int64 
 2   Mem_util     17520 non-null  int64 
 3   In_traffic   17520 non-null  int64 
 4   Out_traffic  17520 non-null  int64 
 5   host_ip      17520 non-null  object
 6   host_status  17520 non-null  object
dtypes: int64(5), object(2)
memory usage: 1.1+ MB


In [30]:
# Identify categorical columns (assuming they are of type 'object')
categorical_columns = df.select_dtypes(include=['object']).columns
 
# Apply label encoding to all categorical columns
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = le.fit_transform(df[column])


Unnamed: 0_level_0,CPU_load,CPU_Util,Mem_util,In_traffic,Out_traffic,host_ip,host_status
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-01-01 00:00:00,0,0,0,0,0,1,0
2023-01-01 01:00:00,9,9,9,9,9,1,1
2023-01-01 02:00:00,7,7,7,7,7,1,1
2023-01-01 03:00:00,5,5,5,5,5,1,1
2023-01-01 04:00:00,6,6,6,6,6,1,1
...,...,...,...,...,...,...,...
2023-12-31 19:00:00,6,6,6,6,6,0,1
2023-12-31 20:00:00,0,0,0,0,0,0,0
2023-12-31 21:00:00,6,6,6,6,6,0,1
2023-12-31 22:00:00,6,6,6,6,6,0,1


In [33]:
# List of features to scale
columns_to_scale = ['CPU_load', 'CPU_Util', 'Mem_util', 'In_traffic', 'Out_traffic']
 
# Normalize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[columns_to_scale])
scaled_features

array([[-1.00921127, -1.08556812, -1.09529539, -1.08937509, -1.09242194],
       [-0.53766306, -0.55125553, -0.55082103, -0.55187669, -0.55039613],
       [-0.64245155, -0.66999166, -0.67181533, -0.67132078, -0.67084631],
       ...,
       [-0.6948458 , -0.72935973, -0.73231248, -0.73104282, -0.7310714 ],
       [-0.6948458 , -0.72935973, -0.73231248, -0.73104282, -0.7310714 ],
       [-0.5900573 , -0.6106236 , -0.61131818, -0.61159873, -0.61062122]])

In [35]:
# Combine scaled numerical features with encoded categorical features
encoded_features = df[categorical_columns].values
encoded_features

array([[1, 0],
       [1, 1],
       [1, 1],
       ...,
       [0, 1],
       [0, 1],
       [0, 1]])

In [38]:
combined_features = np.hstack((scaled_features, encoded_features))
pd.DataFrame(combined_features,columns=df.columns) # to show the combined features 

Unnamed: 0,CPU_load,CPU_Util,Mem_util,In_traffic,Out_traffic,host_ip,host_status
0,-1.009211,-1.085568,-1.095295,-1.089375,-1.092422,1.0,0.0
1,-0.537663,-0.551256,-0.550821,-0.551877,-0.550396,1.0,1.0
2,-0.642452,-0.669992,-0.671815,-0.671321,-0.670846,1.0,1.0
3,-0.747240,-0.788728,-0.792810,-0.790765,-0.791296,1.0,1.0
4,-0.694846,-0.729360,-0.732312,-0.731043,-0.731071,1.0,1.0
...,...,...,...,...,...,...,...
17515,-0.694846,-0.729360,-0.732312,-0.731043,-0.731071,0.0,1.0
17516,-1.009211,-1.085568,-1.095295,-1.089375,-1.092422,0.0,0.0
17517,-0.694846,-0.729360,-0.732312,-0.731043,-0.731071,0.0,1.0
17518,-0.694846,-0.729360,-0.732312,-0.731043,-0.731071,0.0,1.0


In [39]:
combined_features.shape

(17520, 7)

In [40]:
df['host_status'].values[0 + 48]

1

In [41]:
len(combined_features)

17520

In [42]:
# This process is particularly useful in time series analysis and sequence modeling tasks
def create_sequences(combined_features, target_column, window_size):
    sequences = []
    for i in range(len(combined_features) - window_size):
        sequences.append((combined_features[i:i + window_size], target_column[i + window_size]))
    return np.array(sequences)

window_size = 48 # window_size: The size of the sliding window to create sequences,a window size of 48 is specified, which likely corresponds to 48 hours (2 days) of data.

sequences = create_sequences(combined_features, df['host_status'].values, window_size)
sequences

  return np.array(sequences)


array([[array([[-1.00921127, -1.08556812, -1.09529539, -1.08937509, -1.09242194,
                 1.        ,  0.        ],
               [-0.53766306, -0.55125553, -0.55082103, -0.55187669, -0.55039613,
                 1.        ,  1.        ],
               [-0.64245155, -0.66999166, -0.67181533, -0.67132078, -0.67084631,
                 1.        ,  1.        ],
               [-0.74724004, -0.78872779, -0.79280964, -0.79076487, -0.79129649,
                 1.        ,  1.        ],
               [-0.6948458 , -0.72935973, -0.73231248, -0.73104282, -0.7310714 ,
                 1.        ,  1.        ],
               [-0.5900573 , -0.6106236 , -0.61131818, -0.61159873, -0.61062122,
                 1.        ,  1.        ],
               [-0.48526881, -0.49188747, -0.49032388, -0.49215464, -0.49017104,
                 1.        ,  1.        ],
               [-0.5900573 , -0.6106236 , -0.61131818, -0.61159873, -0.61062122,
                 1.        ,  1.        ],
        

In [44]:
sequences.shape

(17472, 2)

In [45]:
display(pd.DataFrame(sequences[0][0]))

Unnamed: 0,0,1,2,3,4,5,6
0,-1.009211,-1.085568,-1.095295,-1.089375,-1.092422,1.0,0.0
1,-0.537663,-0.551256,-0.550821,-0.551877,-0.550396,1.0,1.0
2,-0.642452,-0.669992,-0.671815,-0.671321,-0.670846,1.0,1.0
3,-0.74724,-0.788728,-0.79281,-0.790765,-0.791296,1.0,1.0
4,-0.694846,-0.72936,-0.732312,-0.731043,-0.731071,1.0,1.0
5,-0.590057,-0.610624,-0.611318,-0.611599,-0.610621,1.0,1.0
6,-0.485269,-0.491887,-0.490324,-0.492155,-0.490171,1.0,1.0
7,-0.590057,-0.610624,-0.611318,-0.611599,-0.610621,1.0,1.0
8,-0.694846,-0.72936,-0.732312,-0.731043,-0.731071,1.0,1.0
9,-0.485269,-0.491887,-0.490324,-0.492155,-0.490171,1.0,1.0


In [19]:
sequences[0][1]

1

In [46]:
X = np.array([seq[0] for seq in sequences])
y = np.array([seq[1] for seq in sequences])

In [47]:
X

array([[[-1.00921127, -1.08556812, -1.09529539, ..., -1.09242194,
          1.        ,  0.        ],
        [-0.53766306, -0.55125553, -0.55082103, ..., -0.55039613,
          1.        ,  1.        ],
        [-0.64245155, -0.66999166, -0.67181533, ..., -0.67084631,
          1.        ,  1.        ],
        ...,
        [-0.74724004, -0.78872779, -0.79280964, ..., -0.79129649,
          1.        ,  1.        ],
        [-0.74724004, -0.78872779, -0.79280964, ..., -0.79129649,
          1.        ,  1.        ],
        [-0.48526881, -0.49188747, -0.49032388, ..., -0.49017104,
          1.        ,  1.        ]],

       [[-0.53766306, -0.55125553, -0.55082103, ..., -0.55039613,
          1.        ,  1.        ],
        [-0.64245155, -0.66999166, -0.67181533, ..., -0.67084631,
          1.        ,  1.        ],
        [-0.74724004, -0.78872779, -0.79280964, ..., -0.79129649,
          1.        ,  1.        ],
        ...,
        [-0.74724004, -0.78872779, -0.79280964, ..., -

In [48]:
pd.DataFrame(X[0])

Unnamed: 0,0,1,2,3,4,5,6
0,-1.009211,-1.085568,-1.095295,-1.089375,-1.092422,1.0,0.0
1,-0.537663,-0.551256,-0.550821,-0.551877,-0.550396,1.0,1.0
2,-0.642452,-0.669992,-0.671815,-0.671321,-0.670846,1.0,1.0
3,-0.74724,-0.788728,-0.79281,-0.790765,-0.791296,1.0,1.0
4,-0.694846,-0.72936,-0.732312,-0.731043,-0.731071,1.0,1.0
5,-0.590057,-0.610624,-0.611318,-0.611599,-0.610621,1.0,1.0
6,-0.485269,-0.491887,-0.490324,-0.492155,-0.490171,1.0,1.0
7,-0.590057,-0.610624,-0.611318,-0.611599,-0.610621,1.0,1.0
8,-0.694846,-0.72936,-0.732312,-0.731043,-0.731071,1.0,1.0
9,-0.485269,-0.491887,-0.490324,-0.492155,-0.490171,1.0,1.0


In [50]:
X.shape

(17472, 48, 7)

In [49]:
y

array([1, 1, 1, ..., 1, 1, 1])

In [51]:
 
# Ensure labels are integers (necessary for sparse_categorical_crossentropy)
y = y.astype(int)

# Split df into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train

array([[[-1.00921127, -1.08556812, -1.09529539, ..., -1.09242194,
          1.        ,  0.        ],
        [-0.53766306, -0.55125553, -0.55082103, ..., -0.55039613,
          1.        ,  1.        ],
        [-0.64245155, -0.66999166, -0.67181533, ..., -0.67084631,
          1.        ,  1.        ],
        ...,
        [-0.74724004, -0.78872779, -0.79280964, ..., -0.79129649,
          1.        ,  1.        ],
        [-0.74724004, -0.78872779, -0.79280964, ..., -0.79129649,
          1.        ,  1.        ],
        [-0.48526881, -0.49188747, -0.49032388, ..., -0.49017104,
          1.        ,  1.        ]],

       [[-0.53766306, -0.55125553, -0.55082103, ..., -0.55039613,
          1.        ,  1.        ],
        [-0.64245155, -0.66999166, -0.67181533, ..., -0.67084631,
          1.        ,  1.        ],
        [-0.74724004, -0.78872779, -0.79280964, ..., -0.79129649,
          1.        ,  1.        ],
        ...,
        [-0.74724004, -0.78872779, -0.79280964, ..., -

In [53]:
 X.shape

(17472, 48, 7)

In [52]:
(window_size, X.shape[-1])

(48, 7)

In [None]:
from tensorflow.keras.layers import LSTM
 
# Define the LSTM model for multiclass classification
model_lstm = Sequential([
    LSTM(50, activation='relu', input_shape=(window_size, X.shape[-1]), return_sequences=True),
    Dropout(0.2),
    LSTM(50, activation='relu'),
    Dropout(0.2),
    Dense(50, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])
 
model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
 
# Train the LSTM model with class weights
history_lstm = model_lstm.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping], class_weight=class_weights)
 

In [None]:
# Evaluate the LSTM model
test_loss_lstm, test_accuracy_lstm = model_lstm.evaluate(X_test, y_test)
print(f'Test Accuracy (LSTM): {test_accuracy_lstm}')
 
# Predict the test set
y_pred_prob_lstm = model_lstm.predict(X_test)
y_pred_lstm = np.argmax(y_pred_prob_lstm, axis=1)
 

In [None]:
# Calculate classification metrics
conf_matrix_lstm = confusion_matrix(y_test, y_pred_lstm)
class_report_lstm = classification_report(y_test, y_pred_lstm)
roc_auc_lstm = roc_auc_score(y_test, y_pred_prob_lstm, multi_class='ovo', average='macro')
 
precision_lstm, recall_lstm, f1_lstm, _ = precision_recall_fscore_support(y_test, y_pred_lstm, average='macro')
sensitivity_lstm = recall_lstm
 
print(f"Confusion Matrix:\n{conf_matrix_lstm}")
print(f"Classification Report:\n{class_report_lstm}")
print(f"ROC AUC Score: {roc_auc_lstm}")
print(f"Sensitivity: {sensitivity_lstm}")
print(f"F1 Score: {f1_lstm}")

In [None]:



 
# Plot ROC Curve for each class
fpr_lstm = {}
tpr_lstm = {}
roc_auc_lstm = {}
for i in range(num_classes):
    fpr_lstm[i], tpr_lstm[i], _ = roc_curve(y_test, y_pred_prob_lstm[:, i], pos_label=i)
    roc_auc_lstm[i] = auc(fpr_lstm[i], tpr_lstm[i])
 
plt.figure()
for i in range(num_classes):
    plt.plot(fpr_lstm[i], tpr_lstm[i], label=f'ROC curve (area = {roc_auc_lstm[i]:0.2f}) for class {i}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - LSTM')
plt.legend(loc='lower right')
plt.show()
 
# Plot loss and accuracy over epochs
plt.figure()
plt.plot(history_lstm.history['loss'], label='train_loss')
plt.plot(history_lstm.history['val_loss'], label='val_loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss over epochs - LSTM')
plt.legend()
plt.show()
 
plt.figure()
plt.plot(history_lstm.history['accuracy'], label='train_accuracy')
plt.plot(history_lstm.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy over epochs - LSTM')
plt.legend()
plt.show()
 
model_lstm.save('lstm_model.h5')
 
# Generate predictions for the next 48 time steps
num_predictions = 48
last_window = combined_features[-window_size:]  # Get the last 'window_size' rows
 
predictions_lstm = []
predicted_features_lstm = []
 
for _ in range(num_predictions):
    # Predict next value
    prediction_lstm = model_lstm.predict(last_window[np.newaxis, :, :])
    predicted_class_lstm = np.argmax(prediction_lstm, axis=1)  # Get predicted class
    predictions_lstm.append(predicted_class_lstm[0])
    # Approximate new feature values for the new time step
    new_feature_row_lstm = np.random.randn(len(columns_to_scale))  # Assuming normal distribution for simplicity
    encoded_categorical_values = np.random.randint(0, np.max(encoded_features, axis=0), size=encoded_features.shape[1])
    new_combined_features = np.hstack((new_feature_row_lstm, encoded_categorical_values))
    predicted_features_lstm.append(new_combined_features)
    # Update the window with the new prediction and feature values
    last_window = np.vstack([last_window[1:], new_combined_features])  # Slide window
 
# Convert predictions back to original labels
predicted_labels_lstm = label_encoders['host_status'].inverse_transform(predictions_lstm)
 
# Separate the scaled numerical and encoded categorical features
predicted_features_lstm = np.array(predicted_features_lstm)
predicted_numerical_features_lstm = predicted_features_lstm[:, :len(columns_to_scale)]
predicted_categorical_features_lstm = predicted_features_lstm[:, len(columns_to_scale):]
 
# Convert scaled predicted numerical features back to original scale
predicted_numerical_original_scale_lstm = scaler.inverse_transform(predicted_numerical_features_lstm)
 
# Create a DataFrame for the next 48 time steps
future_dates_lstm = pd.date_range(start=data.index[-1] + pd.Timedelta(hours=1), periods=num_predictions, freq='H')
predicted_df_lstm = pd.DataFrame(predicted_numerical_original_scale_lstm, columns=columns_to_scale, index=future_dates_lstm)
 
# Revert the label-encoded categorical columns back to their original form
for i, column in enumerate(categorical_columns):
    predicted_df_lstm[column] = label_encoders[column].inverse_transform(predicted_categorical_features_lstm[:, i].astype(int))
 
predicted_df_lstm['host_status'] = predicted_labels_lstm
 
# Display the final DataFrame
predicted_df_lstm[['CPU_load', 'CPU_Util', 'Mem_util', 'In_traffic', 'Out_traffic', 'host_ip', 'host_status']]