In [3]:
import pandas as pd

# Load the dataset (Replace 'your_file.csv' with the actual filename)
df = pd.read_csv('/kaggle/input/syntheticdata/dataSynthetic.csv')

# Define the columns of interest
columns_of_interest = [
    "Pod Status", "Pod Reason", "Pod Event Type", "Pod Event Reason", 
    "Pod Event Source", "Pod Event Reason", "Event Reason", "Event Source"
]

# Check which columns exist in the dataset (to avoid KeyErrors)
existing_columns = [col for col in columns_of_interest if col in df.columns]

# Get unique values for the existing string columns
unique_string_values = {col: df[col].dropna().unique().tolist() for col in existing_columns}

# Print the results
for col, values in unique_string_values.items():
    print(f"Column: {col}\nUnique String Values: {values}\n")


Column: Pod Status
Unique String Values: ['Running', 'Unknown', 'CrashLoopBackOff', 'Pending', 'Error', 'ContainerCreating', 'NotFound']

Column: Pod Reason
Unique String Values: []

Column: Pod Event Type

Column: Pod Event Reason
Unique String Values: ['Pulled', 'Created', 'BackOff', 'Started', 'NodeNotReady', 'Pulling', 'Killing']

Column: Pod Event Source
Unique String Values: ['kubelet', 'node-controller']

Column: Event Reason
Unique String Values: ['OOMKilling', 'RegisteredNode', 'NoVMEventScheduled', 'No recent events', 'KubeletIsUp', 'NodeNotReady', 'RedeployScheduled', 'ContainerRuntimeIsUp']

Column: Event Source
Unique String Values: ['kernel-monitor', 'node-controller', 'custom-scheduledevents-consolidated-condition-plugin-monitor', 'kubelet-custom-plugin-monitor', 'container-runtime-custom-plugin-monitor', 'custom-scheduledevents-consolidated-plugin-monitor']



In [4]:
import pandas as pd

# Load the dataset (Replace 'your_file.csv' with the actual filename)
df = pd.read_csv('/kaggle/input/syntheticdata/dataSynthetic.csv')

# Trim column names to remove any leading/trailing spaces
df.rename(columns=lambda x: x.strip(), inplace=True)

# 🚨 **Pod Status Failures**
pod_status_failure = df["Pod Status"].isin(["Unknown", "CrashLoopBackOff", "Error", "Pending", "NotFound"])

# 🚨 **Pod Event Type Issues (Warnings)**
pod_event_failure = df["Pod Event Type"].isin(["Warning"])

# 🚨 **Pod Event Reasons Indicating Failure**
pod_event_reason_failure = df["Pod Event Reason"].isin(["BackOff", "NodeNotReady", "Killing"])

# 🚨 **System Event Failures (Critical Issues)**
event_reason_failure = df["Event Reason"].isin(["OOMKilling", "NoVMEventScheduled"])

# 🚨 **High Resource Usage (CPU/Memory Overload)**
resource_failure = (df["CPU Usage (%)"] > 90) | (df["Memory Usage (%)"] > 90)

# 🚨 **Pod Restarts (Frequent Restarts Indicate Failures)**
pod_restart_failure = df["Pod Restarts"] > 5

# 🚨 **Network Issues (Packet Drops)**
network_failure = (df["Network Receive Packets Dropped (p/s)"] > 0) | (df["Network Transmit Packets Dropped (p/s)"] > 0)

# 🚨 **Disk I/O Failures (High Read/Write Activity)**
disk_failure = (df["FS Reads Total (MB)"] > 1000) | (df["FS Writes Total (MB)"] > 1000)

# Combine all failure conditions
df["Fail"] = (
    pod_status_failure | 
    pod_event_failure | 
    pod_event_reason_failure | 
    event_reason_failure | 
    resource_failure | 
    pod_restart_failure | 
    network_failure | 
    disk_failure
).astype(int)

# Print failure statistics
print("Failure Count Summary:\n", df["Fail"].value_counts())

# Print a sample of failure cases
print("\nSample Failure Cases:\n", df[df["Fail"] == 1].head())

# df is now updated with the "Fail" column and ready for further analysis


Failure Count Summary:
 Fail
1    95307
0     4693
Name: count, dtype: int64

Sample Failure Cases:
              Timestamp                                           Pod Name  \
0  2024-04-19 21:05:48          opentelemetry-demo-redis-68779558bb-4mjtr   
1  2024-04-19 20:58:41       opentelemetry-demo-frontend-76f486559f-szlsh   
2  2024-04-18 22:13:38  opentelemetry-demo-frauddetectionservice-64cb6...   
3  2024-04-22 12:30:14  opentelemetry-demo-frauddetectionservice-64cb6...   
4  2024-04-22 15:54:45  opentelemetry-demo-recommendationservice-7697d...   

   CPU Usage (%)  Memory Usage (%) Pod Status  Pod Reason  Pod Restarts  \
0       0.230313         53.101612    Running         NaN             0   
1      14.612577          0.141409    Running         NaN             2   
2       0.603365         12.191371    Running         NaN             0   
3       0.077004          0.201204    Unknown         NaN             0   
4       1.060736         43.678164    Running         NaN    

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [5]:
# Filter only failed cases where Fail == 1
df_failed = df[df["Fail"] == 1]
df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Timestamp,Pod Name,CPU Usage (%),Memory Usage (%),Pod Status,Pod Reason,Pod Restarts,Ready Containers,Total Containers,Pod Event Type,...,Network Transmit Packets (p/s),Network Receive Packets Dropped (p/s),Network Transmit Packets Dropped (p/s),FS Reads Total (MB),FS Writes Total (MB),FS Reads/Writes Total (MB),FS Reads Bytes Total (MB),FS Writes Bytes Total (MB),FS Reads/Writes Bytes Total (MB),Fail
0,2024-04-19 21:05:48,opentelemetry-demo-redis-68779558bb-4mjtr,0.230313,53.101612,Running,,0,1,1,No recent events,...,1.135357,2.153989,1.185063,0.000485,0.001514,0.001785,0.000310,0.000000,0.000331,1
1,2024-04-19 20:58:41,opentelemetry-demo-frontend-76f486559f-szlsh,14.612577,0.141409,Running,,2,1,1,Normal,...,356.949251,1.460284,10.050480,0.001362,0.000666,0.001240,0.000049,0.000000,0.000000,1
2,2024-04-18 22:13:38,opentelemetry-demo-frauddetectionservice-64cb6...,0.603365,12.191371,Running,,0,1,1,Normal,...,3.042058,3.968989,4.581325,0.001177,0.328530,0.357402,0.000326,0.003142,0.003002,1
3,2024-04-22 12:30:14,opentelemetry-demo-frauddetectionservice-64cb6...,0.077004,0.201204,Unknown,,0,0,0,No recent events,...,1.345098,0.861935,5.033863,0.000000,0.000771,0.000000,0.000396,0.000000,0.000205,1
4,2024-04-22 15:54:45,opentelemetry-demo-recommendationservice-7697d...,1.060736,43.678164,Running,,0,1,1,No recent events,...,0.094615,2.542782,7.614585,0.000000,0.001226,0.000493,0.000000,0.000279,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2024-04-18 21:34:18,opentelemetry-demo-frontendproxy-588c77dd7c-l4wcd,3.726941,0.276247,Running,,0,1,1,Normal,...,480.307024,12.942186,6.872772,0.001660,0.002174,0.001344,0.000976,0.000553,0.000000,1
99996,2024-04-18 15:32:45,opentelemetry-demo-emailservice-dd9b599db-hctwt,0.012213,4.863887,Running,,0,1,1,Normal,...,0.000000,0.000000,0.000000,0.000000,0.000220,0.001230,0.000261,0.000000,0.000530,0
99997,2024-04-18 15:57:40,opentelemetry-demo-shippingservice-86ccddbd5b-...,0.009438,0.983323,Running,,0,1,1,Normal,...,0.007754,0.000000,0.011464,0.000000,0.001411,0.000000,0.001104,0.000000,0.000000,1
99998,2024-04-22 11:01:44,opentelemetry-demo-shippingservice-86ccddbd5b-...,0.009156,60.118184,Running,,0,1,1,No recent events,...,0.025412,0.000000,0.030217,0.000028,0.001853,0.000000,0.000162,0.000000,0.000000,1


In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout, Input
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

# 🔹 Load the preprocessed dataset
df = df.copy()  # Ensure df has the "Fail" column
df_failed = df[df["Fail"] == 1]
df_normal = df[df["Fail"] == 0]

# 🔹 Select numerical features for training
features = [
    "CPU Usage (%)", "Memory Usage (%)", "Pod Restarts",
    "Memory Usage (MB)", "Memory Requests (%)", 
    "Network Receive Bytes", "Network Transmit Bytes",
    "FS Reads Total (MB)", "FS Writes Total (MB)"
]

# 🔹 Normalize the dataset
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# 🔹 Convert DataFrame to NumPy arrays
X = df[features].values
y = df["Fail"].values

# 🔹 Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =====================================================================
# 🔹 1️⃣ LSTM Model for Failure Prediction
# =====================================================================
lstm_model = Sequential([
    LSTM(64, activation="relu", return_sequences=True, input_shape=(X_train.shape[1], 1)),
    Dropout(0.2),
    LSTM(32, activation="relu"),
    Dense(1, activation="sigmoid")
])

lstm_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
lstm_model.fit(X_train.reshape(-1, X_train.shape[1], 1), y_train, epochs=10, batch_size=32)

# Evaluate LSTM
y_pred_lstm = (lstm_model.predict(X_test.reshape(-1, X_test.shape[1], 1)) > 0.5).astype(int)
print("\n🔹 LSTM Classification Report:\n", classification_report(y_test, y_pred_lstm))

# =====================================================================
# 🔹 2️⃣ GRU Model
# =====================================================================
gru_model = Sequential([
    GRU(64, activation="relu", return_sequences=True, input_shape=(X_train.shape[1], 1)),
    Dropout(0.2),
    GRU(32, activation="relu"),
    Dense(1, activation="sigmoid")
])

gru_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
gru_model.fit(X_train.reshape(-1, X_train.shape[1], 1), y_train, epochs=10, batch_size=32)

# Evaluate GRU
y_pred_gru = (gru_model.predict(X_test.reshape(-1, X_test.shape[1], 1)) > 0.5).astype(int)
print("\n🔹 GRU Classification Report:\n", classification_report(y_test, y_pred_gru))

# =====================================================================
# 🔹 3️⃣ Autoencoder for Anomaly Detection
# =====================================================================
input_dim = X_train.shape[1]

autoencoder = Sequential([
    Dense(64, activation="relu", input_shape=(input_dim,)),
    Dense(32, activation="relu"),
    Dense(64, activation="relu"),
    Dense(input_dim, activation="sigmoid")
])

autoencoder.compile(optimizer="adam", loss="mse")
autoencoder.fit(X_train, X_train, epochs=10, batch_size=32)

# Reconstruction error for anomalies
reconstruction_error = np.mean(np.abs(autoencoder.predict(X_test) - X_test), axis=1)
threshold = np.percentile(reconstruction_error, 95)
y_pred_autoencoder = (reconstruction_error > threshold).astype(int)
print("\n🔹 Autoencoder Classification Report:\n", classification_report(y_test, y_pred_autoencoder))

# =====================================================================
# 🔹 4️⃣ Isolation Forest for Anomaly Detection
# =====================================================================
iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest.fit(X_train)
y_pred_iso = iso_forest.predict(X_test)
y_pred_iso = np.where(y_pred_iso == -1, 1, 0)  # Convert -1 (anomaly) to 1 (failure)

print("\n🔹 Isolation Forest Classification Report:\n", classification_report(y_test, y_pred_iso))

# =====================================================================
# 🔹 5️⃣ LLMs (Mistral, BERT, GPT) for Log Analysis (Optional)
# =====================================================================
# You can fine-tune an LLM using Hugging Face Transformers for failure prediction
# Example: Fine-tuning a BERT model to classify failure-related logs
# Uncomment below for LLM-based failure classification

# from transformers import pipeline
# model_name = "bert-base-uncased"
# failure_detector = pipeline("text-classification", model=model_name)
# sample_logs = df["Pod Event Message"].fillna("").tolist()
# failure_predictions = failure_detector(sample_logs)
# print("\n🔹 LLM Failure Predictions:\n", failure_predictions)

  super().__init__(**kwargs)


Epoch 1/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.9520 - loss: 0.1987
Epoch 2/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9525 - loss: 0.1598
Epoch 3/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9521 - loss: 0.1498
Epoch 4/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9515 - loss: 0.1414
Epoch 5/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9528 - loss: 0.1298
Epoch 6/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9535 - loss: 0.1221
Epoch 7/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9548 - loss: 0.1140
Epoch 8/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9546 - loss: 0.1122
Epoch 9/10
[1m2500/250

  super().__init__(**kwargs)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.9533 - loss: 0.1955
Epoch 2/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.9521 - loss: 0.1613
Epoch 3/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.9525 - loss: 0.1436
Epoch 4/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9524 - loss: 0.1298
Epoch 5/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9546 - loss: 0.1192
Epoch 6/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.9554 - loss: 0.1155
Epoch 7/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.9569 - loss: 0.1085
Epoch 8/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.9566 - loss: 0.1063
Epoch 9/10
[1m2500/2500[0

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - loss: 0.6715
Epoch 2/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.5874
Epoch 3/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.5935
Epoch 4/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.5862
Epoch 5/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.5919
Epoch 6/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.5931
Epoch 7/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.5898
Epoch 8/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.5903
Epoch 9/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.5853
Epoch 10/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/