In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import os
import pickle

# Load dataset
def load_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} not found")
    df = pd.read_csv(file_path)
    print("Columns:", df.columns.tolist())
    time_cols = ['Pod Event Age', 'Event Age']
    for col in time_cols:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: sum(int(t) * 60 ** i for i, t in enumerate(reversed(str(x).split(':')))))
    return df

# Feature and target columns
def get_feature_target_columns(df):
    target_columns = [
        'Pod_Status_Failure', 'Pod_Event_Failure', 'Pod_Event_Reason_Failure',
        'Resource_Failure', 'Pod_Restart_Failure', 'Network_Failure', 'Disk_Failure'
    ]
    available_targets = [col for col in target_columns if col in df.columns]
    if not available_targets:
        raise ValueError("No target columns found!")
    non_feature_columns = available_targets + [col for col in df.columns if df[col].dtype == 'object']
    feature_columns = [col for col in df.columns if col not in non_feature_columns]
    return feature_columns, available_targets

# Create a single target (any failure = 1, no failure = 0)
def create_combined_target(df, target_columns):
    df['Any_Failure'] = df[target_columns].max(axis=1)  # 1 if any failure type is 1, else 0
    return df

# Forecast features using moving averages
def forecast_features(pod_data, feature_columns, forecast_periods=1):
    forecasted_features = {}
    for feature in feature_columns:
        values = pod_data[feature].values
        if len(values) < 3:
            forecasted_features[feature] = values[-1]
        else:
            window_size = min(5, len(values))
            forecasted_features[feature] = np.mean(values[-window_size:])
    return forecasted_features

# Build and train a Keras single-output model
def build_and_train_model(X_train, y_train, epochs=50, batch_size=32):
    input_dim = X_train.shape[1]

    # Define the model
    inputs = Input(shape=(input_dim,))
    x = Dense(128, activation='relu')(inputs)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    output = Dense(1, activation='sigmoid', name='output')(x)  # Single output
    
    # Create and compile the model
    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    
    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # Train the model
    history = model.fit(X_train, y_train, 
                        epochs=epochs, 
                        batch_size=batch_size, 
                        validation_split=0.2, 
                        callbacks=[early_stopping], 
                        verbose=1)
    
    return model, history

# Predict failures and save model
def predict_pod_failures(pod_data, feature_columns, target_columns, forecast_time_steps=1):
    print("Preparing data...")
    pod_data = create_combined_target(pod_data, target_columns)  # Add combined target
    scaler = StandardScaler()
    X = scaler.fit_transform(pod_data[feature_columns])
    X = pd.DataFrame(X, columns=feature_columns)
    y = pod_data['Any_Failure']
    
    print("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    print("Training Keras model...")
    model, history = build_and_train_model(X_train, y_train)
    
    # Save the model and scaler
    model.save("model.h5")
    with open("scaler.pkl", "wb") as f:
        pickle.dump(scaler, f)
    print("Model saved as 'model.h5' and scaler saved as 'scaler.pkl'")
    
    # Evaluate on test data
    evaluation = model.evaluate(X_test, y_test, verbose=0)
    evaluation_results = {'val_loss': evaluation[0], 'val_acc': evaluation[1]}
    
    # Forecast for a sample of pods
    sample_pods = pod_data['Pod Name'].unique()[:min(20, len(pod_data['Pod Name'].unique()))]
    future_predictions = {}
    
    print("Forecasting future failures...")
    for pod_name in sample_pods:
        pod_df = pod_data[pod_data['Pod Name'] == pod_name].copy()
        forecasted_features = forecast_features(pod_df, feature_columns)
        future_features = pd.DataFrame([forecasted_features])
        future_features_scaled = scaler.transform(future_features)
        
        # Predict with the model
        prediction = model.predict(future_features_scaled, verbose=0)[0][0]
        future_predictions[pod_name] = {'Any_Failure': int(prediction > 0.5)}
    
    return evaluation_results, future_predictions, scaler, history

# Plot feature trends
def plot_feature_trends(pod_data, pod_name, feature_columns, max_features=3):
    pod_df = pod_data[pod_data['Pod Name'] == pod_name].copy()
    if pod_df.empty:
        print(f"No data found for pod {pod_name}")
        return None
    
    features_to_plot = feature_columns[:min(max_features, len(feature_columns))]
    fig, axs = plt.subplots(len(features_to_plot), 1, figsize=(10, 3*len(features_to_plot)))
    if len(features_to_plot) == 1:
        axs = [axs]
    
    for i, feature in enumerate(features_to_plot):
        values = pod_df[feature].values
        axs[i].plot(range(len(values)), values, 'b-', label='Historical')
        axs[i].set_title(f'{feature} for {pod_name}')
        axs[i].legend()
    
    plt.tight_layout()
    plt.savefig(f"trends_{pod_name}.png")
    plt.close()
    print(f"Saved plot to trends_{pod_name}.png")

# Plot training history
def plot_training_history(history):
    plt.figure(figsize=(10, 5))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss During Training')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig('training_history_loss.png')
    plt.close()
    
    plt.figure(figsize=(10, 5))
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy During Training')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig('training_history_accuracy.png')
    plt.close()
    print("Saved training history plots")

# Main function
def main(file_path):
    print("Starting execution...")
    pod_data = load_data(file_path)
    feature_columns, target_columns = get_feature_target_columns(pod_data)
    print(f"Dataset: {len(pod_data)} rows, {len(feature_columns)} features, {len(target_columns)} targets")
    
    evaluation_results, future_predictions, scaler, history = predict_pod_failures(
        pod_data, feature_columns, target_columns
    )
    
    print("\nModel Evaluation Results:")
    print(f"Validation Loss: {evaluation_results['val_loss']:.4f}")
    print(f"Validation Accuracy: {evaluation_results['val_acc']:.4f}")
    
    print("\nFuture Predictions:")
    for pod_name, predictions in future_predictions.items():
        print(f"\n{pod_name}:")
        print(f"  Any_Failure: {'Failure' if predictions['Any_Failure'] == 1 else 'No Failure'}")
    
    sample_pod = pod_data['Pod Name'].iloc[0]
    plot_feature_trends(pod_data, sample_pod, feature_columns)
    plot_training_history(history)

if __name__ == "__main__":
    file_path = "/kaggle/input/fulldata/Transformed_Dataset/transformed_dataset.csv"  # Update this
    main(file_path)

Starting execution...
Columns: ['Pod Status', 'Pod Event Type', 'Pod Event Reason', 'CPU Usage (%)', 'Memory Usage (%)', 'Pod Restarts', 'Network Receive Packets Dropped (p/s)', 'Network Transmit Packets Dropped (p/s)', 'FS Reads Total (MB)', 'FS Writes Total (MB)', 'Pod Name', 'Node Name', 'Ready Containers', 'Total Containers', 'Pod Event Age', 'Pod Event Source', 'Event Age', 'Event Source', 'Hour', 'Day', 'Month', 'Weekday', 'Pod_Status_Failure', 'Pod_Event_Failure', 'Pod_Event_Reason_Failure', 'Resource_Failure', 'Pod_Restart_Failure', 'Network_Failure', 'Disk_Failure']
Dataset: 100000 rows, 20 features, 7 targets
Preparing data...
Splitting data...
Training Keras model...
Epoch 1/50
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0137 - val_accuracy: 1.0000 - val_loss: 7.1250e-07
Epoch 2/50
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 1.0000 - loss: 4.7331e-06 - val_accuracy: 1.0