In [1]:
! pip install flaml[automl] matplotlib openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting flaml[automl]
  Downloading FLAML-2.3.3-py3-none-any.whl.metadata (16 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading FLAML-2.3.3-py3-none-any.whl (314 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.2/314.2 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[

In [2]:
import pandas as pd
import numpy as np
from flaml import AutoML
from sklearn.model_selection import train_test_split
import joblib
from joblib import Parallel, delayed
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Step 1: Load and Preprocess the Data
traffic = pd.read_csv("/kaggle/input/public-sector-network/traffic.csv")
nodes = pd.read_csv("/kaggle/input/public-sector-network/nodes.csv")
data = pd.merge(traffic, nodes, on="Node_ID")

# Feature engineering
data['DateTime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])
data['Year'] = data['DateTime'].dt.year
data['Month'] = data['DateTime'].dt.month
data['Day'] = data['DateTime'].dt.day
data['Hour'] = data['DateTime'].dt.hour

# Drop irrelevant columns
data = data[['Node_ID', 'Year', 'Month', 'Day', 'Hour', 'Data_Usage_GB', 'Peak_Usage_GB', 'Downtime_Events',
             'Type', 'Region', 'Population_Served', 'Connectivity_Status', 'Existing_Infrastructure']]

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Type', 'Region', 'Connectivity_Status', 'Existing_Infrastructure'], drop_first=True)

# Step 2: Prepare Data for Regression
targets = ['Data_Usage_GB', 'Peak_Usage_GB', 'Downtime_Events']
X = data.drop(columns=targets)
y_data_usage = data['Data_Usage_GB']
y_peak_usage = data['Peak_Usage_GB']
y_downtime_events = data['Downtime_Events']

# Split into train and test sets
X_train, X_test, y_train_data, y_test_data = train_test_split(X, y_data_usage, test_size=0.2, random_state=42)
X_train, X_test, y_train_peak, y_test_peak = train_test_split(X, y_peak_usage, test_size=0.2, random_state=42)
X_train, X_test, y_train_downtime, y_test_downtime = train_test_split(X, y_downtime_events, test_size=0.2, random_state=42)

# Step 3: Train Regression Models Using FLAML
automl_data_usage = AutoML()
automl_peak_usage = AutoML()
automl_downtime_events = AutoML()

settings = {
    "time_budget": 60,  # 1 minute
    "metric": "rmse",
    "task": "regression",
    "seed": 42,
}

print("Training Data Usage Regression Model...")
automl_data_usage.fit(X_train=X_train, y_train=y_train_data, **settings)

print("Training Peak Usage Regression Model...")
automl_peak_usage.fit(X_train=X_train, y_train=y_train_peak, **settings)

print("Training Downtime Events Regression Model...")
automl_downtime_events.fit(X_train=X_train, y_train=y_train_downtime, **settings)

# Save the trained models
joblib.dump(automl_data_usage, '/kaggle/working/data_usage_regression_model.pkl')
joblib.dump(automl_peak_usage, '/kaggle/working/peak_usage_regression_model.pkl')
joblib.dump(automl_downtime_events, '/kaggle/working/downtime_events_regression_model.pkl')
print("Models saved to disk.")

# Step 4: Evaluate Models on Test Set
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    print(f"{model_name} - RMSE: {rmse}, MAE: {mae}")

print("\nEvaluating Data Usage Regression Model...")
evaluate_model(automl_data_usage, X_test, y_test_data, "Data Usage Regression Model")

print("\nEvaluating Peak Usage Regression Model...")
evaluate_model(automl_peak_usage, X_test, y_test_peak, "Peak Usage Regression Model")

print("\nEvaluating Downtime Events Regression Model...")
evaluate_model(automl_downtime_events, X_test, y_test_downtime, "Downtime Events Regression Model")

# Step 5: Optimized Prediction Function (Hourly Predictions for a Single Date and Time Range)
def predict_for_node(node_data, date, start_hour, end_hour, data_usage_model, peak_usage_model, downtime_events_model, X_columns):
    """
    Predict for a single node for a specific date and time range.
    """
    predictions = []
    for hour in range(start_hour, end_hour + 1):
        input_df = pd.DataFrame([{
            'Year': date.year,
            'Month': date.month,
            'Day': date.day,
            'Hour': hour,
            **node_data
        }])
        
        # One-hot encode and align columns
        input_df = pd.get_dummies(input_df, columns=['Type', 'Region', 'Connectivity_Status', 'Existing_Infrastructure'], drop_first=True)
        for col in X_columns:
            if col not in input_df.columns:
                input_df[col] = 0
        input_df = input_df[X_columns]
        
        # Predict
        data_usage_pred = data_usage_model.predict(input_df)[0]
        peak_usage_pred = peak_usage_model.predict(input_df)[0]
        downtime_events_pred = int(round(downtime_events_model.predict(input_df)[0]))  # Round to integer
        
        predictions.append({
            'Date': date.strftime('%Y-%m-%d'),
            'Time': f"{hour:02d}:00:00",
            'Node_ID': node_data['Node_ID'],
            'Data_Usage_GB': data_usage_pred,
            'Peak_Usage_GB': peak_usage_pred,
            'Downtime_Events': downtime_events_pred
        })
    return predictions

def predict_data_peak_downtime_for_all_nodes(date, start_time, end_time):
    """
    Predict for all nodes in parallel for a specific date and time range.
    """
    # Load models once
    data_usage_model = joblib.load('/kaggle/working/data_usage_regression_model.pkl')
    peak_usage_model = joblib.load('/kaggle/working/peak_usage_regression_model.pkl')
    downtime_events_model = joblib.load('/kaggle/working/downtime_events_regression_model.pkl')
    
    # Parse start and end hours
    start_hour = int(start_time.split(':')[0])
    end_hour = int(end_time.split(':')[0])
    
    # Prepare node data
    nodes_data = nodes.to_dict('records')
    
    # Use parallel processing
    results = Parallel(n_jobs=-1)(
        delayed(predict_for_node)(node, date, start_hour, end_hour, data_usage_model, peak_usage_model, downtime_events_model, X.columns)
        for node in nodes_data
    )
    
    # Flatten results
    all_predictions = [pred for sublist in results for pred in sublist]
    return pd.DataFrame(all_predictions)

# Step 6: Run the Chatbot-Like Interface with Predefined Values
if __name__ == "__main__":
    # Predefined values for user input
    date = pd.to_datetime("2025-10-15")  # Example date
    start_time = "11:30:00"  # Example start time
    end_time = "12:30:00"  # Example end time
    
    # Generate predictions
    predictions_df = predict_data_peak_downtime_for_all_nodes(date, start_time, end_time)
    print("\nPrediction Results for All Nodes:")
    print(predictions_df)

Training Data Usage Regression Model...
[flaml.automl.logger: 01-25 12:35:47] {1728} INFO - task = regression
[flaml.automl.logger: 01-25 12:35:47] {1739} INFO - Evaluation method: holdout
[flaml.automl.logger: 01-25 12:35:47] {1838} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 01-25 12:35:47] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost']
[flaml.automl.logger: 01-25 12:35:47] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 01-25 12:35:47] {2393} INFO - Estimated sufficient time budget=1079s. Estimated necessary time budget=9s.
[flaml.automl.logger: 01-25 12:35:47] {2442} INFO -  at 0.6s,	estimator lgbm's best error=43.6106,	best estimator lgbm's best error=43.6106
[flaml.automl.logger: 01-25 12:35:47] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 01-25 12:35:48] {2442} INFO -  at 0.6s,	estimator lgbm's best error=43.5519,	best estimator lgbm's best