In [1]:
! pip install flaml[automl] matplotlib openml

Collecting openml
  Downloading openml-0.15.0-py3-none-any.whl.metadata (9.9 kB)
Collecting flaml[automl]
  Downloading FLAML-2.3.3-py3-none-any.whl.metadata (16 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Downloading openml-0.15.0-py3-none-any.whl (157 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.0/158.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading FLAML-2.3.3-py3-none-any.whl (314 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.2/314.2 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB

In [2]:
import pandas as pd
import numpy as np
from flaml import AutoML
from sklearn.model_selection import train_test_split
import joblib
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load and Preprocess the Data

# Load the datasets
traffic = pd.read_csv("/kaggle/input/public-sector-network/traffic.csv")
nodes = pd.read_csv("/kaggle/input/public-sector-network/nodes.csv")

# Merge the datasets on Node_ID
data = pd.merge(traffic, nodes, on="Node_ID")

# Save the merged dataset to '/kaggle/working/'
merged_data_path = '/kaggle/working/traffic_merged_data.csv'
data.to_csv(merged_data_path, index=False)
print(f"Merged dataset saved to {merged_data_path}")

# Feature engineering
# Extract year, month, day, and hour from the 'Date' and 'Time' columns
data['DateTime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])
data['Year'] = data['DateTime'].dt.year
data['Month'] = data['DateTime'].dt.month
data['Day'] = data['DateTime'].dt.day
data['Hour'] = data['DateTime'].dt.hour

# Drop irrelevant columns
data = data[['Node_ID', 'Year', 'Month', 'Day', 'Hour', 'Data_Usage_GB', 'Peak_Usage_GB', 
             'Type', 'Region', 'Population_Served', 'Connectivity_Status', 'Existing_Infrastructure']]

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Type', 'Region', 'Connectivity_Status', 'Existing_Infrastructure'], drop_first=True)

# Step 2: Prepare Data for Regression

# Define the target variables
targets = ['Data_Usage_GB', 'Peak_Usage_GB']

# Split into features (X) and targets (y)
X = data.drop(columns=targets)
y_data_usage = data['Data_Usage_GB']
y_peak_usage = data['Peak_Usage_GB']

# Split into train and test sets (80-20 split)
X_train, X_test, y_train_data, y_test_data = train_test_split(X, y_data_usage, test_size=0.2, random_state=42)
X_train, X_test, y_train_peak, y_test_peak = train_test_split(X, y_peak_usage, test_size=0.2, random_state=42)

# Step 3: Train Regression Models Using FLAML

# Train a model for Data Usage
automl_data_usage = AutoML()
settings = {
    "time_budget": 600,  # 10 minutes
    "metric": "rmse",  # Use RMSE for regression
    "task": "regression",
    "log_file_name": "data_usage_regression.log",
    "seed": 42,
}
print("\nTraining Data Usage Regression Model...")
automl_data_usage.fit(X_train=X_train, y_train=y_train_data, **settings)

# Train a model for Peak Usage
automl_peak_usage = AutoML()
settings = {
    "time_budget": 600,  # 10 minutes
    "metric": "rmse",  # Use RMSE for regression
    "task": "regression",
    "log_file_name": "peak_usage_regression.log",
    "seed": 42,
}
print("\nTraining Peak Usage Regression Model...")
automl_peak_usage.fit(X_train=X_train, y_train=y_train_peak, **settings)

# Step 4: Evaluate the Models on the Test Set

# Evaluate Data Usage Model
y_pred_data = automl_data_usage.predict(X_test)
rmse_data = np.sqrt(mean_squared_error(y_test_data, y_pred_data))
r2_data = r2_score(y_test_data, y_pred_data)
print(f"\nData Usage Model - RMSE: {rmse_data}, R²: {r2_data}")

# Evaluate Peak Usage Model
y_pred_peak = automl_peak_usage.predict(X_test)
rmse_peak = np.sqrt(mean_squared_error(y_test_peak, y_pred_peak))
r2_peak = r2_score(y_test_peak, y_pred_peak)
print(f"Peak Usage Model - RMSE: {rmse_peak}, R²: {r2_peak}")

# Step 5: Save the Models

# Save the trained models
joblib.dump(automl_data_usage, '/kaggle/working/data_usage_regression_model.pkl')
joblib.dump(automl_peak_usage, '/kaggle/working/peak_usage_regression_model.pkl')
print("\nModels saved to disk.")

# Step 6: Pre-set User Input Values

def get_predefined_input():
    """
    Pre-set user input values for prediction.
    """
    user_input = {
        'Node_ID': 'Node_123',  # Example Node ID
        'Population_Served': 5000,  # Example population served
        'Region': 'Russia',  # Example region
        'Type': 'School',  # Example type of organization
        'Existing_Infrastructure': 'Yes',  # Example existing infrastructure
        'Connectivity_Status': 'Connected'  # Example connectivity status
    }
    start_date = '2025-10-01'  # Example start date
    end_date = '2025-10-07'  # Example end date
    
    return user_input, start_date, end_date

# Step 7: Integrate with the Prediction Function (Updated for Daily Predictions)

def predict_data_and_peak_usage(input_data, start_date, end_date):
    """
    Predict data usage and peak usage based on user input for a date range.
    input_data: A dictionary containing the input features.
    start_date: Start date for prediction (YYYY-MM-DD).
    end_date: End date for prediction (YYYY-MM-DD).
    """
    # Load the saved models
    data_usage_model = joblib.load('/kaggle/working/data_usage_regression_model.pkl')
    peak_usage_model = joblib.load('/kaggle/working/peak_usage_regression_model.pkl')
    
    # Generate a date range from start_date to end_date
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Initialize lists to store predictions
    predictions = []
    
    # Iterate over each date in the range
    for date in date_range:
        # Extract year, month, and day
        year = date.year
        month = date.month
        day = date.day
        
        # Create a copy of the input data
        input_df = pd.DataFrame([input_data])
        
        # Add year, month, and day
        input_df['Year'] = year
        input_df['Month'] = month
        input_df['Day'] = day
        
        # Set Hour to 0 (or any placeholder, since we're not using it)
        input_df['Hour'] = 0
        
        # Ensure all categorical columns are present in the input DataFrame
        categorical_columns = ['Type', 'Region', 'Connectivity_Status', 'Existing_Infrastructure']
        for col in categorical_columns:
            if col not in input_df.columns:
                input_df[col] = 0
        
        # One-hot encode categorical variables
        input_df = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
        
        # Ensure the input data has the same features as the training data
        # Add missing columns and fill with 0
        for col in X.columns:
            if col not in input_df.columns:
                input_df[col] = 0
        
        # Reorder columns to match the training data
        input_df = input_df[X.columns]
        
        # Predict
        data_usage_pred = data_usage_model.predict(input_df)
        peak_usage_pred = peak_usage_model.predict(input_df)
        
        # Append predictions to the list
        predictions.append({
            'Date': date.strftime('%Y-%m-%d'),
            'Data_Usage_GB': data_usage_pred[0],
            'Peak_Usage_GB': peak_usage_pred[0]
        })
    
    # Convert predictions to a DataFrame
    predictions_df = pd.DataFrame(predictions)
    
    return predictions_df

# Step 8: Run the Prediction with Pre-set Input

if __name__ == "__main__":
    # Get pre-set user input
    user_input, start_date, end_date = get_predefined_input()
    
    # Predict data usage and peak usage for the date range
    predictions_df = predict_data_and_peak_usage(user_input, start_date, end_date)
    
    # Display the results
    print("\nPrediction Results:")
    print(predictions_df)

Merged dataset saved to /kaggle/working/traffic_merged_data.csv

Training Data Usage Regression Model...
[flaml.automl.logger: 01-25 00:52:37] {1728} INFO - task = regression
[flaml.automl.logger: 01-25 00:52:37] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 01-25 00:52:37] {1838} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 01-25 00:52:38] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost']
[flaml.automl.logger: 01-25 00:52:38] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 01-25 00:52:38] {2393} INFO - Estimated sufficient time budget=2203s. Estimated necessary time budget=19s.
[flaml.automl.logger: 01-25 00:52:38] {2442} INFO -  at 0.6s,	estimator lgbm's best error=43.4689,	best estimator lgbm's best error=43.4689
[flaml.automl.logger: 01-25 00:52:38] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 01-25 00:52:38] {2442} INFO -  at 0.8s,	es