In [1]:
! pip install flaml[automl] matplotlib openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting flaml[automl]
  Downloading FLAML-2.3.3-py3-none-any.whl.metadata (16 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading FLAML-2.3.3-py3-none-any.whl (314 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.2/314.2 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[

In [2]:
import os
import numpy as np
import pandas as pd
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib

# Step 1: Load the datasets
procurement = pd.read_csv("/kaggle/input/public-sector-network/procurement.csv")
nodes = pd.read_csv("/kaggle/input/public-sector-network/nodes.csv")

# Step 2: Merge the datasets on Node_ID
data = pd.merge(procurement, nodes, on="Node_ID")

# Step 3: Save the merged dataset to '/kaggle/working/' as procurement_merged_data.csv
merged_data_path = '/kaggle/working/procurement_merged_data.csv'
data.to_csv(merged_data_path, index=False)

# Step 4: Load the merged dataset from '/kaggle/working/'
if os.path.exists(merged_data_path):
    data = pd.read_csv(merged_data_path)
else:
    raise FileNotFoundError(f"Merged dataset not found at {merged_data_path}.")

# Step 5: Preprocess the data
# Drop non-relevant columns
data = data.drop(columns=['Deployment_ID', 'Latitude', 'Longitude', 'Vendor_Details', 'Connectivity_Status', 'Existing_Infrastructure', 'Quantity'])

# Map categorical columns to numerical values
type_mapping = {
    'Government Office': 1,
    'Health Center': 2,
    'School': 3
}
data['Type'] = data['Type'].map(type_mapping)

# One-hot encode 'Region' and 'Equipment_Used'
data = pd.get_dummies(data, columns=['Region', 'Equipment_Used'], drop_first=True)

# Step 6: Define the target variables and features
X = data.drop(columns=['Cost_USD', 'Time_Taken_Days'])  # Features
y_cost = data['Cost_USD']  # Target variable for cost prediction
y_time = data['Time_Taken_Days']  # Target variable for time prediction

# Step 7: Split into train and test sets
if X.shape[0] > 0:  # Ensure the dataset is not empty
    X_train, X_test, y_cost_train, y_cost_test = train_test_split(X, y_cost, test_size=0.2, random_state=42)
    X_train, X_test, y_time_train, y_time_test = train_test_split(X, y_time, test_size=0.2, random_state=42)
else:
    raise ValueError("The dataset is empty after preprocessing. Check the data loading and preprocessing steps.")

# Step 8: Train Regression Models Using FLAML

# Train a model for Cost Prediction
automl_cost = AutoML()
settings_cost = {
    "time_budget": 3600,  # 1 hour
    "metric": "rmse",  # Use RMSE for regression
    "task": "regression",
    "log_file_name": "cost_prediction.log",
    "seed": 42,
}
automl_cost.fit(X_train=X_train, y_train=y_cost_train, **settings_cost)

# Train a model for Time Prediction
automl_time = AutoML()
settings_time = {
    "time_budget": 3600,  # 1 hour
    "metric": "rmse",  # Use RMSE for regression
    "task": "regression",
    "log_file_name": "time_prediction.log",
    "seed": 42,
}
automl_time.fit(X_train=X_train, y_train=y_time_train, **settings_time)

# Step 9: Evaluate Model Performance on Test Data
# Predict on the test set
y_cost_pred = automl_cost.predict(X_test)
y_time_pred = automl_time.predict(X_test)

# Calculate RMSE and MAE for cost prediction
rmse_cost = np.sqrt(mean_squared_error(y_cost_test, y_cost_pred))
mae_cost = mean_absolute_error(y_cost_test, y_cost_pred)

# Calculate RMSE and MAE for time prediction
rmse_time = np.sqrt(mean_squared_error(y_time_test, y_time_pred))
mae_time = mean_absolute_error(y_time_test, y_time_pred)

# Print evaluation metrics
print("Cost Prediction Model Evaluation:")
print(f"RMSE: {rmse_cost:.2f}")
print(f"MAE: {mae_cost:.2f}\n")

print("Time Prediction Model Evaluation:")
print(f"RMSE: {rmse_time:.2f}")
print(f"MAE: {mae_time:.2f}\n")

# Step 10: Save the Models
joblib.dump(automl_cost, '/kaggle/working/cost_prediction_model.pkl')
joblib.dump(automl_time, '/kaggle/working/time_prediction_model.pkl')

# Step 11: Create a Function for User Input

def predict_cost_and_time(input_data):
    """
    Predict cost and time based on user input.
    input_data: A dictionary containing the input features.
    """
    # Load the saved models
    cost_model = joblib.load('/kaggle/working/cost_prediction_model.pkl')
    time_model = joblib.load('/kaggle/working/time_prediction_model.pkl')
    
    # Convert input data to a DataFrame
    input_df = pd.DataFrame([input_data])
    
    # Ensure the input data has the same features as the training data
    for col in X.columns:
        if col not in input_df.columns:
            input_df[col] = 0
    
    # Reorder columns to match the training data
    input_df = input_df[X.columns]
    
    # Predict
    cost_pred = cost_model.predict(input_df)
    time_pred = time_model.predict(input_df)
    
    return cost_pred[0], time_pred[0]

# Step 12: Predefined Input Values
# Set predefined input values
valid_equipment = ['cable', 'switch', 'antenna', 'router']  # Valid equipment types

predefined_input = {
    'Node_ID': 'Node_123',
    'Population_Served': 5000,
    'Region': 'Russia',  # Ensure this matches the one-hot encoded format
    'Type': 1,  # Government Office
    'Quantity': 195
}

# Validate and set the equipment
equipment = 'switch'  # Predefined equipment
if equipment not in valid_equipment:
    raise ValueError(f"Invalid equipment. Please enter one of: {valid_equipment}")

# Set the correct one-hot encoded column for the equipment
equipment_columns = [col for col in X.columns if col.startswith("Equipment_Used_")]
for col in equipment_columns:
    if col == f"Equipment_Used_{equipment}":
        predefined_input[col] = 1
    else:
        predefined_input[col] = 0

# Predict using predefined input
cost_pred, time_pred = predict_cost_and_time(predefined_input)

# Display results
print(f"\nRecommended Deployment Plan for Node_ID: {predefined_input['Node_ID']}, Region: {predefined_input['Region'].capitalize()}, Organization Type: Government Office:")
print(f"Equipment: {equipment.capitalize()}")
print(f"Predicted Cost (USD): {cost_pred:.2f}")
print(f"Predicted Time (Days): {time_pred:.2f}")

[flaml.automl.logger: 01-25 12:30:31] {1728} INFO - task = regression
[flaml.automl.logger: 01-25 12:30:31] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 01-25 12:30:31] {1838} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 01-25 12:30:31] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost']
[flaml.automl.logger: 01-25 12:30:31] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 01-25 12:30:31] {2393} INFO - Estimated sufficient time budget=1085s. Estimated necessary time budget=9s.
[flaml.automl.logger: 01-25 12:30:31] {2442} INFO -  at 0.4s,	estimator lgbm's best error=41824.7780,	best estimator lgbm's best error=41824.7780
[flaml.automl.logger: 01-25 12:30:31] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 01-25 12:30:31] {2442} INFO -  at 0.5s,	estimator lgbm's best error=41793.2021,	best estimator lgbm's best error=41793.2021
[flaml.automl.logg