In [1]:
! pip install flaml[automl] matplotlib openml

Collecting openml
  Downloading openml-0.15.0-py3-none-any.whl.metadata (9.9 kB)
Collecting flaml[automl]
  Downloading FLAML-2.3.3-py3-none-any.whl.metadata (16 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Downloading openml-0.15.0-py3-none-any.whl (157 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.0/158.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading FLAML-2.3.3-py3-none-any.whl (314 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.2/314.2 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB

In [2]:
import os
import numpy as np
import pandas as pd
from flaml import AutoML
from sklearn.model_selection import train_test_split
import joblib
from sklearn.metrics import accuracy_score

# Load the datasets
maintenance = pd.read_csv("/kaggle/input/public-sector-network/maintenance.csv")
nodes = pd.read_csv("/kaggle/input/public-sector-network/nodes.csv")

# Merge the datasets on Node_ID
data = pd.merge(maintenance, nodes, on="Node_ID")

# Save the merged data to a CSV file in the /kaggle/working/ directory
merged_data_path = '/kaggle/working/maintenance_merged_data.csv'
data.to_csv(merged_data_path, index=False)
print(f"Merged data saved to {merged_data_path}")

# Preprocess the data
# Drop non-relevant columns and handle categorical data
data = data.drop(columns=['Log_ID', 'Technician_ID', 'Latitude', 'Longitude'])  # Remove Latitude and Longitude

# Map 'Type' to numerical values
type_mapping = {
    'Government Office': 1,
    'Health Center': 2,
    'School': 3
}
data['Type'] = data['Type'].map(type_mapping)

# Map 'Connectivity_Status' to numerical values
connectivity_mapping = {
    'Connected': 1,
    'Unconnected': 2
}
data['Connectivity_Status'] = data['Connectivity_Status'].map(connectivity_mapping)

# Map 'Existing_Infrastructure' to numerical values
infrastructure_mapping = {
    'Yes': 1,
    'No': 2
}
data['Existing_Infrastructure'] = data['Existing_Infrastructure'].map(infrastructure_mapping)

# Convert Node_ID to numeric (extract the number from 'Node_1', 'Node_2', etc.)
data['Node_ID'] = data['Node_ID'].str.extract('(\d+)').astype(int)

# One-hot encode remaining categorical columns (e.g., 'Region')
data = pd.get_dummies(data, columns=['Region'], drop_first=True)

# Define the target variable and features
X = data.drop(columns=['Issue_Type'])  # Features
y = data['Issue_Type']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize an AutoML instance
automl = AutoML()

# Specify automl goal and constraint
automl_settings = {
    "time_budget": 3600,  # 1 hour
    "metric": "accuracy",
    "task": "classification",
    "log_file_name": "maintenance.log",
}

# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)

# Save the model
model_path = '/kaggle/working/maintenance_model.pkl'
joblib.dump(automl, model_path)

# Print the best model
print("Best model:", automl.model.estimator)

# Evaluate the model on the test set
y_pred = automl.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy on Test Set: {accuracy * 100:.2f}%")

# Function to predict using the saved model
def predict_issue_occurrence(input_data):
    model = joblib.load(model_path)
    prediction = model.predict(input_data)
    return prediction

# Pre-set input values
node_id = "Node_1"  # Example Node ID
Node_ID = int(node_id.split('_')[1])  # Extract numeric part
Type = 1  # Government Office
Region = "Russia"  # Example Region
Population_Served = 5000  # Example Population Served
Connectivity_Status = 1  # Connected
Existing_Infrastructure = 1  # Yes
Resolution_Time_Hours = 4  # Example Resolution Time

# Create a DataFrame from the pre-set input data
input_data = pd.DataFrame({
    'Node_ID': [Node_ID],
    'Type': [Type],
    'Region': [Region],
    'Population_Served': [Population_Served],
    'Connectivity_Status': [Connectivity_Status],
    'Existing_Infrastructure': [Existing_Infrastructure],
    'Resolution_Time_Hours': [Resolution_Time_Hours]  # Include resolution time
})

# Preprocess the input data (same as training data)
input_data = pd.get_dummies(input_data, columns=['Region'], drop_first=True)

# Ensure the input data has the same columns as the training data
input_data = input_data.reindex(columns=X_train.columns, fill_value=0)

# Predict the issue occurrence
prediction = predict_issue_occurrence(input_data)
print(f"Predicted Issue Type: {prediction[0]}")

Merged data saved to /kaggle/working/maintenance_merged_data.csv
[flaml.automl.logger: 01-25 00:43:27] {1728} INFO - task = classification
[flaml.automl.logger: 01-25 00:43:27] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 01-25 00:43:27] {1838} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 01-25 00:43:27] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost', 'lrl1']
[flaml.automl.logger: 01-25 00:43:27] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 01-25 00:43:27] {2393} INFO - Estimated sufficient time budget=1327s. Estimated necessary time budget=33s.
[flaml.automl.logger: 01-25 00:43:27] {2442} INFO -  at 0.4s,	estimator lgbm's best error=0.5875,	best estimator lgbm's best error=0.5875
[flaml.automl.logger: 01-25 00:43:27] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 01-25 00:43:27] {2442} INFO -  at 0.5s,	estimator lgbm's best erro