In [1]:
! pip install flaml[automl] matplotlib openml

Collecting openml
  Downloading openml-0.15.0-py3-none-any.whl.metadata (9.9 kB)
Collecting flaml[automl]
  Downloading FLAML-2.3.3-py3-none-any.whl.metadata (16 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Downloading openml-0.15.0-py3-none-any.whl (157 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.0/158.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading FLAML-2.3.3-py3-none-any.whl (314 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.2/314.2 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB

In [2]:
import os
import numpy as np
import pandas as pd
from flaml import AutoML
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# Step 1: Load and Preprocess the Data

def load_and_preprocess_data(environment_filepath, nodes_filepath):
    # Load the datasets
    environment = pd.read_csv(environment_filepath)
    nodes = pd.read_csv(nodes_filepath)
    
    # Rename 'Region_Name' to 'Region' in the environment dataset
    environment.rename(columns={'Region_Name': 'Region'}, inplace=True)
    
    # Merge datasets on 'Region' (default is inner join)
    data = pd.merge(environment, nodes, on="Region")
    
    # Save the merged data to a CSV file
    merged_data_path = '/kaggle/working/environment_merged_data.csv'
    data.to_csv(merged_data_path, index=False)
    print(f"Merged data saved to {merged_data_path}")
    
    # Drop unnecessary columns (e.g., 'Region_ID', 'Latitude', 'Longitude')
    data.drop(columns=['Region_ID', 'Latitude', 'Longitude'], inplace=True)
    
    # Convert 'Region', 'Type', 'Connectivity_Status', and 'Existing_Infrastructure' to lowercase
    data['Region'] = data['Region'].str.lower()
    data['Type'] = data['Type'].str.lower()
    data['Connectivity_Status'] = data['Connectivity_Status'].str.lower()
    data['Existing_Infrastructure'] = data['Existing_Infrastructure'].str.lower()
    
    # Encode the target column 'Disaster_Risk_Level' into numeric labels
    label_encoder = LabelEncoder()
    data['Disaster_Risk_Level'] = label_encoder.fit_transform(data['Disaster_Risk_Level'])
    
    # One-hot encode the categorical columns: 'Region', 'Type', 'Connectivity_Status', 'Existing_Infrastructure'
    categorical_columns = ['Region', 'Type', 'Connectivity_Status', 'Existing_Infrastructure']
    onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_features = onehot_encoder.fit_transform(data[categorical_columns])
    encoded_features_df = pd.DataFrame(encoded_features, columns=onehot_encoder.get_feature_names_out(categorical_columns))
    
    # Combine encoded features with the rest of the dataset
    data = pd.concat([data.drop(columns=categorical_columns), encoded_features_df], axis=1)
    
    # Split into features (X) and target (y)
    X = data.drop(columns=['Disaster_Risk_Level'])
    y = data['Disaster_Risk_Level']
    
    return X, y, label_encoder, onehot_encoder

# Step 2: Train the AutoML Model

def train_automl_model(X_train, y_train):
    # Initialize AutoML
    automl = AutoML()
    
    # Specify AutoML settings
    automl_settings = {
        "time_budget": 3600,  # 1 hour
        "metric": "accuracy",  # Use accuracy for classification
        "task": "classification",
        "log_file_name": "environment_classification.log",
    }
    
    # Train the model
    automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
    
    # Calculate and print the accuracy of the model on the training data
    accuracy = automl.score(X_train, y_train)
    print(f"Model training accuracy: {accuracy:.4f}")
    
    return automl

# Step 3: Save the Model, Label Encoder, and OneHotEncoder

def save_model_and_encoders(model, label_encoder, onehot_encoder, model_path, label_encoder_path, onehot_encoder_path):
    joblib.dump(model, model_path)
    joblib.dump(label_encoder, label_encoder_path)
    joblib.dump(onehot_encoder, onehot_encoder_path)

# Step 4: Pre-set User Input

def get_predefined_input():
    """
    Pre-set user input for predicting disaster risk level.
    """
    user_input = {
        'Region': 'Russia',  # Predefined region
        'Temperature_C': 25.0,  # Predefined temperature
        'Humidity_Percent': 60.0,  # Predefined humidity
        'Past_Disruptions': 3,  # Predefined past disruptions
        'Population_Served': 10000,  # Predefined population served
        'Connectivity_Status': 'Connected',  # Predefined connectivity status
        'Existing_Infrastructure': 'Yes',  # Predefined existing infrastructure
        'Type': 'Health Center'  # Predefined organization type
    }
    return user_input

# Step 5: Integrate with the Prediction Function

def predict_risk_level(input_data, model_path, label_encoder_path, onehot_encoder_path, X_columns):
    """
    Predict the disaster risk level based on pre-set input.
    input_data: A dictionary containing the input features.
    """
    # Load the saved model, label encoder, and one-hot encoder
    model = joblib.load(model_path)
    label_encoder = joblib.load(label_encoder_path)
    onehot_encoder = joblib.load(onehot_encoder_path)
    
    # Convert input data to a DataFrame
    input_df = pd.DataFrame([input_data])
    
    # One-hot encode the categorical columns: 'Region', 'Type', 'Connectivity_Status', 'Existing_Infrastructure'
    categorical_columns = ['Region', 'Type', 'Connectivity_Status', 'Existing_Infrastructure']
    encoded_features = onehot_encoder.transform(input_df[categorical_columns])
    encoded_features_df = pd.DataFrame(encoded_features, columns=onehot_encoder.get_feature_names_out(categorical_columns))
    
    # Combine encoded features with the rest of the input data
    input_df = pd.concat([input_df.drop(columns=categorical_columns), encoded_features_df], axis=1)
    
    # Ensure the input data has the same columns as the training data
    # Add missing columns and fill with 0
    for col in X_columns:
        if col not in input_df.columns:
            input_df[col] = 0
    
    # Reorder columns to match the training data
    input_df = input_df[X_columns]
    
    # Predict the risk level
    risk_level_encoded = model.predict(input_df)
    
    # Decode the predicted risk level
    risk_level = label_encoder.inverse_transform(risk_level_encoded)
    
    return risk_level[0]

# Step 6: Run the Chatbot-Like Interface

if __name__ == "__main__":
    # Load and preprocess data
    X, y, label_encoder, onehot_encoder = load_and_preprocess_data(
        "/kaggle/input/public-sector-network/environment.csv",
        "/kaggle/input/public-sector-network/nodes.csv"
    )
    
    # Train the AutoML model
    automl_model = train_automl_model(X, y)
    
    # Save the model, label encoder, and one-hot encoder
    save_model_and_encoders(
        automl_model, label_encoder, onehot_encoder,
        '/kaggle/working/environment_automl_model.pkl',
        '/kaggle/working/label_encoder.pkl',
        '/kaggle/working/onehot_encoder.pkl'
    )
    
    # Get pre-set user input
    user_input = get_predefined_input()
    
    # Predict the risk level
    risk_level = predict_risk_level(
        user_input,
        '/kaggle/working/environment_automl_model.pkl',
        '/kaggle/working/label_encoder.pkl',
        '/kaggle/working/onehot_encoder.pkl',
        X.columns
    )
    
    # Display the results
    print("\nPrediction Results:")
    print(f"The predicted disaster risk level for the organization type in the region is: {risk_level}")

Merged data saved to /kaggle/working/environment_merged_data.csv
[flaml.automl.logger: 01-25 00:48:21] {1728} INFO - task = classification
[flaml.automl.logger: 01-25 00:48:21] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 01-25 00:48:21] {1838} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 01-25 00:48:22] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost', 'lrl1']
[flaml.automl.logger: 01-25 00:48:22] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 01-25 00:48:22] {2393} INFO - Estimated sufficient time budget=5984s. Estimated necessary time budget=148s.
[flaml.automl.logger: 01-25 00:48:22] {2442} INFO -  at 1.0s,	estimator lgbm's best error=0.1500,	best estimator lgbm's best error=0.1500
[flaml.automl.logger: 01-25 00:48:22] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 01-25 00:48:22] {2442} INFO -  at 1.0s,	estimator lgbm's best err