In [1]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
#from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
#from sklearn.linear_model import Lasso
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.linear_model import Ridge
#from sklearn.svm import SVR
#from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import time 
import os

all_data = pd.read_csv ("combined_data.csv")

# Packets per second is disabled for full buffer traffic
all_data.loc[all_data["Generated_traffic"] == "Full Buffer", "Packets_per_second"] = int(100.0)
target_columns = [
    "Uplink_delay", "Downlink_delay", "Uplink_pcktloss", "Downlink_pcktloss",
    "Uplink_thrpt", "Downlink_thrpt", "sta_nrg", "ap_nrg", "sta_overhead", "ap_overhead"
]
scenario_list = ["Communication_link", "Power_save_mechanism", "Generated_traffic" ,"Transport_protocol" ,"Packets_per_second" ,"Sta_count"]
all_data[target_columns] = all_data[target_columns].fillna(0)
scenario_set = all_data[scenario_list]
numerical_columns= ["Packets_per_second", "Sta_count"]
categorical_columns= ["Communication_link", "Power_save_mechanism", "Generated_traffic", "Transport_protocol"]
categorical_preprocessor = OneHotEncoder(handle_unknown= "ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer (
        [
            ("one_hot_encoder", categorical_preprocessor, categorical_columns),
            ("standard_scaler", numerical_preprocessor, numerical_columns),
        ]
    )

model = make_pipeline (preprocessor, GradientBoostingRegressor(random_state=42))

param_grid = {    
#    'decisiontreeregressor__max_depth': [3, 5, 10, None],
#    'decisiontreeregressor__min_samples_split': [2, 5, 10],
#    'decisiontreeregressor__min_samples_leaf': [1, 2, 4]
    'gradientboostingregressor__n_estimators': [100, 200, 300],
    'gradientboostingregressor__learning_rate': [0.01, 0.1, 0.2],
    'gradientboostingregressor__max_depth': [3, 5, 7],
    'gradientboostingregressor__min_samples_split': [2, 5, 10],
    'gradientboostingregressor__min_samples_leaf': [1, 2, 4]
#    'ridge__alpha': [0.1, 1, 10, 100, 1000]
#    'lasso__alpha': [0.1, 1, 10, 100, 1000]
#    'randomforestregressor__n_estimators': [100, 200, 300],
#    'randomforestregressor__max_depth': [None, 10, 20, 30],
#    'randomforestregressor__min_samples_split': [2, 5, 10],
#    'randomforestregressor__min_samples_leaf': [1, 2, 4]
#    'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#    'svr__C': [0.1, 1, 10, 100],
#    'svr__epsilon': [0.01, 0.1, 0.2]
}

# Initialize results list
results_dict = {}
for target_column in target_columns:
    target = all_data[target_column].fillna(0)
    X_train, X_test, y_train, y_test = train_test_split(scenario_set, target, test_size=0.2, random_state=42)

    # Use GridSearchCV to search for the best hyperparameter with cross-validation
    train_time_start= time.time()
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    train_time_end= time.time()

    # Perform cross-validation on the best model
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_mse_scores = -cv_scores  # Convert to positive mean squared error
    cv_rmse_scores = np.sqrt(cv_mse_scores)
    # Make predictions
    predict_time_start = time.time()
    y_pred = best_model.predict(X_test)
    predict_time_end = time.time()
     # Collect results
    results_df = pd.DataFrame({
        'ML model': ['Gradient Boosting Regression'],
        'Mean cross-validated MSE': [np.mean(cv_mse_scores)],
        'Mean cross-validated RMSE': [np.mean(cv_rmse_scores)],
        'Best model hyperparameters': [str(grid_search.best_params_)],
        'model MSE': [mean_squared_error(y_test, y_pred)],
        'model R² score': [r2_score(y_test, y_pred)],
        'model Training Time': [train_time_end - train_time_start],
        'model Prediction Time': [predict_time_end - predict_time_start]
    })
    # Store results DataFrame in the dictionary
    results_dict[target_column] = results_df

results_dir = 'ml-results/'
# Output the results DataFrames to CSV files
for target_column, df in results_dict.items():
    file_path = os.path.join(results_dir, f"{target_column}.csv")    
    # Check if file exists
    if os.path.exists(file_path):
        # Read existing data
        existing_df = pd.read_csv(file_path)
        # Append new results to existing data
        combined_df = pd.concat([existing_df, df], ignore_index=True)
        # Save combined data back to file without index
        combined_df.to_csv(file_path, index=False)
        print(f"Results for Target Column '{target_column}' appended to '{file_path}'")
    else:
        # If file does not exist, save new results to new file without index
        df.to_csv(file_path, index=False)
        print(f"Results for Target Column '{target_column}' saved to '{file_path}'")  


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Results for Target Column 'Uplink_delay' appended to 'ml-results/Uplink_delay.csv'
Results for Target Column 'Downlink_delay' appended to 'ml-results/Downlink_delay.csv'
Results for Target Column 'Uplink_pcktloss' appended to 'ml-results/Uplink_pcktloss.csv'
Results for Target Column 'Downlink_pcktloss' appended to 'ml-results/Downlink_pcktloss.csv'
Results for Target Column 'Uplink_thrpt' appended to 'ml-results/Uplink_thrpt.csv'
Results for Target Column 'Downlink_thrpt' appended to 'ml-results/Downlink_thrpt.csv'
Results for Target Column 'sta_nrg' appended to 'ml-results/sta_nrg.csv'
Results for Target Column 'ap_nrg' appended to 'ml-results/ap_nrg.csv'
Results for Target Column 'sta_overhead' appended to 'ml-results/sta_overhead.csv'
Results for Target Column 'ap_overhead' appended to 'ml-results/ap_overhead.csv'
