<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
LightGBM for Regression 
</p>

In [2]:
# Libraries for data manipulation and visualization
import numpy as np                               # For numerical operations
import pandas as pd                              # For data manipulation
import matplotlib.pyplot as plt                  # For plotting
import seaborn as sns                            # For advanced data visualization

# Libraries for model building and evaluation
from sklearn.model_selection import (              # For cross-validation, splitting data, and grid search
    KFold, 
    train_test_split, 
    GridSearchCV
)
from sklearn.metrics import (                     # For model evaluation metrics
    mean_squared_error, 
    r2_score, 
    mean_absolute_percentage_error, 
    median_absolute_error
)

# LightGBM library
import lightgbm as lgb                            # For LightGBM Regressor

# ================================
# Set Seed for Reproducibility
# ================================
RANDOM_STATE = 50
np.random.seed(RANDOM_STATE)

# Suppress specific FutureWarning about 'force_all_finite'
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, message=".*'force_all_finite'.*")
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.utils._tags")

In [3]:
# Define the path to the dataset
dataset_path = '../../all_data_files/cleaned_dataset_per_device.csv'

# Load the dataset
try:
    df = pd.read_csv(dataset_path)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"File not found at the specified path: {dataset_path}")
    import sys
    sys.exit()

# Display dataset information
print("\nDataset Information:")
df.info()

print("\nFirst Five Rows of the Dataset:")
display(df.head())

Dataset loaded successfully.

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656317 entries, 0 to 656316
Data columns (total 20 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   time         656317 non-null  object 
 1   device_id    656317 non-null  object 
 2   co2          656317 non-null  float64
 3   humidity     656317 non-null  float64
 4   pm25         656317 non-null  float64
 5   pressure     656317 non-null  float64
 6   temperature  656317 non-null  float64
 7   rssi         656317 non-null  float64
 8   snr          656317 non-null  float64
 9   SF           656317 non-null  int64  
 10  frequency    656317 non-null  float64
 11  f_count      656317 non-null  float64
 12  p_count      656317 non-null  float64
 13  toa          656317 non-null  float64
 14  distance     656317 non-null  int64  
 15  c_walls      656317 non-null  int64  
 16  w_walls      656317 non-null  int64  
 17  exp_pl       656

Unnamed: 0,time,device_id,co2,humidity,pm25,pressure,temperature,rssi,snr,SF,frequency,f_count,p_count,toa,distance,c_walls,w_walls,exp_pl,n_power,esp
0,2024-09-26 11:02:08.387851+00:00,ED0,539.0,49.34,0.39,299.69,24.57,-48.0,8.5,9,867.5,82.0,109.0,0.246784,10,0,0,65.4,-57.073822,-48.573822
1,2024-09-26 11:03:08.309590+00:00,ED0,540.0,49.33,0.8,299.77,24.59,-48.0,12.8,8,867.3,83.0,110.0,0.133632,10,0,0,65.4,-61.022142,-48.222142
2,2024-09-26 11:04:08.368448+00:00,ED0,537.0,49.28,0.71,299.7,24.62,-48.0,8.0,8,868.5,84.0,111.0,0.133632,10,0,0,65.4,-56.63892,-48.63892
3,2024-09-26 11:05:08.405529+00:00,ED0,537.0,49.34,0.56,299.69,24.63,-49.0,11.0,8,867.9,85.0,112.0,0.133632,10,0,0,66.4,-60.331956,-49.331956
4,2024-09-26 11:06:08.455112+00:00,ED0,534.0,49.28,0.6,299.73,24.64,-46.0,9.2,8,867.5,86.0,113.0,0.133632,10,0,0,63.4,-55.693058,-46.493058


In [4]:
# Define feature columns and target
feature_columns = [
    'distance', 'frequency', 'c_walls', 'w_walls', 
    'co2', 'humidity', 'pm25', 'pressure', 
    'temperature', 'snr'
]
target_column = 'exp_pl'

# Verify that all required columns exist
missing_columns = set(feature_columns + [target_column]) - set(df.columns)
if missing_columns:
    raise ValueError(f"The following required columns are missing in the dataset: {missing_columns}")

# Extract features and target
all_features = df[feature_columns].values
PL_all = df[target_column].values

# Perform train-test split (80-20 split)
X_train_all, X_test_all, PL_train_all, PL_test_all = train_test_split(
    all_features, PL_all, test_size=0.2, random_state=RANDOM_STATE
)

print("Train-test split completed.")

Train-test split completed.


In [5]:
# Define the fixed max_depth values
max_depth_values = [1, 2, 3, 4]

# Define other hyperparameters for grid search
param_grid = {
    'learning_rate': [0.01, 0.05],            # Step size shrinkage
    'n_estimators': [50, 80, 100],     # Number of boosting iterations
    'subsample': [0.2],                 # Subsample ratio of the training instances
    'colsample_bytree': [0.2],          # Subsample ratio of columns when constructing each tree
    'min_child_weight': [1],            # Minimum sum of instance weight (hessian) needed in a child
    'reg_alpha': [0.5],                 # L1 regularization term on weights
    'reg_lambda': [0.5]                 # L2 regularization term on weights
}

# Dictionary to store the best models for each max_depth
best_models = {}
best_params_per_depth = {}
best_scores_per_depth = {}

# Iterate over each max_depth and perform grid search
for depth in max_depth_values:
    print(f"\nPerforming Grid Search for max_depth={depth}...")
    
    # Set num_leaves based on max_depth
    if depth == 1:
        num_leaves_list = [2]
    elif depth == 2:
        num_leaves_list = [3]
    elif depth == 3:
        num_leaves_list = [7]
    
    # Update param_grid with the appropriate num_leaves
    current_param_grid = param_grid.copy()
    current_param_grid['num_leaves'] = num_leaves_list
    
    print(f"  Using num_leaves: {num_leaves_list}")
    
    # Create a base LightGBM Regressor with the current max_depth
    lgb_reg = lgb.LGBMRegressor(
        max_depth=depth,
        objective='regression',                       # Regression task
        random_state=RANDOM_STATE, 
        n_jobs=-1,
        verbosity=-1  # Suppress informational messages
    )
    
    # Initialize Grid Search
    grid_search = GridSearchCV(
        estimator=lgb_reg,
        param_grid=current_param_grid,
        cv=5,                                         # 5-fold cross-validation
        scoring='neg_mean_squared_error',             # Using negative MSE for comparison
        verbose=0,
        n_jobs=-1
    )
    
    # Perform Grid Search with Early Stopping
    grid_search.fit(X_train_all, PL_train_all)
    
    # Retrieve the best parameters and corresponding score
    best_params = grid_search.best_params_
    best_neg_mse = grid_search.best_score_
    best_mse = -best_neg_mse  # Convert from negative MSE to MSE
    
    best_models[depth] = grid_search.best_estimator_
    best_params_per_depth[depth] = best_params
    best_scores_per_depth[depth] = best_mse
    
    print(f"  Best Parameters for max_depth={depth}: {best_params}")
    print(f"  Best CV MSE for max_depth={depth}: {best_mse:.4f}")


Performing Grid Search for max_depth=1...
  Using num_leaves: [2]
  Best Parameters for max_depth=1: {'colsample_bytree': 0.2, 'learning_rate': 0.05, 'min_child_weight': 1, 'n_estimators': 100, 'num_leaves': 2, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'subsample': 0.2}
  Best CV MSE for max_depth=1: 63.0219

Performing Grid Search for max_depth=2...
  Using num_leaves: [3]
  Best Parameters for max_depth=2: {'colsample_bytree': 0.2, 'learning_rate': 0.05, 'min_child_weight': 1, 'n_estimators': 100, 'num_leaves': 3, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'subsample': 0.2}
  Best CV MSE for max_depth=2: 52.1129

Performing Grid Search for max_depth=3...
  Using num_leaves: [7]
  Best Parameters for max_depth=3: {'colsample_bytree': 0.2, 'learning_rate': 0.05, 'min_child_weight': 1, 'n_estimators': 100, 'num_leaves': 7, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'subsample': 0.2}
  Best CV MSE for max_depth=3: 44.6034

Performing Grid Search for max_depth=4...
  Using num_leaves: [7]
  Best Paramete

In [6]:
# Initialize a list to store evaluation metrics for the best models
evaluation_metrics = []

print("\nEvaluating the best LightGBM models from Grid Search...\n")

for depth in best_models:
    print(f"Evaluating model with max_depth={depth}...")
    
    # Retrieve the best model for the current depth
    best_model = best_models[depth]
    
    # Make predictions on the training set
    PL_train_pred = best_model.predict(X_train_all)
    # Make predictions on the test set
    PL_test_pred = best_model.predict(X_test_all)
    
    # Calculate metrics
    train_mse = mean_squared_error(PL_train_all, PL_train_pred)
    test_mse = mean_squared_error(PL_test_all, PL_test_pred)
    train_r2 = r2_score(PL_train_all, PL_train_pred)
    test_r2 = r2_score(PL_test_all, PL_test_pred)
    test_rmse = np.sqrt(test_mse)
    test_mape = mean_absolute_percentage_error(PL_test_all, PL_test_pred)
    test_median_ae = median_absolute_error(PL_test_all, PL_test_pred)
    
    # Append metrics to the list
    evaluation_metrics.append({
        'max_depth': depth,
        'Model': 'LightGBM (GridSearch)',
        'Training Loss (MSE)': train_mse,
        'Test Loss (MSE)': test_mse,
        'Test RMSE': test_rmse,
        'R² Score': test_r2,
        'Test MAPE (%)': test_mape * 100,
        'Test Median AE': test_median_ae
    })
    
    print(f"  Model with max_depth={depth} - MSE: Train={train_mse:.4f}, Test={test_mse:.4f}, "
          f"R²: Train={train_r2:.4f}, Test={test_r2:.4f}, "
          f"MAPE: Test={test_mape*100:.2f}%, Median AE: Test={test_median_ae:.4f}\n")

# Convert the metrics list to a DataFrame
evaluation_df = pd.DataFrame(evaluation_metrics)

print(f"\nModel Evaluation Metrics:")
# Display the DataFrame
display(evaluation_df)


Evaluating the best LightGBM models from Grid Search...

Evaluating model with max_depth=1...
  Model with max_depth=1 - MSE: Train=63.0033, Test=62.6220, R²: Train=0.8262, Test=0.8272, MAPE: Test=6.60%, Median AE: Test=4.2438

Evaluating model with max_depth=2...
  Model with max_depth=2 - MSE: Train=52.1649, Test=51.8005, R²: Train=0.8561, Test=0.8571, MAPE: Test=5.67%, Median AE: Test=3.8179

Evaluating model with max_depth=3...
  Model with max_depth=3 - MSE: Train=44.5722, Test=44.4499, R²: Train=0.8771, Test=0.8774, MAPE: Test=5.20%, Median AE: Test=3.4454

Evaluating model with max_depth=4...
  Model with max_depth=4 - MSE: Train=44.3085, Test=44.1747, R²: Train=0.8778, Test=0.8781, MAPE: Test=5.17%, Median AE: Test=3.4230


Model Evaluation Metrics:


Unnamed: 0,max_depth,Model,Training Loss (MSE),Test Loss (MSE),Test RMSE,R² Score,Test MAPE (%),Test Median AE
0,1,LightGBM (GridSearch),63.003308,62.622003,7.913407,0.827209,6.602695,4.243784
1,2,LightGBM (GridSearch),52.164938,51.800475,7.197255,0.857068,5.670651,3.817864
2,3,LightGBM (GridSearch),44.572164,44.449895,6.667075,0.877351,5.197158,3.445359
3,4,LightGBM (GridSearch),44.308498,44.174656,6.646402,0.87811,5.171214,3.423037


<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
Cross Validation : Could be redundant anyway! 
</p>

# Define the number of folds for cross-validation
n_folds = 5

# Initialize KFold with consistent RANDOM_STATE
kf = KFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)

# Dictionary to store cross-validation results
cv_results_dict = {depth: [] for depth in max_depth_values}

print("\nPerforming K-Fold Cross-Validation for Each Best Model...\n")

for depth in max_depth_values:
    if depth not in best_models:
        print(f"No best model found for max_depth={depth}. Skipping cross-validation.")
        continue
    
    model = best_models[depth]
    params = best_params_per_depth[depth]
    
    print(f"Cross-Validation for max_depth={depth} with parameters: {params}")
    
    fold = 1
    for train_idx, val_idx in kf.split(X_train_all):
        print(f"  Training fold {fold}...")
        
        # Split the data for the current fold
        X_train_fold, X_val_fold = X_train_all[train_idx], X_train_all[val_idx]
        PL_train_fold, PL_val_fold = PL_train_all[train_idx], PL_train_all[val_idx]
        
        # Instantiate a new model with the best parameters
        lgb_cv = lgb.LGBMRegressor(
            max_depth=depth,
            num_leaves=params['num_leaves'],
            learning_rate=params['learning_rate'],
            n_estimators=params['n_estimators'],
            subsample=params['subsample'],
            colsample_bytree=params['colsample_bytree'],
            min_child_weight=params['min_child_weight'],
            reg_alpha=params['reg_alpha'],
            reg_lambda=params['reg_lambda'],
            objective='regression',
            random_state=RANDOM_STATE,  # Ensured consistency
            n_jobs=-1
        )
        
        # Train the model on the current fold
        lgb_cv.fit(X_train_fold, PL_train_fold)
        
        # Predict on validation set
        PL_val_pred = lgb_cv.predict(X_val_fold)
        
        # Calculate metrics
        val_mse = mean_squared_error(PL_val_fold, PL_val_pred)
        val_rmse = np.sqrt(val_mse)
        val_r2 = r2_score(PL_val_fold, PL_val_pred)
        val_mape = mean_absolute_percentage_error(PL_val_fold, PL_val_pred)
        val_median_ae = median_absolute_error(PL_val_fold, PL_val_pred)
        
        # Append metrics to the dictionary
        cv_results_dict[depth].append({
            'Fold': fold,
            'Validation Loss (MSE)': round(val_mse, 4),
            'Validation RMSE': round(val_rmse, 4),
            'R² Score': round(val_r2, 4),
            'Validation MAPE (%)': round(val_mape * 100, 2),
            'Validation Median AE': round(val_median_ae, 4)
        })
        
        print(f"  Fold {fold} - MSE: {val_mse:.4f}, RMSE: {val_rmse:.4f}, "
              f"R²: {val_r2:.4f}, MAPE: {val_mape*100:.2f}%, "
              f"Median AE: {val_median_ae:.4f}\n")
        fold += 1

# Create a DataFrame for each max_depth's CV results and display them
for depth in max_depth_values:
    if depth not in cv_results_dict or not cv_results_dict[depth]:
        print(f"\nNo cross-validation results available for max_depth={depth}.")
        continue
    
    print(f"\nK-Fold Cross-Validation Results for max_depth={depth}:")
    cv_results_df = pd.DataFrame(cv_results_dict[depth])
    display(cv_results_df)
    
    # Summary statistics
    cv_summary = cv_results_df.agg(['mean', 'std']).round(4).reset_index()
    cv_summary.rename(columns={'index': 'Metric'}, inplace=True)
    print(f"\nCross-Validation Summary for max_depth={depth}:")
    display(cv_summary)