In [1]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/ELM/merged_CSV/merged_132K.csv').dropna()
data = data.drop_duplicates(subset=['.geo'])

print('Dataset Features :',data.columns.tolist())

Dataset Features : ['system:index', 'City', 'Class', 'HH', 'HH_asm', 'HH_contrast', 'HH_corr', 'HH_dent', 'HH_diss', 'HH_dvar', 'HH_ent', 'HH_idm', 'HH_imcorr1', 'HH_imcorr2', 'HH_inertia', 'HH_maxcorr', 'HH_prom', 'HH_savg', 'HH_sent', 'HH_shade', 'HH_svar', 'HH_var', 'HV', 'HV_asm', 'HV_contrast', 'HV_corr', 'HV_dent', 'HV_diss', 'HV_dvar', 'HV_ent', 'HV_idm', 'HV_imcorr1', 'HV_imcorr2', 'HV_inertia', 'HV_maxcorr', 'HV_prom', 'HV_savg', 'HV_sent', 'HV_shade', 'HV_svar', 'HV_var', 'NSAI1', 'NSAI2', 'SISAI', 'VH', 'VH_asm', 'VH_contrast', 'VH_corr', 'VH_dent', 'VH_diss', 'VH_dvar', 'VH_ent', 'VH_idm', 'VH_imcorr1', 'VH_imcorr2', 'VH_inertia', 'VH_maxcorr', 'VH_prom', 'VH_savg', 'VH_sent', 'VH_shade', 'VH_svar', 'VH_var', 'VV', 'VV_asm', 'VV_contrast', 'VV_corr', 'VV_dent', 'VV_diss', 'VV_dvar', 'VV_ent', 'VV_idm', 'VV_imcorr1', 'VV_imcorr2', 'VV_inertia', 'VV_maxcorr', 'VV_prom', 'VV_savg', 'VV_sent', 'VV_shade', 'VV_svar', 'VV_var', 'indbiMax', 'indbiMedian', 'mndwiMax', 'mndwiMedian'

In [19]:
# CLASSIFIER based Best Feature Combination
# Version 14.4 (11 September 2023)
# add addtional accuracy matrix for best feature combination model

# Import necessary libraries
import time

# Record the current time
start_time = time.time()

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb

# Define a function for Accuracy Assessment
def evaluate_accuracy(y_true, y_pred):

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    confusion = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)

    print("\nAccuracy Metrics:\n")

    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1 Score: {f1:.3f}")
    print(f"Kappa: {kappa:.3f}")

    print("\nConfusion Matrix:")
    print(confusion)

    print("\nClassification Report:")
    print(report)

    return accuracy, precision, recall, f1, kappa, confusion, report


# Define a function for Classifier based model training and evaluation
def train_and_evaluate_classifier(X_train, X_test, y_train, y_test):
    # Define the classifiers
    rf_model = RandomForestClassifier(n_estimators=250, random_state=100, n_jobs=-1)
    xgb_model = xgb.XGBClassifier(n_estimators=250, random_state=100, n_jobs=-1)

    # Create a VotingClassifier with soft voting
    ensemble_model = VotingClassifier(
        estimators=[('rf', rf_model), ('xgb', xgb_model)],
        voting='soft'
    ).fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = ensemble_model.predict(X_test)

    # Calculate accuracy metrics
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average=None)[-1] # for UIS class only
    report = classification_report(y_test, predictions)
    confusion = confusion_matrix(y_test, predictions)

    return accuracy, confusion, report, predictions, f1

selected_features =[
    # 'HH', 'HH_asm', 'HH_contrast', 'HH_corr', 'HH_dent', 'HH_diss', 'HH_dvar', 'HH_ent', 'HH_idm', 'HH_imcorr1', 'HH_imcorr2', 'HH_inertia', 'HH_prom', 'HH_savg', 'HH_sent', 'HH_shade', 'HH_svar', 'HH_var',
    # 'HV', 'HV_asm', 'HV_contrast', 'HV_corr', 'HV_dent', 'HV_diss', 'HV_dvar', 'HV_ent', 'HV_idm', 'HV_imcorr1', 'HV_imcorr2', 'HV_inertia', 'HV_prom', 'HV_savg', 'HV_sent', 'HV_shade', 'HV_svar', 'HV_var',
    'NSAI1', 'NSAI2', 'SISAI',
    # 'VH', 'VH_asm', 'VH_contrast', 'VH_corr', 'VH_dent', 'VH_diss', 'VH_dvar', 'VH_ent', 'VH_idm', 'VH_imcorr1', 'VH_imcorr2', 'VH_inertia', 'VH_prom', 'VH_savg', 'VH_sent', 'VH_shade', 'VH_svar', 'VH_var',
    # 'VV', 'VV_asm', 'VV_contrast', 'VV_corr', 'VV_dent', 'VV_diss', 'VV_dvar', 'VV_ent', 'VV_idm', 'VV_imcorr1', 'VV_imcorr2', 'VV_inertia', 'VV_prom', 'VV_savg', 'VV_sent', 'VV_shade', 'VV_svar', 'VV_var',
    'indbiMax', 'indbiMedian', 'mndwiMax', 'mndwiMedian', 'mndwiSD', 'ndbiMedian', 'ndbiMin', 'ndbiSD', 'nduiMedian', 'nduiMin', 'nduiSD', 'ndviMax', 'ndviMedian', 'ndviSD',
    's2_aerosols', 's2_blue', 's2_green', 's2_nir', 's2_red', 's2_redEdge1', 's2_redEdge2', 's2_redEdge3', 's2_redEdge4', 's2_swir1', 's2_swir2', 's2_waterVapor', 'swiRedMedian',
    'viirs'
                   ]

def main():
    # Load the CSV file into a DataFrame
    data = pd.read_csv('/content/drive/MyDrive/ELM/merged_CSV/merged_132K.csv').dropna()
    data = data.drop_duplicates(subset=['.geo'])

    print('Dataset Features :',data.columns.tolist())

    # Define sample sizes for each class seperately
    # sample_sizes = {        0: 3000,        1: 3000,        2: 3000,        3: 3000,        4: 6000        }
    sample_sizes = {
        0: 100,
        1: 100,
        2: 100,
        3: 100,
        4: 100        }

    grouped = data.groupby('Class')# Group by class
    sampled = grouped.apply(lambda x: x.sample(n=sample_sizes[x.name], replace=True).reset_index(drop=True))  # Sample from each group according to the defined sample sizes
    sampled = [sampled] # Convert to list
    data = pd.concat(sampled) # Concatenate samples

    target = data['Class']
    data = data[selected_features]

    print('Selected Features :',data.columns.tolist())
    print('Total sample numbers in each classes :')
    print(target.value_counts())
    print('n_estimators=250')
    print()

    # Assuming 'data' is your DataFrame and 'target' is the target variable
    features = data.columns.tolist()

    # Record the best accuracy and feature combination
    best_accuracy = 0
    best_combination = []

    # Create a dictionary to store the accuracy of each feature
    feature_accuracies = {}

    # Loop until no new feature is added to the best combination
    while True:
        best_feature = None

        for feature in features:
            # Add the current feature to the best combination
            combination = best_combination + [feature]

            # Split the data into training and test sets (with a random seed)
            X_train, X_test, y_train, y_test = train_test_split(data[combination], target, test_size=0.20, random_state=0)

            # Train the model and evaluate it
            accuracy = train_and_evaluate_classifier(X_train, X_test, y_train, y_test)[4] # selected 4 for F1_score

            # Store the accuracy of the feature
            feature_accuracies[feature] = accuracy

            # If this combination has the best accuracy so far, record it
            if accuracy > best_accuracy:    # for regressor <, for classifiers >
                best_accuracy = accuracy
                best_feature = feature
                best_accuracy = accuracy
                best_X_train = X_train
                best_X_test = X_test
                best_y_train = y_train
                best_y_test = y_test
        # If no new feature was added to the best combination, stop the loop
        if best_feature is None:
            break

        # Sort the features by their accuracies in ascending order and print them
        sorted_features = sorted(feature_accuracies.items(), key=lambda item: item[1], reverse=False)

        # If we're evaluating a single feature, print its accuracy
        if len(combination) == 1:
            for feature, accuracy in sorted_features:
                print(f"Accuracy: {accuracy:.4f}, Feature: {feature}")

        # Otherwise, add the best feature to the best combination
        best_combination.append(best_feature)
        features.remove(best_feature)

        print(f"Accuracy: {best_accuracy:.4f}, Combination: {best_combination}")

    print(f"Best accuracy: {best_accuracy:.4f}, Best feature combination: {best_combination}")

    # Call function train_and_evaluate_regressor. Mainly to define the variable predictions
    accuracy, confusion, report, predictions, f1 = train_and_evaluate_classifier(best_X_train, best_X_test, best_y_train, best_y_test)

    # use the prediction variable to  get accuracy metrics
    accuracy_metrics = evaluate_accuracy(best_y_test, predictions)

if __name__ == "__main__":
   main()

# Calculate and print the time it took to train the model
training_time = time.time() - start_time
print(f"Training time: {training_time} seconds")

Dataset Features : ['system:index', 'City', 'Class', 'HH', 'HH_asm', 'HH_contrast', 'HH_corr', 'HH_dent', 'HH_diss', 'HH_dvar', 'HH_ent', 'HH_idm', 'HH_imcorr1', 'HH_imcorr2', 'HH_inertia', 'HH_maxcorr', 'HH_prom', 'HH_savg', 'HH_sent', 'HH_shade', 'HH_svar', 'HH_var', 'HV', 'HV_asm', 'HV_contrast', 'HV_corr', 'HV_dent', 'HV_diss', 'HV_dvar', 'HV_ent', 'HV_idm', 'HV_imcorr1', 'HV_imcorr2', 'HV_inertia', 'HV_maxcorr', 'HV_prom', 'HV_savg', 'HV_sent', 'HV_shade', 'HV_svar', 'HV_var', 'NSAI1', 'NSAI2', 'SISAI', 'VH', 'VH_asm', 'VH_contrast', 'VH_corr', 'VH_dent', 'VH_diss', 'VH_dvar', 'VH_ent', 'VH_idm', 'VH_imcorr1', 'VH_imcorr2', 'VH_inertia', 'VH_maxcorr', 'VH_prom', 'VH_savg', 'VH_sent', 'VH_shade', 'VH_svar', 'VH_var', 'VV', 'VV_asm', 'VV_contrast', 'VV_corr', 'VV_dent', 'VV_diss', 'VV_dvar', 'VV_ent', 'VV_idm', 'VV_imcorr1', 'VV_imcorr2', 'VV_inertia', 'VV_maxcorr', 'VV_prom', 'VV_savg', 'VV_sent', 'VV_shade', 'VV_svar', 'VV_var', 'indbiMax', 'indbiMedian', 'mndwiMax', 'mndwiMedian'

In [16]:
# REGRESSOR based  Best Feature Combination
# Version 14.4.1 (11 September 2023)
# add addtional accuracy matrix for best feature combination model

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define a function for Accuracy Assessment for regressor ML model
def evaluate_accuracy_regressor(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print("\nRegression Metrics:\n")
    print(f"Mean Squared Error (MSE): {mse:.3f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
    print(f"Mean Absolute Error (MAE): {mae:.3f}")
    print(f"R^2 Score: {r2:.3f}")

    return mse, rmse, mae, r2

# Define a function for REGRESSION based model training and evaluation
def train_and_evaluate_regressor(X_train, X_test, y_train, y_test):
    # Define the regression models
    rf_model = RandomForestRegressor(n_estimators=250, random_state=100, n_jobs=-1)
    xgb_model = XGBRegressor(n_estimators=250, random_state=100, n_jobs=-1)

    # Create a VotingRegressor with soft voting
    ensemble_model = VotingRegressor(
        estimators=[('rf', rf_model), ('xgb', xgb_model)]
    )

    # Train the VotingRegressor
    ensemble_model.fit(X_train, y_train)

    # Make predictions
    predictions = ensemble_model.predict(X_test)

    # Calculate regression metrics
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    return mse, rmse, mae, r2, predictions

selected_features =[
    'HH', 'HH_asm', 'HH_contrast', 'HH_corr', 'HH_dent', 'HH_diss', 'HH_dvar', 'HH_ent', 'HH_idm', 'HH_imcorr1', 'HH_imcorr2', 'HH_inertia', 'HH_prom', 'HH_savg', 'HH_sent', 'HH_shade', 'HH_svar', 'HH_var',
    'HV', 'HV_asm', 'HV_contrast', 'HV_corr', 'HV_dent', 'HV_diss', 'HV_dvar', 'HV_ent', 'HV_idm', 'HV_imcorr1', 'HV_imcorr2', 'HV_inertia', 'HV_prom', 'HV_savg', 'HV_sent', 'HV_shade', 'HV_svar', 'HV_var',
    'NSAI1', 'NSAI2', 'SISAI',
    'VH', 'VH_asm', 'VH_contrast', 'VH_corr', 'VH_dent', 'VH_diss', 'VH_dvar', 'VH_ent', 'VH_idm', 'VH_imcorr1', 'VH_imcorr2', 'VH_inertia', 'VH_prom', 'VH_savg', 'VH_sent', 'VH_shade', 'VH_svar', 'VH_var',
    'VV', 'VV_asm', 'VV_contrast', 'VV_corr', 'VV_dent', 'VV_diss', 'VV_dvar', 'VV_ent', 'VV_idm', 'VV_imcorr1', 'VV_imcorr2', 'VV_inertia', 'VV_prom', 'VV_savg', 'VV_sent', 'VV_shade', 'VV_svar', 'VV_var',
    'indbiMax', 'indbiMedian', 'mndwiMax', 'mndwiMedian', 'mndwiSD', 'ndbiMedian', 'ndbiMin', 'ndbiSD', 'nduiMedian', 'nduiMin', 'nduiSD', 'ndviMax', 'ndviMedian', 'ndviSD',
    's2_aerosols', 's2_blue', 's2_green', 's2_nir', 's2_red', 's2_redEdge1', 's2_redEdge2', 's2_redEdge3', 's2_redEdge4', 's2_swir1', 's2_swir2', 's2_waterVapor', 'swiRedMedian',
    'viirs'
                   ]



def main():
    # Load the CSV file into a DataFrame
    data = pd.read_csv('/content/drive/MyDrive/ELM/merged_CSV/merged_132K.csv').dropna()
    data = data.drop_duplicates(subset=['.geo'])

    print('Dataset Features :',data.columns.tolist())

    # Define sample sizes for each class seperately
    # sample_sizes = {        0: 3000,        1: 3000,        2: 3000,        3: 3000,        4: 6000        }
    sample_sizes = {        0: 150,        1: 150,        2: 150,        3: 150,        4: 150        }

    grouped = data.groupby('Class')# Group by class
    sampled = grouped.apply(lambda x: x.sample(n=sample_sizes[x.name], replace=True).reset_index(drop=True))  # Sample from each group according to the defined sample sizes
    sampled = [sampled] # Convert to list
    data = pd.concat(sampled) # Concatenate samples

    target = data['Class']
    y = target
    data = data[selected_features]

    print('Selected Features :',data.columns.tolist())
    print('Total sample numbers in each classes :')
    print(target.value_counts())
    print('n_estimators=250')
    print()

    # Assuming 'data' is your DataFrame and 'target' is the target variable
    features = data.columns.tolist()

    # Record the best accuracy and feature combination
    best_accuracy = 0
    best_combination = []

    # Create a dictionary to store the accuracy of each feature
    feature_accuracies = {}

    best_X_train, best_X_test, best_y_train, best_y_test = None, None, None, None

    # Loop until no new feature is added to the best combination
    while True:
        best_feature = None

        for feature in features:
            # Add the current feature to the best combination
            combination = best_combination + [feature]

            # Split the data into training and test sets (with a random seed)
            X_train, X_test, y_train, y_test = train_test_split(data[combination], target, test_size=0.20, random_state=0)

            # Train the model and evaluate it
            accuracy = train_and_evaluate_regressor(X_train, X_test, y_train, y_test)[1] # selected 1 for RMSE

            # Store the accuracy of the feature
            feature_accuracies[feature] = accuracy

            # If this combination has the best accuracy so far, record it
            if accuracy < best_accuracy:    # for regressor <, for classifiers >
                best_accuracy = accuracy
                best_feature = feature
                best_accuracy = accuracy
                best_X_train = X_train
                best_X_test = X_test
                best_y_train = y_train
                best_y_test = y_test
        # If no new feature was added to the best combination, stop the loop
        if best_feature is None:
            break

        # Sort the features by their accuracies in ascending order and print them
        sorted_features = sorted(feature_accuracies.items(), key=lambda item: item[1], reverse=True)

        # If we're evaluating a single feature, print its accuracy
        if len(combination) == 1:
            for feature, accuracy in sorted_features:
                print(f"Accuracy: {accuracy:.4f}, Feature: {feature}")

        # Otherwise, add the best feature to the best combination
        best_combination.append(best_feature)
        features.remove(best_feature)

        print(f"Accuracy: {best_accuracy:.4f}, Combination: {best_combination}")

    print(f"Best accuracy: {best_accuracy:.4f}, Best feature combination: {best_combination}")

    # Call function train_and_evaluate_regressor. Mainly to define the variable predictions
    # accuracy, confusion, report, predictions, f1 = train_and_evaluate_classifier(best_X_train, best_X_test, best_y_train, best_y_test)
    # Train model with new best feature combo
    mse, rmse, mae, r2, predictions = train_and_evaluate_regressor(
        best_X_train, best_X_test, best_y_train, best_y_test
    )
    # use the prediction variable to  get accuracy metrics
    accuracy_metrics = evaluate_accuracy_regressor(best_y_test, predictions)

if __name__ == "__main__":
   main()
# Code for testing best Regressor based ML features

Dataset Features : ['system:index', 'City', 'Class', 'HH', 'HH_asm', 'HH_contrast', 'HH_corr', 'HH_dent', 'HH_diss', 'HH_dvar', 'HH_ent', 'HH_idm', 'HH_imcorr1', 'HH_imcorr2', 'HH_inertia', 'HH_maxcorr', 'HH_prom', 'HH_savg', 'HH_sent', 'HH_shade', 'HH_svar', 'HH_var', 'HV', 'HV_asm', 'HV_contrast', 'HV_corr', 'HV_dent', 'HV_diss', 'HV_dvar', 'HV_ent', 'HV_idm', 'HV_imcorr1', 'HV_imcorr2', 'HV_inertia', 'HV_maxcorr', 'HV_prom', 'HV_savg', 'HV_sent', 'HV_shade', 'HV_svar', 'HV_var', 'NSAI1', 'NSAI2', 'SISAI', 'VH', 'VH_asm', 'VH_contrast', 'VH_corr', 'VH_dent', 'VH_diss', 'VH_dvar', 'VH_ent', 'VH_idm', 'VH_imcorr1', 'VH_imcorr2', 'VH_inertia', 'VH_maxcorr', 'VH_prom', 'VH_savg', 'VH_sent', 'VH_shade', 'VH_svar', 'VH_var', 'VV', 'VV_asm', 'VV_contrast', 'VV_corr', 'VV_dent', 'VV_diss', 'VV_dvar', 'VV_ent', 'VV_idm', 'VV_imcorr1', 'VV_imcorr2', 'VV_inertia', 'VV_maxcorr', 'VV_prom', 'VV_savg', 'VV_sent', 'VV_shade', 'VV_svar', 'VV_var', 'indbiMax', 'indbiMedian', 'mndwiMax', 'mndwiMedian'

ValueError: ignored

In [None]:
# Building soft ensemble model with the best feature combination
# use after Version 14.3 of Best feature Combination findings
# 5 Class ML model training
# Version 7 (September 13, 2023)

# Import libraries
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    cohen_kappa_score
)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib


# Define model evaluation function for classifer ML model
def evaluate_classifier_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=None)[-1]
    recall = recall_score(y_true, y_pred, average=None)[-1]
    f1 = f1_score(y_true, y_pred, average=None)[-1]
    confusion = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)

    print("Accuracy: {:.3f}".format(accuracy))
    print("Precision: {:.3f}".format(precision))
    print("Recall: {:.3f}".format(recall))
    print("F1 Score: {:.3f}".format(f1))
    print("Kappa: {:.3f}".format(kappa))
    print("\nConfusion Matrix:")
    print(confusion)
    print("\nClassification Report:")
    print(report)

    return accuracy, precision, recall, f1, kappa, confusion, report

# Define model evaluation function for regressor ML model
def evaluate_regressor_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    predictions = ensemble_model.predict(X_test)

    print("Mean Squared Error (MSE): {:.3f}".format(mse))
    print("Root Mean Squared Error (RMSE): {:.3f}".format(rmse))
    print("Mean Absolute Error (MAE): {:.3f}".format(mae))
    print("R^2 Score: {:.3f}".format(r2))
    print("Predictions: {:.3f}".format(predictions))

    return mse, rmse, mae, r2, predictions

# Load data
# # Load data and drop any rows contain NaN values
data = pd.read_csv('/content/drive/MyDrive/ELM/merged_CSV/merged_132K.csv').dropna()
data = data.drop_duplicates(subset=['.geo'])
# data = merged_df.dropna()

# # Define sample sizes
# sample_sizes = {
#     0: 10000,
#     1: 10000,
#     2: 10000,
#     3: 10000,
#     4: 10000
#     }
    # Define sample sizes
sample_sizes = {
    0: 1000,
    1: 1000,
    2: 1000,
    3: 1000,
    4: 1000
    }

grouped = data.groupby('Class')# Group by class
sampled = grouped.apply(lambda x: x.sample(n=sample_sizes[x.name], replace=True).reset_index(drop=True))  # Sample from each group according to the defined sample sizes
sampled = [sampled] # Convert to list
sampled = pd.concat(sampled) # Concatenate samples

# Verify counts
print('Total sample numbers in each classes :')
print(sampled['Class'].value_counts())
print()

SFM = ['SISAI']
MFM = ['swirSoil', 'CI', 'Green', 'NIR', 'DBSI', 'NSAI2', 'Blue']

models = [SFM, MFM]
model_name = ['SFM', 'MFM']


# Start of the loop function
for i in range(len(models)):
    # Split data
    X = sampled[models[i]]
    y = sampled['Class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # # Train a Classifier models
    # rf = RandomForestClassifier(n_estimators=500, random_state=100).fit(X_train, y_train)
    # xgb = xgb.XGBClassifier(n_estimators=500, random_state=100).fit(X_train, y_train)
    # ensemble_model = VotingClassifier(estimators=[('Random Forest', rf), ('XGBoost', xgb)], voting='soft').fit(X_train, y_train)

    # Train a Regressor Model
    rf = RandomForestRegressor(n_estimators=500, random_state=100).fit(X_train, y_train)
    xgb = xgb.XGBRegressor(n_estimators=500, random_state=100).fit(X_train, y_train)
    ensemble_model = VotingRegressor(estimators=[('Random Forest', rf), ('XGBoost', xgb)]).fit(X_train, y_train)

    # # Save model
    # joblib.dump(ensemble_model, f'/content/drive/MyDrive/ELM/TrainedModel/SEM_UIS_{models[i]}_230911.pkl')

    # Evaluate model
    ensemble_pred = ensemble_model.predict(X_test)
    ensemble_accuracy = accuracy_score(y_test, ensemble_pred)

    conf_mat_rf = confusion_matrix(y_test, rf_pred)  # confusion matrix for RF Classifier
    conf_mat_xgb = confusion_matrix(y_test, xgb_pred)  # confusion matrix for XGB Classifier
    conf_mat_ensemble = confusion_matrix(y_test, ensemble_pred)  # confusion matrix for Soft Ensemble Classifier

    # Additional metrics
    ensemble_precision = precision_score(y_test, ensemble_pred, average=None)
    ensemble_recall = recall_score(y_test, ensemble_pred, average=None)
    ensemble_f1 = f1_score(y_test, ensemble_pred, average=None)

    print('Model :', i+1)
    print('dataset shape :', X.shape)
    print('X.columns.tolist() :',X.columns.tolist())

    # Make predictions
    y_pred = ensemble_pred

    # Evaluate model
    evaluate_model(y_test, y_pred)



  # # print('confusion matrix :')
  # print(conf_mat_ensemble)
  # print()

  # # Print ensemble metrics
  # # print('Ensemble Test Metrics:')
  # print(i+1, ':','Ensemble_OA: ', ensemble_accuracy)

  # # Precision, Recall and F1_Score is for Impervious class only.
  # print(i+1, ':','Precision: ', ensemble_precision[3])
  # print(i+1, ':','Recall: ', ensemble_recall[3])
  # print(i+1, ':','F1_Score: ', ensemble_f1[3])
  # # print()


In [None]:
# Building soft ensemble model with the best feature combination
# use after Version 14.2 of Best feature Combination findings
# 5 Class ML model training
# Version 6 (September 09, 2023)

# Import libraries
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    cohen_kappa_score
)
import joblib


# Define model evaluation function
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=None)[3]
    recall = recall_score(y_true, y_pred, average=None)[3]
    f1 = f1_score(y_true, y_pred, average=None)[3]
    confusion = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)

    print("Accuracy: {:.3f}".format(accuracy))
    print("Precision: {:.3f}".format(precision))
    print("Recall: {:.3f}".format(recall))
    print("F1 Score: {:.3f}".format(f1))
    print("Kappa: {:.3f}".format(kappa))

    print("\nConfusion Matrix:")
    print(confusion)

    print("\nClassification Report:")
    print(report)

    return accuracy, precision, recall, f1, kappa, confusion, report

# Load data
# # Load data and drop any rows contain NaN values
data = pd.read_csv('/content/drive/MyDrive/ELM/merged_CSV/merged_60K_Soil_Five_Classes.csv').dropna()
data = data.drop_duplicates(subset=['.geo'])
# data = merged_df.dropna()

# Define sample sizes
sample_sizes = {
    0: 10000,
    1: 10000,
    2: 10000,
    3: 10000,
    4: 10000
    }

grouped = data.groupby('Class')# Group by class
sampled = grouped.apply(lambda x: x.sample(n=sample_sizes[x.name], replace=True).reset_index(drop=True))  # Sample from each group according to the defined sample sizes
sampled = [sampled] # Convert to list
sampled = pd.concat(sampled) # Concatenate samples

# Verify counts
print('Total sample numbers in each classes :')
print(sampled['Class'].value_counts())
print()

SFM = ['SISAI']
MFM = ['swirSoil', 'CI', 'Green', 'NIR', 'DBSI', 'NSAI2', 'Blue']

models = [SFM, MFM]
model_name = ['SFM', 'MFM']


# Train models
# Start of the loop function
for i in range(len(models)):

  # Split the data into features (X) and labels (y)
  # X = data.drop('Class', axis=1)
  X = sampled[models[i]] # , 'Class'
  y = sampled['Class']

  # Split data
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

  # Final Random Forest Model
  rf_final = RandomForestClassifier(n_estimators=500, random_state=100)
  rf_final.fit(X_train, y_train)
  rf_pred = rf_final.predict(X_test)
  rf_accuracy = accuracy_score(y_test, rf_pred)

  # Final XGB Model
  xgb_final = xgb.XGBClassifier(n_estimators=500, random_state=100)
  xgb_final.fit(X_train, y_train)
  xgb_pred = xgb_final.predict(X_test)
  xgb_accuracy = accuracy_score(y_test, xgb_pred)

  # Create a VotingClassifier with all the individual classifiers
  ensemble_model = VotingClassifier(
      estimators=[
          ('Random Forest', rf_final),
          ('XGBoost', xgb_final)
                  ],
                  voting='soft'
  )

  # Fit model with data
  ensemble_model.fit(X_train, y_train)

  # Save the trained Ensembled model to disk
  # SEM stands for Soft Ensemble Model
  joblib.dump(ensemble_model, f'/content/drive/MyDrive/ELM/TrainedModel/SEM_Soil_{model_name[i]}_230911.pkl')

  # Ensemble predictions & Accuracy
  ensemble_pred = ensemble_model.predict(X_test)
  ensemble_accuracy = accuracy_score(y_test, ensemble_pred)

  conf_mat_rf = confusion_matrix(y_test, rf_pred)  # confusion matrix for RF Classifier
  conf_mat_xgb = confusion_matrix(y_test, xgb_pred)  # confusion matrix for XGB Classifier
  conf_mat_ensemble = confusion_matrix(y_test, ensemble_pred)  # confusion matrix for Soft Ensemble Classifier

  # Additional metrics
  ensemble_precision = precision_score(y_test, ensemble_pred, average=None)
  ensemble_recall = recall_score(y_test, ensemble_pred, average=None)
  ensemble_f1 = f1_score(y_test, ensemble_pred, average=None)

  print('Model :', i+1)
  print('dataset shape :', X.shape)
  print('X.columns.tolist() :',X.columns.tolist())

  # Make predictions
  y_pred = ensemble_pred

  # Evaluate model
  evaluate_model(y_test, y_pred)



  # # print('confusion matrix :')
  # print(conf_mat_ensemble)
  # print()

  # # Print ensemble metrics
  # # print('Ensemble Test Metrics:')
  # print(i+1, ':','Ensemble_OA: ', ensemble_accuracy)

  # # Precision, Recall and F1_Score is for Impervious class only.
  # print(i+1, ':','Precision: ', ensemble_precision[3])
  # print(i+1, ':','Recall: ', ensemble_recall[3])
  # print(i+1, ':','F1_Score: ', ensemble_f1[3])
  # # print()


In [None]:
# Finding Best Feature Combination for building ML Model
# Version 14 (11 September 2023)
# removing pipeline transformer and Scaler as these are unnecessary for the classifier's we are using.

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb

# Define a function for model training and evaluation
def train_and_evaluate(X_train, X_test, y_train, y_test):
    # Define the classifiers
    rf_model = RandomForestClassifier(n_estimators=250, random_state=100, n_jobs=-1)
    xgb_model = xgb.XGBClassifier(n_estimators=250, random_state=100, n_jobs=-1)

    # Create a VotingClassifier with soft voting
    ensemble_model = VotingClassifier(
        estimators=[('rf', rf_model), ('xgb', xgb_model)],
        voting='soft'
    )

    # Train the VotingClassifier
    ensemble_model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = ensemble_model.predict(X_test)
    accuracy = f1_score(y_test, predictions, average=None)[3] #3 is for soil class

    return accuracy

def main():
    # Load the CSV file into a DataFrame
    data = pd.read_csv('/content/drive/MyDrive/ELM/merged_CSV/merged_60K_Soil_Five_Classes.csv').dropna()
    data = data.drop_duplicates(subset=['.geo'])

    print('data.columns.tolist() :',data.columns.tolist())

    # Define sample sizes for each class seperately
    sample_sizes = {
        0: 10000,
        1: 10000,
        2: 10000,
        3: 10000,
        4: 10000
        }

    grouped = data.groupby('Class')# Group by class
    sampled = grouped.apply(lambda x: x.sample(n=sample_sizes[x.name], replace=True).reset_index(drop=True))  # Sample from each group according to the defined sample sizes
    sampled = [sampled] # Convert to list
    data = pd.concat(sampled) # Concatenate samples

    target = data['Class']
    data = data.drop(['Class','.geo','system:index', 'City'], axis=1)

    # Verify counts
    print('Total sample numbers in each classes :')
    print(target.value_counts())
    print('n_estimators=250')
    print()

    # Assuming 'data' is your DataFrame and 'target' is the target variable
    features = data.columns.tolist()

    # Record the best accuracy and feature combination
    best_accuracy = 0
    best_combination = []

    # Create a dictionary to store the accuracy of each feature
    feature_accuracies = {}

    # Loop until no new feature is added to the best combination
    while True:
        best_feature = None

        for feature in features:
            # Add the current feature to the best combination
            combination = best_combination + [feature]

            # Split the data into training and test sets (with a random seed)
            X_train, X_test, y_train, y_test = train_test_split(data[combination], target, test_size=0.20, random_state=0)

            # Train the model and evaluate it
            accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)

            # Store the accuracy of the feature
            feature_accuracies[feature] = accuracy

            # If this combination has the best accuracy so far, record it
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature = feature

        # If no new feature was added to the best combination, stop the loop
        if best_feature is None:
            break

        # Sort the features by their accuracies in ascending order and print them
        sorted_features = sorted(feature_accuracies.items(), key=lambda item: item[1])

        # If we're evaluating a single feature, print its accuracy
        if len(combination) == 1:
            for feature, accuracy in sorted_features:
                print(f"Accuracy: {accuracy}, Feature: {feature}")

        # Otherwise, add the best feature to the best combination
        best_combination.append(best_feature)
        features.remove(best_feature)

        print(f"Accuracy: {best_accuracy}", f"Combination: {best_combination}")

    print(f"Best accuracy: {best_accuracy}")
    print(f"Best feature combination: {best_combination}")

if __name__ == "__main__":
   main()

data.columns.tolist() : ['system:index', 'Blue', 'CI', 'City', 'Class', 'DBSI', 'Green', 'NDSI2010', 'NDSI2015', 'NIR', 'NSAI1', 'NSAI2', 'RNDSI', 'Red', 'SWIR1', 'SWIR2', 'swirSoil', '.geo']
Total sample numbers in each classes :
0    10000
1    10000
2    10000
3    10000
4    10000
Name: Class, dtype: int64
n_estimators=250

Accuracy: 0.7859281437125749, Feature: NIR
Accuracy: 0.792079207920792, Feature: NDSI2015
Accuracy: 0.7956452710365163, Feature: NSAI2
Accuracy: 0.7964847363552268, Feature: NSAI1
Accuracy: 0.8111888111888111, Feature: RNDSI
Accuracy: 0.814466885861907, Feature: CI
Accuracy: 0.8192346424974823, Feature: Blue
Accuracy: 0.8298792985652471, Feature: NDSI2010
Accuracy: 0.8408348001005782, Feature: Green
Accuracy: 0.8434393638170975, Feature: SWIR1
Accuracy: 0.8496672022033509, Feature: DBSI
Accuracy: 0.8562156764852721, Feature: SWIR2
Accuracy: 0.8618172665492071, Feature: Red
Accuracy: 0.9001956947162427, Feature: swirSoil
Accuracy: 0.9001956947162427 Combination: 

In [None]:
# Finding Best Feature Combination for building ML Model
# Version 13 (11 September 2023)

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb

# Define a function for model training and evaluation
def train_and_evaluate(X_train, X_test, y_train, y_test):
    # Create a pipeline for data preprocessing and model training
    pipeline = Pipeline([
        ('scaling', StandardScaler()),
        ('classification', VotingClassifier(
            estimators=[
                ('Random Forest', RandomForestClassifier(n_estimators=250, random_state=100, n_jobs=-1)),
                ('XGBoost', xgb.XGBClassifier(n_estimators=250, random_state=100, n_jobs=-1))
            ],
            voting='soft'
        ))
    ])

    # Train the model using the pipeline
    pipeline.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = pipeline.predict(X_test)
    accuracy = f1_score(y_test, predictions, average=None)[3] #3 is for soil class

    return accuracy

def main():
    # Load the CSV file into a DataFrame
    data = pd.read_csv('/content/drive/MyDrive/ELM/merged_CSV/merged_60K_Soil_Five_Classes.csv').dropna()
    data = data.drop_duplicates(subset=['.geo'])

    print('data.columns.tolist() :',data.columns.tolist())

    # Define sample sizes
    sample_sizes = {
        0: 10000,
        1: 10000,
        2: 10000,
        3: 10000,
        4: 10000
        }

    grouped = data.groupby('Class')# Group by class
    sampled = grouped.apply(lambda x: x.sample(n=sample_sizes[x.name], replace=True).reset_index(drop=True))  # Sample from each group according to the defined sample sizes
    sampled = [sampled] # Convert to list
    data = pd.concat(sampled) # Concatenate samples

    target = data['Class']
    data = data.drop(['Class','.geo','system:index', 'City'], axis=1)

    # Verify counts
    print('Total sample numbers in each classes :')
    print(target.value_counts())
    print('n_estimators=250')
    print()

    # Assuming 'data' is your DataFrame and 'target' is the target variable
    features = data.columns.tolist()

    # Record the best accuracy and feature combination
    best_accuracy = 0
    best_combination = []

    # Create a dictionary to store the accuracy of each feature
    feature_accuracies = {}

    # Loop until no new feature is added to the best combination
    while True:
        best_feature = None

        for feature in features:
            # Add the current feature to the best combination
            combination = best_combination + [feature]

            # Split the data into training and test sets (with a random seed)
            X_train, X_test, y_train, y_test = train_test_split(data[combination], target, test_size=0.20, random_state=0)

            # Train the model and evaluate it
            accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)

            # Store the accuracy of the feature
            feature_accuracies[feature] = accuracy

            # If this combination has the best accuracy so far, record it
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature = feature

        # If no new feature was added to the best combination, stop the loop
        if best_feature is None:
            break

        # Sort the features by their accuracies in ascending order and print them
        sorted_features = sorted(feature_accuracies.items(), key=lambda item: item[1])

        # If we're evaluating a single feature, print its accuracy
        if len(combination) == 1:
            for feature, accuracy in sorted_features:
                print(f"Accuracy: {accuracy}, Feature: {feature}")

        # Otherwise, add the best feature to the best combination
        best_combination.append(best_feature)
        features.remove(best_feature)

        print(f"Accuracy: {best_accuracy}", f"Combination: {best_combination}")

    print(f"Best accuracy: {best_accuracy}")
    print(f"Best feature combination: {best_combination}")

if __name__ == "__main__":
   main()

data.columns.tolist() : ['system:index', 'Blue', 'CI', 'City', 'Class', 'DBSI', 'Green', 'NDSI2010', 'NDSI2015', 'NIR', 'NSAI1', 'NSAI2', 'RNDSI', 'Red', 'SWIR1', 'SWIR2', 'swirSoil', '.geo']
Total sample numbers in each classes :
0    10000
1    10000
2    10000
3    10000
4    10000
Name: Class, dtype: int64
n_estimators=250

Accuracy: 0.7940955716787591, Feature: NIR
Accuracy: 0.7957584140156755, Feature: NSAI1
Accuracy: 0.8000935234977788, Feature: NDSI2015
Accuracy: 0.8094696107443661, Feature: NSAI2
Accuracy: 0.8101969872537659, Feature: CI
Accuracy: 0.814608048383345, Feature: RNDSI
Accuracy: 0.8301606922126082, Feature: Blue
Accuracy: 0.8326866012325953, Feature: NDSI2010
Accuracy: 0.835192069392813, Feature: Green
Accuracy: 0.8466533466533467, Feature: SWIR1
Accuracy: 0.8572759022118743, Feature: DBSI
Accuracy: 0.8610905502097213, Feature: SWIR2
Accuracy: 0.8615308397324746, Feature: Red
Accuracy: 0.9005158437730287, Feature: swirSoil
Accuracy: 0.9005158437730287 Combination: 

KeyboardInterrupt: ignored

In [None]:
# 10th Sempember 2023
# Use XGB only
# Testing with 150k Samples
# n_estimator = 300
# Version 12
# improve the code with function

# different Class will have different numbers of samples
# Now, instead RF classifier, this code uses Soft Voting ensemble classifier with RF and XGB classifier.

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import pandas as pd
import xgboost as xgb
import numpy as np

# Load the CSV file into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/ELM/merged_CSV/merged_60K_Soil_Five_Classes.csv').dropna()
data = data.drop_duplicates(subset=['.geo'])
print('data.columns.tolist() :',data.columns.tolist())

# # Define a function for model training and evaluation
# def train_and_evaluate(X_train, X_test, y_train, y_test):
#     # Create a pipeline for data preprocessing and model training
#     pipeline = Pipeline([
#         ('scaling', StandardScaler()),
#         ('classification', VotingClassifier(
#             estimators=[
#                 ('Random Forest', RandomForestClassifier(n_estimators=300, random_state=100, n_jobs=-1)),
#                 ('XGBoost', xgb.XGBClassifier(n_estimators=300, random_state=100, n_jobs=-1))
#             ],
#             voting='soft'
#         ))
#     ])

#     # Train the model using the pipeline
#     pipeline.fit(X_train, y_train)

#     # Make predictions and calculate accuracy
#     predictions = pipeline.predict(X_test)
#     accuracy = f1_score(y_test, predictions, average=None)[3] #3 is for soil class

#     return accuracy

# Define a function for model training and evaluation (XGB Only)
def train_and_evaluate(X_train, X_test, y_train, y_test):
    # Create a pipeline for data preprocessing and model training
    pipeline = Pipeline([
        ('scaling', StandardScaler()),
        ('classification', xgb.XGBClassifier(n_estimators=300, random_state=100, n_jobs=-1))
    ])

    # Train the model using the pipeline
    pipeline.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = pipeline.predict(X_test)
    accuracy = f1_score(y_test, predictions, average=None)[3]

    return accuracy

# Define sample sizes
sample_sizes = {
    0: 10000,
    1: 10000,
    2: 10000,
    3: 10000,
    4: 10000
    }

grouped = data.groupby('Class')# Group by class
sampled = grouped.apply(lambda x: x.sample(n=sample_sizes[x.name], replace=True).reset_index(drop=True))  # Sample from each group according to the defined sample sizes
sampled = [sampled] # Convert to list
data = pd.concat(sampled) # Concatenate samples

target = data['Class']
data = data.drop(['Class','.geo','system:index', 'City'], axis=1)

# Verify counts
print('Total sample numbers in each classes :')
print(target.value_counts())
print()

# Assuming 'data' is your DataFrame and 'target' is the target variable
features = data.columns.tolist()

# Record the best accuracy and feature combination
best_accuracy = 0
best_combination = []

# Loop over all features
for _ in range(len(features)):
    best_feature = None

    for feature in features:
        # Add the current feature to the best combination
        combination = best_combination + [feature]

        # Split the data into training and test sets (with a random seed)
        X_train, X_test, y_train, y_test = train_test_split(data[combination], target, test_size=0.20, random_state=0)

        # Train the model and evaluate it
        accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)

        # If this combination has the best accuracy so far, record it
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_feature = feature

    # Add the best feature of this iteration to the best combination
    if best_feature is not None:
        best_combination.append(best_feature)
        features.remove(best_feature)

    print(f"Accuracy: {best_accuracy}", f"Combination: {best_combination}")

print(f"Best accuracy: {best_accuracy}")
print(f"Best feature combination: {best_combination}")

data.columns.tolist() : ['system:index', 'Blue', 'CI', 'City', 'Class', 'DBSI', 'Green', 'NDSI2010', 'NDSI2015', 'NIR', 'NSAI1', 'NSAI2', 'RNDSI', 'Red', 'SWIR1', 'SWIR2', 'swirSoil', '.geo']
Total sample numbers in each classes :
0    10000
1    10000
2    10000
3    10000
4    10000
Name: Class, dtype: int64

Accuracy: 0.826995718962478 Combination: ['swirSoil']
Accuracy: 0.9090031378228336 Combination: ['swirSoil', 'NIR']
Accuracy: 0.9599802371541502 Combination: ['swirSoil', 'NIR', 'Green']
Accuracy: 0.9707776126795442 Combination: ['swirSoil', 'NIR', 'Green', 'CI']
Accuracy: 0.9747274529236869 Combination: ['swirSoil', 'NIR', 'Green', 'CI', 'NDSI2010']
Accuracy: 0.9747274529236869 Combination: ['swirSoil', 'NIR', 'Green', 'CI', 'NDSI2010']
Accuracy: 0.9747274529236869 Combination: ['swirSoil', 'NIR', 'Green', 'CI', 'NDSI2010']
Accuracy: 0.9747274529236869 Combination: ['swirSoil', 'NIR', 'Green', 'CI', 'NDSI2010']
Accuracy: 0.9747274529236869 Combination: ['swirSoil', 'NIR', 'Gree

KeyboardInterrupt: ignored

In [None]:
# 10th Sempember 2023
# Testing with 180k Samples
# n_estimator = 300
# attempt 11
# improve the code with function

# different Class will have different numbers of samples
# Now, instead RF classifier, this code uses Soft Voting ensemble classifier with RF and XGB classifier.

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import pandas as pd
import xgboost as xgb
import numpy as np

# Load the CSV file into a DataFrame
# data = pd.read_csv('merged_182K_Chapter5_Five_Class_Model.csv')
data = pd.read_csv('/content/drive/MyDrive/ELM/merged_CSV/merged_60K_Soil_Five_Classes.csv').dropna()
data = data.drop_duplicates(subset=['.geo'])

# # Define a function for model training and evaluation
# def train_and_evaluate(X_train, X_test, y_train, y_test):
#     # Create a pipeline for data preprocessing and model training
#     pipeline = Pipeline([
#         ('scaling', StandardScaler()),
#         ('classification', VotingClassifier(
#             estimators=[
#                 ('Random Forest', RandomForestClassifier(n_estimators=300, random_state=100, n_jobs=-1)),
#                 ('XGBoost', xgb.XGBClassifier(n_estimators=300, random_state=100, n_jobs=-1))
#             ],
#             voting='soft'
#         ))
#     ])

#     # Rest of the function
#     pipeline.fit(X_train, y_train)
#     predictions = pipeline.predict(X_test)
#     accuracy = f1_score(y_test, predictions, average=None)[4]

#     return accuracy

# Define a function for model training and evaluation (XGB Only)
def train_and_evaluate(X_train, X_test, y_train, y_test):
    # Create a pipeline for data preprocessing and model training
    pipeline = Pipeline([
        ('scaling', StandardScaler()),
        ('classification', xgb.XGBClassifier(n_estimators=300, random_state=100, n_jobs=-1))
    ])

    pipeline.fit(X_train, y_train)  # Train the model using the pipeline
    predictions = pipeline.predict(X_test)    # Make predictions and calculate accuracy
    accuracy = f1_score(y_test, predictions, average=None)[3]

    return accuracy


# Define sample sizes
sample_sizes = {
    0: 10000,
    1: 10000,
    2: 10000,
    3: 10000,
    4: 10000
    }

# Grouped, sampled and concentrated them together into single dataframe
grouped = data.groupby('Class')# Group by class
sampled = grouped.apply(lambda x: x.sample(n=sample_sizes[x.name], replace=True).reset_index(drop=True))  # Sample from each group according to the defined sample sizes
sampled = [sampled] # Convert to list
data = pd.concat(sampled) # Concatenate samples

# select the terget data column
target = data['Class']
data = data.drop(['Class','.geo','system:index','City'], axis=1)

# Verify counts
print('Total sample numbers in each classes :')
print(target.value_counts())
print()

# Assuming 'data' is your DataFrame and 'target' is the target variable
features = data.columns.tolist()

# Record the best accuracy and feature combination
best_accuracy = 0
best_combination = []

# Loop over all features
for _ in range(len(features)):
    best_feature = None

    # Standardize
    scaler = StandardScaler().fit(data)
    standardized_data = scaler.transform(data)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(standardized_data, target, test_size=0.20, random_state=0)

    # Feature selection
    for feature in features:

        # Pass standardized data
        accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)

        # Add the current feature to the best combination
        combination = best_combination + [feature]

        # If this combination has the best accuracy so far, record it
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_feature = feature

    # Add the best feature of this iteration to the best combination
    if best_feature is not None:
        best_combination.append(best_feature)
        features.remove(best_feature)

    print(f"Accuracy: {best_accuracy}", f"Combination: {best_combination}")

print(f"Best accuracy: {best_accuracy}")
print(f"Best feature combination: {best_combination}")

In [None]:
# 10th Sempember 2023
# Testing with 150k Samples
# n_estimator = 300
# attempt 10
# improve the code with function

# different Class will have different numbers of samples
# Now, instead RF classifier, this code uses Soft Voting ensemble classifier with RF and XGB classifier.

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import pandas as pd
import xgboost as xgb
import numpy as np

# Load the CSV file into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/ELM/merged_CSV/merged_60K_Soil_Five_Classes.csv').dropna()
data = data.drop_duplicates(subset=['.geo'])
print('data.columns.tolist() :',data.columns.tolist())

# Define a function for model training and evaluation
def train_and_evaluate(X_train, X_test, y_train, y_test):
    # Create a pipeline for data preprocessing and model training
    pipeline = Pipeline([
        ('scaling', StandardScaler()),
        ('classification', VotingClassifier(
            estimators=[
                ('Random Forest', RandomForestClassifier(n_estimators=300, random_state=100, n_jobs=-1)),
                ('XGBoost', xgb.XGBClassifier(n_estimators=300, random_state=100, n_jobs=-1))
            ],
            voting='soft'
        ))
    ])

    # Train the model using the pipeline
    pipeline.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = pipeline.predict(X_test)
    accuracy = f1_score(y_test, predictions, average=None)[3] #3 is for soil class

    return accuracy

# Define sample sizes
sample_sizes = {
    0: 10000,
    1: 10000,
    2: 10000,
    3: 10000,
    4: 10000
    }

grouped = data.groupby('Class')# Group by class
sampled = grouped.apply(lambda x: x.sample(n=sample_sizes[x.name], replace=True).reset_index(drop=True))  # Sample from each group according to the defined sample sizes
sampled = [sampled] # Convert to list
data = pd.concat(sampled) # Concatenate samples

target = data['Class']
data = data.drop(['Class','.geo','system:index', 'City'], axis=1)

# Verify counts
print('Total sample numbers in each classes :')
print(target.value_counts())
print()

# Assuming 'data' is your DataFrame and 'target' is the target variable
features = data.columns.tolist()

# Record the best accuracy and feature combination
best_accuracy = 0
best_combination = []

# Loop over all features
for _ in range(len(features)):
    best_feature = None

    for feature in features:
        # Add the current feature to the best combination
        combination = best_combination + [feature]

        # Split the data into training and test sets (with a random seed)
        X_train, X_test, y_train, y_test = train_test_split(data[combination], target, test_size=0.20, random_state=0)

        # Train the model and evaluate it
        accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)

        # If this combination has the best accuracy so far, record it
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_feature = feature

    # Add the best feature of this iteration to the best combination
    if best_feature is not None:
        best_combination.append(best_feature)
        features.remove(best_feature)

    print(f"Accuracy: {best_accuracy}", f"Combination: {best_combination}")

print(f"Best accuracy: {best_accuracy}")
print(f"Best feature combination: {best_combination}")

data.columns.tolist() : ['system:index', 'Blue', 'CI', 'City', 'Class', 'DBSI', 'Green', 'NDSI2010', 'NDSI2015', 'NIR', 'NSAI1', 'NSAI2', 'RNDSI', 'Red', 'SWIR1', 'SWIR2', 'swirSoil', '.geo']
Total sample numbers in each classes :
0    10000
1    10000
2    10000
3    10000
4    10000
Name: Class, dtype: int64

Accuracy: 0.9002457002457003 Combination: ['swirSoil']
Accuracy: 0.9405594405594406 Combination: ['swirSoil', 'NDSI2015']
Accuracy: 0.9636947394418374 Combination: ['swirSoil', 'NDSI2015', 'NDSI2010']
Accuracy: 0.9724223602484472 Combination: ['swirSoil', 'NDSI2015', 'NDSI2010', 'NSAI1']
Accuracy: 0.9768599154018413 Combination: ['swirSoil', 'NDSI2015', 'NDSI2010', 'NSAI1', 'NSAI2']
Accuracy: 0.9785322016974538 Combination: ['swirSoil', 'NDSI2015', 'NDSI2010', 'NSAI1', 'NSAI2', 'Red']
Accuracy: 0.9785322016974538 Combination: ['swirSoil', 'NDSI2015', 'NDSI2010', 'NSAI1', 'NSAI2', 'Red']
Accuracy: 0.9785322016974538 Combination: ['swirSoil', 'NDSI2015', 'NDSI2010', 'NSAI1', 'NSAI

In [None]:
# 10th Sempember 2023
# Testing with 15k samples
# attempt 9
# improve the code with function

# different Class will have different numbers of samples
# Now, instead RF classifier, this code uses Soft Voting ensemble classifier with RF and XGB classifier.

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import pandas as pd
import xgboost as xgb
import numpy as np

# Load the CSV file into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/Chapter5Model//merged_CSV/merged_162K_Chapter5_Five_Class_Model.csv').dropna()
data = data.drop_duplicates(subset=['.geo'])

# # Define a function for model training and evaluation
# def train_and_evaluate(X_train, X_test, y_train, y_test):
#     # Create a pipeline for data preprocessing and model training
#     pipeline = Pipeline([
#         ('scaling', StandardScaler()),
#         ('classification', VotingClassifier(
#             estimators=[
#                 ('Random Forest', RandomForestClassifier(n_estimators=500, random_state=100)),
#                 ('XGBoost', xgb.XGBClassifier(n_estimators=500, random_state=100))
#             ],
#             voting='soft'
#         ))
#     ])

# Define a function for model training and evaluation
def train_and_evaluate(X_train, X_test, y_train, y_test):
    # Create a pipeline for data preprocessing and model training
    pipeline = Pipeline([
        ('scaling', StandardScaler()),
        ('classification', VotingClassifier(
            estimators=[
                ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=100, n_jobs=-1)),
                ('XGBoost', xgb.XGBClassifier(n_estimators=100, random_state=100, n_jobs=-1))
            ],
            voting='soft'
        ))
    ])

    # Train the model using the pipeline
    pipeline.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = pipeline.predict(X_test)
    accuracy = f1_score(y_test, predictions, average=None)[4]

    return accuracy

# Define sample sizes
sample_sizes = {
    0: 2500,
    1: 2500,
    2: 2500,
    3: 2500,
    4: 5000
    }

grouped = data.groupby('Class')# Group by class
sampled = grouped.apply(lambda x: x.sample(n=sample_sizes[x.name], replace=True).reset_index(drop=True))  # Sample from each group according to the defined sample sizes
sampled = [sampled] # Convert to list
data = pd.concat(sampled) # Concatenate samples

target = data['Class']
data = data.drop(['Class','.geo','system:index','City', 'NDBI', 'UI', 'SwiRed', 'INDBI', 'NBAI', 'BLFEI'], axis=1)

# Verify counts
print('Total sample numbers in each classes :')
print(target.value_counts())
print()

# Assuming 'data' is your DataFrame and 'target' is the target variable
features = data.columns.tolist()

# Record the best accuracy and feature combination
best_accuracy = 0
best_combination = []

# Loop over all features
for _ in range(len(features)):
    best_feature = None

    for feature in features:
        # Add the current feature to the best combination
        combination = best_combination + [feature]

        # Split the data into training and test sets (with a random seed)
        X_train, X_test, y_train, y_test = train_test_split(data[combination], target, test_size=0.20, random_state=0)

        # Train the model and evaluate it
        accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)

        # If this combination has the best accuracy so far, record it
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_feature = feature

    # Add the best feature of this iteration to the best combination
    if best_feature is not None:
        best_combination.append(best_feature)
        features.remove(best_feature)

    print(f"Accuracy: {best_accuracy}", f"Combination: {best_combination}")

print(f"Best accuracy: {best_accuracy}")
print(f"Best feature combination: {best_combination}")

Total sample numbers in each classes :
4    5000
0    2500
1    2500
2    2500
3    2500
Name: Class, dtype: int64

Accuracy: 0.7328671328671329 Combination: ['SISAI']
Accuracy: 0.8165225744476465 Combination: ['SISAI', 'L8_swir1']
Accuracy: 0.8684337349397591 Combination: ['SISAI', 'L8_swir1', 'mndwiMedian']
Accuracy: 0.8960155490767736 Combination: ['SISAI', 'L8_swir1', 'mndwiMedian', 'ndviMax']
Accuracy: 0.9092702169625245 Combination: ['SISAI', 'L8_swir1', 'mndwiMedian', 'ndviMax', 'NSAI2']
Accuracy: 0.9158050221565731 Combination: ['SISAI', 'L8_swir1', 'mndwiMedian', 'ndviMax', 'NSAI2', 'ndviMedian']
Accuracy: 0.9193706981317601 Combination: ['SISAI', 'L8_swir1', 'mndwiMedian', 'ndviMax', 'NSAI2', 'ndviMedian', 'L8_blue']
Accuracy: 0.9239823442864149 Combination: ['SISAI', 'L8_swir1', 'mndwiMedian', 'ndviMax', 'NSAI2', 'ndviMedian', 'L8_blue', 'swirSoil']
Accuracy: 0.9239823442864149 Combination: ['SISAI', 'L8_swir1', 'mndwiMedian', 'ndviMax', 'NSAI2', 'ndviMedian', 'L8_blue', 'sw

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/Chapter5Model//merged_CSV/merged_162K_Chapter5_Five_Class_Model.csv').dropna()
data = data.drop_duplicates(subset=['.geo'])

print(data['Class'].value_counts())

4    61752
2    39673
3    29236
1    28119
0     3247
Name: Class, dtype: int64


In [None]:
# 10th September 2023
# attempt 8
# different Class will have different numbers of samples
# Now, instead RF classifier, this code uses Soft Voting ensemble classifier with RF and XGB classifier.

import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
import xgboost as xgb

# Load the CSV file into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/Chapter5Model//merged_CSV/merged_162K_Chapter5_Five_Class_Model.csv').dropna()
data = data.drop_duplicates(subset=['.geo'])

# Define sample sizes
sample_sizes = {
    0: 25000,
    1: 25000,
    2: 25000,
    3: 25000,
    4: 50000
    }

grouped = data.groupby('Class')# Group by class
sampled = grouped.apply(lambda x: x.sample(n=sample_sizes[x.name], replace=True).reset_index(drop=True))  # Sample from each group according to the defined sample sizes
sampled = [sampled] # Convert to list
data = pd.concat(sampled) # Concatenate samples

target = data['Class']
data = data.drop(['Class','.geo','system:index','City', 'NDBI', 'UI', 'SwiRed', 'INDBI', 'NBAI', 'BLFEI'], axis=1)

# Verify counts
print('Total sample numbers in each classes :')
print(target.value_counts())
print()

# Assuming 'data' is your DataFrame and 'target' is the target variable
features = data.columns.tolist()

# Record the best accuracy and feature combination
best_accuracy = 0
best_combination = []

# Loop over all features
for _ in range(len(features)):
    best_feature = None

    for feature in features:
        # Add the current feature to the best combination
        combination = best_combination + [feature]

        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(data[combination], target, test_size=0.20)

        # Normalize the data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)

        # Train the model using Random Forest and XGBoost classifiers
        rf_model = RandomForestClassifier(n_estimators=500, random_state=100)
        xgb_model = xgb.XGBClassifier(n_estimators=500, random_state=100)

        ensemble_model = VotingClassifier(
            estimators=[
                ('Random Forest', rf_model),
                ('XGBoost', xgb_model)
            ],
            voting='soft'
        )

        ensemble_model.fit(X_train, y_train)

        # Make predictions and calculate accuracy
        predictions = ensemble_model.predict(X_test)
        rf_f1 = f1_score(y_test, predictions, average=None)[4]  # the last "[4]" is for the impervious surface area. This is the fifth element of the accuracy matrix. hence the 4th element of the index.

        # If this combination has the best accuracy so far, record it
        if rf_f1 > best_accuracy:
            best_accuracy = rf_f1
            best_feature = feature

    # Add the best feature of this iteration to the best combination
    if best_feature is not None:
        best_combination.append(best_feature)
        features.remove(best_feature)

    print(f"Accuracy: {best_accuracy}", f"Combination: {best_combination}")

print(f"Best accuracy: {best_accuracy}")
print(f"Best feature combination: {best_combination}")

Total sample numbers in each classes :
4    50000
0    25000
1    25000
2    25000
3    25000
Name: Class, dtype: int64



KeyboardInterrupt: ignored

In [None]:
# attempt 7
# Now, instead RF classifier, this code uses Soft Voting ensemble classifier with RF and XGB classifier.

import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import pandas as pd
import xgboost as xgb

# Load the CSV file into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/Chapter5Model//merged_CSV/merged_162K_Chapter5_Five_Class_Model.csv').dropna()

n = 1000 # Define sample size
grouped = data.groupby('Class') # Group by class
sampled = grouped.apply(lambda x: x.sample(n=n, replace=True).reset_index(drop=True)) # Sample from each group
sampled = [sampled] # Convert to list
data = pd.concat(sampled)  # Concatenate samples

target = data['Class']
data = data.drop(['Class','.geo','system:index','City', 'NDBI', 'UI', 'SwiRed', 'INDBI', 'NBAI', 'BLFEI'], axis=1)

# Verify counts
print('Total sample numbers in each classes :')
print(target.value_counts())
print()

# Assuming 'data' is your DataFrame and 'target' is the target variable
features = data.columns.tolist()

# Record the best accuracy and feature combination
best_accuracy = 0
best_combination = []

# Loop over all features
for _ in range(len(features)):
    best_feature = None

    for feature in features:
        # Add the current feature to the best combination
        combination = best_combination + [feature]

        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(data[combination], target, test_size=0.25)

        # Train the model using Random Forest and XGBoost classifiers
        rf_model = RandomForestClassifier(n_estimators=100, random_state=100)
        xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=100)

        ensemble_model = VotingClassifier(
            estimators=[
                ('Random Forest', rf_model),
                ('XGBoost', xgb_model)
            ],
            voting='soft'
        )

        ensemble_model.fit(X_train, y_train)

        # Make predictions and calculate accuracy
        predictions = ensemble_model.predict(X_test)
        rf_f1 = f1_score(y_test, predictions, average=None)[4]  # the last "[4]" is for the impervious surface area. This is the fifth element of the accuracy matrix. hence the 4th element of the index.

        # If this combination has the best accuracy so far, record it
        if rf_f1 > best_accuracy:
            best_accuracy = rf_f1
            best_feature = feature

    # Add the best feature of this iteration to the best combination
    if best_feature is not None:
        best_combination.append(best_feature)
        features.remove(best_feature)

    print(f"Accuracy: {best_accuracy}", f"Combination: {best_combination}")

print(f"Best accuracy: {best_accuracy}")
print(f"Best feature combination: {best_combination}")


Total sample numbers in each classes :
0    1000
1    1000
2    1000
3    1000
4    1000
Name: Class, dtype: int64

Accuracy: 0.5775193798449613 Combination: ['SISAI']
Accuracy: 0.735632183908046 Combination: ['SISAI', 'L8_swir2']
Accuracy: 0.7907869481765836 Combination: ['SISAI', 'L8_swir2', 'L8_nir']
Accuracy: 0.80859375 Combination: ['SISAI', 'L8_swir2', 'L8_nir', 'ndviMax']
Accuracy: 0.848030018761726 Combination: ['SISAI', 'L8_swir2', 'L8_nir', 'ndviMax', 'nduiMin']
Accuracy: 0.8637992831541218 Combination: ['SISAI', 'L8_swir2', 'L8_nir', 'ndviMax', 'nduiMin', 'ndbiMin']
Accuracy: 0.8637992831541218 Combination: ['SISAI', 'L8_swir2', 'L8_nir', 'ndviMax', 'nduiMin', 'ndbiMin']
Accuracy: 0.8740458015267176 Combination: ['SISAI', 'L8_swir2', 'L8_nir', 'ndviMax', 'nduiMin', 'ndbiMin', 'NSAI1']
Accuracy: 0.8740458015267176 Combination: ['SISAI', 'L8_swir2', 'L8_nir', 'ndviMax', 'nduiMin', 'ndbiMin', 'NSAI1']
Accuracy: 0.8740458015267176 Combination: ['SISAI', 'L8_swir2', 'L8_nir', 'nd

KeyboardInterrupt: ignored

In [None]:
# Attempt 6
# Professional grade output with 24k samples in each class. Total 120k samples
# Only using RF model
# Now in each iteration one best contribution bands will be added to the best_combination.

import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Load the CSV file into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/Chapter5Model//merged_CSV/merged_162K_Chapter5_Five_Class_Model.csv').dropna()

n = 24000 # Define sample size
grouped = data.groupby('Class') # Group by class
sampled = grouped.apply(lambda x: x.sample(n=n, replace=True).reset_index(drop=True)) # Sample from each group
sampled = [sampled] # Convert to list
data = pd.concat(sampled)  # Concatenate samples

target = data['Class']
data = data.drop(['Class','.geo','system:index','City', 'NDBI', 'UI', 'SwiRed', 'INDBI', 'NBAI', 'BLFEI'], axis=1)

# Verify counts
print('Total sample numbers in each classes :')
print(target.value_counts())
print()

# Assuming 'data' is your DataFrame and 'target' is the target variable
features = data.columns.tolist()

# Record the best accuracy and feature combination
best_accuracy = 0
best_combination = []

# Loop over all features
for _ in range(len(features)):
    best_feature = None

    for feature in features:
        # Add the current feature to the best combination
        combination = best_combination + [feature]

        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(data[combination], target, test_size=0.25)

        # Train the model
        model = RandomForestClassifier()
        model.fit(X_train, y_train)

        # Make predictions and calculate accuracy
        predictions = model.predict(X_test)
        rf_f1 = f1_score(y_test, predictions, average=None)[4]  # the last "[4]" is for the impervious surface area. This is the fifth element of the accuracy matrix. hence the 4th element of the index.

        # If this combination has the best accuracy so far, record it
        if rf_f1 > best_accuracy:
            best_accuracy = rf_f1
            best_feature = feature

    # Add the best feature of this iteration to the best combination
    if best_feature is not None:
        best_combination.append(best_feature)
        features.remove(best_feature)

    print(f"Combination: {best_combination}",f"Accuracy: {best_accuracy}")

print(f"Best accuracy: {best_accuracy}")
print(f"Best feature combination: {best_combination}")


Total sample numbers in each classes :
0    24000
1    24000
2    24000
3    24000
4    24000
Name: Class, dtype: int64

Combination: ['SISAI'] Accuracy: 0.6767376729186115
Combination: ['SISAI', 'swirSoil'] Accuracy: 0.793420828362025


KeyboardInterrupt: ignored

In [None]:
# Attempt 5
# Now in each iteration one best contribution bands will be added to the best_combination.

import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Load the CSV file into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/Chapter5Model//merged_CSV/merged_162K_Chapter5_Five_Class_Model.csv').dropna()

n = 3000 # Define sample size
grouped = data.groupby('Class') # Group by class
sampled = grouped.apply(lambda x: x.sample(n=n, replace=False).reset_index(drop=True)) # Sample from each group
sampled = [sampled] # Convert to list
data = pd.concat(sampled)  # Concatenate samples

target = data['Class']
data = data.drop(['Class','.geo','system:index','City', 'NDBI', 'UI', 'SwiRed', 'INDBI', 'NBAI', 'BLFEI'], axis=1)

# Verify counts
print('Total sample numbers in each classes :')
print(target.value_counts())
print()

# Assuming 'data' is your DataFrame and 'target' is the target variable
features = data.columns.tolist()

# Record the best accuracy and feature combination
best_accuracy = 0
best_combination = []

# Loop over all features
for _ in range(len(features)):
    best_feature = None

    for feature in features:
        # Add the current feature to the best combination
        combination = best_combination + [feature]

        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(data[combination], target, test_size=0.25)

        # Train the model
        model = RandomForestClassifier()
        model.fit(X_train, y_train)

        # Make predictions and calculate accuracy
        predictions = model.predict(X_test)
        rf_f1 = f1_score(y_test, predictions, average=None)[4]  # the last "[4]" is for the impervious surface area. This is the fifth element of the accuracy matrix. hence the 4th element of the index.

        # If this combination has the best accuracy so far, record it
        if rf_f1 > best_accuracy:
            best_accuracy = rf_f1
            best_feature = feature

    # Add the best feature of this iteration to the best combination
    if best_feature is not None:
        best_combination.append(best_feature)
        features.remove(best_feature)

    print(f"Combination: {best_combination}",f"Accuracy: {best_accuracy}")

print(f"Best accuracy: {best_accuracy}")
print(f"Best feature combination: {best_combination}")


Total sample numbers in each classes :
0    3000
1    3000
2    3000
3    3000
4    3000
Name: Class, dtype: int64

Combination: ['SISAI'] Accuracy: 0.5288919102651258
Combination: ['SISAI', 'ndbiMin'] Accuracy: 0.6964980544747081
Combination: ['SISAI', 'ndbiMin', 'NSAI2'] Accuracy: 0.782608695652174
Combination: ['SISAI', 'ndbiMin', 'NSAI2', 'mndwiMedian'] Accuracy: 0.8215873015873015
Combination: ['SISAI', 'ndbiMin', 'NSAI2', 'mndwiMedian', 'ndviMedian'] Accuracy: 0.8464673913043478
Combination: ['SISAI', 'ndbiMin', 'NSAI2', 'mndwiMedian', 'ndviMedian', 'L8_swir1'] Accuracy: 0.8695652173913043
Combination: ['SISAI', 'ndbiMin', 'NSAI2', 'mndwiMedian', 'ndviMedian', 'L8_swir1', 'ndbiMedian'] Accuracy: 0.8810289389067524
Combination: ['SISAI', 'ndbiMin', 'NSAI2', 'mndwiMedian', 'ndviMedian', 'L8_swir1', 'ndbiMedian'] Accuracy: 0.8810289389067524
Combination: ['SISAI', 'ndbiMin', 'NSAI2', 'mndwiMedian', 'ndviMedian', 'L8_swir1', 'ndbiMedian'] Accuracy: 0.8810289389067524
Combination: ['S

In [None]:
# best decide best model based on F1-Score of Impervious class.
# Attempt 4

import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Load the CSV file into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/Chapter5Model//merged_CSV/merged_162K_Chapter5_Five_Class_Model.csv').dropna()

# Assuming 'target' is the name of the target variable column

n = 1000 # Define sample size
grouped = data.groupby('Class') # Group by class
sampled = grouped.apply(lambda x: x.sample(n=n, replace=False).reset_index(drop=True)) # Sample from each group
sampled = [sampled] # Convert to list
data = pd.concat(sampled)  # Concatenate samples

target = data['Class']
data = data.drop(['Class','.geo','system:index','City', 'NDBI', 'UI', 'SwiRed', 'INDBI', 'NBAI', 'BLFEI'], axis=1)

# Assuming 'data' is your DataFrame and 'target' is the target variable
features = data.columns.tolist()

# Record the best accuracy and feature combination
best_accuracy = 0
best_combination = None

# Loop over all possible combinations of features
for r in range(1, len(features) + 1):
    for combination in itertools.combinations(features, r):
        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(data[list(combination)], target, test_size=0.25)

        # Train the model
        model = RandomForestClassifier()
        model.fit(X_train, y_train)

        # Make predictions and calculate accuracy
        predictions = model.predict(X_test)
        # accuracy = accuracy_score(y_test, predictions)
        rf_f1 = f1_score(y_test, predictions, average=None)[4]  # the last "[4]" is for the impervious surface area. This is the fifth element of the accuracy matrix. hence the 4th element of the index.

        # If this combination has the best accuracy so far, record it
        if rf_f1 > best_accuracy:
            best_accuracy = rf_f1
            best_combination = combination
    print(f"Combination: {best_combination}",f"Accuracy: {best_accuracy}")

print(f"Best accuracy: {best_accuracy}")
print(f"Best feature combination: {best_combination}")


Combination: ('swirSoil',) Accuracy: 0.5046728971962617
Combination: ('L8_swir1', 'ndviMax') Accuracy: 0.7398568019093078
Combination: ('L8_nir', 'L8_swir2', 'ndviMax') Accuracy: 0.8151898734177214


KeyboardInterrupt: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Five class training and best accuracy depends on the Overall Accuracy. Later model focus Impervious class only
# attempt 2
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Load the CSV file into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/Chapter5Model//merged_CSV/merged_162K_Chapter5_Five_Class_Model.csv').dropna()

# Assuming 'target' is the name of the target variable column

n = 2000 # Define sample size
grouped = data.groupby('Class') # Group by class
sampled = grouped.apply(lambda x: x.sample(n=n, replace=True).reset_index(drop=True)) # Sample from each group
sampled = [sampled] # Convert to list
data = pd.concat(sampled)  # Concatenate samples

target = data['Class']
data = data.drop(['Class','.geo','system:index','City', 'NDBI', 'UI', 'SwiRed', 'INDBI', 'NBAI', 'BLFEI'], axis=1)

# Assuming 'data' is your DataFrame and 'target' is the target variable
features = data.columns.tolist()

# Record the best accuracy and feature combination
best_accuracy = 0
best_combination = None

# Loop over all possible combinations of features
for r in range(1, len(features) + 1):
    for combination in itertools.combinations(features, r):
        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(data[list(combination)], target, test_size=0.2)

        # Train the model
        model = RandomForestClassifier()
        model.fit(X_train, y_train)

        # Make predictions and calculate accuracy
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)

        # If this combination has the best accuracy so far, record it
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_combination = combination
    print(f"Combination: {best_combination}",f"Accuracy: {best_accuracy}")

print(f"Best accuracy: {best_accuracy}")
print(f"Best feature combination: {best_combination}")


In [None]:
# Code using cross velidation
# Attempt 3

import itertools
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Load the CSV file into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/Chapter5Model//merged_CSV/merged_162K_Chapter5_Five_Class_Model.csv').dropna()

n = 1000 # Define sample size
grouped = data.groupby('Class') # Group by class
sampled = grouped.apply(lambda x: x.sample(n=n, replace=True).reset_index(drop=True)) # Sample from each group
sampled = [sampled] # Convert to list
data = pd.concat(sampled)  # Concatenate samples

target = data['Class']
data = data.drop(['Class','.geo','system:index','City', 'NDBI', 'UI', 'SwiRed', 'INDBI', 'NBAI', 'BLFEI'], axis=1)


# Assuming 'data' is your DataFrame and 'target' is the target variable
features = data.columns.tolist()

# Record the best accuracy and feature combination
best_accuracy = 0
best_combination = None

# Loop over all possible combinations of features
for r in range(1, len(features) + 1):
    for combination in itertools.combinations(features, r):
        # Train the model using cross-validation
        model = RandomForestClassifier()
        scores = cross_val_score(model, data[list(combination)], target, cv=5)

        # Calculate average accuracy
        accuracy = scores.mean()

        # If this combination has the best accuracy so far, record it
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_combination = combination
    print(f"Combination: {best_combination}",f"Accuracy: {best_accuracy}")

print(f"Best accuracy: {best_accuracy}")
print(f"Best feature combination: {best_combination}")
