# Random Forest Regression

In [1]:
from train_utils import *
import torch
from torch.utils.data import DataLoader
from dataset.dataset_loader import SNDataset, SNDatasetClimate, myNormalize, myToTensor, Augmentations, RFTransform, TensorCenterPixels
from torchvision import transforms
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import numpy as np
from sklearn.metrics import mean_squared_error
from datetime import date, datetime
import json


In [2]:
# create a folder called 'results' in the current directory if it doesn't exist
if not os.path.exists('results'):
    os.mkdir('results')

In [3]:
# Format the date and time
now = datetime.now()
start_string = now.strftime("%Y-%m-%d %H:%M:%S")
print("Current Date and Time:", start_string)

Current Date and Time: 2023-05-17 16:03:34


In [4]:
import os
os.getcwd()

NAFISEH = "Nafiseh"
MOIEN = "Moien"

if "d:" in os.getcwd():
    USER = MOIEN
elif "c:" in os.getcwd():
    USER = NAFISEH
else:
    raise Exception("Unknown user")

USER

'Nafiseh'

In [5]:
# Setup device-agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [6]:
OC_MAX = 87
USE_CLIMATE = False
USE_SRTM = True

### No Normalization
Random Forest doens't need normalization. So I addedthe RF transfom, it only reshapes the image into channels first format.
then used myTransfomr to resize to 64x64.

You can test my Normalize transform by uncommenting the line in the cell below.

### Cut Center
Cuts a 2x2 square from the center of the image.
If `interplate_center_pixel` is set to True, then the center pixel is interpolated from the 4 surrounding pixels.

In [7]:
#mynorm = myNormalize(img_bands_min_max =[[(0,7),(0,1)], [(7,12),(-1,1)], [(12), (-4,2963)], [(13), (0, 90)]], oc_min = 0, oc_max = 200)
rf_transform = RFTransform(oc_min = 0, oc_max = OC_MAX)
my_to_tensor = myToTensor()
cut_center = TensorCenterPixels(pixel_radius=1 ,interpolate_center_pixel = False)
transform = transforms.Compose([rf_transform,my_to_tensor,cut_center])

### Bands to use

In [8]:
bands = [0,1,2,3,4,5,6,7,8,9,10,11] if not USE_SRTM else [0,1,2,3,4,5,6,7,8,9,10,11,12,13]

########################################################################################
################################# IF Not USE_CLIMATE ###############################
########################################################################################

if not USE_CLIMATE: # NOT USING THE CLIMATE DATA
    if USER == MOIEN:
        train_ds = SNDataset('D:\python\SoilNet\dataset\l8_images\\train\\','D:\python\SoilNet\dataset\LUCAS_2015_all.csv',l8_bands=bands, transform=transform)
    elif USER == NAFISEH:
        train_ds = SNDataset('C:\\Users\\nkakhani\\_Multimodal\\SoilNet-3\\SoilNet\\dataset\\l8_images\\train',\
                             'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-3\\SoilNet\\dataset\\LUCAS_2015_all.csv',l8_bands=bands, transform=transform) #Nafiseh 
    if USER == MOIEN:
        test_ds = SNDataset('D:\python\SoilNet\dataset\l8_images\\test\\','D:\python\SoilNet\dataset\LUCAS_2015_all.csv',l8_bands=bands, transform=transform)
    elif USER == NAFISEH:
        test_ds = SNDataset('C:\\Users\\nkakhani\\_Multimodal\\SoilNet-3\\SoilNet\\dataset\\l8_images\\test',\
                            'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-3\\SoilNet\\dataset\\LUCAS_2015_all.csv',l8_bands=bands, transform=transform) #Nafiseh 
        
########################################################################################
################################### IF USE_CLIMATE #################################
########################################################################################
else: # USING THE CLIMATE DATA
    if USER == MOIEN:
        train_ds = SNDatasetClimate('D:\python\SoilNet\dataset\l8_images\\train\\',
                                    'D:\python\SoilNet\dataset\LUCAS_2015_all.csv',
                                    "D:\\python\\SoilNet\\dataset\\Climate\\All\\filled\\",
                                    l8_bands=bands, transform=transform, normalize_climate = False)
    elif USER == NAFISEH:
        train_ds = SNDatasetClimate('C:\\Users\\nkakhani\\_Multimodal\\SoilNet-3\\SoilNet\\dataset\\l8_images\\train',\
                            'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-3\\SoilNet\\dataset\\LUCAS_2015_all.csv',
                            'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-3\\SoilNet\\dataset\\Climate\\All\\filled',
                            l8_bands=bands, transform=transform, normalize_climate = False) #Nafiseh 
    if USER == MOIEN:
        test_ds = SNDatasetClimate('D:\python\SoilNet\dataset\l8_images\\test\\',
                                'D:\python\SoilNet\dataset\LUCAS_2015_all.csv',
                                "D:\\python\\SoilNet\\dataset\\Climate\\All\\filled\\",
                                l8_bands=bands, transform=transform, normalize_climate = False)
    elif USER == NAFISEH:
        test_ds = SNDatasetClimate('C:\\Users\\nkakhani\\_Multimodal\\SoilNet-3\\SoilNet\\dataset\\l8_images\\test',\
                            'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-3\\SoilNet\\dataset\\LUCAS_2015_all.csv',
                            'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-3\\SoilNet\\dataset\\Climate\\All\\filled',
                            l8_bands=bands, transform=transform, normalize_climate = False) #Nafiseh

In [9]:
train_ds[0][0][0].shape, train_ds[0][0][1].shape

(torch.Size([2, 2]), torch.Size([2, 2]))

In [10]:
# CONFIG
NUM_WORKERS = 6 if USER == NAFISEH else 2
TRAIN_BATCH_SIZE = 32 if USER == NAFISEH else 4
TEST_BATCH_SIZE = 32 if USER == NAFISEH else 4


In [11]:
train_dl = DataLoader(train_ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
test_dl = DataLoader(test_ds, batch_size=TEST_BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

In [12]:
if USE_CLIMATE:
    # Preprocess the data using the DataLoader
    X_processed = []
    y_processed = []
    for batch_idx, (features, target) in enumerate(train_dl):
        images_np = features[0].numpy()
        climate_np = features[1].numpy()
        # Preprocess the features as needed
        images_processed = images_np.reshape(images_np.shape[0], -1) # Flatten the images with shape (batch_size, num_pixels * num_bands) -> e.g: (32, 4 * 12) if 4 pixel is being used or (32, 1 * 12) if 1 pixel is being used 
        climate_processed = climate_np.reshape(climate_np.shape[0], -1) # Flatten the climate data with shape (batch_size, num_climate_features * sequence_length) -> e.g: (32, 14*61) if 14 climate feature is being used and the each feature is a sequence of 61 months
        features_processed = np.concatenate([images_processed, climate_processed], axis=1)
        X_processed.append(features_processed)
        y_processed.append(target.numpy())

    X_processed = np.concatenate(X_processed, axis=0)  # (DataLoader Length, num_pixels * num_bands + num_climate_features * sequence_length)
    y_processed = np.concatenate(y_processed, axis=0)  # (DataLoader Length,)
else:
    # Preprocess the data using the DataLoader
    X_processed = []
    y_processed = []
    for batch_idx, (features, target) in enumerate(train_dl):
        features_np = features.numpy()
        # Preprocess the features as needed
        features_processed = features_np.reshape(features_np.shape[0], -1)
        X_processed.append(features_processed)
        y_processed.append(target.numpy())

    X_processed = np.concatenate(X_processed, axis=0)
    y_processed = np.concatenate(y_processed, axis=0)

In [13]:
print(X_processed.shape, X_processed.dtype,"|",y_processed.shape, y_processed.dtype)
print(f"Memory size of the Train array is {X_processed.nbytes/(1024**2)} MB or {X_processed.nbytes/(1024**3)} GB" )

(15296, 56) float32 | (15296,) float32
Memory size of the Train array is 3.267578125 MB or 0.0031909942626953125 GB


## Grid Search.
I don't know what are the best parameters for the random forest. <span style="color: green;">Please change them and let me know what works best</span>. Thank you

In [14]:
# Define the grid of hyperparameters to search over
param_dist = {
    # 'n_estimators': randint(30, 1000),
    'n_estimators': [1, 2, 5, 10, 20, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500],
    'max_depth': [1, 2, 3, 4, 5, 10],
    # 'max_features': [1.0, 'sqrt'],
    'min_samples_split': [1, 2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10, 15, 20],
    'max_leaf_nodes': [1, 2, 5, 10, 15, 20],
}

### Larger Grid

In [15]:
# # DEEP SEARCH
# param_grid = {
#     'n_estimators': [10, 20, 30],
#     'max_depth': [None, 5, 10, 20, 30],
#     'max_features': ['sqrt', 'log2'],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'criterion': ['mse', 'mae'],
#     'bootstrap': [True, False],
#     'oob_score': [True, False],
#     'max_samples': [0.5, 0.75, None],
#     'max_leaf_nodes': [None, 10, 20],
#     'min_impurity_decrease': [0.0, 0.1],
#     'ccp_alpha': [0.0, 0.1],
#     'warm_start': [True, False]
# }

In [16]:
NUM_ITTERS = 50 if USER == NAFISEH else 1
CV = 10 if USER == NAFISEH else 2
SEED = 42

In [17]:
# Define RandomForestRegressor
rfr_ = RandomForestRegressor()
# Define the randomized search object
rfr = RandomizedSearchCV(
    estimator=rfr_,
    param_distributions=param_dist,
    n_iter=NUM_ITTERS, # Number of Combinations from the grid to try
    cv=CV, # Corss Validation Folds
    random_state=SEED
)

In [18]:
rfr.fit(X_processed, y_processed)

180 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nkakhani\anaconda3\envs\pytorchGPU\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\nkakhani\anaconda3\envs\pytorchGPU\lib\site-packages\sklearn\ensemble\_forest.py", line 340, in fit
    self._validate_params()
  File "c:\Users\nkakhani\anaconda3\envs\pytorchGPU\lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\nkakhani\anaconda3\envs\pytorchGPU\lib\site-packages\sklearn\utils\_param_valid

In [19]:
print(rfr.best_params_)

{'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 20, 'max_leaf_nodes': 20, 'max_depth': 10}


### Testing a Random Image

In [20]:
if USE_CLIMATE:
    # Use the trained model to predict on a new image
    n_rand = np.random.randint(0, len(test_ds))
    new_image = test_ds[n_rand][0][0].numpy()
    new_climate = test_ds[n_rand][0][1].numpy()
    new_image_processed = new_image.reshape(1, -1)
    new_climate_processed = new_climate.reshape(1, -1)
    new_features_processed = np.concatenate([new_image_processed, new_climate_processed], axis=1)
    y_pred = rfr.predict(new_features_processed)
    print("y_pred: ", y_pred[0], "|" ,"y_true: ", test_ds[n_rand][1].numpy())
else:
    # Use the trained model to predict on a new image
    n_rand = np.random.randint(0, len(test_ds))
    new_image = test_ds[n_rand][0].numpy()
    new_image_processed = new_image.reshape(1, -1)
    y_pred = rfr.predict(new_image_processed)
    print("y_pred: ", y_pred[0], "|" ,"y_true: ", test_ds[n_rand][1].numpy())

y_pred:  15.875988239077675 | y_true:  12.7


## RMSE for the whole dataset.

In [21]:
if USE_CLIMATE:
    # Preprocess the data using the DataLoader
    X_processed = []
    y_processed = []
    for batch_idx, (features, target) in enumerate(test_dl):
        images_np = features[0].numpy()
        climate_np = features[1].numpy()
        # Preprocess the features as needed
        images_processed = images_np.reshape(images_np.shape[0], -1) # Flatten the images with shape (batch_size, num_pixels * num_bands) -> e.g: (32, 4 * 12) if 4 pixel is being used or (32, 1 * 12) if 1 pixel is being used 
        climate_processed = climate_np.reshape(climate_np.shape[0], -1) # Flatten the climate data with shape (batch_size, num_climate_features * sequence_length) -> e.g: (32, 14*61) if 14 climate feature is being used and the each feature is a sequence of 61 months
        features_processed = np.concatenate([images_processed, climate_processed], axis=1)
        X_processed.append(features_processed)
        y_processed.append(target.numpy())

    X_processed = np.concatenate(X_processed, axis=0)  # (DataLoader Length, num_pixels * num_bands + num_climate_features * sequence_length)
    y_processed = np.concatenate(y_processed, axis=0)  # (DataLoader Length,)
else:
    # Preprocess the data using the DataLoader
    X_processed = []
    y_processed = []
    for batch_idx, (features, target) in enumerate(test_dl):
        features_np = features.numpy()
        # Preprocess the features as needed
        features_processed = features_np.reshape(features_np.shape[0], -1)
        X_processed.append(features_processed)
        y_processed.append(target.numpy())

    X_processed = np.concatenate(X_processed, axis=0)
    y_processed = np.concatenate(y_processed, axis=0)

In [22]:
y_pred = rfr.predict(X_processed)

In [23]:
rmse = np.sqrt(mean_squared_error(y_processed, y_pred))
print('RMSE:', rmse)

RMSE: 21.26010056233226


In [24]:
# Format the date and time
now = datetime.now()
finish_string = now.strftime("%Y-%m-%d %H:%M:%S")
file_name = now.strftime("D_%Y_%m_%d_T_%H_%M")
print("Current Date and Time:", finish_string)
print("File Name:", file_name)

Current Date and Time: 2023-05-17 18:14:36
File Name: D_2023_05_17_T_18_14


In [25]:
log_json = {}
log_json['RMSE'] = rmse
log_json['USE_CLIMATE'] = USE_CLIMATE
log_json['USE_SRTM'] = USE_SRTM

log_json['NUM_ITTERS'] = NUM_ITTERS
log_json['CV'] = CV
log_json['SEED'] = SEED

log_json['param_dist'] = str(param_dist)
log_json['BEST_PARAMS'] = rfr.best_params_

log_json['TIME'] = {'start': start_string, 'finish': finish_string}

log_json

{'RMSE': 21.26010056233226,
 'USE_CLIMATE': False,
 'USE_SRTM': True,
 'NUM_ITTERS': 50,
 'CV': 10,
 'SEED': 42,
 'param_dist': "{'n_estimators': [1, 2, 5, 10, 20, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_depth': [1, 2, 3, 4, 5, 10], 'min_samples_split': [1, 2, 5, 10], 'min_samples_leaf': [1, 2, 5, 10, 15, 20], 'max_leaf_nodes': [1, 2, 5, 10, 15, 20]}",
 'BEST_PARAMS': {'n_estimators': 300,
  'min_samples_split': 5,
  'min_samples_leaf': 20,
  'max_leaf_nodes': 20,
  'max_depth': 10},
 'TIME': {'start': '2023-05-17 16:03:34', 'finish': '2023-05-17 18:14:36'}}

In [26]:
with open(f"results/RF_{file_name}_{USER}.json", "w") as fp:
    json.dump(log_json, fp, indent=4)