# Random Forest Regression

In [1]:
from train_utils import *
import torch
from torch.utils.data import DataLoader
from dataset.dataset_loader import SNDataset, SNDatasetClimate, myNormalize, myToTensor, Augmentations, RFTransform, TensorCenterPixels
from torchvision import transforms
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import numpy as np
from sklearn.metrics import mean_squared_error


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.getcwd()

NAFISEH = "Nafiseh"
MOIEN = "Moien"

if "d:" in os.getcwd():
    USER = MOIEN
elif "c:" in os.getcwd():
    USER = NAFISEH
else:
    raise Exception("Unknown user")

USER

'Moien'

In [3]:
# Setup device-agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
USE_CLIMATE = True
USE_SRTM = True

### No Normalization
Random Forest doens't need normalization. So I addedthe RF transfom, it only reshapes the image into channels first format.
then used myTransfomr to resize to 64x64.

You can test my Normalize transform by uncommenting the line in the cell below.

### Cut Center
Cuts a 2x2 square from the center of the image.
If `interplate_center_pixel` is set to True, then the center pixel is interpolated from the 4 surrounding pixels.

In [5]:
#mynorm = myNormalize(img_bands_min_max =[[(0,7),(0,1)], [(7,12),(-1,1)], [(12), (-4,2963)], [(13), (0, 90)]], oc_min = 0, oc_max = 200)
rf_transform = RFTransform()
my_to_tensor = myToTensor()
cut_center = TensorCenterPixels(pixel_radius=1 ,interpolate_center_pixel=True)
transform = transforms.Compose([rf_transform,my_to_tensor,cut_center])

### Bands to use

In [6]:
bands = [0,1,2,3,4,5,6,7,8,9,10,11] if not USE_SRTM else [0,1,2,3,4,5,6,7,8,9,10,11,12,13]

########################################################################################
################################# IF Not USE_LSTM_BRANCH ###############################
########################################################################################

if not USE_CLIMATE: # NOT USING THE CLIMATE DATA
    if USER == MOIEN:
        train_ds = SNDataset('D:\python\SoilNet\dataset\l8_images\\train\\','D:\python\SoilNet\dataset\LUCAS_2015_all.csv',l8_bands=bands, transform=transform)
    elif USER == NAFISEH:
        train_ds = SNDataset('C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\l8_images\\train',\
                            'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\LUCAS_2015_all.csv',l8_bands=bands, transform=transform) #Nafiseh 
    if USER == MOIEN:
        test_ds = SNDataset('D:\python\SoilNet\dataset\l8_images\\test\\','D:\python\SoilNet\dataset\LUCAS_2015_all.csv',l8_bands=bands, transform=transform)
    elif USER == NAFISEH:
        test_ds = SNDataset('C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\l8_images\\test',\
                            'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\LUCAS_2015_all.csv',l8_bands=bands, transform=transform) #Nafiseh 
        
########################################################################################
################################### IF USE_LSTM_BRANCH #################################
########################################################################################
else: # USING THE CLIMATE DATA
    if USER == MOIEN:
        train_ds = SNDatasetClimate('D:\python\SoilNet\dataset\l8_images\\train\\',
                                    'D:\python\SoilNet\dataset\LUCAS_2015_all.csv',
                                    "D:\\python\\SoilNet\\dataset\\Climate\\All\\filled\\",
                                    l8_bands=bands, transform=transform)
    elif USER == NAFISEH:
        train_ds = SNDatasetClimate('C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\l8_images\\train',\
                            'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\LUCAS_2015_all.csv'
                            'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\Climate\\All\\filled',
                            l8_bands=bands, transform=transform) #Nafiseh 
    if USER == MOIEN:
        test_ds = SNDatasetClimate('D:\python\SoilNet\dataset\l8_images\\test\\',
                                'D:\python\SoilNet\dataset\LUCAS_2015_all.csv',
                                "D:\\python\\SoilNet\\dataset\\Climate\\All\\filled\\",
                                l8_bands=bands, transform=transform)
    elif USER == NAFISEH:
        test_ds = SNDatasetClimate('C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\l8_images\\test',\
                            'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\LUCAS_2015_all.csv',
                            'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\Climate\\All\\filled',
                            l8_bands=bands, transform=transform) #Nafiseh

In [9]:
train_ds[0][0][0].shape, train_ds[0][0][1].shape

(torch.Size([14, 1, 1]), torch.Size([61, 14]))

In [10]:
# CONFIG
NUM_WORKERS = 6 if USER == NAFISEH else 2
TRAIN_BATCH_SIZE = 32 if USER == NAFISEH else 4
TEST_BATCH_SIZE = 32 if USER == NAFISEH else 4


In [11]:
train_dl = DataLoader(train_ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
test_dl = DataLoader(test_ds, batch_size=TEST_BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

In [16]:
# Preprocess the data using the DataLoader
X_processed = []
y_processed = []
for batch_idx, (features, target) in enumerate(train_dl):
    images_np = features[0].numpy()
    climate_np = features[1].numpy()
    # Preprocess the features as needed
    images_processed = images_np.reshape(images_np.shape[0], -1) # Flatten the images with shape (batch_size, num_pixels * num_bands) -> e.g: (32, 4 * 12) if 4 pixel is being used or (32, 1 * 12) if 1 pixel is being used 
    climate_processed = climate_np.reshape(climate_np.shape[0], -1) # Flatten the climate data with shape (batch_size, num_climate_features * sequence_length) -> e.g: (32, 14*61) if 14 climate feature is being used and the each feature is a sequence of 61 months
    features_processed = np.concatenate([images_processed, climate_processed], axis=1)
    X_processed.append(features_processed)
    y_processed.append(target.numpy())

X_processed = np.concatenate(X_processed, axis=0)  # (DataLoader Length, num_pixels * num_bands + num_climate_features * sequence_length)
y_processed = np.concatenate(y_processed, axis=0)  # (DataLoader Length,)

In [17]:
print(X_processed.shape, X_processed.dtype,"|",y_processed.shape, y_processed.dtype)
print(f"Memory size of the Train array is {X_processed.nbytes/(1024**2)} MB or {X_processed.nbytes/(1024**3)} GB" )

(52, 868) float32 | (52,) float32
Memory size of the Train array is 0.17218017578125 MB or 0.00016814470291137695 GB


## Grid Search.
I don't know what are the best parameters for the random forest. <span style="color: green;">Please change them and let me know what works best</span>. Thank you

In [25]:
# Define the grid of hyperparameters to search over
param_dist = {
    'n_estimators': randint(90, 100),
    'max_depth': [10, 20, 30],
    # 'max_features': [1.0, 'sqrt'],
    # 'min_samples_split': randint(2, 10),
    # 'min_samples_leaf': randint(1, 10)
}

### Larger Grid

In [26]:
# # DEEP SEARCH
# param_grid = {
#     'n_estimators': [10, 20, 30],
#     'max_depth': [None, 5, 10, 20, 30],
#     'max_features': ['sqrt', 'log2'],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'criterion': ['mse', 'mae'],
#     'bootstrap': [True, False],
#     'oob_score': [True, False],
#     'max_samples': [0.5, 0.75, None],
#     'max_leaf_nodes': [None, 10, 20],
#     'min_impurity_decrease': [0.0, 0.1],
#     'ccp_alpha': [0.0, 0.1],
#     'warm_start': [True, False]
# }

In [27]:
# Define RandomForestRegressor
rfr_ = RandomForestRegressor()
# Define the randomized search object
rfr = RandomizedSearchCV(
    estimator=rfr_,
    param_distributions=param_dist,
    n_iter=1, # Number of Combinations from the grid to try
    cv=10, # Corss Validation Folds
    random_state=42
)

In [28]:
rfr.fit(X_processed, y_processed)

In [29]:
print(rfr.best_params_)

{'max_depth': 30, 'n_estimators': 93}


### Testing a Random Image

In [33]:
# Use the trained model to predict on a new image
n_rand = np.random.randint(0, len(test_ds))
new_image = test_ds[n_rand][0][0].numpy()
new_climate = test_ds[n_rand][0][1].numpy()
new_image_processed = new_image.reshape(1, -1)
new_climate_processed = new_climate.reshape(1, -1)
new_features_processed = np.concatenate([new_image_processed, new_climate_processed], axis=1)
y_pred = rfr.predict(new_features_processed)
print("y_pred: ", y_pred[0], "|" ,"y_true: ", test_ds[n_rand][1].numpy())

y_pred:  19.247312189430318 | y_true:  18.7


## RMSE for the whole dataset.

In [34]:
# Preprocess the data using the DataLoader
X_processed = []
y_processed = []
for batch_idx, (features, target) in enumerate(train_dl):
    images_np = features[0].numpy()
    climate_np = features[1].numpy()
    # Preprocess the features as needed
    images_processed = images_np.reshape(images_np.shape[0], -1) # Flatten the images with shape (batch_size, num_pixels * num_bands) -> e.g: (32, 4 * 12) if 4 pixel is being used or (32, 1 * 12) if 1 pixel is being used 
    climate_processed = climate_np.reshape(climate_np.shape[0], -1) # Flatten the climate data with shape (batch_size, num_climate_features * sequence_length) -> e.g: (32, 14*61) if 14 climate feature is being used and the each feature is a sequence of 61 months
    features_processed = np.concatenate([images_processed, climate_processed], axis=1)
    X_processed.append(features_processed)
    y_processed.append(target.numpy())

X_processed = np.concatenate(X_processed, axis=0)  # (DataLoader Length, num_pixels * num_bands + num_climate_features * sequence_length)
y_processed = np.concatenate(y_processed, axis=0)  # (DataLoader Length,)

In [35]:
y_pred = rfr.predict(X_processed)

In [36]:
rmse = np.sqrt(mean_squared_error(y_processed, y_pred))
print('RMSE:', rmse)

RMSE: 2.9635462537634716
