# Random Forest Regression

In [61]:
from train_utils import *
import torch
from torch.utils.data import DataLoader
from dataset.dataset_loader import SNDataset, myNormalize, myToTensor, Augmentations, RFTransform
from torchvision import transforms
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import mean_squared_error

In [62]:
import os
os.getcwd()

NAFISEH = "Nafiseh"
MOIEN = "Moien"

if "d:" in os.getcwd():
    USER = MOIEN
elif "c:" in os.getcwd():
    USER = NAFISEH
else:
    raise Exception("Unknown user")

USER

'Moien'

In [63]:
# Setup device-agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

### No Normalization
Random Forest doens't need normalization. So I addedthe RF transfom, it only reshapes the image into channels first format.
then used myTransfomr to resize to 64x64.

You can test my Normalize transform by uncommenting the line in the cell below.

In [64]:
#mynorm = myNormalize(img_bands_min_max =[[(0,7),(0,1)], [(7,12),(-1,1)], [(12), (-4,2963)], [(13), (0, 90)]], oc_min = 0, oc_max = 200)
rf_transform = RFTransform()
my_to_tensor = myToTensor()
transform = transforms.Compose([rf_transform,my_to_tensor])

### Bands to use

In [65]:
bands = [0,1,2,3,4,5,6,7,8,9,10,11]

In [66]:
if USER == MOIEN:
    train_ds = SNDataset('D:\python\SoilNet\dataset\l8_images\\train\\','D:\python\SoilNet\dataset\LUCAS_2015_all.csv',l8_bands=bands, transform=transform)
elif USER == NAFISEH:
    train_ds = SNDataset('C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\l8_images\\train',\
                        'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\LUCAS_2015_all.csv',l8_bands=bands, transform=transform) #Nafiseh 
if USER == MOIEN:
    test_ds = SNDataset('D:\python\SoilNet\dataset\l8_images\\test\\','D:\python\SoilNet\dataset\LUCAS_2015_all.csv',l8_bands=bands, transform=transform)
elif USER == NAFISEH:
    test_ds = SNDataset('C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\l8_images\\test',\
                        'C:\\Users\\nkakhani\\_Multimodal\\SoilNet-1\\dataset\\LUCAS_2015_all.csv',l8_bands=bands, transform=transform) #Nafiseh 

In [76]:
# CONFIG
NUM_WORKERS = 6 if USER == NAFISEH else 2
TRAIN_BATCH_SIZE = 32 if USER == NAFISEH else 4
TEST_BATCH_SIZE = 32 if USER == NAFISEH else 4

# RF CONFIG
N_ESTIMATORS = 100
MAX_DEPTH = 10

In [68]:
train_dl = DataLoader(train_ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
test_dl = DataLoader(test_ds, batch_size=TEST_BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

# <span style="color:red; font-weight:bold;">Please Be Careful; The Cell below might consume a lot of RAM</span>
We are loading the whole dataset into to the RAM in order to pass it thorough the Random Forest Classifier.
* Open Up the Task Manager and monitor the RAM usage; if it passes 30GB, you might want to stop the execution of the code. or forcekill Vscode using the Task Manager.
* With my Caclulations it should not take more than 4-8 GB of RAM, but it might be different for you.

```python

In [69]:
# Preprocess the data using the DataLoader
X_processed = []
y_processed = []
for batch_idx, (features, target) in enumerate(train_dl):
    features_np = features.numpy()
    # Preprocess the features as needed
    features_processed = features_np.reshape(features_np.shape[0], -1)
    X_processed.append(features_processed)
    y_processed.append(target.numpy())

X_processed = np.concatenate(X_processed, axis=0)
y_processed = np.concatenate(y_processed, axis=0)

In [70]:
print(X_processed.shape, X_processed.dtype,"|",y_processed.shape, y_processed.dtype)
print(f"Memory size of the Train array is {X_processed.nbytes/(1024**2)} MB or {X_processed.nbytes/(1024**3)} GB" )

(52, 49152) float32 | (52,) float32
Memory size of the Train array is 9.75 MB or 0.009521484375 GB


## Train the Random Forest Classifier
This code took  `1 minute` to run on my machine, I have a sample of 50 images, so for you it might take around `7hrs`.

it used only 15% of my CPU, Since your CPU is  faster than mine, if It uses 15% of yours , it might take less time.


In [71]:
# Train a RandomForestRegressor on the processed data
rfr = RandomForestRegressor(n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH, random_state=42)
rfr.fit(X_processed, y_processed)

### Testing a Random Image

In [72]:
# Use the trained model to predict on a new image
n_rand = np.random.randint(0, len(test_ds))
new_image = test_ds[n_rand][0].numpy()
new_image_processed = new_image.reshape(1, -1)
y_pred = rfr.predict(new_image_processed)
print("y_pred: ", y_pred[0], "|" ,"y_true: ", test_ds[n_rand][1].numpy())

y_pred:  15.13699984550476 | y_true:  13.7


## RMSE for the whole dataset.

In [73]:
# Preprocess the data using the DataLoader
X_processed = []
y_processed = []
for batch_idx, (features, target) in enumerate(test_dl):
    features_np = features.numpy()
    # Preprocess the features as needed
    features_processed = features_np.reshape(features_np.shape[0], -1)
    X_processed.append(features_processed)
    y_processed.append(target.numpy())

X_processed = np.concatenate(X_processed, axis=0)
y_processed = np.concatenate(y_processed, axis=0)

In [74]:
y_pred = rfr.predict(X_processed)

In [75]:
rmse = np.sqrt(mean_squared_error(y_processed, y_pred))
print('RMSE:', rmse)

RMSE: 3.6248083302021628
