<a href="https://colab.research.google.com/github/masonritchotte/SoilRegression/blob/3-finalsetup/ImageProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initializing Values and Data for Models

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from google.colab import files
import os
import zipfile
from PIL import Image
import shutil


os.environ['KAGGLE_CONFIG_DIR'] = "/content"

Use Kaggle to download dataset directly into runtime

In [3]:
!kaggle datasets download -d bnneett/soilstuffpp

Downloading soilstuffpp.zip to /content
 92% 118M/129M [00:01<00:00, 76.0MB/s]
100% 129M/129M [00:01<00:00, 83.8MB/s]


Extract files from zipped dataset

In [12]:
zip_ref = zipfile.ZipFile('soilstuffpp.zip', 'r') #Opens the zip file in read mode
zip_ref.extractall('/content/') #Extracts the files into the /tmp folder
zip_ref.close()

# Data Intake (Carbon & Nitrogen Concentration)

In [46]:
# Replace path with the path to your actual text file for carbon data
carbon_data_file_path = '/content/carbondata.txt'

# Initialize an empty list to store the carbon data
carbon_data = []

# Open the file and read the lines
with open(carbon_data_file_path, 'r') as file:
    for line in file:
        # Convert each line to a float and append to the list
        # You might need to adjust the conversion depending on the format of your numbers
        carbon_data.append(float(line.strip()))

print(carbon_data)  # This will print out the list of carbon data


[7.031554344, 3.964311551, 4.056773071, 0.9126810004, 0.6960307436, 0.2506006665, 0.0, 3.237992445, 2.337853183, 2.343648841, 0.7061660351, 0.6086380768, 0.4231493666, 0.2130257449, 3.335775061, 2.496328928, 2.193573979, 0.6180469716, 0.252476742, 0.1667946308, 0.1376720901, 3.909415971, 3.20750435, 2.886914159, 0.7854783647, 0.459141842, 0.2700636943, 4.732968162, 3.208573464, 3.190376569, 0.9107418752, 0.4050530366, 0.2483817552, 0.2089194685, 2.805161836, 1.793886891, 1.664405246, 0.6572029443, 0.2751137737, 0.2194896205, 0.2060262683, 6.721915285, 1.737633704, 0.8082622362, 5.030954708, 5.345057977, 1.565592414, 0.5872953286, 0.3538600054, 0.2766272189, 10.77628316, 6.469539688, 4.543040603, 1.207888611, 0.4921092822, 0.2789907812, 0.2752313941, 10.14722681, 6.609356507, 5.189678813, 0.9488408491, 0.475505136, 8.747855918, 2.805649042, 1.719393259, 0.4983455734, 0.3550038114, 0.2271657491, 0.2410333532, 8.877070976, 2.414188239, 1.5943154, 0.7772972091, 0.5295955986, 0.3411564026, 

In [34]:
# Replace path with the path to your actual text file for nitrogen data
nitrogen_data_file_path = '/content/nitrogendata.txt'

# Initialize an empty list to store the carbon data
nitrogen_data = []

# Open the file and read the lines
with open(nitrogen_data_file_path, 'r') as file:
    for line in file:
        # Convert each line to a float and append to the list
        # You might need to adjust the conversion depending on the format of your numbers
        nitrogen_data.append(float(line.strip()))

print(nitrogen_data)  # This will print out the list of nitrogen data


[0.5064277367, 0.2845430432, 0.2990918484, 0.07313149042, 0.04837279299, 0.01291756013, 0.0, 0.2482460874, 0.1736690936, 0.1683283698, 0.04018831094, 0.03565243965, 0.02580179064, 0.0, 0.264833493, 0.2019089574, 0.1586671956, 0.04377832715, 0.0, 0.0, 0.0, 0.3098927294, 0.2269460625, 0.216518562, 0.05273529657, 0.02732987155, 0.0, 0.3680246491, 0.242828466, 0.2428272564, 0.05963190849, 0.01518948887, 0.0, 0.0, 0.1903955997, 0.1389342923, 0.1085481682, 0.04468980021, 0.01285578382, 0.0, 0.0, 0.3766951281, 0.09425843203, 0.04490345757, 0.3193222548, 0.3276003276, 0.07384869879, 0.02905347915, 0.01884461567, 0.01035502959, 0.6312467123, 0.3512635001, 0.2172548641, 0.07466947775, 0.02121160699, 0.008664309974, 0.0, 0.6155603503, 0.355073818, 0.2700793618, 0.05624572043, 0.02963088385, 0.5844609618, 0.198156827, 0.1342931995, 0.03228149463, 0.01306762496, 0.0, 0.0, 0.5931683371, 0.1823011615, 0.1224099839, 0.06120450465, 0.03742725078, 0.01372468286, 1.043068641, 0.4111321948, 0.2379613205, 

In [39]:
# Replace 'your_carbon_data.txt' with the path to your actual text file
mgc02_data_file_path = '/content/mgc02pergram.txt'  # Update with your file path

# Initialize an empty list to store the carbon data
mgc02_data = []

# Open the file and read the lines
with open(mgc02_data_file_path, 'r') as file:
    for line in file:
        # Convert each line to a float and append to the list
        # You might need to adjust the conversion depending on the format of your numbers
        mgc02_data.append(float(line.strip()))

print(mgc02_data)  # This will print out the list of carbon data


[2.675623389, 1.832589458, 0.9009793016, 0.1385681896, 0.2149792223, 0.07365937413, 0.4228454808, 2.122576515, 0.8976664342, 0.5406540947, 0.2334875784, 0.1838275553, 0.4367109439, 0.1781878888, 2.098350307, 1.293886733, 0.577776445, 0.8486912144, 0.2328482368, 0.06133386948, 0.04299799028, 1.953824827, 0.8533061973, 0.8651955978, 0.1230602169, 0.09819780664, 0.0676459196, 2.523714617, 0.9617914546, 0.725765017, 0.2886143675, 0.03686461938, 0.0, 0.0, 1.574383702, 0.655553702, 0.4471356295, 0.1717006091, 0.01228759306, 0.0, 0.04294654959, 1.90824409, 0.3739688797, 0.03679482089, 1.660565785, 1.697043241, 0.3374035376, 0.1535489245, 0.0, 0.0, 2.798637489, 1.193505105, 0.4121281066, 0.0, 0.006139505956, 0.0, 0.0, 3.494135853, 1.117613082, 0.854158268, 0.03066082305, 0.006136138992, 3.634118837, 1.050065039, 0.3492549672, 0.06738330852, 0.05537709759, 0.01230294641, 0.0, 3.41615162, 0.718071461, 0.1658080156, 0.04299369885, 0.006149014583, 0.1165982669, 5.230167199, 2.33810209, 0.920971834

In [14]:
# Directory where the images are located
image_directory = '/content/pp/'  # Replace with the directory containing your images

# Get a list of filenames in the directory and sort them alphanumerically
filenames = sorted(os.listdir(image_directory))

# Loop through the sorted filenames
for i, filename in enumerate(filenames):
    # Construct the new filename
    new_filename = f"{i+1}_pp.jpg"

    # Rename the file
    old_filepath = os.path.join(image_directory, filename)
    new_filepath = os.path.join(image_directory, new_filename)
    os.rename(old_filepath, new_filepath)

# Load images
images = {}
for filename in os.listdir(image_directory):
    image_path = os.path.join(image_directory, filename)
    if os.path.isfile(image_path):
        image = cv2.imread(image_path)
        images[filename] = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

In [18]:
def extract_median_rgb(image, center, radius):
    """
    Extract the median RGB values from the center circle of the image.
    :param image: Input image
    :param center: Center coordinates of the circle
    :param radius: Radius of the circle
    :return: Median RGB values
    """
    mask = np.zeros(image.shape[:2], dtype="uint8")
    cv2.circle(mask, center, radius, 255, -1)
    masked = cv2.bitwise_and(image, image, mask=mask)
    median_rgb = [np.median(masked[:,:,i][mask==255]) for i in range(3)]
    #mask_colored = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)

    # Create an image that shows the mask as a semi-transparent overlay on the original image
    #overlay = cv2.addWeighted(image, 0.7, mask_colored, 0.3, 0)

    return median_rgb#, overlay

# Example usage (you need to define the center and radius as per your image)
# center = (x, y)  # Replace with the center of your circle
# radius = r  # Replace with the radius of your circle
# median_rgbs = [extract_median_rgb(image, center, radius) for image in images.values()]


In [40]:
centerpoint = (1530, 1300)
radius = 250 # 135
median_rgbs = [extract_median_rgb(image, centerpoint, radius) for image in images.values()]

# These lines are for determining how the mask is affecting the images
#for i, (filename, image) in enumerate(images.items()):
#    median_rgb, overlay_image = extract_median_rgb(image, centerpoint, radius)
#    plt.figure(figsize=(10, 10))
#    plt.imshow(cv2.cvtColor(overlay_image, cv2.COLOR_BGR2RGB))
#    plt.title(f"Overlay for {filename}")
#    plt.show()

# Create a DataFrame
df = pd.DataFrame(median_rgbs, columns=['Red', 'Green', 'Blue'])
df['Nitrogen'] = nitrogen_data
df['mgCO2'] = mgc02_data
df['Carbon'] = carbon_data

# Random Search using Random Forest

In [33]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[['Red', 'Green', 'Blue', 'mgCO2', 'Nitrogen']], df['Carbon'], test_size=0.2, random_state=42)

# Hyperparameter grid to sample from during fitting
random_grid = {
    'n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest Regressor
rf = RandomForestRegressor()

# Random search of parameters
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                               n_iter=70, cv=2, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
rf_random.fit(X_train, y_train)

# Get the best model
best_random = rf_random.best_estimator_

# Predictions on the test set using the best model
predictions = best_random.predict(X_test)

# Evaluate the model on the test set
mse_test = mean_squared_error(y_test, predictions)
print(f"Test Mean Squared Error with best random model: {mse_test}")

# Display the best hyperparameters
print(f"Best hyperparameters: {rf_random.best_params_}")

# Display actual vs predicted values for each point in the test set using the best model
comparison_df = pd.DataFrame({'Actual Carbon': y_test, 'Predicted Carbon': predictions})
comparison_df = comparison_df.reset_index(drop=True)
print(comparison_df)

Fitting 2 folds for each of 70 candidates, totalling 140 fits


  warn(


Test Mean Squared Error with best random model: 0.14466425611761452
Best hyperparameters: {'n_estimators': 1400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': True}
    Actual Carbon  Predicted Carbon
0        3.753918          3.360213
1        1.207889          1.387381
2        2.414188          2.521813
3        5.345058          4.648686
4        5.030955          4.129449
5        0.219490          0.246696
6        3.207504          3.913026
7        0.394477          0.244356
8        0.706166          0.588653
9        7.031554          7.225529
10       0.252477          0.226966
11       0.910742          0.814800
12       0.529596          0.556725
13       0.208919          0.220295
14       2.639057          2.780169
15       0.696031          0.762513
16       5.857052          6.736255
17       3.314152          3.377801
18       0.423149          0.492107
19       0.405053          0.318432


# Random Forest Alone

In [45]:
# Split the data into features and target
X_train, X_test, y_train, y_test = train_test_split(df[['Red', 'Green', 'Blue', 'Nitrogen']], df['Carbon'], test_size=0.3, random_state=42)

# Init range of hyperparameters for grid search
parameters = {
    'kernel': ['rbf'],
    'C': [1e2, 1e3, 1e4, 1e5],
    'gamma': [0.001, 0.0001],
    'epsilon': [0.1, 0.2, 0.3, 0.5]
}

#svr = SVR()

# Initialize Grid Search with cross-validation
#grid_search = GridSearchCV(svr, parameters, cv=5, scoring='neg_mean_squared_error')

# Fit Grid Search on the training data
#grid_search.fit(X_train, y_train)

# Get the best model
#best_model = grid_search.best_estimator_

random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

predictions = random_forest_model.predict(X_test)

mse_test = mean_squared_error(y_test, predictions)

print(f"Test Mean Squared Error with best model: {mse_test}")

# Display the best hyperparameters
#print(f"Best hyperparameters: {grid_search.best_params_}")

# View a single prediction
# For example, we can predict the carbon content for the first test sample
# single_prediction = model.predict(X_test.iloc[[0]])
# print(f"Actual carbon content for first test sample: {y_test.iloc[0]}")
# print(f"Predicted carbon content for first test sample: {single_prediction[0]}")
comparison_df = pd.DataFrame({'Actual Carbon': y_test, 'Predicted Carbon': predictions})
comparison_df = comparison_df.reset_index(drop=True)
print(comparison_df)

Test Mean Squared Error with best model: 0.09549464999472194
    Actual Carbon  Predicted Carbon
0        3.753918          3.403073
1        1.207889          1.388217
2        2.414188          2.776285
3        5.345058          4.808837
4        5.030955          4.505649
5        0.219490          0.228459
6        3.207504          3.850800
7        0.394477          0.218253
8        0.706166          0.594485
9        7.031554          7.088297
10       0.252477          0.224854
11       0.910742          0.877341
12       0.529596          0.572564
13       0.208919          0.194989
14       2.639057          2.761748
15       0.696031          0.755816
16       5.857052          6.358054
17       3.314152          3.494950
18       0.423149          0.494956
19       0.405053          0.312894
20       0.278991          0.249508
21       3.792446          4.214872
22       0.270064          0.216662
23       1.737634          1.553366
24       8.877071          9.657074
25 

# Random Forest Multi Output Regressor

In [31]:
from sklearn.multioutput import MultiOutputRegressor

# Multi-output target
y = df[['Carbon', 'Nitrogen']]

# Train/test split remains the same
X_train, X_test, y_train, y_test = train_test_split(df[['Red', 'Green', 'Blue']], y, test_size=0.2, random_state=42)

# Multi-output model
multi_output_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))

# Fit the model
multi_output_model.fit(X_train, y_train)

# Predictions on the test set
predictions = multi_output_model.predict(X_test)

# You can evaluate each target separately
mse_test_carbon = mean_squared_error(y_test['Carbon'], predictions[:, 0])
mse_test_nitrogen = mean_squared_error(y_test['Nitrogen'], predictions[:, 1])

print(f"Test Mean Squared Error for Carbon: {mse_test_carbon}")
print(f"Test Mean Squared Error for Nitrogen: {mse_test_nitrogen}")

comparison_df = pd.DataFrame({
    'Actual Carbon': y_test['Carbon'],
    'Predicted Carbon': predictions[:, 0],
    'Actual Nitrogen': y_test['Nitrogen'],
    'Predicted Nitrogen': predictions[:, 1]
})

# Reset the index of the DataFrame for better visualization
comparison_df.reset_index(drop=True, inplace=True)

# Display the comparison DataFrame
print(comparison_df)


Test Mean Squared Error for Carbon: 12.187635747036973
Test Mean Squared Error for Nitrogen: 0.05038677449811626
    Actual Carbon  Predicted Carbon  Actual Nitrogen  Predicted Nitrogen
0        3.753918          2.649720         0.253548            0.146930
1        1.207889          0.880892         0.074669            0.047836
2        2.414188          0.986716         0.182301            0.077899
3        5.345058          3.354053         0.327600            0.206363
4        5.030955          0.524148         0.319322            0.040052
5        0.219490          3.018922         0.000000            0.189341
6        3.207504          0.586993         0.226946            0.044046
7        0.394477          2.783721         0.000000            0.205569
8        0.706166          1.736205         0.040188            0.141804
9        7.031554          2.051100         0.506428            0.126313
10       0.252477          1.233133         0.000000            0.084531
11       0.