<a href="https://colab.research.google.com/github/masonritchotte/SoilRegression/blob/1-graphtestandrandomforest/ImageProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
# Import necessary libraries
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from google.colab import files
import os
import zipfile
from PIL import Image
import shutil


os.environ['KAGGLE_CONFIG_DIR'] = "/content"

In [2]:
!kaggle datasets download -d masonritchotte/soil-sample-carbon-regression

Downloading soil-sample-carbon-regression.zip to /content
 99% 287M/291M [00:05<00:00, 44.5MB/s]
100% 291M/291M [00:05<00:00, 53.2MB/s]


In [3]:
zip_ref = zipfile.ZipFile('soil-sample-carbon-regression.zip', 'r') #Opens the zip file in read mode
zip_ref.extractall('/content/goo') #Extracts the files into the /tmp folder
zip_ref.close()

In [6]:
def get_time_taken(image_path):
    """
    Extracts the time taken from the image EXIF data.
    """
    with Image.open(image_path) as img:
        exif_data = img._getexif()
        # EXIF Tags: https://www.exiv2.org/tags.html
        # 36867 is the tag for DateTimeOriginal
        time_taken = exif_data.get(36867) if exif_data else '0000:00:00 00:00:00'
        return time_taken


In [7]:
# Replace 'your_carbon_data.txt' with the path to your actual text file
carbon_data_file_path = '/content/carbondata.txt'  # Update with your file path

# Initialize an empty list to store the carbon data
carbon_data = []

# Open the file and read the lines
with open(carbon_data_file_path, 'r') as file:
    for line in file:
        # Convert each line to a float and append to the list
        # You might need to adjust the conversion depending on the format of your numbers
        carbon_data.append(float(line.strip()))

print(carbon_data)  # This will print out the list of carbon data


[7.031554344, 3.964311551, 4.056773071, 0.9126810004, 0.6960307436, 0.2506006665, 0.0, 3.237992445, 2.337853183, 2.343648841, 0.7061660351, 0.6086380768, 0.4231493666, 0.2130257449, 3.335775061, 2.496328928, 2.193573979, 0.6180469716, 0.252476742, 0.1667946308, 0.1376720901, 3.909415971, 3.20750435, 2.886914159, 0.7854783647, 0.459141842, 0.2700636943, 4.732968162, 3.208573464, 3.190376569, 0.9107418752, 0.4050530366, 0.2483817552, 0.2089194685, 2.805161836, 1.793886891, 1.664405246, 0.6572029443, 0.2751137737, 0.2194896205, 0.2060262683, 6.721915285, 1.737633704, 0.8082622362, 5.030954708, 5.345057977, 1.565592414, 0.5872953286, 0.3538600054, 0.2766272189, 10.77628316, 6.469539688, 4.543040603, 1.207888611, 0.4921092822, 0.2789907812, 0.2752313941, 10.14722681, 6.609356507, 5.189678813, 0.9488408491, 0.475505136, 8.747855918, 2.805649042, 1.719393259, 0.4983455734, 0.3550038114, 0.2271657491, 0.2410333532, 8.877070976, 2.414188239, 1.5943154, 0.7772972091, 0.5295955986, 0.3411564026, 

In [9]:
# Code to upload files
image_directory = '/content/goo'

images_with_time = []
for filename in os.listdir(image_directory):
    if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
        image_path = os.path.join(image_directory, filename)
        time_taken = get_time_taken(image_path)
        images_with_time.append((image_path, time_taken))

# Sort images based on time taken
images_with_time.sort(key=lambda x: x[1])

# Rename images sequentially
for i, (image_path, _) in enumerate(images_with_time, start=1):
    new_name = f"Image{i}.JPG"  # Change the extension if needed
    new_path = os.path.join(image_directory, new_name)
    shutil.move(image_path, new_path)

# Load images
images = {}
for filename in os.listdir(image_directory):
    image_path = os.path.join(image_directory, filename)
    if os.path.isfile(image_path):
        image = cv2.imread(image_path)
        images[filename] = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

In [33]:
def extract_median_rgb(image, center, radius):
    """
    Extract the median RGB values from the center circle of the image.
    :param image: Input image
    :param center: Center coordinates of the circle
    :param radius: Radius of the circle
    :return: Median RGB values
    """
    mask = np.zeros(image.shape[:2], dtype="uint8")
    cv2.circle(mask, center, radius, 255, -1)
    masked = cv2.bitwise_and(image, image, mask=mask)
    median_rgb = [np.median(masked[:,:,i][mask==255]) for i in range(3)]
    #mask_colored = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)

    # Create an image that shows the mask as a semi-transparent overlay on the original image
    #overlay = cv2.addWeighted(image, 0.7, mask_colored, 0.3, 0)

    return median_rgb#, overlay

# Example usage (you need to define the center and radius as per your image)
# center = (x, y)  # Replace with the center of your circle
# radius = r  # Replace with the radius of your circle
# median_rgbs = [extract_median_rgb(image, center, radius) for image in images.values()]


In [45]:
centerpoint = (1530, 1300)
radius = 150 # 135
median_rgbs = [extract_median_rgb(image, centerpoint, radius) for image in images.values()]


#for i, (filename, image) in enumerate(images.items()):
#    median_rgb, overlay_image = extract_median_rgb(image, centerpoint, radius)
#    plt.figure(figsize=(10, 10))
#    plt.imshow(cv2.cvtColor(overlay_image, cv2.COLOR_BGR2RGB))
#    plt.title(f"Overlay for {filename}")
#    plt.show()

# Create a DataFrame
df = pd.DataFrame(median_rgbs, columns=['Red', 'Green', 'Blue'])
df['Carbon'] = carbon_data

In [46]:
# Split the data into features and target
X_train, X_test, y_train, y_test = train_test_split(df[['Red', 'Green', 'Blue']], df['Carbon'], test_size=0.2, random_state=42)

# Init range of hyperparameters for grid search
parameters = {
    'kernel': ['rbf'],
    'C': [1e2, 1e3, 1e4, 1e5],
    'gamma': [0.001, 0.0001],
    'epsilon': [0.1, 0.2, 0.3, 0.5]
}

#svr = SVR()

# Initialize Grid Search with cross-validation
#grid_search = GridSearchCV(svr, parameters, cv=5, scoring='neg_mean_squared_error')

# Fit Grid Search on the training data
#grid_search.fit(X_train, y_train)

# Get the best model
#best_model = grid_search.best_estimator_

random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

predictions = random_forest_model.predict(X_test)

mse_test = mean_squared_error(y_test, predictions)

#print(f"Test Mean Squared Error with best model: {mse_test}")

# Display the best hyperparameters
#print(f"Best hyperparameters: {grid_search.best_params_}")

# View a single prediction
# For example, we can predict the carbon content for the first test sample
# single_prediction = model.predict(X_test.iloc[[0]])
# print(f"Actual carbon content for first test sample: {y_test.iloc[0]}")
# print(f"Predicted carbon content for first test sample: {single_prediction[0]}")
comparison_df = pd.DataFrame({'Actual Carbon': y_test, 'Predicted Carbon': predictions})
comparison_df = comparison_df.reset_index(drop=True)
print(comparison_df)

    Actual Carbon  Predicted Carbon
0        3.753918          2.535533
1        1.207889          2.719154
2        2.414188          1.577309
3        5.345058          3.532456
4        5.030955          0.440131
5        0.219490          2.635789
6        3.207504          2.785447
7        0.394477          1.812552
8        0.706166          1.736194
9        7.031554          3.941081
10       0.252477          7.004839
11       0.910742          0.866512
12       0.529596          5.481398
13       0.208919          3.817686
14       2.639057          1.449907
15       0.696031          0.669687
16       5.857052          1.319579
17       3.314152          8.624177
18       0.423149          0.773157
19       0.405053          8.410260
