In [1]:
#Mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Load data
df = pd.read_csv('/content/drive/MyDrive/OMDENA_PROJECTS/merged_v4.csv')
print('shape:', df.shape)
print(df.head(5))

shape: (2438, 17)
   longitude  latitude       Area Soil group   Land class   Soil type   pH  \
0    89.2767   25.5678  Mithpukur      belab  high ground  Clay loam   4.6   
1    89.2767   25.5678  Mithpukur      belab  high ground  Clay loam   5.3   
2    89.2767   25.5678  Mithpukur      belab  high ground  Clay loam   4.9   
3    89.2767   25.5678  Mithpukur      belab  high ground  Clay loam   5.2   
4    89.2767   25.5678  Mithpukur     Noadda  high ground  Clay loam   5.3   

    SOC  Nitrogen  Potassium  Phosphorus  Sulfur  Boron  Zinc  Sand  Silt  \
0  1.07      0.05       0.09        13.3    13.5   0.27  0.95  33.0  33.0   
1  1.08      0.11       0.17        20.5    27.8   0.30  1.04  33.0  33.0   
2  1.87      0.08       0.35        21.7    27.8   0.32  1.16  33.0  33.0   
3  1.51      0.08       0.50        18.5    25.6   0.26  1.05  33.0  33.0   
4  1.59      0.08       0.17        12.0    26.4   0.24  0.74  33.0  33.0   

   Clay  
0  33.0  
1  33.0  
2  33.0  
3  33.0  


In [4]:
#One hot encode categorical columns
df = pd.get_dummies(df, columns=['Soil type', 'Land class'])
df.shape

(2438, 28)

In [5]:
#Define columns X and y
X = df.drop(columns=['latitude','longitude','Soil group','Area','Boron','Zinc','SOC'], axis=1)
y = df['SOC']

In [7]:
#Scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [10]:
#Fit multilinear regression model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
#random forest regressor
from sklearn.ensemble import RandomForestRegressor
# Create a pipeline with StandardScaler and Ridge regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(n_estimators=50, max_depth=7, random_state=42))
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)

# Evaluate the model
print("SOC R²:", r2_score(y_test, y_pred_test))
print("SOC MAE:", mean_absolute_error(y_test, y_pred_test))

SOC R²: 0.8226483950437187
SOC MAE: 0.16077150978853297


In [9]:
#Fit ridge regression model
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
# Create a pipeline with StandardScaler and Ridge regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', Ridge())
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)

# Evaluate the model
print("SOC R²:", r2_score(y_test, y_pred_test))
print("SOC MAE:", mean_absolute_error(y_test, y_pred_test))

SOC R²: 0.8211742850006953
SOC MAE: 0.16460633258363938
