* **Tasfiq Kamran**
* tasfiq.kamran@gmail.com

In [None]:
%cd /content/drive/MyDrive/Dataset/ipage

/content/drive/MyDrive/Dataset/ipage


# Import stuffs

In [None]:
import numpy as np
import pandas as pd
import scipy
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import os

# sklearn
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict

# Data processing and Training

In [None]:
df = pd.read_csv('IPAGE_SoilData_cleaned.csv')

In [None]:
df.head()

Unnamed: 0,Area,Data Collection Year,soil group,Land class,knit (surface),pH,SOC (%),Nitrogen N (%),Potassium K (meq/100),Phosphorus P (ug/g),Sulfur S (ug/g),Boron B (ug/g),Zinc Zn (ug/g)
0,Mithpukur,2005,belab,high ground,Clay loam,5.0,1.27,0.08,0.15,19.6,37.7,0.26,0.86
1,Mithpukur,2005,belab,high ground,Clay loam,4.9,1.47,0.09,0.25,4.1,32.0,0.25,0.75
2,Mithpukur,2005,belab,high ground,Clay loam,4.6,1.07,0.05,0.09,13.3,13.5,0.27,0.95
3,Mithpukur,2005,belab,high ground,Clay loam,5.2,1.51,0.06,0.3,20.2,30.0,0.28,1.0
4,Mithpukur,2005,belab,high ground,Clay loam,5.3,1.08,0.11,0.17,20.5,27.8,0.3,1.04


In [None]:
# working only with numerical and one categorical feature (Area) and dropping
# the other columns

df = df.drop(['Data Collection Year', 'soil group', 'Land class', 'knit (surface)'], axis=1)

In [None]:
target_columns = ['SOC (%)', 'Boron B (ug/g)', 'Zinc Zn (ug/g)']  # our target columns SOC and B and zinc
feature_columns = [col for col in df.columns if col not in target_columns] # features

In [None]:
X = df[feature_columns] # Feature vector
y = df[target_columns]  # target vector

In [None]:
# separating numerical and categorical columns

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns

In [None]:
# encoding categorical and scaling numerical columns

feature_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols),
        # ('num', StandardScaler(), numerical_cols)
        ('num', StandardScaler(), numerical_cols)
    ]
)

In [None]:
# split the dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df['Area'])


In [None]:
# K-Fold split
cnt = 0
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X_train, y_train):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, \
    Test set:{len(test_index)}')
    cnt += 1

Fold:0, Train set: 444,     Test set:50
Fold:1, Train set: 444,     Test set:50
Fold:2, Train set: 444,     Test set:50
Fold:3, Train set: 444,     Test set:50
Fold:4, Train set: 445,     Test set:49
Fold:5, Train set: 445,     Test set:49
Fold:6, Train set: 445,     Test set:49
Fold:7, Train set: 445,     Test set:49
Fold:8, Train set: 445,     Test set:49
Fold:9, Train set: 445,     Test set:49


In [None]:
# scaling the target variables

target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train)
y_test_scaled = target_scaler.transform(y_test)

In [None]:
alpha = [1, 5, 10]
solver = ['sparse_cg']
l1_ratio = [0.3, 0.5, 0.7, 1.0]
max_iter = [100, 500, 1000]
n_estimator = [100,300,500]
learning_rate = [0.1, 0.5, 1.0, 5.0]
loss_func = ['linear', 'square']
min_samples_split = [2, 3, 5, 7]
min_samples_leaf = [1, 3, 5, 7]

search_params = {
    'ridge': {
        'alpha': alpha,
        'max_iter': max_iter
    },
    'lasso': {
        'alpha': alpha,
        'max_iter': max_iter
    },
    'elasticNet': {
        'alpha': alpha,
        'max_iter': max_iter,
        'l1_ratio': l1_ratio
    },
    'adaboost': {
        'n_estimators': n_estimator,
        'learning_rate': learning_rate,
        'loss': loss_func
    },
    'bagging': {
        'n_estimators': n_estimator
    },
    'rf': {
        'n_estimators': n_estimator,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
    },
    'xgb': {
        'n_estimators': n_estimator,
        'learning_rate': learning_rate
    }
}

# add models

models = {
    "Linear Regression": LinearRegression(),
    'Ridge Regression': Ridge(random_state=0),
    'Lasso Regression': Lasso(random_state=0),
    'ElasticNet': ElasticNet(random_state=0),
    'Adaboost Regression': MultiOutputRegressor(AdaBoostRegressor(random_state=0)),
    'Bagging Regression': MultiOutputRegressor(BaggingRegressor(random_state=0)),
    "Random Forest": RandomForestRegressor(random_state=0),
    "Support Vector Regression": MultiOutputRegressor(SVR()),
    "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=0),
}

model_list = list(models.keys())

In [None]:
model_list

['Linear Regression',
 'Ridge Regression',
 'Lasso Regression',
 'ElasticNet',
 'Adaboost Regression',
 'Bagging Regression',
 'Random Forest',
 'Support Vector Regression',
 'XGBoost']

In [None]:
# Initialize dictionary to store results
results = {}

def cross_validation(reg_model, model_name, X, y, cv):

    model_pipeline = Pipeline(steps=[
        ('preprocessor', feature_preprocessor),
        # ('regressor', MultiOutputRegressor(rf_reg))
         ('regressor', reg_model)
    ])

    scoring = ['neg_mean_squared_error', 'r2']
    score = cross_validate(model_pipeline, X, y, cv=cv, scoring=scoring)

    mse_arr = -1 * score['test_neg_mean_squared_error']
    rmse_arr = np.sqrt(mse_arr)

    mse =  mse_arr.mean()
    rmse = rmse_arr.mean()
    r2 = score['test_r2'].mean()

    # rmse_scores = np.sqrt(-scores)

    print('-' * 20 + f"{model_name} Cross validation" + '-' * 20)
    print(f"MSE scores: {mse_arr}\n")
    print(f"RMSE scores: {rmse_arr}\n")
    # print(f"R2 scores: {score['test_r2']}\n")

    print(f"Mean MSE: {round(mse, 2)}\n")
    print(f"Mean RSE: {round(rmse, 2)}\n")
    print(f"Mean R2: {round(r2, 2)}")
    print('--' * 60 + '\n')
    # print("StandardDeviation:", rmse_scores.std())

# Define function for evaluating models
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
    mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
    r2 = r2_score(y_true, y_pred, multioutput='variance_weighted')

    print(f"{model_name} Performance:")
    print(f"MSE: {mse}")
    print(f"MAE: {mae}")
    print(f"R² Score: {r2}")

    results[model_name] = {'MSE': mse, 'MAE': mae, 'R² Score': r2}
    print("-" * 30)


print(f"Starting Cross Validation.....\n")

for model_name, model in models.items():
    cross_validation(model, model_name, X_train, y_train_scaled, kf)

Starting Cross Validation.....

--------------------Linear Regression Cross validation--------------------
MSE scores: [ 0.41930744  0.33872462  0.47438811  1.70808662  0.35444051 25.56243846
  0.40043312  2.67919466  0.4333085   0.28997424]

RMSE scores: [0.64753953 0.58200054 0.68875838 1.30693788 0.59534906 5.05593102
 0.63279785 1.63682457 0.65826173 0.53849256]

Mean MSE: 3.27

Mean RSE: 1.23

Mean R2: -5.11
------------------------------------------------------------------------------------------------------------------------

--------------------Ridge Regression Cross validation--------------------
MSE scores: [ 0.42106883  0.33006214  0.47359571  1.71365808  0.35352331 26.26355956
  0.40474699  2.67549266  0.43346435  0.29017259]

RMSE scores: [0.64889817 0.57451034 0.68818291 1.30906764 0.59457827 5.12479849
 0.63619729 1.63569333 0.65838009 0.53867671]

Mean MSE: 3.34

Mean RSE: 1.24

Mean R2: -5.26
-----------------------------------------------------------------------------

**Linear regression**, **Ridge**, **Lasso** and **Elasticnet** didn't perform well. Wherease the performance of the **Lasso** and **Elasticnet** are similar. **XGBoost** Didn't perform well either.

**Support Vector Regression** Model outperformed all the other models. **Random Forest** did really good too. **Adaboost** and **Bagging** showed some good performance too.

**Outliers** weren't handled during this training process

# Handling Outliers

Experimenting by handling outliers with **median** values

In [None]:
# replace outlier values with median values of that feaute column

for col in numerical_cols.tolist() + target_columns:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    df[col] = np.where((df[col] < lower_bound) | (df[col] > upper_bound),
                       df[col].median(),
                       df[col])

In [None]:
X = df[feature_columns]
y = df[target_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df['Area'])

target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train)
y_test_scaled = target_scaler.transform(y_test)

In [None]:
for model_name, model in models.items():
    cross_validation(model, model_name, X_train, y_train_scaled, kf)

--------------------Linear Regression Cross validation--------------------
MSE scores: [0.5301417  0.79197769 0.55409905 0.55460643 0.66767431 0.77104066
 0.7667371  0.66002254 0.60013208 0.72156832]

RMSE scores: [0.7281083  0.88993128 0.7443783  0.74471903 0.8171134  0.87808921
 0.87563526 0.81241772 0.77468192 0.84945178]

Mean MSE: 0.66

Mean RSE: 0.81

Mean R2: 0.31
------------------------------------------------------------------------------------------------------------------------

--------------------Ridge Regression Cross validation--------------------
MSE scores: [0.53178192 0.77712673 0.55753996 0.54635665 0.66765767 0.76884364
 0.77912649 0.66593028 0.60474133 0.724845  ]

RMSE scores: [0.72923379 0.88154791 0.74668598 0.73915942 0.81710322 0.87683729
 0.88268142 0.81604552 0.77765116 0.85137829]

Mean MSE: 0.66

Mean RSE: 0.81

Mean R2: 0.31
------------------------------------------------------------------------------------------------------------------------

---------

Other than **Lasso** and **Elasticnet**, the overall performance of all the other models significantly improved. Most significant improvement shown in **Linear Regression**, **Ridge Regression** and **XGBoost** which all had negatve $ R^2 $ values.