In [43]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV

import dagshub
dagshub.init(repo_owner='Omdena', repo_name='IPage', mlflow=True)

import mlflow

In [44]:
# Initialize MLflow
mlflow.set_experiment('Taylor_v3_SOC_Baseline_20241225')
mlflow.autolog()

2024/12/25 11:20:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/12/25 11:20:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2024/12/25 11:20:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [45]:
data = pd.read_csv('merged_v3.csv')

data.info()
data['Area'].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2584 entries, 0 to 2583
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   longitude   2584 non-null   float64
 1   latitude    2584 non-null   float64
 2   Area        2584 non-null   object 
 3   Soil group  2584 non-null   object 
 4   Land class  2584 non-null   object 
 5   Soil type   2584 non-null   object 
 6   pH          2584 non-null   float64
 7   SOC         2584 non-null   float64
 8   Nitrogen    2584 non-null   float64
 9   Potassium   2584 non-null   float64
 10  Phosphorus  2584 non-null   float64
 11  Sulfur      2584 non-null   float64
 12  Boron       2584 non-null   float64
 13  Zinc        2584 non-null   float64
 14  Sand        2584 non-null   float64
 15  Silt        2584 non-null   float64
 16  Clay        2584 non-null   float64
dtypes: float64(13), object(4)
memory usage: 343.3+ KB


array(['Mithpukur', 'Pirgacha ', 'Gangachara', 'Kaunia upazila',
       'Taraganj Thana', 'Bauchi', 'Taraba', 'Plateau', 'Kaduna',
       'Nasarawa', 'Niger', 'Kebbi', 'Kano', 'Kwara', 'Katsina',
       'Adamawa'], dtype=object)

In [46]:
data.drop(columns=['longitude', 'latitude', 'Soil group', 'Boron', 'Zinc'], inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2584 entries, 0 to 2583
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        2584 non-null   object 
 1   Land class  2584 non-null   object 
 2   Soil type   2584 non-null   object 
 3   pH          2584 non-null   float64
 4   SOC         2584 non-null   float64
 5   Nitrogen    2584 non-null   float64
 6   Potassium   2584 non-null   float64
 7   Phosphorus  2584 non-null   float64
 8   Sulfur      2584 non-null   float64
 9   Sand        2584 non-null   float64
 10  Silt        2584 non-null   float64
 11  Clay        2584 non-null   float64
dtypes: float64(9), object(3)
memory usage: 242.4+ KB


In [47]:
# Define columns
numerical_cols = ['pH', 'Nitrogen', 'Potassium', 'Phosphorus',
                  'Sulfur', 'Sand', 'Silt', 'Clay']

categorical_cols = ['Area', 'Land class', 'Soil type']

target_col = 'SOC'

In [48]:
models = {
    'RandomForest': RandomForestRegressor(random_state=0),
    'XGB': XGBRegressor(random_state=0)
}

In [49]:


# Split data into features (X) and targets (y)
X = data.drop(columns=target_col)
y = data[target_col]

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot encode categorical features
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
        # Scale numerical features
        ('scaler', MinMaxScaler(), numerical_cols)
    ],
    remainder='drop'  # Drop all other columns not specified (e.g., raw categorical columns)
)

# Train and test multiple models
for model_name, model in models.items():

    # Create pipeline
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('model', model)
    ])
        
    # Train the pipeline
    pipeline.fit(X_train, y_train)
        
    # Predict on the test set
    y_pred = pipeline.predict(X_test)

    # Evaluate and display results
    r2 = r2_score(y_test, y_pred)
    print(f"R² for {model_name}: {r2}")
    print("-" * 30)

        


2024/12/25 11:20:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '53910fd4a1564bb189526704c5b2e728', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run sedate-kite-138 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/53910fd4a1564bb189526704c5b2e728
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
R² for RandomForest: 0.7986670251808461
------------------------------


2024/12/25 11:20:43 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'bf9c58f6233c4f908965e6e035d7d7b2', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run masked-snipe-932 at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1/runs/bf9c58f6233c4f908965e6e035d7d7b2
🧪 View experiment at: https://dagshub.com/Omdena/IPage.mlflow/#/experiments/1
R² for XGB: 0.7707157252058728
------------------------------


In [52]:
# Upload using the DagsHub client, to a DVC tracked folder also called "data".
# Follow the instructions that appear to authorize the request.
from dagshub import upload_files
upload_files('Omdena/IPage', 'merged_v3.csv')