In [1]:
import pandas as pd
# Loading data
data = pd.read_csv("preprocessed_700.csv")


In [2]:
# Convert Yield from t/ha to kg/m²
data['Yield'] = data['Yield'] * 0.1

In [5]:
nitrogen_ratio = []
phosphorus_ratio = []
potassium_ratio = []

for entry in data['N-P-K Ratio']:
  ratio = entry.split(':')
  nitrogen_ratio.append(float(ratio[0]))
  phosphorus_ratio.append(float(ratio[1]))
  potassium_ratio.append(float(ratio[2]))

In [7]:
data['Nitrogen_Ratio'] = nitrogen_ratio
data['Phosphorus_Ratio'] = phosphorus_ratio
data['Potassium_Ratio'] = potassium_ratio
data.drop(columns=['N-P-K Ratio'], inplace=True)

In [8]:
# Define features and target
X = data.drop('Yield', axis=1)
y = data['Yield']

In [9]:
data.head()

Unnamed: 0,Name,Fertility,Photoperiod,Temperature,Rainfall,pH,Light_Hours,Light_Intensity,Rh,Nitrogen,Phosphorus,Potassium,Yield,Category_pH,Soil_Type,Season,Nitrogen_Ratio,Phosphorus_Ratio,Potassium_Ratio
0,Apple,High,Day Neutral,21.063204,1932.402709,6.567764,12.716549,860.189066,92.677579,89.266502,40.330099,180.63574,1.284748,low_acidic,Sandy Loam,Fall,10.0,10.0,10.0
1,Apple,High,Day Neutral,19.511305,1589.295994,6.784538,13.54456,797.66076,92.293923,92.80815,37.131922,179.042979,1.389429,neutral,Sandy Loam,Fall,10.0,10.0,10.0
2,Apple,High,Day Neutral,23.045662,1269.789133,6.619155,12.330668,910.861369,91.798926,84.24859,38.693498,163.604138,1.33722,low_acidic,Sandy Loam,Fall,10.0,10.0,10.0
3,Apple,High,Short Day Period,17.986016,1944.180144,6.638623,12.96534,922.725203,92.74271,84.780429,43.950592,173.881606,1.180157,neutral,Sandy Loam,Spring,10.0,10.0,10.0
4,Apple,High,Day Neutral,23.775354,1790.352815,6.654898,12.895817,821.411003,90.98153,91.197126,45.56447,174.324935,1.066052,neutral,Sandy Loam,Fall,10.0,10.0,10.0


In [10]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# List of categorical and numerical columns
categorical_features = ['Name', 'Fertility', 'Photoperiod', 'Category_pH', 'Soil_Type', 'Season']
numerical_features = ['Temperature', 'Rainfall', 'pH', 'Light_Hours', 'Light_Intensity', 'Rh', 'Nitrogen', 'Phosphorus', 'Potassium', 'Nitrogen_Ratio', 'Phosphorus_Ratio', 'Potassium_Ratio']

# Preprocessing pipeline for categorical and numerical data
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])


In [11]:
from sklearn.model_selection import train_test_split
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Create a pipeline with preprocessing and Random Forest model
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
rf_model.fit(X_train, y_train)

In [13]:
# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f'Random Forest MSE: {mse_rf}')
print(f'Random Forest R^2: {r2_rf}')

Random Forest MSE: 0.013452098705943494
Random Forest R^2: 0.993465817489565


In [14]:
import xgboost as xgb

# Create a pipeline with preprocessing and XGBoost model
xgb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(n_estimators=100, random_state=42))
])

# Train the model
xgb_model.fit(X_train, y_train)

In [15]:
# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f'XGBoost MSE: {mse_xgb}')
print(f'XGBoost R^2: {r2_xgb}')

XGBoost MSE: 0.012850291599053824
XGBoost R^2: 0.9937581374879869


INFERENCE SESSION FOR A CUSTOM INPUT

In [16]:
def predict_yield(custom_input, model):
    # Convert custom input into a DataFrame
    input_df = pd.DataFrame([custom_input])
    
    # Extract N-P-K ratios from custom input
    ratio = input_df['N-P-K Ratio'].iloc[0].split(':')
    input_df['Nitrogen_Ratio'] = float(ratio[0])
    input_df['Phosphorus_Ratio'] = float(ratio[1])
    input_df['Potassium_Ratio'] = float(ratio[2])
    
    # Drop the original 'N-P-K Ratio' column
    input_df.drop(columns=['N-P-K Ratio'], inplace=True)
    
    # Preprocess the input and make a prediction
    yield_pred = model.predict(input_df)
    
    return yield_pred[0]

In [18]:
# Custom input for prediction
custom_input = {
    'Name': 'Apple',
    'Fertility': 'High',
    'Photoperiod': 'Day Neutral',
    'N-P-K Ratio': '10:10:10',
    'Temperature': 22.0,
    'Rainfall': 1500.0,
    'pH': 6.5,
    'Light_Hours': 12.0,
    'Light_Intensity': 800.0,
    'Rh': 90.0,
    'Nitrogen': 100.0,
    'Phosphorus': 50.0,
    'Potassium': 150.0,
    'Category_pH': 'low_acidic',
    'Soil_Type': 'Sandy Loam',
    'Season': 'Fall'
}

# Predict yield using Random Forest model
rf_yield = predict_yield(custom_input, rf_model)
print(f'Predicted Yield with Random Forest: {rf_yield:.2f} kg/m²')

# Predict yield using XGBoost model
xgb_yield = predict_yield(custom_input, xgb_model)
print(f'Predicted Yield with XGBoost: {xgb_yield:.2f} kg/m²')

Predicted Yield with Random Forest: 1.01 kg/m²
Predicted Yield with XGBoost: 1.06 kg/m²


From the predicted yield using the custom input the yield prediction of 1.06 kg/m² aligns prefectly with our dataset so XGBoost is a better choice for custom data prediction