# üìà AgriTech - Crop Yield Prediction Demo

This notebook demonstrates the **Crop Yield Prediction** model from the AgriTech project.

It predicts the expected crop yield (in tons/hectare) based on:
- Crop type and cultivation year
- Season and geographic state
- Area, rainfall, fertilizer and pesticide usage

---
**GitHub:** https://github.com/omroy07/AgriTech

## üì¶ Step 1: Install Dependencies

In [None]:
!pip install numpy pandas scikit-learn matplotlib seaborn xgboost joblib

## üìö Step 2: Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print('‚úÖ All libraries imported!')

## üìä Step 3: Load Sample Yield Dataset

In [None]:
# Sample dataset - subset of Indian crop production data
# Full dataset: https://www.kaggle.com/datasets/srinivas1/agricuture-crops-production-in-india

data = {
    'Crop':            ['Wheat','Rice','Maize','Cotton','Sugarcane','Wheat','Rice','Groundnut',
                        'Soybean','Bajra','Jowar','Mustard','Barley','Chickpea','Lentil',
                        'Wheat','Rice','Maize','Cotton','Sugarcane'],
    'Crop_Year':       [2022,2022,2022,2022,2022,2023,2023,2023,
                        2023,2023,2023,2022,2022,2021,2021,
                        2021,2021,2021,2021,2021],
    'Season':          ['Rabi','Kharif','Kharif','Kharif','Whole Year','Rabi','Kharif','Kharif',
                        'Kharif','Kharif','Kharif','Rabi','Rabi','Rabi','Rabi',
                        'Rabi','Kharif','Kharif','Kharif','Whole Year'],
    'State':           ['Punjab','West Bengal','Karnataka','Maharashtra','Uttar Pradesh',
                        'Haryana','Andhra Pradesh','Gujarat','Madhya Pradesh','Rajasthan',
                        'Maharashtra','Rajasthan','Himachal Pradesh','Madhya Pradesh','Bihar',
                        'Uttar Pradesh','Tamil Nadu','Bihar','Telangana','Maharashtra'],
    'Area':            [1500,2500,800,3200,4000,1800,3000,1200,
                        2000,900,700,1100,400,600,300,
                        1400,2800,750,2900,3800],
    'Annual_Rainfall': [750,1600,900,650,1100,680,1200,580,
                        1000,400,700,350,1500,550,900,
                        720,1400,950,700,1050],
    'Fertilizer':      [200,350,180,275,500,220,320,150,
                        280,90,110,160,80,70,60,
                        195,340,170,265,490],
    'Pesticide':       [15,22,10,30,45,18,25,8,
                        20,5,7,12,4,6,3,
                        14,21,9,28,42],
    'Production':      [6300,14500,2480,7680,280000,7740,17400,1440,
                        6200,1530,1260,1980,840,780,240,
                        5880,15680,2325,7540,268300]
}

df = pd.DataFrame(data)
# Calculate yield per hectare
df['Yield'] = df['Production'] / df['Area']

print('üìä Dataset shape:', df.shape)
print('\nSample data:')
df[['Crop','Season','State','Area','Annual_Rainfall','Fertilizer','Pesticide','Yield']].head(8)

## üîß Step 4: Preprocessing - Encode Categorical Features

In [None]:
# Encode categorical columns
le_crop    = LabelEncoder()
le_season  = LabelEncoder()
le_state   = LabelEncoder()

df_encoded = df.copy()
df_encoded['Crop_enc']   = le_crop.fit_transform(df['Crop'])
df_encoded['Season_enc'] = le_season.fit_transform(df['Season'])
df_encoded['State_enc']  = le_state.fit_transform(df['State'])

# Feature matrix and target
features = ['Crop_enc', 'Crop_Year', 'Season_enc', 'State_enc',
            'Area', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']
X = df_encoded[features]
y = df_encoded['Yield']

print('‚úÖ Features shape:', X.shape)
print('\nEncoded feature sample:')
print(X.head(4))

## üèãÔ∏è Step 5: Train Multiple Models and Compare

In [None]:
# Check for pre-trained model first
model_paths = [
    '../Crop Yield Prediction/model.pkl',
    '../Crop Yield Prediction/yield_model.pkl',
    'yield_model.pkl'
]

best_model = None
scaler = StandardScaler()

for path in model_paths:
    if os.path.exists(path):
        best_model = joblib.load(path)
        print(f'‚úÖ Pre-trained model loaded from: {path}')
        break

if best_model is None:
    print('‚ö†Ô∏è No pre-trained model found. Training demo models...')

    X_scaled = scaler.fit_transform(X)

    models = {
        'Linear Regression':        LinearRegression(),
        'Random Forest':            RandomForestRegressor(n_estimators=100, random_state=42),
        'Gradient Boosting':        GradientBoostingRegressor(n_estimators=100, random_state=42)
    }

    results = []
    print('\nüìä Model Comparison (trained on full demo dataset):')
    print('-' * 60)
    print(f'{"Model":<25} {"R¬≤ Score":<12} {"MAE":<12} {"RMSE"}')
    print('-' * 60)

    for name, m in models.items():
        m.fit(X_scaled, y)
        preds = m.predict(X_scaled)
        r2  = r2_score(y, preds)
        mae = mean_absolute_error(y, preds)
        rmse = np.sqrt(mean_squared_error(y, preds))
        print(f'{name:<25} {r2:<12.4f} {mae:<12.2f} {rmse:.2f}')
        results.append((r2, name, m))

    print('-' * 60)
    best_r2, best_name, best_model = max(results, key=lambda x: x[0])
    print(f'\nüèÜ Best model: {best_name} (R¬≤ = {best_r2:.4f})')

## üåæ Step 6: Predict Yield for New Inputs

In [None]:
def predict_yield(crop, crop_year, season, state, area,
                  annual_rainfall, fertilizer, pesticide,
                  model, scaler, le_crop, le_season, le_state):
    """
    Predict crop yield for given parameters.
    Returns predicted yield in tons per hectare.
    """
    try:
        crop_enc   = le_crop.transform([crop])[0]
        season_enc = le_season.transform([season])[0]
        state_enc  = le_state.transform([state])[0]
    except ValueError as e:
        return None, f'Unknown category: {e}'

    features = np.array([[crop_enc, crop_year, season_enc, state_enc,
                          area, annual_rainfall, fertilizer, pesticide]])
    features_scaled = scaler.transform(features)
    predicted_yield = model.predict(features_scaled)[0]
    return round(predicted_yield, 3), None


# --- Test Predictions ---
test_cases = [
    ('Wheat',     2024, 'Rabi',      'Punjab',        1500, 750,  200, 15),
    ('Rice',      2024, 'Kharif',    'West Bengal',   2500, 1600, 350, 22),
    ('Sugarcane', 2024, 'Whole Year','Uttar Pradesh', 4000, 1100, 500, 45),
    ('Maize',     2023, 'Kharif',    'Karnataka',     800,  900,  180, 10),
    ('Cotton',    2024, 'Kharif',    'Maharashtra',   3200, 650,  275, 30),
]

print('\nüåæ Yield Predictions:')
print('-' * 75)
print(f'{"Crop":<12} {"Year":<6} {"Season":<12} {"State":<18} {"Yield (t/ha)"}')
print('-' * 75)

for crop, year, season, state, area, rainfall, fert, pest in test_cases:
    yield_val, err = predict_yield(
        crop, year, season, state, area, rainfall, fert, pest,
        best_model, scaler, le_crop, le_season, le_state
    )
    if err:
        print(f'{crop:<12} {year:<6} {season:<12} {state:<18} ERROR: {err}')
    else:
        print(f'{crop:<12} {year:<6} {season:<12} {state:<18} {yield_val}')
print('-' * 75)

## üì¶ Step 7: Batch Predict from JSON File

In [None]:
try:
    with open('sample_yield_input.json', 'r') as f:
        sample_data = json.load(f)

    print(f'üìÇ Loaded {len(sample_data["batch_predictions"])} samples from sample_yield_input.json')
    results = []
    for s in sample_data['batch_predictions']:
        yld, err = predict_yield(
            s['crop'], s['crop_year'], s['season'], s['state'],
            s['area'], s['annual_rainfall'], s['fertilizer'], s['pesticide'],
            best_model, scaler, le_crop, le_season, le_state
        )
        results.append({'crop': s['crop'], 'predicted': yld, 'expected': s.get('expected_yield_tons','N/A')})

    results_df = pd.DataFrame(results)
    print('\nüìä Batch Prediction Results:')
    print(results_df.to_string(index=False))
except FileNotFoundError:
    print('‚ö†Ô∏è sample_yield_input.json not found. Please ensure it is in the same directory.')

## üìà Step 8: Visualize Predicted vs Expected Yield

In [None]:
# Predict on the training data for visualization
X_scaled_all = scaler.transform(X)
y_pred = best_model.predict(X_scaled_all)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Actual vs Predicted
axes[0].scatter(y, y_pred, color='#2E8B57', alpha=0.7, edgecolors='#006400', s=80)
max_val = max(y.max(), max(y_pred))
axes[0].plot([0, max_val], [0, max_val], 'r--', linewidth=1.5, label='Perfect Prediction')
axes[0].set_xlabel('Actual Yield (t/ha)')
axes[0].set_ylabel('Predicted Yield (t/ha)')
axes[0].set_title('Actual vs Predicted Yield')
axes[0].legend()

# Plot 2: Yield by Crop Type
df['Predicted_Yield'] = y_pred
crop_avg = df.groupby('Crop')['Yield'].mean().sort_values(ascending=False)
colors = plt.cm.Greens(np.linspace(0.4, 0.9, len(crop_avg)))
axes[1].bar(crop_avg.index, crop_avg.values, color=colors)
axes[1].set_xlabel('Crop')
axes[1].set_ylabel('Average Yield (t/ha)')
axes[1].set_title('Average Yield by Crop Type')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('yield_prediction_chart.png', dpi=150, bbox_inches='tight')
plt.show()
print('üìä Chart saved as yield_prediction_chart.png')

## üß™ Step 9: Try Your Own Values!

In [None]:
# ‚úèÔ∏è EDIT THESE VALUES to predict for your farm

my_crop           = 'Wheat'        # Crop name
my_crop_year      = 2024           # Year of cultivation
my_season         = 'Rabi'         # Kharif | Rabi | Zaid | Whole Year
my_state          = 'Punjab'       # Indian state
my_area           = 1500.0         # Area in hectares
my_rainfall       = 750.5          # Annual rainfall in mm
my_fertilizer     = 200.0          # Fertilizer in kg/ha
my_pesticide      = 15.0           # Pesticide in kg/ha

# ------- Run Prediction -------
predicted_yield, err = predict_yield(
    my_crop, my_crop_year, my_season, my_state,
    my_area, my_rainfall, my_fertilizer, my_pesticide,
    best_model, scaler, le_crop, le_season, le_state
)

if err:
    print(f'‚ùå Error: {err}')
else:
    total_production = predicted_yield * my_area
    print('=' * 45)
    print('      üìà YIELD PREDICTION RESULT')
    print('=' * 45)
    print(f'  Crop             : {my_crop}')
    print(f'  Year             : {my_crop_year}')
    print(f'  Season           : {my_season}')
    print(f'  State            : {my_state}')
    print(f'  Area             : {my_area} ha')
    print('-' * 45)
    print(f'  ‚úÖ Yield          : {predicted_yield} tons/ha')
    print(f'  üì¶ Total Produce  : {total_production:.1f} tons')
    print('=' * 45)