# Feature Engineering (Optimized)

This notebook creates the training dataset by merging real datasets and using vectorized operations for efficiency:
1. **Historical Crop Performance**: Actual yields per crop-province-year
2. **Soil Test Data**: Real farmer soil conditions (NPK, pH) aggregated by province
3. **Climate Data**: 5-year averages (2020-2024) for temperature, rainfall, humidity

The key improvements are:
- **Vectorized Feature Extraction**: Replaced slow `iterrows()` loop with fully vectorized operations.
- **Efficient Data Merging**: Optimized merging of crop, climate, and soil data.
- **Dynamic Suitability Score**: Calculates scores based on the 95th percentile of each crop's yield.

In [1]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
from sklearn.model_selection import train_test_split

# Add app to path to import modules
project_root = Path("../").resolve()
sys.path.append(str(project_root))

from app.services.data_loader import DataLoader
from app.services.feature_extractor import FeatureExtractor

# Initialize services
data_loader = DataLoader()
data_loader.load_all_data()
feature_extractor = FeatureExtractor()

print("Data loaded successfully!")

Loading datasets...


Datasets loaded. Creating unified database and climate averages...
Pre-calculated climate averages for 80 provinces.
Initialization complete.
Data loaded successfully!


In [2]:
# Load and preprocess historical performance data
historical_perf = data_loader.historical_performance
historical_perf['yield_per_ha'] = historical_perf['Volume_Production'] / historical_perf['Area_Planted_Harvested']
historical_perf_clean = historical_perf[historical_perf['yield_per_ha'].notna() & (historical_perf['yield_per_ha'] != float('inf')) & (historical_perf['yield_per_ha'] > 0)].copy()
print(f"Valid yield records: {len(historical_perf_clean)}")

Valid yield records: 93369


In [3]:
# Merge datasets
historical_perf_clean['Province_normalized'] = historical_perf_clean['Province'].str.strip().str.title()
merged_data = pd.merge(historical_perf_clean, data_loader.climate_averages, left_on='Province_normalized', right_index=True, how='inner')
soil_data = pd.read_csv(data_loader.data_dir / "soil_test_data.csv")
soil_data['province_normalized'] = soil_data['province'].str.strip().str.title()
province_soil_agg = soil_data.groupby('province_normalized').agg(
    nitrogen=('nitrogen', lambda x: x.mode()[0] if not x.mode().empty else 'Medium'),
    phosphorus=('phosphorus', lambda x: x.mode()[0] if not x.mode().empty else 'Medium'),
    potassium=('potassium', lambda x: x.mode()[0] if not x.mode().empty else 'Medium'),
    ph_min=('ph_min', 'mean'),
    ph_max=('ph_max', 'mean')
).reset_index()
merged_data = pd.merge(merged_data, province_soil_agg, left_on='Province_normalized', right_on='province_normalized', how='inner')
print(f"Total merged records: {len(merged_data)}")

Total merged records: 65487


In [4]:
# Vectorized feature extraction
print("Extracting features using vectorized operations...")
feature_df = feature_extractor.extract_features_vectorized(merged_data, data_loader.unified_crop_db)
print(f"Feature extraction complete. {len(feature_df)} records created.")

# Calculate dynamic suitability score
max_yield_per_crop = historical_perf_clean.groupby('Crop')['yield_per_ha'].quantile(0.95)
feature_df['max_yield'] = feature_df['crop_name'].map(max_yield_per_crop).fillna(20)
feature_df['suitability_score'] = np.minimum(100.0, np.maximum(0.0, (feature_df['actual_yield'] / feature_df['max_yield']) * 80.0 + 20.0))
feature_df = feature_df.drop(columns=['max_yield'])
print("Suitability scores calculated.")

Extracting features using vectorized operations...


Feature extraction complete. 65487 records created.
Suitability scores calculated.


In [5]:
# Split data into training and validation sets
train_set, val_set = train_test_split(feature_df, test_size=0.2, random_state=42)

print(f"Training set shape: {train_set.shape}")
print(f"Validation set shape: {val_set.shape}")

# Save the datasets to a temporary directory
output_dir = Path("/tmp/models")
output_dir.mkdir(exist_ok=True)
train_set.to_csv(output_dir / "training_dataset.csv", index=False)
val_set.to_csv(output_dir / "validation_dataset.csv", index=False)

print(f"\nTraining and validation datasets saved to {output_dir}")

Training set shape: (52389, 14)
Validation set shape: (13098, 14)



Training and validation datasets saved to /tmp/models
