In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
import os

# --- This code is from Phase 3, we need it to create the features ---
def create_features(df):
    df = df.sort_values(['market_name', 'commodity', 'date'])
    df['price_yesterday'] = df.groupby(['market_name', 'commodity'])['modal_price'].shift(1)
    df['price_last_week'] = df.groupby(['market_name', 'commodity'])['modal_price'].shift(7)
    df['price_avg_7days'] = df.groupby(['market_name', 'commodity'])['modal_price'].shift(1).rolling(7).mean()
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['arrivals_yesterday'] = df.groupby(['market_name', 'commodity'])['arrivals_qty'].shift(1)
    df = df.dropna()
    return df

# Load data and create features
df = pd.read_csv('data/market_prices.csv')
df['date'] = pd.to_datetime(df['date'])
df_features = create_features(df)
# --- End of Phase 3 code ---


def train_price_model(df_features, commodity='Tomato'):
    """
    Train a model to predict crop prices
    """
    # Filter data for specific commodity
    data = df_features[df_features['commodity'] == commodity].copy()

    # Prepare features (X) and target (y)
    feature_columns = [
        'price_yesterday', 'price_last_week', 'price_avg_7days',
        'day_of_week', 'month', 'is_weekend', 'arrivals_yesterday',
        'market_lat', 'market_lon'
    ]
    X = data[feature_columns]
    y = data['modal_price']

    # Split data: 80% for training, 20% for testing
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=False # Time series data should not be shuffled
    )

    # Initialize and train the model
    print(f"Training model for {commodity}...")
    model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)

    # Test the model on data it has never seen
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred) ** 0.5

    print(f"\nModel Performance for {commodity}:")
    print(f"Mean Absolute Error (MAE): ₹{mae:.2f}")
    print(f"Root Mean Square Error (RMSE): ₹{rmse:.2f}")

    # Save the trained model and the feature list to the 'models' folder
    joblib.dump(model, f'models/{commodity}_price_model.pkl')
    joblib.dump(feature_columns, f'models/{commodity}_features.pkl')
    print(f"\nModel for {commodity} saved successfully!")

    return model, feature_columns

# Train and save the model for Tomatoes
model, features = train_price_model(df_features, 'Tomato')

Training model for Tomato...

Model Performance for Tomato:
Mean Absolute Error (MAE): ₹226.70
Root Mean Square Error (RMSE): ₹281.61

Model for Tomato saved successfully!
