In [None]:
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import statsmodels.api as sm
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# Create models folder
os.makedirs('models_ensemble_tuned', exist_ok=True)

print("All libraries ready!")

All libraries ready!


In [None]:
# Cell 2: Load and Initial Clean
df = pd.read_csv('BetelPrice.csv')
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
df = df.sort_values('Date').reset_index(drop=True)

# Remove garbage columns
df = df.drop(columns=['Unnamed: 6', ' '], errors='ignore')

# Basic cleaning
df = df.dropna(subset=['Price'])
df = df[df['Price'] > 0].reset_index(drop=True)

print(f"Total rows after initial clean: {len(df)}")

Total rows after initial clean: 9429


In [None]:
# Cell 3: Outlier Removal per Commercial Type (IQR Method)
commercial_types = ['Peedunu', 'Keti', 'Kanda', 'Korikan']
df_clean = pd.DataFrame()

for ctype in commercial_types:
    print(f"\n--- {ctype} ---")
    subset = df[df['Commercial Type'] == ctype].copy()
    print(f"Original rows: {len(subset)} | Price range: {subset['Price'].min()} - {subset['Price'].max()}")

    # IQR outlier detection
    Q1 = subset['Price'].quantile(0.25)
    Q3 = subset['Price'].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    before = len(subset)
    subset = subset[(subset['Price'] >= lower) & (subset['Price'] <= upper)]
    after = len(subset)

    print(f"After outlier removal: {after} rows (removed {before - after})")
    print(f"New price range: {subset['Price'].min()} - {subset['Price'].max()}")

    df_clean = pd.concat([df_clean, subset], ignore_index=True)

df = df_clean
print(f"\nFinal total rows after outlier removal across all types: {len(df)}")


--- Peedunu ---
Original rows: 3776 | Price range: 200 - 30000
After outlier removal: 3590 rows (removed 186)
New price range: 200 - 12000

--- Keti ---
Original rows: 1888 | Price range: 50 - 10000
After outlier removal: 1812 rows (removed 76)
New price range: 50 - 5000

--- Kanda ---
Original rows: 1888 | Price range: 250 - 20000
After outlier removal: 1785 rows (removed 103)
New price range: 250 - 11000

--- Korikan ---
Original rows: 1877 | Price range: 20 - 4000
After outlier removal: 1757 rows (removed 120)
New price range: 20 - 1800

Final total rows after outlier removal across all types: 8944


In [None]:
# Cell 4: Training Loop with Tuned LSTM + SARIMA Ensemble
time_steps = 60
results = {}

print("\nStarting fine-tuned training with LSTM + SARIMA ensemble...\n")

for ctype in commercial_types:
    print("="*70)
    print(f"FINE-TUNING ENSEMBLE MODEL FOR: {ctype.upper()}")
    print("="*70)

    subset = df[df['Commercial Type'] == ctype].copy()
    print(f"Rows: {len(subset)}")

    if len(subset) < 300:
        print(f"Skipping {ctype} - insufficient data after cleaning.\n")
        continue

    # Add cyclical date features
    subset['Month_sin'] = np.sin(2 * np.pi * subset['Date'].dt.month / 12)
    subset['Month_cos'] = np.cos(2 * np.pi * subset['Date'].dt.month / 12)
    subset['DayOfYear_sin'] = np.sin(2 * np.pi * subset['Date'].dt.dayofyear / 365.25)
    subset['DayOfYear_cos'] = np.cos(2 * np.pi * subset['Date'].dt.dayofyear / 365.25)

    # Encode categoricals
    cat_cols = ['District', 'Market Type', 'Quality Grade']
    subset_encoded = pd.get_dummies(subset, columns=cat_cols, dtype=float, drop_first=True)

    # Features
    feature_cols = ['Price'] + [col for col in subset_encoded.columns if col not in ['Date', 'Price', 'Commercial Type']]
    data = subset_encoded[feature_cols].values.astype(np.float32)

    # Log + scale Price
    data_log = data.copy()
    data_log[:, 0] = np.log(data[:, 0])
    scaler = MinMaxScaler()
    data_scaled = data_log.copy()
    data_scaled[:, 0] = scaler.fit_transform(data_log[:, [0]]).flatten()

    # Create sequences for LSTM
    X, y = [], []
    for i in range(len(data_scaled) - time_steps):
        X.append(data_scaled[i:i + time_steps])
        y.append(data_scaled[i + time_steps, 0])
    X = np.array(X)
    y = np.array(y)

    if len(X) < 50:
        print(f"Not enough sequences for {ctype}.\n")
        continue

    # 70/15/15 split: train/val/test
    train_idx = int(0.7 * len(X))
    val_idx = int(0.85 * len(X))
    X_train, X_val, X_test = X[:train_idx], X[train_idx:val_idx], X[val_idx:]
    y_train, y_val, y_test = y[:train_idx], y[train_idx:val_idx], y[val_idx:]

    # === Tuned LSTM Model ===
    lstm_model = Sequential()
    lstm_model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(time_steps, X.shape[2])))
    lstm_model.add(Dropout(0.2))
    lstm_model.add(Bidirectional(LSTM(128)))
    lstm_model.add(Dropout(0.2))
    lstm_model.add(Dense(64, activation='relu'))
    lstm_model.add(Dense(1))

    lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), loss='mse')

    early_stop = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True, verbose=0)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6, verbose=0)

    lstm_model.fit(X_train, y_train, epochs=200, batch_size=64, validation_data=(X_val, y_val),
                   callbacks=[early_stop, lr_scheduler], verbose=0)

    # LSTM predictions on val and test
    lstm_val_scaled = lstm_model.predict(X_val, verbose=0)
    lstm_test_scaled = lstm_model.predict(X_test, verbose=0)
    lstm_val = np.exp(scaler.inverse_transform(lstm_val_scaled))
    lstm_test = np.exp(scaler.inverse_transform(lstm_test_scaled))

    # === SARIMA Model ===
    price_series = subset['Price'].values
    try:
        sarima_model = sm.tsa.SARIMAX(price_series, order=(1,1,1), seasonal_order=(1,1,1,12))
        sarima_fit = sarima_model.fit(disp=False)
        sarima_val = sarima_fit.forecast(steps=len(X_val))
        sarima_test = sarima_fit.forecast(steps=len(X_test))
    except:
        print(f"SARIMA failed for {ctype}. Using LSTM only.")
        sarima_val = np.full_like(lstm_val, np.mean(price_series[-100:]))
        sarima_test = np.full_like(lstm_test, np.mean(price_series[-100:]))

    # Val truths for weight tuning
    y_val_true = np.exp(scaler.inverse_transform(y_val.reshape(-1, 1))).flatten()

    # Tune ensemble weights on validation set
    weights = np.linspace(0, 1, 11)  # 0 to 1 in 0.1 steps
    best_weight = 0.5
    best_mape = float('inf')
    for w in weights:
        val_ensemble = w * lstm_val.flatten() + (1 - w) * sarima_val
        val_mape = np.mean(np.abs((y_val_true - val_ensemble) / y_val_true)) * 100
        if val_mape < best_mape:
            best_mape = val_mape
            best_weight = w

    print(f"Best ensemble weight (LSTM): {best_weight:.2f} (tuned on validation)")

    # Apply best weight to test
    ensemble_test = best_weight * lstm_test.flatten() + (1 - best_weight) * sarima_test

    y_test_true = np.exp(scaler.inverse_transform(y_test.reshape(-1, 1))).flatten()

    rmse = np.sqrt(mean_squared_error(y_test_true, ensemble_test))
    mae = mean_absolute_error(y_test_true, ensemble_test)
    mape = np.mean(np.abs((y_test_true - ensemble_test) / y_test_true)) * 100

    print(f"ENSEMBLE RESULTS → RMSE: {rmse:.0f} | MAE: {mae:.0f} | MAPE: {mape:.1f}%\n")

    results[ctype] = {'RMSE': rmse, 'MAE': mae, 'MAPE': mape}

    # Save models
    lstm_model.save(f'models_ensemble_tuned/lstm_{ctype}.h5')
    joblib.dump(scaler, f'models_ensemble_tuned/scaler_{ctype}.pkl')
    joblib.dump(sarima_fit, f'models_ensemble_tuned/sarima_{ctype}.pkl')
    joblib.dump(best_weight, f'models_ensemble_tuned/weight_{ctype}.pkl')
    print(f"Models saved for {ctype}\n")


Starting fine-tuned training with LSTM + SARIMA ensemble...

FINE-TUNING ENSEMBLE MODEL FOR: PEEDUNU
Rows: 3590




Best ensemble weight (LSTM): 1.00 (tuned on validation)
ENSEMBLE RESULTS → RMSE: 2637 | MAE: 2111 | MAPE: 33.0%

Models saved for Peedunu

FINE-TUNING ENSEMBLE MODEL FOR: KETI
Rows: 1812




Best ensemble weight (LSTM): 1.00 (tuned on validation)
ENSEMBLE RESULTS → RMSE: 970 | MAE: 745 | MAPE: 31.6%

Models saved for Keti

FINE-TUNING ENSEMBLE MODEL FOR: KANDA
Rows: 1785




Best ensemble weight (LSTM): 1.00 (tuned on validation)
ENSEMBLE RESULTS → RMSE: 2325 | MAE: 1861 | MAPE: 25.6%

Models saved for Kanda

FINE-TUNING ENSEMBLE MODEL FOR: KORIKAN
Rows: 1757




Best ensemble weight (LSTM): 1.00 (tuned on validation)
ENSEMBLE RESULTS → RMSE: 331 | MAE: 243 | MAPE: 44.2%

Models saved for Korikan



In [None]:
# Cell 5: Final Summary
print("\n" + "="*80)
print("FINAL TUNED ENSEMBLE PERFORMANCE SUMMARY (After Outlier Removal)")
print("="*80)
for ctype in commercial_types:
    if ctype in results:
        m = results[ctype]
        print(f"{ctype:9} → MAPE: {m['MAPE']:5.1f}%  |  MAE: {m['MAE']:7.0f}  |  RMSE: {m['RMSE']:7.0f}")
    else:
        print(f"{ctype:9} → Skipped")
print("="*80)
print("Improvement expected over previous version!")


FINAL TUNED ENSEMBLE PERFORMANCE SUMMARY (After Outlier Removal)
Peedunu   → MAPE:  33.0%  |  MAE:    2111  |  RMSE:    2637
Keti      → MAPE:  31.6%  |  MAE:     745  |  RMSE:     970
Kanda     → MAPE:  25.6%  |  MAE:    1861  |  RMSE:    2325
Korikan   → MAPE:  44.2%  |  MAE:     243  |  RMSE:     331
Improvement expected over previous version!
