In [1]:
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import os

# Create folder to save models
os.makedirs('models', exist_ok=True)

print("All libraries imported successfully!")

All libraries imported successfully!


In [2]:
# Cell 2: Load and Clean the Data
df = pd.read_csv('BetelPrice.csv')

# Convert Date column
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')

# Sort by date
df = df.sort_values('Date').reset_index(drop=True)

# Remove known garbage columns (if present)
df = df.drop(columns=['Unnamed: 6', ' '], errors='ignore')

# Clean Price: remove NaN or non-positive
df = df.dropna(subset=['Price'])
df = df[df['Price'] > 0].reset_index(drop=True)

print(f"Final clean rows: {len(df)}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"Price range: {df['Price'].min()} to {df['Price'].max()}")

Final clean rows: 9429
Date range: 2016-01-05 00:00:00 to 2025-11-25 00:00:00
Price range: 20 to 30000


In [3]:
# Cell 3: Check Data Distribution per Commercial Type
print(df['Commercial Type'].value_counts())

Commercial Type
Peedunu    3776
Keti       1888
Kanda      1888
Korikan    1877
Name: count, dtype: int64


In [4]:
# Cell 4: Training Loop - One Model per Commercial Type
commercial_types = ['Peedunu', 'Keti', 'Kanda', 'Korikan']
results = {}
time_steps = 60  # Use past 60 days to predict next day

print("Starting training of 4 separate LSTM models...\n")

for ctype in commercial_types:
    print("="*60)
    print(f"TRAINING MODEL FOR: {ctype.upper()}")
    print("="*60)

    # Filter data for this commercial type
    subset = df[df['Commercial Type'] == ctype].copy()
    print(f"Rows available: {len(subset)}")

    if len(subset) < 300:
        print(f"⚠️  Too few rows for {ctype}. Skipping.\n")
        continue

    # Add cyclical date features for seasonality
    subset['Month_sin'] = np.sin(2 * np.pi * subset['Date'].dt.month / 12)
    subset['Month_cos'] = np.cos(2 * np.pi * subset['Date'].dt.month / 12)
    subset['DayOfYear_sin'] = np.sin(2 * np.pi * subset['Date'].dt.dayofyear / 365.25)
    subset['DayOfYear_cos'] = np.cos(2 * np.pi * subset['Date'].dt.dayofyear / 365.25)

    # One-hot encode remaining categorical columns
    cat_cols = ['District', 'Market Type', 'Quality Grade']
    subset_encoded = pd.get_dummies(subset, columns=cat_cols, dtype=float, drop_first=True)

    # Define features (Price first)
    feature_cols = ['Price'] + [col for col in subset_encoded.columns
                                if col not in ['Date', 'Price', 'Commercial Type']]
    data = subset_encoded[feature_cols].values.astype(np.float32)

    # Log transform Price
    data_log = data.copy()
    data_log[:, 0] = np.log(data[:, 0])

    # Scale only the Price column
    scaler = MinMaxScaler(feature_range=(0, 1))
    data_scaled = data_log.copy()
    data_scaled[:, 0] = scaler.fit_transform(data_log[:, [0]]).flatten()

    # Create sequences
    X, y = [], []
    for i in range(len(data_scaled) - time_steps):
        X.append(data_scaled[i:i + time_steps])
        y.append(data_scaled[i + time_steps, 0])

    X = np.array(X)
    y = np.array(y)

    if len(X) < 50:
        print(f"⚠️  Not enough sequences for {ctype}. Skipping.\n")
        continue

    print(f"Sequences created: {len(X)} → Shape: {X.shape}")

    # Train-test split (80% train)
    split_idx = int(0.8 * len(X))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]

    # Build LSTM model
    model = Sequential()
    model.add(LSTM(64, return_sequences=True, input_shape=(time_steps, X.shape[2])))
    model.add(Dropout(0.3))
    model.add(LSTM(64))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
                  loss='mse')

    # Early stopping
    early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=0)

    # Train
    print("Training model...")
    history = model.fit(X_train, y_train,
                        epochs=200,
                        batch_size=32,
                        validation_split=0.1,
                        callbacks=[early_stop],
                        verbose=1)

    print(f"Training finished after {len(history.history['loss'])} epochs\n")

    # Evaluate on test set
    y_pred_scaled = model.predict(X_test, verbose=0)
    y_pred = np.exp(scaler.inverse_transform(y_pred_scaled))
    y_true = np.exp(scaler.inverse_transform(y_test.reshape(-1, 1))).flatten()

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    print(f"RESULTS → RMSE: {rmse:.0f} | MAE: {mae:.0f} | MAPE: {mape:.1f}%\n")

    # Store results
    results[ctype] = {'RMSE': rmse, 'MAE': mae, 'MAPE': mape}

    # Save model and scaler
    model.save(f'models/lstm_model_{ctype}.h5')
    joblib.dump(scaler, f'models/scaler_{ctype}.pkl')
    print(f"Model and scaler saved → models/lstm_model_{ctype}.h5 & scaler_{ctype}.pkl\n")

Starting training of 4 separate LSTM models...

TRAINING MODEL FOR: PEEDUNU
Rows available: 3776
Sequences created: 3716 → Shape: (3716, 60, 8)


  super().__init__(**kwargs)


Training model...
Epoch 1/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - loss: 0.0683 - val_loss: 0.0118
Epoch 2/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0168 - val_loss: 0.0114
Epoch 3/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0137 - val_loss: 0.0107
Epoch 4/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0136 - val_loss: 0.0105
Epoch 5/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0122 - val_loss: 0.0104
Epoch 6/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0117 - val_loss: 0.0128
Epoch 7/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0120 - val_loss: 0.0106
Epoch 8/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0110 - val_loss: 0.0109
Epoch 9/200
[1m84/84



RESULTS → RMSE: 3270 | MAE: 2233 | MAPE: 56.0%

Model and scaler saved → models/lstm_model_Peedunu.h5 & scaler_Peedunu.pkl

TRAINING MODEL FOR: KETI
Rows available: 1888
Sequences created: 1828 → Shape: (1828, 60, 7)
Training model...
Epoch 1/200


  super().__init__(**kwargs)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - loss: 0.1071 - val_loss: 0.0192
Epoch 2/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0234 - val_loss: 0.0212
Epoch 3/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0169 - val_loss: 0.0196
Epoch 4/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0162 - val_loss: 0.0175
Epoch 5/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0150 - val_loss: 0.0172
Epoch 6/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0138 - val_loss: 0.0166
Epoch 7/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0131 - val_loss: 0.0172
Epoch 8/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0134 - val_loss: 0.0164
Epoch 9/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━



RESULTS → RMSE: 1252 | MAE: 971 | MAPE: 63.5%

Model and scaler saved → models/lstm_model_Keti.h5 & scaler_Keti.pkl

TRAINING MODEL FOR: KANDA
Rows available: 1888
Sequences created: 1828 → Shape: (1828, 60, 7)
Training model...
Epoch 1/200


  super().__init__(**kwargs)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - loss: 0.0828 - val_loss: 0.0078
Epoch 2/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0214 - val_loss: 0.0076
Epoch 3/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0185 - val_loss: 0.0086
Epoch 4/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0175 - val_loss: 0.0077
Epoch 5/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0162 - val_loss: 0.0078
Epoch 6/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0142 - val_loss: 0.0112
Epoch 7/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0153 - val_loss: 0.0074
Epoch 8/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0143 - val_loss: 0.0081
Epoch 9/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━



RESULTS → RMSE: 3288 | MAE: 2355 | MAPE: 39.7%

Model and scaler saved → models/lstm_model_Kanda.h5 & scaler_Kanda.pkl

TRAINING MODEL FOR: KORIKAN
Rows available: 1877
Sequences created: 1817 → Shape: (1817, 60, 7)
Training model...
Epoch 1/200


  super().__init__(**kwargs)


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - loss: 0.1490 - val_loss: 0.0317
Epoch 2/200
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0224 - val_loss: 0.0292
Epoch 3/200
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0173 - val_loss: 0.0302
Epoch 4/200
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0168 - val_loss: 0.0353
Epoch 5/200
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0164 - val_loss: 0.0297
Epoch 6/200
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0146 - val_loss: 0.0297
Epoch 7/200
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0138 - val_loss: 0.0284
Epoch 8/200
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0134 - val_loss: 0.0283
Epoch 9/200
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━



RESULTS → RMSE: 443 | MAE: 289 | MAPE: 127.8%

Model and scaler saved → models/lstm_model_Korikan.h5 & scaler_Korikan.pkl



In [5]:
# Cell 5: Final Summary
print("\n" + "="*70)
print("FINAL PERFORMANCE SUMMARY (Test Set)")
print("="*70)
for ctype in commercial_types:
    if ctype in results:
        m = results[ctype]
        print(f"{ctype:9} → MAPE: {m['MAPE']:5.1f}%  |  MAE: {m['MAE']:7.0f}  |  RMSE: {m['RMSE']:7.0f}")
    else:
        print(f"{ctype:9} → Skipped (insufficient data)")
print("="*70)


FINAL PERFORMANCE SUMMARY (Test Set)
Peedunu   → MAPE:  56.0%  |  MAE:    2233  |  RMSE:    3270
Keti      → MAPE:  63.5%  |  MAE:     971  |  RMSE:    1252
Kanda     → MAPE:  39.7%  |  MAE:    2355  |  RMSE:    3288
Korikan   → MAPE: 127.8%  |  MAE:     289  |  RMSE:     443
