In [None]:
# Install required packages
# %pip install pandas numpy scikit-learn plotly lightgbm optuna


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)


In [2]:
# 1. Data Loading
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
print(df_train.head())
print("----------------------------------------")
print(df_test.head())

         date  store  item  sales
0  2013-01-01      1     1     13
1  2013-01-02      1     1     11
2  2013-01-03      1     1     14
3  2013-01-04      1     1     13
4  2013-01-05      1     1     10
----------------------------------------
   id        date  store  item
0   0  2018-01-01      1     1
1   1  2018-01-02      1     1
2   2  2018-01-03      1     1
3   3  2018-01-04      1     1
4   4  2018-01-05      1     1


In [3]:
# Convert date columns to datetime
df_train['date'] = pd.to_datetime(df_train['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

In [4]:
# 2. Initial Data Exploration
print("\nTraining Data Info:")
print("-" * 50)
print(df_train.info())
print("\nTest Data Info:")
print("-" * 50)
print(df_test.info())

# Check missing values
print("\nMissing Values in Training Data:")
print(df_train.isnull().sum())
print("\nMissing Values in Test Data:")
print(df_test.isnull().sum())

# Display basic statistics
print("\nTraining Data Statistics:")
print(df_train.describe())



Training Data Info:
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   date    913000 non-null  datetime64[ns]
 1   store   913000 non-null  int64         
 2   item    913000 non-null  int64         
 3   sales   913000 non-null  int64         
dtypes: datetime64[ns](1), int64(3)
memory usage: 27.9 MB
None

Test Data Info:
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   id      45000 non-null  int64         
 1   date    45000 non-null  datetime64[ns]
 2   store   45000 non-null  int64         
 3   item    45000 non-null  int64         
dtypes: datetime64[ns](1), int64(3)
memory us

In [5]:
# 3. Feature Engineering
def create_features(df):
    """Create time-based features from date column"""
    df = df.copy()
    
    # Basic time features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    
    # Lag features (for training data only)
    if 'sales' in df.columns:
        # Group by store and item
        grouped = df.groupby(['store', 'item'])
        
        # Create lag features
        df['sales_lag_7'] = grouped['sales'].transform(lambda x: x.shift(7))
        df['sales_lag_14'] = grouped['sales'].transform(lambda x: x.shift(14))
        df['sales_lag_30'] = grouped['sales'].transform(lambda x: x.shift(30))
        
        # Create rolling mean features
        df['sales_rolling_mean_7'] = grouped['sales'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
        df['sales_rolling_mean_14'] = grouped['sales'].transform(lambda x: x.rolling(window=14, min_periods=1).mean())
        df['sales_rolling_mean_30'] = grouped['sales'].transform(lambda x: x.rolling(window=30, min_periods=1).mean())
    
    return df

# Apply feature engineering
print("Applying feature engineering...")
df_train = create_features(df_train)
df_test = create_features(df_test)

# Encode categorical variables
le_store = LabelEncoder()
le_item = LabelEncoder()

df_train['store_encoded'] = le_store.fit_transform(df_train['store'])
df_train['item_encoded'] = le_item.fit_transform(df_train['item'])

df_test['store_encoded'] = le_store.transform(df_test['store'])
df_test['item_encoded'] = le_item.transform(df_test['item'])

# Define features for modeling
base_features = ['year', 'month', 'day', 'day_of_week', 'quarter', 'store_encoded', 'item_encoded']
lag_features = ['sales_lag_7', 'sales_lag_14', 'sales_lag_30',
                'sales_rolling_mean_7', 'sales_rolling_mean_14', 'sales_rolling_mean_30']

# Final feature list for training (excluding early dates due to lag features)
features = base_features + lag_features
df_train = df_train.dropna()  # Remove rows with NaN from lag features

print("\nFeatures created:", features)


Applying feature engineering...

Features created: ['year', 'month', 'day', 'day_of_week', 'quarter', 'store_encoded', 'item_encoded', 'sales_lag_7', 'sales_lag_14', 'sales_lag_30', 'sales_rolling_mean_7', 'sales_rolling_mean_14', 'sales_rolling_mean_30']


In [6]:
# 4. Train-Validation-Test Split
print("Preparing train-validation-test split...")

# Sort by date to prevent data leakage
df_train = df_train.sort_values('date').reset_index(drop=True)

# Split into train, validation, test
n_val = 45000
n_test = 45000
n_train = len(df_train) - n_val - n_test

if len(df_train) < n_train + n_val + n_test:
    raise ValueError("Not enough data in df_train to split into 3 sets of 45000 each.")

train_data = df_train.iloc[:n_train]
val_data = df_train.iloc[n_train:n_train + n_val]
test_data = df_train.iloc[n_train + n_val:n_train + n_val + n_test]

X_train = train_data[features]
y_train = train_data['sales']
X_val = val_data[features]
y_val = val_data['sales']
X_test = test_data[features]
y_test = test_data['sales']

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")
print(f"Test data shape: {X_test.shape}")

# Prepare test features for competition/test set
X_test_competition = df_test[base_features]  # Note: lag features will be calculated during prediction


Preparing train-validation-test split...
Training data shape: (808000, 13)
Validation data shape: (45000, 13)
Test data shape: (45000, 13)


In [7]:
 # 5. Model Training and Evaluation

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

performance_results = []

def evaluate_model(y_true, y_pred, dataset_name=""):
    """Calculate and print model evaluation metrics"""
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{dataset_name} Metrics:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2 Score: {r2:.4f}")
    
    return rmse, mae, r2

# สร้าง dictionary สำหรับเก็บโมเดลและผลลัพธ์
models = {}

# 1. LightGBM
print("Training LightGBM model...")
lgbm_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    num_leaves=31,
    random_state=42,
    verbose=-1
)
lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)
val_pred_lgbm = lgbm_model.predict(X_val)
print("LightGBM Results:")
rmse_lgbm, mae_lgbm, r2_lgbm = evaluate_model(y_val, val_pred_lgbm, "LightGBM Validation")
models['LightGBM'] = lgbm_model
performance_results.append({
    'Model': 'LightGBM',
    'RMSE': rmse_lgbm,
    'MAE': mae_lgbm,
    'R2': r2_lgbm
})

# 2. RandomForestRegressor
print("\nTraining RandomForestRegressor model...")
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
val_pred_rf = rf_model.predict(X_val)
print("RandomForestRegressor Results:")
rmse_rf, mae_rf, r2_rf = evaluate_model(y_val, val_pred_rf, "RandomForest Validation")
models['RandomForest'] = rf_model
performance_results.append({
    'Model': 'RandomForest',
    'RMSE': rmse_rf,
    'MAE': mae_rf,
    'R2': r2_rf
})

# 3. GradientBoostingRegressor
print("\nTraining GradientBoostingRegressor model...")
gb_model = GradientBoostingRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
gb_model.fit(X_train, y_train)
val_pred_gb = gb_model.predict(X_val)
print("GradientBoostingRegressor Results:")
rmse_gb, mae_gb, r2_gb = evaluate_model(y_val, val_pred_gb, "GradientBoosting Validation")
models['GradientBoosting'] = gb_model
performance_results.append({
    'Model': 'GradientBoosting',
    'RMSE': rmse_gb,
    'MAE': mae_gb,
    'R2': r2_gb
})

# แสดงตารางสรุป performance ของแต่ละโมเดล
import pandas as pd
performance_df = pd.DataFrame(performance_results)
print("\nSummary of performance of each model (Validation Set):")
display(performance_df)


Training LightGBM model...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[202]	valid_0's rmse: 7.70284	valid_0's l2: 59.3337
LightGBM Results:

LightGBM Validation Metrics:
RMSE: 7.70
MAE: 5.94
R2 Score: 0.9487

Training RandomForestRegressor model...
RandomForestRegressor Results:

RandomForest Validation Metrics:
RMSE: 7.85
MAE: 6.06
R2 Score: 0.9466

Training GradientBoostingRegressor model...
GradientBoostingRegressor Results:

GradientBoosting Validation Metrics:
RMSE: 7.68
MAE: 5.94
R2 Score: 0.9490

Summary of performance of each model (Validation Set):


Unnamed: 0,Model,RMSE,MAE,R2
0,LightGBM,7.702839,5.944186,0.948654
1,RandomForest,7.853852,6.060022,0.946621
2,GradientBoosting,7.678399,5.937578,0.94898


In [8]:
# 6. Generate Predictions for Next 3 Months
print("Generating predictions for next 3 months...")

# Create future dates for prediction
last_date = df_train['date'].max()
future_dates = pd.date_range(
    start=last_date + timedelta(days=1),
    end=last_date + timedelta(days=90),
    freq='D'
)

# Create prediction dataframe
future_df = pd.DataFrame()
future_df['date'] = future_dates

# Create combinations of store and item
store_items = df_train[['store', 'item']].drop_duplicates()
future_predictions = []

# Get the last 30 days of actual data for calculating lag features
last_30_days = df_train[df_train['date'] > last_date - timedelta(days=30)].copy()

# Generate predictions for each store-item combination
for _, row in store_items.iterrows():
    store, item = row['store'], row['item']
    
    # Create temporary dataframe for this store-item combination
    temp_df = future_df.copy()
    temp_df['store'] = store
    temp_df['item'] = item
    
    # Get historical data for this store-item combination
    hist_data = last_30_days[
        (last_30_days['store'] == store) & 
        (last_30_days['item'] == item)
    ].copy()
    
    # Initialize predictions list
    all_predictions = []
    current_data = hist_data.copy()
    
    # Predict day by day to properly calculate lag features
    for date in future_dates:
        # Create a single row for prediction
        pred_row = pd.DataFrame({'date': [date], 'store': [store], 'item': [item]})
        
        # Apply basic feature engineering
        pred_row = create_features(pred_row)
        pred_row['store_encoded'] = le_store.transform([store])[0]
        pred_row['item_encoded'] = le_item.transform([item])[0]
        
        # Calculate lag features using current_data
        if len(current_data) >= 30:  # Ensure we have enough historical data
            pred_row['sales_lag_7'] = current_data['sales'].iloc[-7]
            pred_row['sales_lag_14'] = current_data['sales'].iloc[-14]
            pred_row['sales_lag_30'] = current_data['sales'].iloc[-30]
            
            # Calculate rolling means
            pred_row['sales_rolling_mean_7'] = current_data['sales'].iloc[-7:].mean()
            pred_row['sales_rolling_mean_14'] = current_data['sales'].iloc[-14:].mean()
            pred_row['sales_rolling_mean_30'] = current_data['sales'].iloc[-30:].mean()
        else:
            # If not enough history, use means from training data
            store_item_means = df_train[
                (df_train['store'] == store) & 
                (df_train['item'] == item)
            ]['sales'].mean()
            
            pred_row['sales_lag_7'] = store_item_means
            pred_row['sales_lag_14'] = store_item_means
            pred_row['sales_lag_30'] = store_item_means
            pred_row['sales_rolling_mean_7'] = store_item_means
            pred_row['sales_rolling_mean_14'] = store_item_means
            pred_row['sales_rolling_mean_30'] = store_item_means
        
        # Make predictions using all models
        X_pred = pred_row[features]
        
        # Get predictions from each model
        pred_lgbm = models['LightGBM'].predict(X_pred)[0]
        pred_rf = models['RandomForest'].predict(X_pred)[0]
        pred_gb = models['GradientBoosting'].predict(X_pred)[0]
        
        # Calculate ensemble prediction (weighted average based on R2 scores)
        weights = {
            'LightGBM': r2_lgbm,
            'RandomForest': r2_rf,
            'GradientBoosting': r2_gb
        }
        total_weight = sum(weights.values())
        weights = {k: v/total_weight for k, v in weights.items()}
        
        prediction = (
            pred_lgbm * weights['LightGBM'] +
            pred_rf * weights['RandomForest'] +
            pred_gb * weights['GradientBoosting']
        )
        
        # Store predictions
        pred_row['predicted_sales'] = prediction
        pred_row['predicted_sales_lgbm'] = pred_lgbm
        pred_row['predicted_sales_rf'] = pred_rf
        pred_row['predicted_sales_gb'] = pred_gb
        
        all_predictions.append(pred_row)
        
        # Update current_data with the new prediction for next iteration
        pred_row['sales'] = prediction
        current_data = pd.concat([current_data, pred_row])
    
    # Combine all predictions for this store-item
    store_item_predictions = pd.concat(all_predictions, ignore_index=True)
    future_predictions.append(store_item_predictions)

# Combine all predictions
future_predictions_df = pd.concat(future_predictions, ignore_index=True)

print("Predictions generated successfully!")

# Display sample of predictions with all model results
print("\nSample of predictions:")
cols_to_show = ['date', 'store', 'item', 'predicted_sales', 
                'predicted_sales_lgbm', 'predicted_sales_rf', 'predicted_sales_gb']
print(future_predictions_df[cols_to_show].head())


Generating predictions for next 3 months...
Predictions generated successfully!

Sample of predictions:
        date  store  item  predicted_sales  predicted_sales_lgbm  \
0 2018-01-01      1     1        13.871081             13.421795   
1 2018-01-02      1     1        16.790505             15.857703   
2 2018-01-03      1     1        17.527291             17.072816   
3 2018-01-04      1     1        18.528199             18.393318   
4 2018-01-05      1     1        19.326813             19.339011   

   predicted_sales_rf  predicted_sales_gb  
0           14.308676           13.883705  
1           18.012386           16.504142  
2           18.858113           16.654097  
3           19.117714           18.074983  
4           19.178360           19.462704  


In [11]:
# 7. Visualize Results
def plot_predictions(actual_df, pred_df, store=None, item=None):
    """Plot actual vs predicted sales for specific store and item"""
    
    fig = go.Figure()
    
    # Filter data if store and item are specified
    if store is not None and item is not None:
        actual_df = actual_df[
            (actual_df['store'] == store) & 
            (actual_df['item'] == item)
        ]
        pred_df = pred_df[
            (pred_df['store'] == store) & 
            (pred_df['item'] == item)
        ]
    
    # Aggregate daily sales if showing all stores/items
    actual_daily = actual_df.groupby('date')['sales'].sum().reset_index()
    
    # Plot actual sales
    fig.add_trace(go.Scatter(
        x=actual_daily['date'],
        y=actual_daily['sales'],
        name='Actual Sales',
        line=dict(color='blue')
    ))
    
    # Plot predictions from each model
    colors = {
        'predicted_sales': 'red',      # Ensemble
        'predicted_sales_lgbm': 'green',  # LightGBM
        'predicted_sales_rf': 'orange',   # Random Forest
        'predicted_sales_gb': 'purple'    # Gradient Boosting
    }
    
    names = {
        'predicted_sales': 'Ensemble Prediction',
        'predicted_sales_lgbm': 'LightGBM Prediction',
        'predicted_sales_rf': 'Random Forest Prediction',
        'predicted_sales_gb': 'Gradient Boosting Prediction'
    }
    
    for pred_col, color in colors.items():
        pred_daily = pred_df.groupby('date')[pred_col].sum().reset_index()
        
        fig.add_trace(go.Scatter(
            x=pred_daily['date'],
            y=pred_daily[pred_col],
            name=names[pred_col],
            line=dict(
                color=color,
                dash='dash' if pred_col == 'predicted_sales' else 'dot'
            )
        ))
    
    # Update layout
    title = 'Sales Prediction: '
    title += f'Store {store}, Item {item}' if store and item else 'All Stores and Items'
    
    fig.update_layout(
        title=title,
        xaxis_title='Date',
        yaxis_title='Sales',
        hovermode='x unified',
        template='plotly_white',
        width=1200,
        height=700,
        showlegend=True,
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01
        )
    )
    
    return fig

# Plot overall predictions
print("Creating visualizations...")
# Combine training and validation data for plotting
full_data = pd.concat([train_data, val_data])
overall_plot = plot_predictions(full_data, future_predictions_df)
overall_plot.show()

# Example: Plot predictions for top 3 selling items
print("\nAnalyzing top selling items...")
top_items = df_train.groupby(['store', 'item'])['sales'].sum().sort_values(ascending=False).head(3)
print("\nTop 3 store-item combinations by total sales:")
for (store, item), sales in top_items.items():
    print(f"Store: {store}, Item: {item}, Total Sales: {sales:,.0f}")
    # ใช้ full_data แทน val_data
    specific_plot = plot_predictions(full_data, future_predictions_df, store, item)
    specific_plot.show()

# Calculate and display model contribution to final predictions
print("\nAnalyzing model contributions to ensemble predictions...")
weights = {
    'LightGBM': r2_lgbm,
    'RandomForest': r2_rf,
    'GradientBoosting': r2_gb
}
total_weight = sum(weights.values())
weights = {k: v/total_weight * 100 for k, v in weights.items()}

print("\nModel weights in ensemble prediction:")
for model, weight in weights.items():
    print(f"{model}: {weight:.1f}%")

Creating visualizations...



Analyzing top selling items...

Top 3 store-item combinations by total sales:
Store: 2, Item: 28, Total Sales: 203,786


Store: 2, Item: 15, Total Sales: 203,645


Store: 2, Item: 18, Total Sales: 195,590



Analyzing model contributions to ensemble predictions...

Model weights in ensemble prediction:
LightGBM: 33.4%
RandomForest: 33.3%
GradientBoosting: 33.4%


In [22]:
import pandas as pd
import plotly.graph_objects as go
from datetime import datetime

def plot_predictions_full(
    actual_df: pd.DataFrame,
    pred_df: pd.DataFrame,
    store: int | None = None,
    item: int | None = None,
    date_col: str = 'date',
    sales_col: str = 'sales',
    store_col: str = 'store',
    item_col: str = 'item',
    pred_config: dict | None = None,
    fill_internal_missing: str | None = 'zero',  # 'zero' | 'interpolate' | None
    show_split_line: bool = True,
    debug: bool = False
):
    """
    แสดง actual vs prediction แบบไม่ลาก actual ลงศูนย์ในช่วงอนาคต
    - fill_internal_missing มีผลเฉพาะภายในช่วงวันที่มี actual จริง (min_actual .. max_actual)
      แต่จะไม่เติมหลัง max_actual
    """
    if pred_config is None:
        pred_config = {
            'predicted_sales':      {'name':'Ensemble Prediction','color':'red','dash':'dash'},
            'predicted_sales_lgbm': {'name':'LightGBM Prediction','color':'green','dash':'dot'},
            'predicted_sales_rf':   {'name':'Random Forest Prediction','color':'orange','dash':'dot'},
            'predicted_sales_gb':   {'name':'Gradient Boosting Prediction','color':'purple','dash':'dot'},
        }

    # Ensure datetime
    actual_df = actual_df.copy()
    pred_df   = pred_df.copy()
    if not pd.api.types.is_datetime64_any_dtype(actual_df[date_col]):
        actual_df[date_col] = pd.to_datetime(actual_df[date_col])
    if not pd.api.types.is_datetime64_any_dtype(pred_df[date_col]):
        pred_df[date_col] = pd.to_datetime(pred_df[date_col])

    # Filter
    if store is not None and item is not None:
        actual_sub = actual_df[(actual_df[store_col]==store) & (actual_df[item_col]==item)].copy()
        pred_sub   = pred_df[(pred_df[store_col]==store) & (pred_df[item_col]==item)].copy()
        title_suffix = f"Store {store}, Item {item}"
    else:
        actual_sub = actual_df
        pred_sub   = pred_df
        title_suffix = "All Stores and Items"

    # Aggregate actual
    actual_daily = (actual_sub
                    .groupby(date_col, as_index=False)[sales_col]
                    .sum()
                    .rename(columns={sales_col: 'actual_sales'}))

    if actual_daily.empty:
        raise ValueError("ไม่มี actual data หลัง filter แล้ว")

    # Aggregate predictions
    pred_cols = [c for c in pred_sub.columns if c in pred_config]
    pred_daily = {c: pred_sub.groupby(date_col, as_index=False)[c].sum() for c in pred_cols}

    # Date bounds
    min_actual_date = actual_daily[date_col].min()
    max_actual_date = actual_daily[date_col].max()

    # รวมวันอนาคตเพื่อแนบเส้นแนวนอน (ถ้าไม่มี prediction ก็แค่ช่วง actual)
    pred_dates_all = []
    for c in pred_cols:
        pred_dates_all.extend(pred_daily[c][date_col].tolist())
    max_total_date = max([max_actual_date] + pred_dates_all) if pred_dates_all else max_actual_date

    full_range = pd.date_range(min_actual_date, max_total_date, freq='D')

    # Reindex actual
    actual_full = actual_daily.set_index(date_col).reindex(full_range)
    actual_full.index.name = date_col

    # เติมเฉพาะภายในช่วง historical (<= max_actual_date)
    hist_mask = actual_full.index <= max_actual_date
    if fill_internal_missing == 'zero':
        actual_full.loc[hist_mask, 'actual_sales'] = actual_full.loc[hist_mask, 'actual_sales'].fillna(0)
    elif fill_internal_missing == 'interpolate':
        # interpolate เฉพาะภายในช่วง hist แล้วค่อย mask กลับ
        hist_series = actual_full.loc[hist_mask, 'actual_sales']
        actual_full.loc[hist_mask, 'actual_sales'] = hist_series.interpolate()
    else:
        # ไม่เติมอะไร ปล่อย NaN → จะเกิดช่องว่างถ้าขาดวัน
        pass

    # หลัง max_actual_date ปล่อย NaN (เพื่อจบเส้น)
    # ถ้ามีช่องว่างบางวัน "ก่อน" max_actual_date ที่ยัง NaN (กรณี fill_internal_missing=None)
    # plotly จะสร้างช่อง break ตามต้องการ

    # Reindex predictions
    pred_full = {c: pred_daily[c].set_index(date_col).reindex(full_range) for c in pred_cols}

    # First prediction date (วันแรกที่มีค่าไม่ NaN)
    first_pred_date = None
    if pred_cols:
        candidates = []
        for c in pred_cols:
            non_na = pred_full[c].dropna()
            if not non_na.empty:
                candidates.append(non_na.index.min())
        if candidates:
            first_pred_date = min(candidates)

    if debug:
        print("max_actual_date:", max_actual_date)
        print("first_pred_date:", first_pred_date)

    # Figure
    fig = go.Figure()

    # Actual trace (NaN หลัง max_actual_date จะทำให้เส้นหยุด)
    fig.add_trace(go.Scatter(
        x=actual_full.index,
        y=actual_full['actual_sales'],
        name='Actual Sales',
        mode='lines',
        line=dict(color='blue')
    ))

    # Predictions
    for c in pred_cols:
        cfg = pred_config[c]
        fig.add_trace(go.Scatter(
            x=pred_full[c].index,
            y=pred_full[c][c],
            name=cfg.get('name', c),
            mode='lines',
            line=dict(color=cfg.get('color','red'),
                      dash=cfg.get('dash','dash'))
        ))

    # Vertical line (shape) เฉพาะถ้ามี prediction
    if show_split_line and first_pred_date is not None and pd.notna(first_pred_date):
        fp = first_pred_date.to_pydatetime()
        fig.add_shape(
            type="line",
            x0=fp, x1=fp,
            y0=0, y1=1,
            xref="x", yref="paper",
            line=dict(color="black", dash="dot", width=1)
        )
        fig.add_annotation(
            x=fp, y=1, xref="x", yref="paper",
            text="Forecast Start",
            showarrow=False, yshift=10,
            font=dict(size=12)
        )

    fig.update_layout(
        title=f"Sales Prediction: {title_suffix}",
        xaxis_title="Date",
        yaxis_title="Sales",
        template="plotly_white",
        hovermode="x unified",
        width=1200,
        height=700,
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
        margin=dict(l=40,r=20,t=60,b=40)
    )

    return fig


In [24]:
full_actual_df = df_train  
overall_fig = plot_predictions_full(full_actual_df, future_predictions_df)
overall_fig.show()