# Prediksi Penjualan E-Commerce Indonesia

In [None]:
# Install required libraries
!pip install pandas numpy matplotlib seaborn statsmodels scikit-learn plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Time series specific libraries
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error
import plotly.graph_objects as go
from plotly.subplots import make_subplots

print("Libraries imported successfully!")

1. Generate Synthetic Indonesian E-Commerce Data

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate dates: 3 years of daily data (2019-2022)
dates = pd.date_range(start='2019-01-01', end='2022-12-31', freq='D')

# Create synthetic sales data with:
# - Trend: Growing e-commerce in Indonesia
# - Seasonality: Weekly and yearly patterns
# - Events: Ramadan, Christmas, New Year, 9.9, 10.10, 11.11, 12.12 sales

def generate_ecommerce_sales(dates):
    n = len(dates)

    # Base trend (exponential growth of e-commerce in Indonesia)
    trend = np.linspace(1000, 5000, n) ** 1.2

    # Weekly seasonality (higher sales on weekends)
    day_of_week = dates.dayofweek
    weekly_seasonality = 1 + 0.3 * np.sin(2 * np.pi * day_of_week / 7)

    # Yearly seasonality (holiday effects)
    day_of_year = dates.dayofyear
    yearly_seasonality = 1 + 0.4 * np.sin(2 * np.pi * day_of_year / 365)

    # Special events in Indonesia
    sales_events = np.ones(n)

    for i, date in enumerate(dates):
        # Ramadan effect (1 month before and during Ramadan)
        if date.month == 4 or date.month == 5:  # Approximate Ramadan months
            sales_events[i] += 0.8

        # Christmas and New Year
        if date.month == 12 and date.day >= 15:
            sales_events[i] += 0.6

        # E-commerce sales events
        if (date.month == 9 and date.day == 9):   # 9.9
            sales_events[i] += 1.2
        elif (date.month == 10 and date.day == 10): # 10.10
            sales_events[i] += 1.2
        elif (date.month == 11 and date.day == 11): # 11.11
            sales_events[i] += 1.5
        elif (date.month == 12 and date.day == 12): # 12.12
            sales_events[i] += 1.3

    # Random noise
    noise = np.random.normal(0, 200, n)

    # Combine all components
    sales = trend * weekly_seasonality * yearly_seasonality * sales_events + noise

    # Ensure no negative sales
    sales = np.maximum(sales, 0)

    return sales

# Generate the data
sales_data = generate_ecommerce_sales(dates)

# Create DataFrame
df = pd.DataFrame({
    'date': dates,
    'sales': sales_data,
    'day_of_week': dates.dayofweek,
    'month': dates.month,
    'year': dates.year,
    'day_of_year': dates.dayofyear
})

# Set date as index
df.set_index('date', inplace=True)

print("Data generated successfully!")
print(f"Dataset shape: {df.shape}")
print("\nFirst 10 rows:")
print(df.head(10))

2. Exploratory Data Analysis

In [None]:
# Basic statistics
print("=== STATISTIK DESKRIPTIF ===")
print(f"Periode Data: {df.index.min()} hingga {df.index.max()}")
print(f"Total Hari: {len(df)}")
print(f"Total Penjualan: Rp {df['sales'].sum():,.0f}")
print(f"Rata-rata Penjualan Harian: Rp {df['sales'].mean():,.0f}")
print(f"Standar Deviasi: Rp {df['sales'].std():,.0f}")

# Monthly and yearly summary
monthly_sales = df.resample('M')['sales'].sum()
yearly_sales = df.resample('Y')['sales'].sum()

print("\n=== PENJUALAN TAHUNAN ===")
for year, sales in yearly_sales.items():
    print(f"{year.year}: Rp {sales:,.0f}")

# Visualization 1: Time series plot
plt.figure(figsize=(15, 10))

# Plot 1: Overall time series
plt.subplot(2, 2, 1)
plt.plot(df.index, df['sales'], linewidth=1, alpha=0.7)
plt.title('Trend Penjualan E-Commerce Harian (2019-2022)')
plt.xlabel('Tanggal')
plt.ylabel('Penjualan (Rp)')
plt.grid(True, alpha=0.3)

# Plot 2: Monthly aggregation
plt.subplot(2, 2, 2)
monthly_sales.plot(kind='line')
plt.title('Penjualan Bulanan')
plt.xlabel('Bulan')
plt.ylabel('Penjualan (Rp)')
plt.grid(True, alpha=0.3)

# Plot 3: Seasonal patterns by month
plt.subplot(2, 2, 3)
monthly_avg = df.groupby('month')['sales'].mean()
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
plt.bar(months, monthly_avg)
plt.title('Rata-rata Penjualan Bulanan')
plt.xlabel('Bulan')
plt.ylabel('Rata-rata Penjualan (Rp)')
plt.xticks(rotation=45)

# Plot 4: Weekly patterns
plt.subplot(2, 2, 4)
weekly_avg = df.groupby('day_of_week')['sales'].mean()
days = ['Senin', 'Selasa', 'Rabu', 'Kamis', 'Jumat', 'Sabtu', 'Minggu']
plt.bar(days, weekly_avg)
plt.title('Rata-rata Penjualan Harian dalam Minggu')
plt.xlabel('Hari')
plt.ylabel('Rata-rata Penjualan (Rp)')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

3. Time Series Decomposition

In [None]:
# Resample to weekly data for better decomposition visualization
weekly_df = df.resample('W').mean()

# Seasonal decomposition
decomposition = seasonal_decompose(weekly_df['sales'],
                                  model='multiplicative',
                                  period=52)  # 52 weeks in a year

# Plot decomposition
plt.figure(figsize=(15, 12))

plt.subplot(4, 1, 1)
plt.plot(weekly_df.index, decomposition.observed)
plt.title('Trend Penjualan (Observasi)')
plt.ylabel('Penjualan')
plt.grid(True, alpha=0.3)

plt.subplot(4, 1, 2)
plt.plot(weekly_df.index, decomposition.trend)
plt.title('Trend')
plt.ylabel('Trend')
plt.grid(True, alpha=0.3)

plt.subplot(4, 1, 3)
plt.plot(weekly_df.index, decomposition.seasonal)
plt.title('Seasonal')
plt.ylabel('Seasonal')
plt.grid(True, alpha=0.3)

plt.subplot(4, 1, 4)
plt.plot(weekly_df.index, decomposition.resid)
plt.title('Residual')
plt.ylabel('Residual')
plt.xlabel('Tanggal')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

4. Stationarity Test and ACF/PACF Analysis

In [None]:
# Augmented Dickey-Fuller test for stationarity
def check_stationarity(timeseries):
    print("=== UJI STASIONERITAS (Augmented Dickey-Fuller Test) ===")
    result = adfuller(timeseries.dropna())
    print(f'ADF Statistic: {result[0]:.6f}')
    print(f'p-value: {result[1]:.6f}')
    print('Critical Values:')
    for key, value in result[4].items():
        print(f'\t{key}: {value:.3f}')

    if result[1] <= 0.05:
        print("→ Series STASIONER (p-value ≤ 0.05)")
    else:
        print("→ Series TIDAK STASIONER (p-value > 0.05)")

    return result[1]

# Check stationarity of original series
p_value_original = check_stationarity(df['sales'])

# If not stationary, apply differencing
if p_value_original > 0.05:
    print("\nMenerapkan differencing untuk membuat data stasioner...")
    sales_diff = df['sales'].diff().dropna()
    p_value_diff = check_stationarity(sales_diff)
else:
    sales_diff = df['sales']

# ACF and PACF plots
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
plot_acf(df['sales'].dropna(), lags=40, ax=plt.gca())
plt.title('Autocorrelation Function (ACF)')

plt.subplot(1, 2, 2)
plot_pacf(df['sales'].dropna(), lags=40, ax=plt.gca())
plt.title('Partial Autocorrelation Function (PACF)')

plt.tight_layout()
plt.show()

5. ARIMA Model Implementation

In [None]:
# Split data into train and test sets
split_date = '2022-06-01'
train = df[df.index < split_date]['sales']
test = df[df.index >= split_date]['sales']

print(f"Training data: {train.index.min()} hingga {train.index.max()}")
print(f"Testing data: {test.index.min()} hingga {test.index.max()}")
print(f"Training samples: {len(train)}")
print(f"Testing samples: {len(test)}")

# Fit ARIMA model
print("\n=== TRAINING MODEL ARIMA ===")

# You can manually specify (p,d,q) or use auto_arima for optimization
# For this example, we'll use manual parameters based on ACF/PACF analysis

# ARIMA Model
model_arima = ARIMA(train, order=(2,1,2))
model_arima_fit = model_arima.fit()

print(model_arima_fit.summary())

# Make predictions
arima_forecast = model_arima_fit.forecast(steps=len(test))
arima_predictions = pd.Series(arima_forecast, index=test.index)

# Calculate metrics
mae_arima = mean_absolute_error(test, arima_predictions)
rmse_arima = np.sqrt(mean_squared_error(test, arima_predictions))

print(f"\n=== EVALUASI MODEL ARIMA ===")
print(f"MAE: Rp {mae_arima:,.0f}")
print(f"RMSE: Rp {rmse_arima:,.0f}")
print(f"MAPE: {np.mean(np.abs((test - arima_predictions) / test)) * 100:.2f}%")

6. SARIMA Model (Seasonal ARIMA)

In [None]:
# SARIMA Model to capture seasonality
print("\n=== TRAINING MODEL SARIMA ===")

# SARIMA parameters: (p,d,q) x (P,D,Q,s) where s=7 for weekly seasonality
model_sarima = SARIMAX(train,
                      order=(1,1,1),
                      seasonal_order=(1,1,1,7),
                      enforce_stationarity=False,
                      enforce_invertibility=False)

model_sarima_fit = model_sarima.fit(disp=False)

print(model_sarima_fit.summary())

# Make predictions
sarima_forecast = model_sarima_fit.forecast(steps=len(test))
sarima_predictions = pd.Series(sarima_forecast, index=test.index)

# Calculate metrics
mae_sarima = mean_absolute_error(test, sarima_predictions)
rmse_sarima = np.sqrt(mean_squared_error(test, sarima_predictions))

print(f"\n=== EVALUASI MODEL SARIMA ===")
print(f"MAE: Rp {mae_sarima:,.0f}")
print(f"RMSE: Rp {rmse_sarima:,.0f}")
print(f"MAPE: {np.mean(np.abs((test - sarima_predictions) / test)) * 100:.2f}%")

7. Model Comparison and Visualization

In [None]:
# Compare models
comparison = pd.DataFrame({
    'Model': ['ARIMA', 'SARIMA'],
    'MAE': [mae_arima, mae_sarima],
    'RMSE': [rmse_arima, rmse_sarima],
    'MAPE': [
        np.mean(np.abs((test - arima_predictions) / test)) * 100,
        np.mean(np.abs((test - sarima_predictions) / test)) * 100
    ]
})

print("=== PERBANDINGAN MODEL ===")
print(comparison)

# Visualization of predictions
plt.figure(figsize=(15, 10))

# Plot 1: Overall comparison
plt.subplot(2, 1, 1)
plt.plot(train.index, train, label='Training Data', alpha=0.7)
plt.plot(test.index, test, label='Actual Test Data', color='black', linewidth=2)
plt.plot(arima_predictions.index, arima_predictions, label='ARIMA Prediction', linestyle='--')
plt.plot(sarima_predictions.index, sarima_predictions, label='SARIMA Prediction', linestyle='--')
plt.title('Perbandingan Prediksi Model ARIMA vs SARIMA')
plt.xlabel('Tanggal')
plt.ylabel('Penjualan (Rp)')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: Zoom in on test period
plt.subplot(2, 1, 2)
zoom_start = '2022-08-01'
zoom_end = '2022-09-30'
zoom_test = test[zoom_start:zoom_end]
zoom_arima = arima_predictions[zoom_start:zoom_end]
zoom_sarima = sarima_predictions[zoom_start:zoom_end]

plt.plot(zoom_test.index, zoom_test, label='Actual', color='black', linewidth=2, marker='o')
plt.plot(zoom_arima.index, zoom_arima, label='ARIMA', linestyle='--', marker='s')
plt.plot(zoom_sarima.index, zoom_sarima, label='SARIMA', linestyle='--', marker='^')
plt.title('Prediksi vs Aktual (Zoom Agustus-September 2022)')
plt.xlabel('Tanggal')
plt.ylabel('Penjualan (Rp)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

8. Future Forecasting

In [None]:
# Forecast next 90 days using the best model (SARIMA)
final_model = SARIMAX(df['sales'],
                     order=(1,1,1),
                     seasonal_order=(1,1,1,7),
                     enforce_stationarity=False,
                     enforce_invertibility=False)

final_model_fit = final_model.fit(disp=False)

# Generate future dates
last_date = df.index[-1]
future_dates = [last_date + timedelta(days=x) for x in range(1, 91)]
future_dates = pd.DatetimeIndex(future_dates)

# Make forecast
future_forecast = final_model_fit.forecast(steps=90)
future_predictions = pd.Series(future_forecast, index=future_dates)

# Calculate confidence intervals
forecast_obj = final_model_fit.get_forecast(steps=90)
confidence_intervals = forecast_obj.conf_int()

# Visualization of future forecast
plt.figure(figsize=(15, 8))

# Plot historical data (last 180 days) + forecast
plot_start = df.index[-180]

plt.plot(df.loc[plot_start:].index, df.loc[plot_start:]['sales'],
         label='Data Historis', color='blue', linewidth=2)
plt.plot(future_predictions.index, future_predictions,
         label='Prediksi 90 Hari ke Depan', color='red', linewidth=2, linestyle='--')

# Add confidence interval
plt.fill_between(future_dates,
                confidence_intervals.iloc[:, 0],
                confidence_intervals.iloc[:, 1],
                color='red', alpha=0.2, label='Interval Kepercayaan 95%')

plt.title('Prediksi Penjualan E-Commerce 90 Hari ke Depan')
plt.xlabel('Tanggal')
plt.ylabel('Penjualan (Rp)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)

# Add vertical line to separate historical and forecast
plt.axvline(x=last_date, color='green', linestyle=':', alpha=0.7, label='Mulai Prediksi')

plt.tight_layout()
plt.show()

# Print forecast summary
print("=== PREDIKSI 90 HARI KE DEPAN ===")
print(f"Rata-rata Prediksi Penjualan Harian: Rp {future_predictions.mean():,.0f}")
print(f"Total Prediksi Penjualan 90 Hari: Rp {future_predictions.sum():,.0f}")
print(f"\nPrediksi 10 Hari Pertama:")
for i, (date, pred) in enumerate(future_predictions.head(10).items()):
    print(f"{date.strftime('%d %b %Y')}: Rp {pred:,.0f}")

9. Interactive Visualization with Plotly

In [None]:
fig = make_subplots(rows=2, cols=1,
                   subplot_titles=('Trend Penjualan Historis', 'Prediksi Masa Depan'),
                   vertical_spacing=0.1)

# Historical data
fig.add_trace(
    go.Scatter(x=df.index, y=df['sales'],
               name='Data Historis',
               line=dict(color='blue'),
               hovertemplate='Tanggal: %{x}<br>Penjualan: Rp %{y:,.0f}<extra></extra>'),
    row=1, col=1
)

# Future forecast with confidence interval
fig.add_trace(
    go.Scatter(x=future_predictions.index, y=future_predictions,
               name='Prediksi',
               line=dict(color='red', dash='dash'),
               hovertemplate='Tanggal: %{x}<br>Prediksi: Rp %{y:,.0f}<extra></extra>'),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(x=pd.concat([pd.Series(future_dates), pd.Series(future_dates[::-1])]),
               y=pd.concat([confidence_intervals.iloc[:, 0],
                           confidence_intervals.iloc[:, 1][::-1]]),
               fill='toself',
               fillcolor='rgba(255,0,0,0.2)',
               line=dict(color='rgba(255,255,255,0)'),
               name='Interval Kepercayaan 95%',
               hoverinfo='skip'),
    row=2, col=1
)

fig.update_layout(
    title_text='Analisis Time Series Penjualan E-Commerce Indonesia',
    height=800,
    showlegend=True,
    hovermode='x unified'
)

fig.update_xaxes(title_text='Tanggal', row=1, col=1)
fig.update_xaxes(title_text='Tanggal', row=2, col=1)
fig.update_yaxes(title_text='Penjualan (Rp)', row=1, col=1)
fig.update_yaxes(title_text='Penjualan (Rp)', row=2, col=1)

fig.show()

10. Business Insights and Recommendations

In [None]:
# Generate business insights
print("=== INSIGHTS BISNIS UNTUK E-COMMERCE INDONESIA ===\n")

# Growth analysis
first_year_avg = df[df.index.year == 2019]['sales'].mean()
last_year_avg = df[df.index.year == 2022]['sales'].mean()
growth_rate = ((last_year_avg - first_year_avg) / first_year_avg) * 100

print(f"1. PERTUMBUHAN BISNIS:")
print(f"   • Rata-rata penjualan 2019: Rp {first_year_avg:,.0f}")
print(f"   • Rata-rata penjualan 2022: Rp {last_year_avg:,.0f}")
print(f"   • Pertumbuhan: {growth_rate:.1f}% dalam 3 tahun\n")

# Seasonal patterns
best_month = monthly_avg.idxmax()
worst_month = monthly_avg.idxmin()
print(f"2. POLA MUSIMAN:")
print(f"   • Bulan terbaik: {months[best_month-1]} (Rp {monthly_avg.max():,.0f})")
print(f"   • Bulan terendah: {months[worst_month-1]} (Rp {monthly_avg.min():,.0f})")

# Event analysis
ramadan_effect = monthly_avg[4] / monthly_avg.mean() - 1
christmas_effect = monthly_avg[11] / monthly_avg.mean() - 1

print(f"   • Efek Ramadan: +{ramadan_effect*100:.1f}% di atas rata-rata")
print(f"   • Efek Natal/Tahun Baru: +{christmas_effect*100:.1f}% di atas rata-rata\n")

# Weekend vs weekday
weekend_sales = df[df['day_of_week'] >= 5]['sales'].mean()
weekday_sales = df[df['day_of_week'] < 5]['sales'].mean()
weekend_effect = (weekend_sales - weekday_sales) / weekday_sales * 100

print(f"3. POLA MINGGUAN:")
print(f"   • Rata-rata weekday: Rp {weekday_sales:,.0f}")
print(f"   • Rata-rata weekend: Rp {weekend_sales:,.0f}")
print(f"   • Weekend {weekend_effect:+.1f}% lebih tinggi dari weekday\n")

# Forecast insights
next_quarter_avg = future_predictions.mean()
growth_forecast = ((next_quarter_avg - last_year_avg) / last_year_avg) * 100

print(f"4. PREDIKSI KE DEPAN:")
print(f"   • Rata-rata prediksi 90 hari: Rp {next_quarter_avg:,.0f}")
print(f"   • Perkiraan pertumbuhan: {growth_forecast:+.1f}% vs tahun lalu")
print(f"   • Total prediksi Q1: Rp {future_predictions.sum():,.0f}")

print("\n5. REKOMENDASI STRATEGIS:")
print("   • Tingkatkan stok dan promosi selama event e-commerce (9.9, 10.10, 11.11, 12.12)")
print("   • Optimalkan campaign selama Ramadan dan liburan akhir tahun")
print("   • Fokus pada weekend marketing untuk meningkatkan penjualan")
print("   • Siapkan strategi inventory management berdasarkan prediksi musiman")