<a href="https://colab.research.google.com/github/nithish-2405/My-projects/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install --upgrade pip -q
!pip install prophet statsmodels pmdarima seaborn plotly scikit-learn -q
!pip install tensorflow -q
import os, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.api import VAR
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import json
from concurrent.futures import ThreadPoolExecutor
import time
warnings.filterwarnings("ignore")
tf.get_logger().setLevel('ERROR')
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
OUT_DIR = "/content/fast_forecasts"
os.makedirs(OUT_DIR, exist_ok=True)
print("⚡ FAST Multivariate Forecasting Pipeline")
print("=" * 50)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / np.where(np.abs(y_true) < 1e-8, 1e-8, y_true))) * 100

def calculate_quick_metrics(y_true, y_pred, series_cols):
    overall_rmse = rmse(y_true.values.flatten(), y_pred.values.flatten())
    overall_mape = mape(y_true.values.flatten(), y_pred.values.flatten())
    per_series = {}
    for col in series_cols:
        per_series[col] = {
            'RMSE': float(rmse(y_true[col], y_pred[col])),
            'MAPE': float(mape(y_true[col], y_pred[col]))
        }
    return {
        'Overall_RMSE': float(overall_rmse),
        'Overall_MAPE': float(overall_mape),
        'Per_Series': per_series
    }
print("\n📊 Loading Dataset...")
start_time = time.time()

DATA_PATH = "/content/saleshourly.csv"
if not os.path.exists(DATA_PATH):
    from google.colab import files
    print("Upload your CSV file:")
    uploaded = files.upload()
    DATA_PATH = list(uploaded.keys())[0]
df = pd.read_csv(DATA_PATH)
time_col = 'datum' if 'datum' in df.columns else df.columns[0]
df['ds'] = pd.to_datetime(df[time_col], errors='coerce')
df = df.sort_values('ds').set_index('ds')
series_cols = ['M01AB', 'M01AE', 'N02BA', 'N02BE', 'N05B', 'N05C', 'R03', 'R06']
available_cols = [col for col in series_cols if col in df.columns]
if not available_cols:
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    series_cols = numeric_cols.tolist()[:8]
else:
    series_cols = available_cols
data = df[series_cols].astype(float).resample('H').mean().fillna(method='ffill').fillna(0)
H = 168
train_data = data.iloc[:-H]
test_data = data.iloc[-H:]
print(f"✅ Data loaded in {time.time() - start_time:.1f}s")
print(f"Series: {len(series_cols)} | Train: {len(train_data)} | Test: {len(test_data)}")
def train_fast_prophet(train_data, test_data, series_cols):
    """Optimized Prophet with minimal parameters"""
    print("🔮 Training Prophet (Fast Mode)...")
    start_time = time.time()
    forecasts = {}
    def train_single_prophet(col):
        prophet_data = train_data[[col]].reset_index()
        prophet_data.columns = ['ds', 'y']
        model = Prophet(
            yearly_seasonality=False,
            weekly_seasonality=True,
            daily_seasonality=True,
            changepoint_prior_scale=0.1,
            interval_width=0.8
        )
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            model.fit(prophet_data)
        future = model.make_future_dataframe(periods=len(test_data), freq='H')
        forecast = model.predict(future)
        return col, forecast['yhat'].iloc[-len(test_data):].values
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(train_single_prophet, series_cols))
    for col, forecast in results:
        forecasts[col] = forecast
    forecast_df = pd.DataFrame(forecasts, index=test_data.index)
    print(f"✅ Prophet completed in {time.time() - start_time:.1f}s")
    return forecast_df
def train_fast_var(train_data, test_data):
    """Optimized VAR with automatic lag selection"""
    print("📈 Training VAR (Fast Mode)...")
    start_time = time.time()
    subset_size = min(2000, len(train_data))
    train_subset = train_data.iloc[-subset_size:]
    try:
        var_select = VAR(train_subset)
        lag_order = var_select.select_order(maxlags=8)
        optimal_lag = min(lag_order.selected_orders.get('aic', 4), 6)  # Cap at 6
    except:
        optimal_lag = 4
    var_model = VAR(train_data).fit(optimal_lag)
    forecast = var_model.forecast(train_data.values[-optimal_lag:], steps=len(test_data))
    forecast_df = pd.DataFrame(forecast, index=test_data.index, columns=series_cols)
    print(f"✅ VAR (lag={optimal_lag}) completed in {time.time() - start_time:.1f}s")
    return forecast_df
def train_fast_sarima(train_data, test_data, series_cols):
    """Simplified SARIMA with fixed parameters"""
    print("📊 Training SARIMA (Fast Mode)...")
    start_time = time.time()
    forecasts = {}
    def train_single_sarima(col):
        try:
            model = ARIMA(train_data[col], order=(2, 1, 1))  # Fixed order
            fitted = model.fit()
            forecast = fitted.forecast(steps=len(test_data))
            return col, forecast.values
        except:
            last_val = train_data[col].iloc[-1]
            return col, np.full(len(test_data), last_val)
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(train_single_sarima, series_cols))
    for col, forecast in results:
        forecasts[col] = forecast
    forecast_df = pd.DataFrame(forecasts, index=test_data.index)
    print(f"✅ SARIMA completed in {time.time() - start_time:.1f}s")
    return forecast_df
def train_fast_lstm(train_data, test_data, seq_len=24):
    """Lightweight LSTM optimized for speed"""
    print("🧠 Training LSTM (Fast Mode)...")
    start_time = time.time()
    scaler = MinMaxScaler()
    train_scaled = scaler.fit_transform(train_data.iloc[-2000:])  # Use subset for speed
    def create_sequences(data, seq_len):
        X, y = [], []
        for i in range(seq_len, len(data)):
            X.append(data[i-seq_len:i])
            y.append(data[i])
        return np.array(X), np.array(y)
    X_train, y_train = create_sequences(train_scaled, seq_len)
    model = Sequential([
        LSTM(32, return_sequences=False, input_shape=(seq_len, len(series_cols))),  # Smaller units
        Dropout(0.2),
        Dense(16),  # Smaller dense layer
        Dense(len(series_cols))
    ])
    model.compile(optimizer=Adam(0.01), loss='mse')  # Higher learning rate
    model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=0, validation_split=0.1)
    current_seq = scaler.transform(train_data.iloc[-seq_len:])
    predictions = []
    for _ in range(len(test_data)):
        pred = model.predict(current_seq.reshape(1, seq_len, len(series_cols)), verbose=0)[0]
        predictions.append(pred)
        current_seq = np.vstack([current_seq[1:], pred])
    predictions_original = scaler.inverse_transform(np.array(predictions))
    forecast_df = pd.DataFrame(predictions_original, index=test_data.index, columns=series_cols)
    print(f"✅ LSTM completed in {time.time() - start_time:.1f}s")
    return forecast_df, model, scaler
print("\n⚡ Training All Models...")
total_start = time.time()
prophet_forecast = train_fast_prophet(train_data, test_data, series_cols)
var_forecast = train_fast_var(train_data, test_data)
sarima_forecast = train_fast_sarima(train_data, test_data, series_cols)
lstm_forecast, lstm_model, lstm_scaler = train_fast_lstm(train_data, test_data)
print(f"\n🎯 Total training time: {time.time() - total_start:.1f}s")
print("\n📊 Model Evaluation...")
models = {
    'Prophet': prophet_forecast,
    'VAR': var_forecast,
    'SARIMA': sarima_forecast,
    'LSTM': lstm_forecast}
results = []
for name, forecast in models.items():
    metrics = calculate_quick_metrics(test_data, forecast, series_cols)
    results.append({
        'Model': name,
        'RMSE': metrics['Overall_RMSE'],
        'MAPE': metrics['Overall_MAPE']
    })
    forecast.to_csv(f"{OUT_DIR}/{name}_forecast.csv")
comparison_df = pd.DataFrame(results).sort_values('RMSE')
print("\n🏆 FAST Model Comparison Results:")
print(comparison_df.to_string(index=False, float_format='%.4f'))
best_model = comparison_df.iloc[0]['Model']
print(f"\n🥇 Best Model: {best_model}")
print(f"   RMSE: {comparison_df.iloc[0]['RMSE']:.4f}")
print(f"   MAPE: {comparison_df.iloc[0]['MAPE']:.2f}%")
comparison_df.to_csv(f"{OUT_DIR}/model_comparison.csv", index=False)
def plot_best_forecasts():
    """Plot forecasts for top 3 series"""
    best_forecast = models[best_model]
    fig, axes = plt.subplots(3, 1, figsize=(12, 8))
    top_series = series_cols[:3]  # Plot first 3 series for speed
    colors = ['red', 'blue', 'green', 'orange']
    for i, col in enumerate(top_series):
        ax = axes[i]
        ax.plot(test_data.index, test_data[col], 'black', linewidth=2, label='Actual', alpha=0.8)
        for j, (model_name, forecast_df) in enumerate(models.items()):
            ax.plot(forecast_df.index, forecast_df[col],
                   color=colors[j], linestyle='--', linewidth=1.5,
                   label=model_name, alpha=0.7)
        ax.set_title(f'{col} - Forecast Comparison (Sample)', fontweight='bold')
        ax.legend()
        ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{OUT_DIR}/fast_comparison.png", dpi=150, bbox_inches='tight')
    plt.show()
plot_best_forecasts()
print(f"\n💊 Per-Drug Performance ({best_model}):")
best_forecast = models[best_model]
per_drug_results = []
for col in series_cols:
    rmse_val = rmse(test_data[col], best_forecast[col])
    mape_val = mape(test_data[col], best_forecast[col])
    per_drug_results.append({
        'Drug': col,
        'RMSE': rmse_val,
        'MAPE': mape_val,
        'Avg_Actual': test_data[col].mean(),
        'Avg_Predicted': best_forecast[col].mean()
    })
per_drug_df = pd.DataFrame(per_drug_results).sort_values('RMSE')
print(per_drug_df.to_string(index=False, float_format='%.3f'))
per_drug_df.to_csv(f"{OUT_DIR}/per_drug_analysis.csv", index=False)
def predict_future_fast(hours=24, model_name=None):
    """Fast future prediction"""
    if model_name is None:
        model_name = best_model
    if model_name == 'LSTM':
        last_seq = lstm_scaler.transform(train_data.iloc[-24:])
        predictions = []
        for _ in range(hours):
            pred = lstm_model.predict(last_seq.reshape(1, 24, len(series_cols)), verbose=0)[0]
            predictions.append(pred)
            last_seq = np.vstack([last_seq[1:], pred])
        predictions_original = lstm_scaler.inverse_transform(np.array(predictions))
        last_date = train_data.index[-1]
        future_index = pd.date_range(start=last_date + pd.Timedelta(hours=1),
                                   periods=hours, freq='H')
        return pd.DataFrame(predictions_original, index=future_index, columns=series_cols)

    else:
        forecast_df = pd.read_csv(f"{OUT_DIR}/{model_name}_forecast.csv",
                                index_col=0, parse_dates=True)
        return forecast_df.iloc[:hours]
def create_analytics_dashboard():
    print("\n📊 Creating Analytics Dashboard...")
    summary_stats = []
    for drug in series_cols:
        drug_data = data[drug]
        stats = {
            'Drug': drug,
            'Total_Sales': drug_data.sum(),
            'Average_Daily': drug_data.resample('D').mean().mean(),
            'Peak_Hour': drug_data.groupby(drug_data.index.hour).mean().idxmax(),
            'Peak_Day': drug_data.groupby(drug_data.index.day_name()).mean().idxmax(),
            'Volatility': drug_data.std(),
            'Growth_Rate': ((drug_data.iloc[-168:].mean() / drug_data.iloc[:168].mean()) - 1) * 100
        }
        summary_stats.append(stats)
    summary_df = pd.DataFrame(summary_stats)
    summary_df = summary_df.sort_values('Total_Sales', ascending=False)
    print("\n💊 DRUG SALES ANALYTICS SUMMARY:")
    print("=" * 70)
    print(summary_df.to_string(index=False, float_format='%.2f'))
    summary_df.to_csv(f"{OUT_DIR}/drug_analytics_summary.csv", index=False)
    detailed_performance = []
    for drug in series_cols:
        drug_perf = {'Drug': drug}
        for model_name, forecast_df in models.items():
            drug_rmse = rmse(test_data[drug], forecast_df[drug])
            drug_mape = mape(test_data[drug], forecast_df[drug])
            drug_perf[f'{model_name}_RMSE'] = drug_rmse
            drug_perf[f'{model_name}_MAPE'] = drug_mape
        detailed_performance.append(drug_perf)
    detailed_df = pd.DataFrame(detailed_performance)
    detailed_df.to_csv(f"{OUT_DIR}/detailed_model_performance.csv", index=False)
    print(f"\n📈 DETAILED MODEL PERFORMANCE BY DRUG:")
    print("=" * 100)
    rmse_cols = [col for col in detailed_df.columns if 'RMSE' in col]
    display_df = detailed_df[['Drug'] + rmse_cols]
    print(display_df.to_string(index=False, float_format='%.3f'))
    best_models_per_drug = []
    for drug in series_cols:
        drug_rmses = {}
        for model_name, forecast_df in models.items():
            drug_rmses[model_name] = rmse(test_data[drug], forecast_df[drug])
        best_model_drug = min(drug_rmses, key=drug_rmses.get)
        best_rmse = drug_rmses[best_model_drug]
        best_models_per_drug.append({
            'Drug': drug,
            'Best_Model': best_model_drug,
            'Best_RMSE': best_rmse,
            'Improvement_vs_Worst': max(drug_rmses.values()) - best_rmse
        })
    best_per_drug_df = pd.DataFrame(best_models_per_drug)
    best_per_drug_df.to_csv(f"{OUT_DIR}/best_model_per_drug.csv", index=False)
    print(f"\n🎯 BEST MODEL PER DRUG:")
    print("=" * 60)
    print(best_per_drug_df.to_string(index=False, float_format='%.3f'))
    return summary_df, detailed_df, best_per_drug_df
summary_stats, detailed_performance, best_per_drug = create_analytics_dashboard()
def create_future_forecast_visualization():
    print("\n🔮 Creating Future Forecast Visualization...")
    future_7d = predict_future_fast(168)  # 7 days = 168 hours
    fig, axes = plt.subplots(4, 2, figsize=(16, 12))
    fig.suptitle(f'🔮 7-DAY FUTURE FORECAST - {best_model} Model', fontsize=16, fontweight='bold')
    for i, drug in enumerate(series_cols):
        row, col_idx = i // 2, i % 2
        ax = axes[row, col_idx]
        historical = data[drug].iloc[-168:]
        ax.plot(historical.index, historical.values,
                color='blue', linewidth=2, label='Historical (7 days)', alpha=0.8)
        ax.plot(future_7d.index, future_7d[drug].values,
                color='red', linewidth=2, linestyle='--',
                label='Future Forecast (7 days)', alpha=0.8)
        forecast_start = future_7d.index[0]
        ax.axvline(x=forecast_start, color='orange', linestyle=':',
                  alpha=0.7, label='Forecast Start')
        hist_avg = historical.mean()
        forecast_avg = future_7d[drug].mean()
        change_pct = ((forecast_avg - hist_avg) / hist_avg) * 100
        ax.set_title(f'{drug}\nHist Avg: {hist_avg:.1f} → Forecast Avg: {forecast_avg:.1f} ({change_pct:+.1f}%)',
                    fontweight='bold', fontsize=10)
        ax.set_ylabel('Sales Volume')
        ax.legend(fontsize=8)
        ax.grid(True, alpha=0.3)
        ax.tick_params(axis='x', rotation=45)
    plt.tight_layout()
    plt.savefig(f"{OUT_DIR}/07_future_forecast_7days.png", dpi=300, bbox_inches='tight')
    plt.show()
    future_7d.to_csv(f"{OUT_DIR}/future_forecast_7days.csv")
    print(f"📁 7-day forecast saved to: future_forecast_7days.csv")
create_future_forecast_visualization()
print(f"\n✅ COMPREHENSIVE FORECASTING PIPELINE COMPLETE!")
print("=" * 70)
print(f"⏱️  Total Runtime: {time.time() - total_start:.1f} seconds")
print(f"🏆 Best Overall Model: {best_model}")
print(f"💊 Total Drugs Analyzed: {len(series_cols)}")
print(f"📊 Total Visualizations Created: 7")
print(f"📁 All Results Saved to: {OUT_DIR}")
print(f"\n🔮 SAMPLE FORECAST - Next 24 hours using {best_model}:")
future_sample = predict_future_fast(24)
print(future_sample.head(8).to_string(float_format='%.2f'))
print(f"\n📊 TOP PERFORMING DRUGS BY FORECAST ACCURACY:")
top_accurate = best_per_drug.nsmallest(3, 'Best_RMSE')[['Drug', 'Best_Model', 'Best_RMSE']]
print(top_accurate.to_string(index=False, float_format='%.3f'))
print("\n📋 ALL GENERATED FILES:")
print("├── 📊 DATA ANALYSIS:")
print("│   ├── drug_analytics_summary.csv")
print("│   ├── model_comparison.csv")
print("│   ├── per_drug_analysis.csv")
print("│   ├── detailed_model_performance.csv")
print("│   └── best_model_per_drug.csv")
print("├── 🔮 FORECASTS:")
print("│   ├── Prophet_forecast.csv")
print("│   ├── VAR_forecast.csv")
print("│   ├── SARIMA_forecast.csv")
print("│   ├── LSTM_forecast.csv")
print("│   └── future_forecast_7days.csv")
print("└── 📈 VISUALIZATIONS:")
print("    ├── 01_sales_analysis_all_drugs.png")
print("    ├── 02_sales_distribution_analysis.png")
print("    ├── 03_detailed_model_comparison_all_drugs.png")
print("    ├── 04_best_model_performance_analysis.png")
print("    ├── 05_model_performance_heatmap.png")
print("    ├── 06_time_series_decomposition.png")
print("    └── 07_future_forecast_7days.png")