In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
import torch 
import torch.nn as nn
import warnings
warnings.filterwarnings("ignore")

data_path = pd.read_csv("/Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/Predict Future Sales/merged_data.csv")
df = data_path

print("First five rows of the data:")
print(df.head())
print("\nData Shape: ")
print(df.shape) 



First five rows of the data:
         date  date_block_num  shop_id  item_id  item_price  item_cnt_day  \
0  02.01.2013               0       59    22154      999.00           1.0   
1  03.01.2013               0       25     2552      899.00           1.0   
2  05.01.2013               0       25     2552      899.00          -1.0   
3  06.01.2013               0       25     2554     1709.05           1.0   
4  15.01.2013               0       25     2555     1099.00           1.0   

                                  item_name  item_category_id  \
0                    ANNOUNCEMENT 2012 (BD)                37   
1  DEEP PURPLE  The House Of Blue Light  LP                58   
2  DEEP PURPLE  The House Of Blue Light  LP                58   
3  DEEP PURPLE  Who Do You Think We Are  LP                58   
4            DEEP PURPE 30 Very Best Of 2CD                56   

                 item_category_name            shop_name  
0                 Cinema - Blue-Ray  Jaroslavl TC Altair  

In [6]:
def data_preprocessing(df):
    expected_columns = ['date', 'date_block_num', 'shop_id', 'item_id', 'item_price', 'item_cnt_day',
                       'item_name', 'item_category_id', 'item_category_name', 'shop_name']
    if not all(col in df.columns for col in expected_columns):
        missing = set(expected_columns) - set(df.columns)
        raise ValueError(f"Missing columns: {missing}")
    
    print(f"Preprocessing dataset with {len(df):,} records")
    
    cleaned_df = df.copy()
    # Initial checks
    print("Initial unique counts:")
    print(f"item_id: {cleaned_df['item_id'].nunique()}")
    print(f"item_name: {cleaned_df['item_name'].nunique()}")
    print("item_name nulls before imputation:", cleaned_df['item_name'].isna().sum())
    print("item_id with 'Unknown' item_name:", (cleaned_df['item_name'] == 'Unknown').sum())
    print("Unique item_name per item_id:")
    name_counts = cleaned_df.groupby('item_id')['item_name'].nunique()
    print(name_counts.value_counts())
    if (name_counts > 1).any():
        print("item_id with multiple item_name:", name_counts[name_counts > 1].index.tolist())

    cleaned_df['item_price'] = cleaned_df['item_price'].astype(np.float32)
    cleaned_df['item_cnt_day'] = cleaned_df['item_cnt_day'].astype(np.float32)
    
    print("Initial null counts:")
    for col in cleaned_df.columns:
        nulls = cleaned_df[col].isna().sum()
        print(f"Column '{col}': {nulls:,} nulls ({nulls/len(cleaned_df)*100:.2f}%)")
    
    cleaned_df['Return'] = cleaned_df['item_cnt_day'].where(cleaned_df['item_cnt_day'] < 0, 0).abs().astype(np.float32)
    cleaned_df['item_cnt_day'] = cleaned_df['item_cnt_day'].clip(lower=0)
    print(f"Created Return column. Total Returns: {cleaned_df['Return'].sum():,}")
    
    cleaned_df['date'] = pd.to_datetime(cleaned_df['date'], format='%d.%m.%Y')
    print("Converted date column to datetime format")
    
    # Handle item_name
    if 'item_name' in cleaned_df.columns and cleaned_df['item_name'].isna().any():
        item_name_nulls = cleaned_df['item_name'].isna().sum()
        cleaned_df['item_name'] = cleaned_df['item_name'].fillna('Unknown')
        print(f"Imputed {item_name_nulls:,} missing item names with 'Unknown'")
    
    # Ensure string types
    cleaned_df['shop_id'] = cleaned_df['shop_id'].astype(str)
    cleaned_df['item_id'] = cleaned_df['item_id'].astype(str)
    cleaned_df['item_name'] = cleaned_df['item_name'].astype(str)
    
    # Fix multiple item_name per item_id
    name_counts = cleaned_df.groupby('item_id')['item_name'].nunique()
    if (name_counts > 1).any():
        print(f"Warning: {name_counts[name_counts > 1].count()} item_id(s) have multiple item_name values. Taking most frequent.")
        most_frequent = cleaned_df.groupby('item_id')['item_name'].agg(lambda x: x.mode()[0]).reset_index()
        cleaned_df = cleaned_df.drop(columns='item_name').merge(most_frequent, on='item_id', how='left')
    
    shop_stats = cleaned_df.groupby('shop_id').size().reset_index(name='count')
    shops_to_remove = shop_stats.nsmallest(6, 'count')['shop_id'].tolist()
    remove_records = cleaned_df[cleaned_df['shop_id'].isin(shops_to_remove)].shape[0]
    cleaned_df = cleaned_df[~cleaned_df['shop_id'].isin(shops_to_remove)]
    print(f"Selected {cleaned_df['shop_id'].nunique()} shops after removing {shops_to_remove}")
    print(f"Removed {remove_records:,} records from shop selection")
    
    cleaned_df = cleaned_df.sort_values(['shop_id', 'item_id', 'date'])
    
    def winsorize_with_rolling_stats(group):
        group = group.set_index('date').sort_index()
        rolling_mean = group['item_cnt_day'].rolling(window='30D', min_periods=1).mean()
        rolling_std = group['item_cnt_day'].rolling(window='30D', min_periods=1).std()
        upper_3sigma = rolling_mean + 3 * rolling_std
        
        outliers = group['item_cnt_day'] > upper_3sigma
        outlier_count = outliers.sum()
        
        winsor_limit = group['item_cnt_day'].quantile(0.99)
        clipped = group['item_cnt_day'].clip(upper=winsor_limit)
        clipped_count = (group['item_cnt_day'] > winsor_limit).sum()
        
        return clipped.reset_index(drop=True), outlier_count, clipped_count
        
    total_records = len(cleaned_df)
    item_cnt_day_winsorized = []
    total_outlier = 0
    total_clipped = 0
    
    for (shop_id, item_id), group in cleaned_df.groupby(['shop_id', 'item_id']):
        clipped_series, outliers, clipped = winsorize_with_rolling_stats(group.copy())
        item_cnt_day_winsorized.extend(clipped_series)
        total_outlier += outliers
        total_clipped += clipped
        
    cleaned_df['item_cnt_day'] = item_cnt_day_winsorized
    
    outlier_percentage = (total_outlier / total_records * 100) if total_records > 0 else 0
    clipped_percentage = (total_clipped / total_records * 100) if total_records > 0 else 0

    print("Winsorization applied at 99th percentile for item_cnt_day after 3σ rolling-window detection")
    print(f"Total records: {total_records:,}")
    print(f"Outliers (above 3σ): {total_outlier:,} ({outlier_percentage:.2f}%)")
    print(f"Values clipped (>99th percentile): {total_clipped:,} ({clipped_percentage:.2f}%)")
    
    # Final checks
    print("Final unique counts:")
    print(f"item_id: {cleaned_df['item_id'].nunique()}")
    print(f"item_name: {cleaned_df['item_name'].nunique()}")
    
    return cleaned_df

cleaned_data = data_preprocessing(df)

Preprocessing dataset with 2,935,849 records
Initial unique counts:
item_id: 21807
item_name: 21233
item_name nulls before imputation: 84
item_id with 'Unknown' item_name: 0
Unique item_name per item_id:
item_name
1    21806
0        1
Name: count, dtype: int64
Initial null counts:
Column 'date': 0 nulls (0.00%)
Column 'date_block_num': 0 nulls (0.00%)
Column 'shop_id': 0 nulls (0.00%)
Column 'item_id': 0 nulls (0.00%)
Column 'item_price': 0 nulls (0.00%)
Column 'item_cnt_day': 0 nulls (0.00%)
Column 'item_name': 84 nulls (0.00%)
Column 'item_category_id': 0 nulls (0.00%)
Column 'item_category_name': 0 nulls (0.00%)
Column 'shop_name': 0 nulls (0.00%)
Created Return column. Total Returns: 7,541.0
Converted date column to datetime format
Imputed 84 missing item names with 'Unknown'
Selected 54 shops after removing ['36', '11', '20', '8', '9', '40']
Removed 14,017 records from shop selection
Winsorization applied at 99th percentile for item_cnt_day after 3σ rolling-window detection
Total

In [7]:
cleaned_data.columns.tolist()

['date',
 'date_block_num',
 'shop_id',
 'item_id',
 'item_price',
 'item_cnt_day',
 'item_name',
 'item_category_id',
 'item_category_name',
 'shop_name',
 'Return']

In [8]:
import pandas as pd
import numpy as np
from pathlib import Path # <-- 1. Import the Path object

def aggregate_to_monthly(df_clean):
    # Aggregate to monthly level
    monthly_data = df_clean.groupby(['date_block_num', 'shop_id', 'item_id', 'item_category_id']).agg({
        'item_cnt_day': 'sum',
        'item_price': 'mean',
        'Return': 'sum'
    }).reset_index()
    
    # Rename columns
    monthly_data.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)
    
    # Clip item_cnt_month
    monthly_data['item_cnt_month'] = monthly_data['item_cnt_month'].clip(0, 20).astype(np.float32)
    
    return monthly_data

# Assuming df_clean is loaded
monthly_data = aggregate_to_monthly(cleaned_data)

# --- FIX STARTS HERE ---

# 2. Define the output directory path
output_dir = Path("lstm_data")

# 3. Create the directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# 4. Save the file inside the now-guaranteed-to-exist directory
monthly_data.to_parquet(output_dir / "monthly_data.parquet", index=False)

# --- FIX ENDS HERE ---


print("Monthly data sample:")
print(monthly_data.head())
print(f"Monthly data shape: {monthly_data.shape}")

Monthly data sample:
   date_block_num shop_id item_id  item_category_id  item_cnt_month  \
0               0       0    1000                67             5.0   
1               0       0    1001                67             2.0   
2               0       0   10012                40             1.0   
3               0       0    1002                67             2.0   
4               0       0    1003                67             2.0   

   item_price  Return  
0        58.0     0.0  
1        58.0     0.0  
2        76.0     0.0  
3        58.0     0.0  
4        58.0     0.0  
Monthly data shape: (1601409, 7)


In [None]:
items_df = cleaned_data[['item_id', 'item_name', 'item_price']].drop_duplicates().reset_index(drop=True)
items_df.head()

Unnamed: 0,item_id,item_name,item_price
0,1000,3D Action Puzzle Zombie Cleaner,58.0
1,10004,SM/F (region),64.0
2,1001,"3D Action Puzzle ""Zomby"" Shahter",58.0
3,10012,WOLT m/f (Region),76.0
4,1002,"3D Action Puzzle ""Technica"" Bomber",58.0


In [32]:
import pandas as pd
from pathlib import Path
import logging

# --- SETUP (Assume these are already loaded in your script) ---
# This setup is for demonstration. In your script, you would have already loaded these.
# monthly_data = pd.read_parquet("lstm_data/monthly_data.parquet")
# items_df = pd.read_csv("raw_data/items.csv")
# item_cats_df = pd.read_csv("raw_data/item_categories.csv")
# logger = logging.getLogger(__name__)
# ----------------------------------------------------------------

# --- START OF THE CODE TO ADD ---

logger.info("Creating and saving complete item metadata...")

# 1. Define the absolute save directory as you specified.
SAVE_DIR = Path("/Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/modelnotebook/training_data")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

# 2. Link item names and category names together.
# This starts with your original items and categories data for a complete list.
item_metadata = pd.merge(items_df, item_cats_df, on='item_category_id', how='left')

# 3. Get the average UN SCALED price for each item from your monthly aggregated data.
# This ensures we capture the real-world price for revenue calculations.
avg_price_per_item = monthly_data.groupby('item_id')['item_price'].mean().reset_index()

# 4. Merge the average price into the metadata.
# We use a left merge to ensure we keep all items, even if they had no sales.
item_metadata = pd.merge(item_metadata, avg_price_per_item, on='item_id', how='left')

# 5. Select and rename the final columns to match your requirements.
# We fill any missing prices (for items that never sold) with 0.
item_metadata = item_metadata[[
    'item_id',
    'item_name',
    'item_category_id',
    'item_category_name',
    'item_price'
]].fillna({'item_price': 0})

# 6. Save the final, clean metadata file to the specified directory.
try:
    item_metadata.to_parquet(SAVE_DIR / 'item_metadata.parquet', index=False)
    logger.info(f"✓ Successfully saved item_metadata.parquet to: {SAVE_DIR.resolve()}")
except Exception as e:
    logger.error(f"✗ Failed to save item_metadata.parquet: {e}")
    raise

# --- END OF THE CODE TO ADD ---

# You can add a verification step to see the output.
print("\nSample of the created item_metadata.parquet:")
print(item_metadata.head())
print(f"\nShape of item_metadata: {item_metadata.shape}")

2025-09-14 14:44:59,107 - INFO - Creating and saving complete item metadata...


NameError: name 'items_df' is not defined

In [9]:

monthly_data.columns.tolist()

['date_block_num',
 'shop_id',
 'item_id',
 'item_category_id',
 'item_cnt_month',
 'item_price',
 'Return']

In [10]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

def create_lag_features(df, group_cols, value_cols, lags=[1, 2, 3]):
    df = df.sort_values(group_cols + ['date_block_num']).copy()
    for col in value_cols:
        for lag in lags:
            lag_col = f"{col}_lag_{lag}"
            df[lag_col] = df.groupby(group_cols)[col].shift(lag)
    return df

def impute_nans_with_median_then_mean(df, cols):
    imputation_values = {}
    for col in cols:
        nan_count = df[col].isna().sum()
        if nan_count > 0:
            median_val = df[col].median()
            if median_val != 0:
                fill_val = median_val
                method = 'median'
            else:
                mean_val = df[col].mean()
                if mean_val != 0:
                    fill_val = mean_val
                    method = 'mean'
                else:
                    fill_val = 0
                    method = 'zero'
            df[col] = df[col].fillna(fill_val).astype(np.float32)
            imputation_values[col] = (fill_val, method)
            print(f"Imputed {nan_count:,} NaN values in '{col}' with {method} value {fill_val:.4f}")
        else:
            imputation_values[col] = (None, None)
    return df, imputation_values

# ---- APPLY FEATURE ENGINEERING ----
monthly_data = pd.read_parquet("lstm_data/monthly_data.parquet")
engineered_df = monthly_data.copy()

print(f"Starting feature engineering on monthly_data with {len(engineered_df):,} rows")

group_cols = ['shop_id', 'item_id']
value_cols = ['item_cnt_month']
lags = [1, 2, 3]

# Clip item_cnt_month
engineered_df['item_cnt_month'] = engineered_df['item_cnt_month'].clip(0, 20)

# Make Return binary
engineered_df['Return'] = (engineered_df['Return'] > 0).astype(np.int8)

# Log-transform item_price with tighter clipping
engineered_df['item_price'] = np.log1p(engineered_df['item_price'].clip(lower=0, upper=5000)).astype(np.float32)

# Create lag features
engineered_df = create_lag_features(engineered_df, group_cols, value_cols, lags)

# Add mean encodings
category_means = engineered_df.groupby(['item_category_id', 'date_block_num'])['item_cnt_month'].mean().reset_index()
category_means.rename(columns={'item_cnt_month': 'item_cnt_month_mean_category'}, inplace=True)
engineered_df = engineered_df.merge(category_means, on=['item_category_id', 'date_block_num'], how='left')

shop_means = engineered_df.groupby(['shop_id', 'date_block_num'])['item_cnt_month'].mean().reset_index()
shop_means.rename(columns={'item_cnt_month': 'item_cnt_month_mean_shop'}, inplace=True)
engineered_df = engineered_df.merge(shop_means, on=['shop_id', 'date_block_num'], how='left')

item_means = engineered_df.groupby(['item_id', 'date_block_num'])['item_cnt_month'].mean().reset_index()
item_means.rename(columns={'item_cnt_month': 'item_cnt_month_mean_item'}, inplace=True)
engineered_df = engineered_df.merge(item_means, on=['item_id', 'date_block_num'], how='left')

# Impute missing values
features_to_impute = [
    'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
    'item_cnt_month_mean_category', 'item_cnt_month_mean_shop', 'item_cnt_month_mean_item',
    'item_price'
]
engineered_df, imputation_info = impute_nans_with_median_then_mean(engineered_df, features_to_impute)

# Save
engineered_df.to_parquet("lstm_data/engineered_df.parquet", index=False)

# Verify item_price
print("\nItem_price statistics in engineered_df:")
print(engineered_df['item_price'].describe())

# Preview
print("\nSample of engineered DataFrame:")
print(engineered_df[['shop_id', 'item_id', 'date_block_num', 'item_category_id', 'item_cnt_month', 'Return', 'item_price'] + features_to_impute].head(10))
print(f"Engineered data shape: {engineered_df.shape}")

Starting feature engineering on monthly_data with 1,601,409 rows
Imputed 418,619 NaN values in 'item_cnt_month_lag_1' with median value 1.0000
Imputed 694,941 NaN values in 'item_cnt_month_lag_2' with median value 1.0000
Imputed 890,881 NaN values in 'item_cnt_month_lag_3' with median value 1.0000

Item_price statistics in engineered_df:
count    1.601409e+06
mean     6.066872e+00
std      9.899732e-01
min      8.617770e-02
25%      5.298317e+00
50%      5.991465e+00
75%      6.789816e+00
max      8.517393e+00
Name: item_price, dtype: float64

Sample of engineered DataFrame:
  shop_id item_id  date_block_num  item_category_id  item_cnt_month  Return  \
0       0    1000               0                67             5.0       0   
1       0    1000               1                67             4.0       0   
2       0   10004               1                40             1.0       0   
3       0    1001               0                67             2.0       0   
4       0   10012      

In [11]:
engineered_df.columns.tolist()

['date_block_num',
 'shop_id',
 'item_id',
 'item_category_id',
 'item_cnt_month',
 'item_price',
 'Return',
 'item_cnt_month_lag_1',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 'item_cnt_month_mean_category',
 'item_cnt_month_mean_shop',
 'item_cnt_month_mean_item']

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
import joblib

# ---- COPY AND PREP ----
scaled_df = engineered_df.copy()

# Ensure correct data types
scaled_df['shop_id'] = scaled_df['shop_id'].astype('int16')
scaled_df['item_id'] = scaled_df['item_id'].astype('int32')
scaled_df['date_block_num'] = scaled_df['date_block_num'].astype('int8')
scaled_df['item_category_id'] = scaled_df['item_category_id'].astype('int32')
scaled_df['Return'] = scaled_df['Return'].astype('int8')

# ---- DEFINE COLUMNS ----
numerical_cols = [
    'item_cnt_month',
    'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
    'item_cnt_month_mean_shop', 'item_cnt_month_mean_item', 'item_cnt_month_mean_category'
]
price_col = ['item_price']
categorical_cols = ['shop_id', 'item_id', 'item_category_id']

# Verify columns exist
missing_numerical = [col for col in numerical_cols + price_col if col not in scaled_df.columns]
missing_categorical = [col for col in categorical_cols if col not in scaled_df.columns]
if missing_numerical:
    print(f"Warning: Missing numerical columns: {missing_numerical}")
if missing_categorical:
    print(f"Warning: Missing categorical columns: {missing_categorical}")

# ---- CHRONOLOGICAL SPLIT FOR SCALING/ENCODING ----
train_df = scaled_df[scaled_df['date_block_num'] <= 26].copy()
val_df = scaled_df[(scaled_df['date_block_num'] > 26) & (scaled_df['date_block_num'] <= 29)].copy()
test_df = scaled_df[scaled_df['date_block_num'] > 29].copy()

# ---- APPLY STANDARD SCALING TO NUMERICAL COLUMNS ----
scaler = StandardScaler()
scaler.fit(train_df[numerical_cols])  # Fit only on training data
train_df[numerical_cols] = scaler.transform(train_df[numerical_cols]).astype(np.float32)
val_df[numerical_cols] = scaler.transform(val_df[numerical_cols]).astype(np.float32)
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols]).astype(np.float32)

# Save scaler
joblib.dump(scaler, 'lstm_data/scaler.joblib')

# ---- APPLY ROBUST SCALING TO item_price ----
price_scaler = RobustScaler()
price_scaler.fit(train_df[price_col])  # Fit only on training data
train_df['item_price'] = price_scaler.transform(train_df[price_col]).astype(np.float32)
val_df['item_price'] = price_scaler.transform(val_df[price_col]).astype(np.float32)
test_df['item_price'] = price_scaler.transform(test_df[price_col]).astype(np.float32)

# Save price scaler
joblib.dump(price_scaler, 'lstm_data/price_scaler.joblib')

# Mean encoding for shop_id, item_id, item_category_id
for col in categorical_cols:
    mean_encoded = train_df.groupby(col)['item_cnt_month'].mean().to_dict()
    train_df[f'{col}_mean_encode'] = train_df[col].map(mean_encoded).astype(np.float32)
    val_df[f'{col}_mean_encode'] = val_df[col].map(mean_encoded).fillna(train_df['item_cnt_month'].mean()).astype(np.float32)
    test_df[f'{col}_mean_encode'] = test_df[col].map(mean_encoded).fillna(train_df['item_cnt_month'].mean()).astype(np.float32)


scaled_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

# ---- SORT BY date_block_num, shop_id, item_id ----
scaled_df = scaled_df.sort_values(['date_block_num', 'shop_id', 'item_id']).reset_index(drop=True)

# ---- SAVE OUTPUT ----
scaled_df.to_parquet("lstm_data/scaled_df.parquet", index=False)

# ---- VERIFICATION ----
print("Return value counts:")
print(scaled_df['Return'].value_counts())
print("\nDate_block_num value counts (first 10):")
print(scaled_df['date_block_num'].value_counts().sort_index().head(10))
print("\nItem_price statistics in scaled_df:")
print(scaled_df['item_price'].describe())

# ---- PREVIEW OUTPUT ----
print("\nSample of scaled_df after scaling and encoding:")
preview_cols = ['shop_id', 'item_id', 'date_block_num', 'item_category_id', 'item_cnt_month', 'Return', 'item_price', 'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3', 'item_cnt_month_mean_shop', 'item_cnt_month_mean_item', 'item_cnt_month_mean_category', 'shop_id_mean_encode', 'item_id_mean_encode', 'item_category_id_mean_encode']
print(scaled_df[preview_cols].head(10))

print("\nData shape:")
print(f"scaled_df: {scaled_df.shape}")

Return value counts:
Return
0    1594181
1       7228
Name: count, dtype: int64

Date_block_num value counts (first 10):
date_block_num
0    62238
1    59132
2    63302
3    54637
4    53296
5    56196
6    58035
7    58022
8    51575
9    50463
Name: count, dtype: int64

Item_price statistics in scaled_df:
count    1.601409e+06
mean     1.507174e-01
std      7.141147e-01
min     -4.163441e+00
25%     -4.036773e-01
50%      9.632267e-02
75%      6.722114e-01
max      1.918395e+00
Name: item_price, dtype: float64

Sample of scaled_df after scaling and encoding:
   shop_id  item_id  date_block_num  item_category_id  item_cnt_month  Return  \
0        0       32               0                40        1.531888       0   
1        0       33               0                37        0.371638       0   
2        0       35               0                40       -0.401862       0   
3        0       43               0                40       -0.401862       0   
4        0       51         

In [13]:
scaled_df.columns.tolist()

['date_block_num',
 'shop_id',
 'item_id',
 'item_category_id',
 'item_cnt_month',
 'item_price',
 'Return',
 'item_cnt_month_lag_1',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 'item_cnt_month_mean_category',
 'item_cnt_month_mean_shop',
 'item_cnt_month_mean_item',
 'shop_id_mean_encode',
 'item_id_mean_encode',
 'item_category_id_mean_encode']

In [26]:

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
import joblib
import polars as pl
import logging
import sys
import traceback
from datetime import datetime
import os

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configuration
SAVE_DIR = Path('/Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/modelnotebook/training_data')
SAVE_DIR.mkdir(exist_ok=True)

SEQUENCE_LENGTH = 3
FEATURE_COLS = [
    'item_cnt_month', 'item_price', 'Return',
    'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
    'item_cnt_month_mean_category', 'item_cnt_month_mean_shop', 'item_cnt_month_mean_item',
    'shop_id_mean_encode', 'item_id_mean_encode', 'item_category_id_mean_encode'
]
TARGET_COL = 'item_cnt_month'
DATE_COL = 'date_block_num'
GROUP_COLS = ['shop_id', 'item_id']

def create_lag_features(df):
    logger.info("Creating lag features...")
    df = df.sort_values([DATE_COL] + GROUP_COLS).copy()
    for lag in [1, 2, 3]:
        df[f'{TARGET_COL}_lag_{lag}'] = df.groupby(GROUP_COLS)[TARGET_COL].shift(lag)
    return df

def create_mean_encoded_features(df):
    logger.info("Creating mean-encoded features...")
    df['item_cnt_month_mean_item'] = df.groupby([DATE_COL, 'item_id'])[TARGET_COL].transform('mean')
    df['item_cnt_month_mean_shop'] = df.groupby([DATE_COL, 'shop_id'])[TARGET_COL].transform('mean')
    df['item_cnt_month_mean_category'] = df.groupby([DATE_COL, 'item_category_id'])[TARGET_COL].transform('mean')
    df['shop_id_mean_encode'] = df.groupby('shop_id')[TARGET_COL].transform('mean')
    df['item_id_mean_encode'] = df.groupby('item_id')[TARGET_COL].transform('mean')
    df['item_category_id_mean_encode'] = df.groupby('item_category_id')[TARGET_COL].transform('mean')
    return df

def create_sequences(df):
    logger.info("Creating sequences...")
    missing_cols = [col for col in FEATURE_COLS if col not in df.columns]
    if missing_cols:
        logger.error(f"Missing columns in dataframe: {missing_cols}")
        raise KeyError(f"Missing required columns: {missing_cols}")
    
    sequences, targets, target_dates, identifiers = [], [], [], []
    grouped = df.groupby(GROUP_COLS)
    for (shop_id, item_id), group in grouped:
        group_data = group[FEATURE_COLS].values
        group_target = group[TARGET_COL].values
        group_dates = group[DATE_COL].values
        for i in range(len(group) - SEQUENCE_LENGTH):
            sequences.append(group_data[i:i + SEQUENCE_LENGTH])
            targets.append(group_target[i + SEQUENCE_LENGTH])
            target_dates.append(group_dates[i + SEQUENCE_LENGTH])
            identifiers.append({'shop_id': shop_id, 'item_id': item_id})
    
    if not sequences:
        logger.warning("No sequences created.")
        return np.empty((0, SEQUENCE_LENGTH, len(FEATURE_COLS))), np.array([]), np.array([]), []
    
    logger.info(f"Created {len(sequences):,} sequences")
    return np.array(sequences, dtype=np.float32), np.array(targets, dtype=np.float32), np.array(target_dates, dtype=np.int32), identifiers

def time_based_split(sequences, targets, target_dates, identifiers, train_end=26, val_end=29):
    logger.info("Splitting data...")
    train_idx = target_dates <= train_end
    val_idx = (target_dates > train_end) & (target_dates <= val_end)
    test_idx = target_dates > val_end

    logger.info(f"Train: {train_idx.sum():,}, Val: {val_idx.sum():,}, Test: {test_idx.sum():,}")
    return (
        (sequences[train_idx], targets[train_idx], [identifiers[i] for i in range(len(identifiers)) if train_idx[i]]),
        (sequences[val_idx], targets[val_idx], [identifiers[i] for i in range(len(identifiers)) if val_idx[i]]),
        (sequences[test_idx], targets[test_idx], [identifiers[i] for i in range(len(identifiers)) if test_idx[i]])
    )

def save_sequences(X, y, prefix):
    if len(X) == 0:
        logger.warning(f"{prefix} set is empty")
        return
    
    X_flat = X.reshape(X.shape[0], -1)
    feature_names = [f"{feat}_t{t}" for t in range(SEQUENCE_LENGTH) for feat in FEATURE_COLS]
    
    try:
        pl.DataFrame(X_flat, schema=feature_names).write_parquet(os.path.join(SAVE_DIR, f"{prefix}_X.parquet"))
        pl.DataFrame(y, schema=[TARGET_COL]).write_parquet(os.path.join(SAVE_DIR, f"{prefix}_y.parquet"))
        logger.info(f"Saved {prefix} sequences to {SAVE_DIR}/{prefix}_[X|y].parquet")
    except Exception as e:
        logger.error(f"Failed to save {prefix} sequences: {e}")
        raise

def main(scaled_df):
    try:
        original_df = scaled_df.copy()
        logger.info(f"Original data shape: {original_df.shape[0]}")
    except NameError:
        logger.error("Error: scaled_df is not defined. Please provide the DataFrame.")
        raise

    # Verify required columns
    required_cols = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'item_cnt_month', 'item_price']
    missing_cols = [col for col in required_cols if col not in original_df.columns]
    if missing_cols:
        logger.error(f"Missing columns in scaled_df: {missing_cols}")
        raise KeyError(f"Missing required columns: {missing_cols}")

    # Feature engineering
    logger.info("Performing feature engineering...")
    engineered_df = original_df.copy()

    # Add Return feature if missing
    if 'Return' not in engineered_df.columns:
        engineered_df['Return'] = (engineered_df[TARGET_COL] < 0).astype(np.int8)
        logger.info("Created Return feature")

    # Clip target
    engineered_df[TARGET_COL] = engineered_df[TARGET_COL].clip(0, 20)

    # Create lag features if missing
    lag_cols = [f'item_cnt_month_lag_{i}' for i in [1, 2, 3]]
    if not all(col in engineered_df.columns for col in lag_cols):
        engineered_df = create_lag_features(engineered_df)
        logger.info("Created lag features")

    # Create mean-encoded features if missing
    mean_cols = [
        'item_cnt_month_mean_item', 'item_cnt_month_mean_shop', 'item_cnt_month_mean_category',
        'shop_id_mean_encode', 'item_id_mean_encode', 'item_category_id_mean_encode'
    ]
    if not all(col in engineered_df.columns for col in mean_cols):
        engineered_df = create_mean_encoded_features(engineered_df)
        logger.info("Created mean-encoded features")

    # Fill NaNs
    engineered_df = engineered_df.fillna(0)

    # Prepare data (filter groups with enough data)
    logger.info("Preparing data...")
    min_required = SEQUENCE_LENGTH + 1
    item_counts = engineered_df.groupby(GROUP_COLS)[DATE_COL].count()
    valid_items = item_counts[item_counts >= min_required].index
    prepared_df = engineered_df[engineered_df.set_index(GROUP_COLS).index.isin(valid_items)]
    logger.info(f"Prepared data shape: {prepared_df.shape[0]:,}, valid groups: {len(valid_items):,}")

    # Scaling
    logger.info("Scaling target and feature variables...")
    feature_cols_to_scale = [
        'item_price', 'Return', 'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
        'item_cnt_month_mean_shop', 'item_cnt_month_mean_item', 'item_cnt_month_mean_category',
        'shop_id_mean_encode', 'item_id_mean_encode', 'item_category_id_mean_encode'
    ]
    
    target_scaler = StandardScaler()
    feature_scaler = StandardScaler()
    
    prepared_df[TARGET_COL] = target_scaler.fit_transform(prepared_df[[TARGET_COL]])
    prepared_df[feature_cols_to_scale] = feature_scaler.fit_transform(prepared_df[feature_cols_to_scale])

    # Save scalers
    joblib.dump(target_scaler, SAVE_DIR / 'target_scaler.pkl')
    joblib.dump(feature_scaler, SAVE_DIR / 'feature_scaler.pkl')
    logger.info(f"✓ Saved target_scaler.pkl and feature_scaler.pkl")

    # Create sequences
    all_sequences, all_targets, target_dates, identifiers = create_sequences(prepared_df)

    # Time-based splitting
    (X_train, y_train, train_ids), (X_val, y_val, val_ids), (X_test, y_test, test_ids) = time_based_split(
        all_sequences, all_targets, target_dates, identifiers
    )

    # Save sequences
    save_sequences(X_train, y_train, 'train')
    save_sequences(X_val, y_val, 'val')
    save_sequences(X_test, y_test, 'test')

    # Save test_X_with_ids.parquet
    if test_ids:
        test_ids_df = pd.DataFrame(test_ids)
        test_ids_df.to_parquet(SAVE_DIR / 'test_X_with_ids.parquet', index=False)
        logger.info(f"✓ Saved test_X_with_ids.parquet")
    else:
        logger.warning("Test set is empty; skipping test_X_with_ids.parquet")

    # Create and save item_metadata.parquet
    logger.info("Creating and saving item_metadata.parquet...")
    item_metadata = prepared_df.groupby('item_id').agg({
        'item_price': 'mean',
        'item_category_id': 'first'
    }).reset_index()
    item_metadata['item_category_name'] = item_metadata['item_category_id'].map(
        {i: f"Category_{i}" for i in item_metadata['item_category_id'].unique()}
    )
    item_metadata.to_parquet(SAVE_DIR / 'item_metadata.parquet', index=False)
    logger.info(f"✓ Saved item_metadata.parquet")

    # Log data shapes and date ranges
    logger.info("\n=== Data Shapes ===")
    logger.info(f"Original: {original_df.shape}")
    logger.info(f"Prepared: {prepared_df.shape}")
    logger.info(f"Train: {X_train.shape}, y: {y_train.shape}")
    logger.info(f"Val: {X_val.shape}, y: {y_val.shape}")
    logger.info(f"Test: {X_test.shape}, y: {y_test.shape}")

    logger.info("\nDate ranges:")
    logger.info(f"Train: {target_dates[target_dates <= 26].min() if target_dates.size > 0 else 'N/A'} to {target_dates[target_dates <= 26].max() if target_dates.size > 0 else 'N/A'}")
    logger.info(f"Val: {target_dates[(target_dates > 26) & (target_dates <= 29)].min() if target_dates[(target_dates > 26) & (target_dates <= 29)].size > 0 else 'N/A'} to {target_dates[(target_dates > 26) & (target_dates <= 29)].max() if target_dates[(target_dates > 26) & (target_dates <= 29)].size > 0 else 'N/A'}")
    logger.info(f"Test: {target_dates[target_dates > 29].min() if target_dates[target_dates > 29].size > 0 else 'N/A'} to {target_dates[target_dates > 29].max() if target_dates[target_dates > 29].size > 0 else 'N/A'}")

    logger.info("\nSaved feature columns:")
    logger.info([f"{feat}_t{t}" for t in range(SEQUENCE_LENGTH) for feat in FEATURE_COLS])

    # Verify saved files
    for prefix in ['train', 'val', 'test']:
        x_path = SAVE_DIR / f"{prefix}_X.parquet"
        y_path = SAVE_DIR / f"{prefix}_y.parquet"
        if x_path.exists() and y_path.exists():
            logger.info(f"✓ Verified: {x_path.name} and {y_path.name} exist")
        else:
            logger.error(f"✗ Missing: {x_path.name} or {y_path.name}")
    for f in ['feature_scaler.pkl', 'target_scaler.pkl', 'test_X_with_ids.parquet', 'item_metadata.parquet']:
        if (SAVE_DIR / f).exists():
            logger.info(f"✓ Verified: {f} exists")
        else:
            logger.error(f"✗ Missing: {f}")

if __name__ == "__main__":
    try:
        # If running standalone, scaled_df must be defined
        main(scaled_df)
    except NameError:
        logger.error("scaled_df is not defined. Please provide the DataFrame when calling main().")
        sys.exit(1)
    except Exception as e:
        logger.error(f"Preprocessing failed: {str(e)}")
        logger.error(traceback.format_exc())
        sys.exit(1)


2025-09-13 19:07:39,387 - INFO - Original data shape: 1601409
2025-09-13 19:07:39,390 - INFO - Performing feature engineering...
2025-09-13 19:07:39,486 - INFO - Preparing data...
2025-09-13 19:07:39,812 - INFO - Prepared data shape: 1,146,719, valid groups: 145,397
2025-09-13 19:07:39,813 - INFO - Scaling target and feature variables...
2025-09-13 19:07:39,961 - INFO - ✓ Saved target_scaler.pkl and feature_scaler.pkl
2025-09-13 19:07:39,962 - INFO - Creating sequences...
2025-09-13 19:08:32,341 - INFO - Created 710,528 sequences
2025-09-13 19:08:33,128 - INFO - Splitting data...
2025-09-13 19:08:33,134 - INFO - Train: 573,640, Val: 59,474, Test: 77,414
2025-09-13 19:08:33,971 - INFO - Saved train sequences to /Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/modelnotebook/training_data/train_[X|y].parquet
2025-09-13 19:08:34,027 - INFO - Saved val sequences to /Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/modelnotebook/training_data/val_[X|y].parquet


In [29]:

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import polars as pl
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import sys
import traceback
from datetime import datetime, timedelta
import joblib

# Setup logging
log_dir = Path('/Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/logs')
log_dir.mkdir(exist_ok=True)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_dir / f'run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
logger.info(f"Using device: {device}")

class SalesDataset(Dataset):
    def __init__(self, X_file, y_file, sequence_length=3):
        if not Path(X_file).exists() or not Path(y_file).exists():
            raise FileNotFoundError(f"Data files not found: {X_file}, {y_file}")
        self.X = pl.read_parquet(X_file)
        feature_cols = [
            'item_cnt_month', 'item_price', 'Return',
            'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
            'item_cnt_month_mean_shop', 'item_cnt_month_mean_item', 'item_cnt_month_mean_category',
            'shop_id_mean_encode', 'item_id_mean_encode', 'item_category_id_mean_encode'
        ]
        input_cols = [f"{feat}_t{t}" for t in range(sequence_length) for feat in feature_cols]
        available_cols = self.X.columns
        missing_cols = [col for col in input_cols if col not in available_cols]
        if missing_cols:
            raise ValueError(f"Missing columns in {X_file}: {missing_cols}")
        self.y = pl.read_parquet(y_file).to_numpy().astype(np.float32).reshape(-1, 1)
        self.indices = np.arange(len(self.X))
        if len(self.X) != len(self.y):
            raise ValueError(f"X and y length mismatch: {len(self.X)} vs {len(self.y)}")
        self.sequence_length = sequence_length
        self.numerical_cols = input_cols
        numerical_data = self.X.select(self.numerical_cols).to_numpy().astype(np.float32)
        numerical_data = np.nan_to_num(numerical_data, nan=0.0)
        self.y = np.nan_to_num(self.y, nan=0.0)
        numerical_data = numerical_data.reshape(len(self.X), sequence_length, len(feature_cols))
        self.numerical = numerical_data
        self.identifiers = self.indices
        self.dates = [datetime(2013, 1, 1) + timedelta(days=30 * (int(idx) % 120)) for idx in self.indices]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        numerical = torch.tensor(self.numerical[idx], dtype=torch.float32)
        target = torch.tensor(self.y[idx], dtype=torch.float32)
        identifiers = torch.tensor([self.identifiers[idx]], dtype=torch.int32)
        dates = self.dates[idx]
        return {'numerical': numerical, 'target': target, 'identifiers': identifiers, 'dates': dates}

class HALSTM(nn.Module):
    def __init__(self, numerical_dim=12, hidden_dim=128, num_layers=2, num_heads=4, dropout=0.3, l2_lambda=0.01):
        super(HALSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.l2_lambda = l2_lambda
        self.input_dim = numerical_dim
        self.lstm = nn.LSTM(self.input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.lstm_norm = nn.LayerNorm(hidden_dim)
        self.mha = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)
        self.mha_norm = nn.LayerNorm(hidden_dim)
        self.gate = nn.Linear(hidden_dim * 2, hidden_dim)
        self.sigmoid = nn.Sigmoid()
        self.fc_shared = nn.Linear(hidden_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.positional_encoding = torch.zeros(3, hidden_dim, device=device)
        position = torch.arange(0, 3, dtype=torch.float, device=device).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_dim, 2, device=device).float() * (-torch.log(torch.tensor(10000.0, device=device)) / hidden_dim))
        self.positional_encoding[:, 0::2] = torch.sin(position * div_term)
        self.positional_encoding[:, 1::2] = torch.cos(position * div_term)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None: nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LSTM):
                for name, param in m.named_parameters():
                    if 'weight' in name: nn.init.xavier_normal_(param)
                    elif 'bias' in name: nn.init.constant_(param, 0)

    def forward(self, numerical):
        batch_size, seq_len, _ = numerical.size()
        x = self.dropout(numerical)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim, device=x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim, device=x.device)
        lstm_out, _ = self.lstm(x, (h0, c0))
        lstm_out = self.lstm_norm(lstm_out)
        lstm_out = lstm_out + self.positional_encoding[:seq_len, :].unsqueeze(0)
        mha_out, mha_weights = self.mha(lstm_out, lstm_out, lstm_out)
        mha_out = self.mha_norm(mha_out)
        combined = torch.cat([lstm_out[:, -1, :], mha_out[:, -1, :]], dim=-1)
        gate_val = self.sigmoid(self.gate(combined))
        fused = gate_val * lstm_out[:, -1, :] + (1 - gate_val) * mha_out[:, -1, :]
        shared = self.relu(self.fc_shared(fused))
        output = self.fc_out(shared)
        return output, {'mha_weights': mha_weights, 'gate_weights': gate_val}

def collate_fn(batch):
    if not batch: return {}
    return {
        'numerical': torch.stack([item['numerical'] for item in batch]),
        'target': torch.stack([item['target'] for item in batch]),
        'identifiers': torch.stack([item['identifiers'] for item in batch]),
        'dates': [item['dates'] for item in batch]
    }

def train_model(model, train_loader, val_loader, num_epochs=15, lr=0.0005, accum_steps=2):
    criterion_mse = nn.MSELoss().to(device)
    criterion_mae = nn.L1Loss().to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=model.l2_lambda)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, epochs=num_epochs, steps_per_epoch=len(train_loader)//accum_steps)
    scaler = torch.cuda.amp.GradScaler() if device.type in ['cuda', 'mps'] else None
    
    metrics = {'epoch': [], 'train_mse': [], 'train_rmse': [], 'train_mae': [], 'val_mse': [], 'val_rmse': [], 'val_mae': []}
    best_val_loss = float('inf')
    output_dir = Path('/Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/results')
    output_dir.mkdir(exist_ok=True)
    patience = 5
    epochs_no_improve = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_mse = 0
        train_mae = 0
        optimizer.zero_grad()
        
        for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):
            numerical = batch['numerical'].to(device)
            target = batch['target'].to(device)
            
            if scaler:
                with torch.cuda.amp.autocast():
                    output, _ = model(numerical)
                    mse_loss = criterion_mse(output, target) / accum_steps
                    mae_loss = criterion_mae(output, target) / accum_steps
                    loss = mse_loss
                scaler.scale(loss).backward()
                if (batch_idx + 1) % accum_steps == 0:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
                    scaler.step(optimizer)
                    scheduler.step()
                    scaler.update()
                    optimizer.zero_grad()
            else:
                output, _ = model(numerical)
                mse_loss = criterion_mse(output, target) / accum_steps
                mae_loss = criterion_mae(output, target) / accum_steps
                loss = mse_loss
                loss.backward()
                if (batch_idx + 1) % accum_steps == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
            
            train_loss += mse_loss.item() * accum_steps
            train_mse += mse_loss.item() * accum_steps
            train_mae += mae_loss.item() * accum_steps
        
        train_loss /= len(train_loader)
        train_mse /= len(train_loader)
        train_rmse = np.sqrt(train_mse)
        train_mae /= len(train_loader)
        
        model.eval()
        val_loss = 0
        val_mse = 0
        val_mae = 0
        with torch.no_grad():
            for batch in val_loader:
                numerical = batch['numerical'].to(device)
                target = batch['target'].to(device)
                output, _ = model(numerical)
                mse_loss = criterion_mse(output, target)
                mae_loss = criterion_mae(output, target)
                val_loss += mse_loss.item()
                val_mse += mse_loss.item()
                val_mae += mae_loss.item()
        
        val_loss /= len(val_loader)
        val_mse /= len(val_loader)
        val_rmse = np.sqrt(val_mse)
        val_mae /= len(val_loader)
        
        metrics['epoch'].append(epoch + 1)
        metrics['train_mse'].append(train_mse)
        metrics['train_rmse'].append(train_rmse)
        metrics['train_mae'].append(train_mae)
        metrics['val_mse'].append(val_mse)
        metrics['val_rmse'].append(val_rmse)
        metrics['val_mae'].append(val_mae)
        logger.info(f"Epoch {epoch+1}, Train MSE: {train_mse:.4f}, RMSE: {train_rmse:.4f}, MAE: {train_mae:.4f}, "
                    f"Val MSE: {val_mse:.4f}, RMSE: {val_rmse:.4f}, MAE: {val_mae:.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), output_dir / 'best_ha_lstm.pth')
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                logger.info(f"Early stopping triggered after {epoch+1} epochs")
                break
    
    metrics_df = pd.DataFrame(metrics)
    metrics_df.to_csv(output_dir / 'training_metrics.csv', index=False)
    
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 2, 1)
    plt.plot(metrics['epoch'], metrics['train_rmse'], label='Train RMSE')
    plt.plot(metrics['epoch'], metrics['val_rmse'], label='Val RMSE')
    plt.xlabel('Epoch')
    plt.ylabel('RMSE')
    plt.title('Training and Validation RMSE')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(metrics['epoch'], metrics['train_mae'], label='Train MAE')
    plt.plot(metrics['epoch'], metrics['val_mae'], label='Val MAE')
    plt.xlabel('Epoch')
    plt.ylabel('MAE')
    plt.title('Training and Validation MAE')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(output_dir / 'metrics_plot.png')
    plt.close()
    
    logger.info(f"Loading best model from epoch with loss: {best_val_loss:.4f}")
    model.load_state_dict(torch.load(output_dir / 'best_ha_lstm.pth'))
    return model, metrics_df

def predict(model, test_loader, test_df_with_ids, item_metadata_df, shops_df, item_category_dict, scaler, feature_scaler=None):
    model.eval()
    predictions = []
    interpret_outputs = []
    successful_shop_lookups = 0
    successful_item_lookups = 0
    successful_price_lookups = 0
    total_samples = 0
    negative_price_count = 0
    
    item_metadata_dict = item_metadata_df.set_index('item_id').to_dict('index') if not item_metadata_df.empty else {}
    
    # Use placeholder shop names if shops_df is unavailable
    if shops_df is None or shops_df.empty:
        logger.warning("shops_df is unavailable. Using placeholder shop names.")
        shop_metadata_dict = {i: f"Shop_{i}" for i in test_df_with_ids['shop_id'].unique()}
    else:
        shop_metadata_dict = shops_df.set_index('shop_id')['shop_name'].to_dict()
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(test_loader, desc="Predicting")):
            numerical = batch['numerical'].to(device)
            identifiers = batch['identifiers']
            dates = batch['dates']
            
            output, attn_dict = model(numerical)
            
            unscaled_preds = scaler.inverse_transform(output.cpu().numpy())
            
            mha_weights = attn_dict['mha_weights'][:, -1, :].cpu().numpy()
            gate_weights = attn_dict['gate_weights'].cpu().numpy()
            
            batch_size = len(unscaled_preds)
            total_samples += batch_size
            
            for i in range(batch_size):
                original_row_index = identifiers[i][0].item()
                row_data = test_df_with_ids.iloc[original_row_index]
                shop_id = int(row_data['shop_id'])
                item_id = int(row_data['item_id'])
                
                predicted_demand = max(0, round(unscaled_preds[i][0]))
                
                category_name = item_category_dict.get(item_id, f"Category_{row_data.get('item_category_id', 'Unknown')}")
                if category_name != 'Unknown':
                    successful_item_lookups += 1
                
                metadata = item_metadata_dict.get(item_id, {})
                unit_price = metadata.get('item_price', 0)
                if unit_price == 0 and feature_scaler is not None:
                    numerical_i = numerical[i].cpu().numpy()
                    prices = numerical_i[:, 1].reshape(-1, 1)
                    unscaled_prices = feature_scaler.inverse_transform(prices)
                    unit_price = unscaled_prices[-1]
                    if unit_price == 0:
                        non_zero = unscaled_prices[unscaled_prices > 0]
                        unit_price = np.mean(non_zero) if len(non_zero) > 0 else 0
                unit_price = max(0, unit_price)
                if unit_price < 0:
                    negative_price_count += 1
                if unit_price > 0:
                    successful_price_lookups += 1
                
                # FIXED: Lookup shop_name from shop_metadata_dict
                shop_name = shop_metadata_dict.get(shop_id, f"Shop_{shop_id}")
                if shop_name != f"Shop_{shop_id}":
                    successful_shop_lookups += 1
                
                total_sales = unit_price * predicted_demand
                
                pred_dict = {
                    'shop_id': shop_id,
                    'shop_name': shop_name,
                    'item_id': item_id,
                    'item_category_name': category_name,
                    'predicted_product_demand': predicted_demand,
                    'predicted_total_sales': total_sales
                }
                predictions.append(pred_dict)
                
                lstm_reliance_score = np.mean(gate_weights[i])
                attention_t2 = mha_weights[i][0]
                attention_t1 = mha_weights[i][1]
                attention_t0 = mha_weights[i][2]
                
                interpret_outputs.append({
                    'timestamp_reference': dates[i].isoformat(),
                    'shop_id': shop_id,
                    'shop_name': shop_name,
                    'item_id': item_id,
                    'forecasted_value_unscaled': unscaled_preds[i][0],
                    'lstm_trend_reliance': lstm_reliance_score,
                    'attention_t_minus_2': attention_t2,
                    'attention_t_minus_1': attention_t1,
                    'attention_t_minus_0': attention_t0
                })
    
    logger.info(f"Lookup success: Shops {successful_shop_lookups/total_samples*100:.2f}%, Items {successful_item_lookups/total_samples*100:.2f}%, Prices {successful_price_lookups/total_samples*100:.2f}%")
    if negative_price_count > 0:
        logger.warning(f"Encountered {negative_price_count} negative prices before clamping to 0")
    
    pred_df = pd.DataFrame(predictions)
    interpret_df = pd.DataFrame(interpret_outputs)
    
    output_dir = Path('/Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/results')
    pred_df.to_csv(output_dir / 'final_predictions.csv', index=False)
    interpret_df.to_csv(output_dir / 'final_interpretability_outputs.csv', index=False)
    
    logger.info(f"Final predictions saved to {output_dir / 'final_predictions.csv'}")
    
    return pred_df, interpret_df

def generate_dashboard_metrics(pred_df):
    if pred_df.empty:
        logger.warning("Prediction dataframe is empty, cannot generate dashboard metrics.")
        return
    logger.info("Generating dashboard metrics by category...")
    category_summary = pred_df.groupby('item_category_name').agg(
        total_demand=('predicted_product_demand', 'sum'),
        total_revenue=('predicted_total_sales', 'sum')
    ).reset_index().sort_values(by='total_revenue', ascending=False)
    output_dir = Path('/Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/results')
    category_summary.to_csv(output_dir / 'dashboard_category_summary.csv', index=False)
    logger.info(f"Dashboard summary saved to {output_dir / 'dashboard_category_summary.csv'}")

def visualize_results(pred_df, y_test_unscaled):
    output_dir = Path('/Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/results')
    plt.figure(figsize=(12, 5))
    sns.kdeplot(pred_df['predicted_product_demand'], label='Predicted Demand', clip=(0, 20))
    sns.kdeplot(y_test_unscaled, label='Actual Demand', clip=(0, 20))
    plt.title('Prediction vs. Actual Distribution (Unscaled)')
    plt.legend()
    plt.savefig(output_dir / 'prediction_visualization.png')
    plt.close()

def main():
    data_dir = Path('/Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/modelnotebook/training_data')
    raw_data_dir = Path('/Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/raw_data')

    train_X_path = data_dir / 'train_X.parquet'
    train_y_path = data_dir / 'train_y.parquet'
    val_X_path = data_dir / 'val_X.parquet'
    val_y_path = data_dir / 'val_y.parquet'
    test_X_path = data_dir / 'test_X.parquet'
    test_y_path = data_dir / 'test_y.parquet'
    test_X_with_ids_path = data_dir / 'test_X_with_ids.parquet'
    item_metadata_path = data_dir / 'item_metadata.parquet'
    scaler_path = data_dir / 'target_scaler.pkl'
    feature_scaler_path = data_dir / 'feature_scaler.pkl'
    shops_path = data_dir / 'shops.csv'
    items_path = data_dir / 'items.csv'
    categories_path = data_dir / 'item_categories.csv'

    required_files = [
        train_X_path, train_y_path, val_X_path, val_y_path,
        test_X_path, test_y_path, test_X_with_ids_path,
        item_metadata_path, scaler_path, feature_scaler_path,
        shops_path, items_path, categories_path
    ]
    for f in required_files:
        if not f.exists():
            raise FileNotFoundError(f"CRITICAL: Required data file not found at {f}. Please ensure all files are present or run preprocess_data.py.")

    logger.info("Loading prerequisite files for prediction...")
    test_df_with_ids = pd.read_parquet(test_X_with_ids_path)
    item_metadata_df = pd.read_parquet(item_metadata_path)
    scaler = joblib.load(scaler_path)
    logger.info("Loading feature_scaler.pkl for price unscaling.")
    feature_scaler = joblib.load(feature_scaler_path)
    
    logger.info("Loading metadata...")
    shops_df = pd.read_csv(shops_path)
    items_df = pd.read_csv(items_path)
    categories_df = pd.read_csv(categories_path)
    item_category_dict = items_df.merge(categories_df, on='item_category_id').set_index('item_id')['item_category_name'].to_dict()

    batch_size = 128
    num_epochs = 15
    lr = 0.0005
    accum_steps = 2
    
    logger.info("Initializing datasets and dataloaders...")
    train_dataset = SalesDataset(train_X_path, train_y_path)
    val_dataset = SalesDataset(val_X_path, val_y_path)
    test_dataset = SalesDataset(test_X_path, test_y_path)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, collate_fn=collate_fn)
    
    logger.info("Initializing model...")
    model = HALSTM(numerical_dim=12, hidden_dim=128, num_layers=2, num_heads=4, dropout=0.3, l2_lambda=0.01).to(device)
    
    logger.info("Starting model training...")
    model, metrics_df = train_model(model, train_loader, val_loader, num_epochs, lr, accum_steps)
    
    logger.info("Starting prediction and interpretation...")
    pred_df, interpret_df = predict(model, test_loader, test_df_with_ids, item_metadata_df, shops_df, item_category_dict, scaler, feature_scaler)
    
    generate_dashboard_metrics(pred_df)
 
    y_test_scaled = pl.read_parquet(test_y_path).to_numpy()
    y_test_unscaled = scaler.inverse_transform(y_test_scaled).flatten()
    visualize_results(pred_df, y_test_unscaled)

if __name__ == "__main__":
    try:
        logger.info("Initiating program execution")
        main()
        logger.info("Program executed successfully")
    except Exception as e:
        logger.error(f"Program execution failed: {str(e)}")
        logger.error(traceback.format_exc())
        sys.exit(1)

2025-09-14 14:05:08,845 - INFO - Using device: mps
2025-09-14 14:05:08,850 - INFO - Initiating program execution
2025-09-14 14:05:08,852 - INFO - Loading prerequisite files for prediction...
2025-09-14 14:05:08,962 - INFO - Loading feature_scaler.pkl for price unscaling.
2025-09-14 14:05:08,963 - INFO - Loading metadata...
2025-09-14 14:05:09,029 - INFO - Initializing datasets and dataloaders...
2025-09-14 14:05:10,097 - INFO - Initializing model...
2025-09-14 14:05:10,545 - INFO - Starting model training...
Epoch 1: 100%|██████████| 4482/4482 [01:05<00:00, 68.44it/s]
2025-09-14 14:06:18,478 - INFO - Epoch 1, Train MSE: 0.4659, RMSE: 0.6826, MAE: 0.3685, Val MSE: 0.2638, RMSE: 0.5136, MAE: 0.2594
Epoch 2: 100%|██████████| 4482/4482 [01:05<00:00, 68.35it/s]
2025-09-14 14:07:26,532 - INFO - Epoch 2, Train MSE: 0.3765, RMSE: 0.6136, MAE: 0.2892, Val MSE: 0.2684, RMSE: 0.5180, MAE: 0.2520
Epoch 3: 100%|██████████| 4482/4482 [01:05<00:00, 68.61it/s]
2025-09-14 14:08:34,256 - INFO - Epoch 3,

In [30]:
item_metadata_df = pd.read_parquet('/Users/mohammednihal/Desktop/XAI/Cashflow-and-demand-forecating-/modelnotebook/training_data/item_metadata.parquet')
print(item_metadata_df.head())

   item_id  item_price  item_category_id item_category_name
0        1    2.376879                76        Category_76
1       28   -0.431686                30        Category_30
2       30   -0.661427                40        Category_40
3       31    0.099377                37        Category_37
4       32   -0.822099                40        Category_40
