In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
import torch 
import torch.nn as nn
import warnings
warnings.filterwarnings("ignore")

data_path = pd.read_csv("/Users/mohammednihal/XAI-1/Predict Future Sales/merged_data.csv")
df = data_path

print("First five rows of the data:")
print(df.head())
print("\nData Shape: ")
print(df.shape) 



First five rows of the data:
         date  date_block_num  shop_id  item_id  item_price  item_cnt_day  \
0  02.01.2013               0       59    22154      999.00           1.0   
1  03.01.2013               0       25     2552      899.00           1.0   
2  05.01.2013               0       25     2552      899.00          -1.0   
3  06.01.2013               0       25     2554     1709.05           1.0   
4  15.01.2013               0       25     2555     1099.00           1.0   

                                  item_name  item_category_id  \
0                    ANNOUNCEMENT 2012 (BD)                37   
1  DEEP PURPLE  The House Of Blue Light  LP                58   
2  DEEP PURPLE  The House Of Blue Light  LP                58   
3  DEEP PURPLE  Who Do You Think We Are  LP                58   
4            DEEP PURPE 30 Very Best Of 2CD                56   

                 item_category_name            shop_name  
0                 Cinema - Blue-Ray  Jaroslavl TC Altair  

In [2]:
def data_preprocessing(df):
    expected_columns = ['date', 'date_block_num', 'shop_id', 'item_id', 'item_price', 'item_cnt_day',
                       'item_name', 'item_category_id', 'item_category_name', 'shop_name']
    if not all(col in df.columns for col in expected_columns):
        missing = set(expected_columns) - set(df.columns)
        raise ValueError(f"Missing columns: {missing}")
    
    print(f"Preprocessing dataset with {len(df):,} records")
    
    cleaned_df = df.copy()
    # Initial checks
    print("Initial unique counts:")
    print(f"item_id: {cleaned_df['item_id'].nunique()}")
    print(f"item_name: {cleaned_df['item_name'].nunique()}")
    print("item_name nulls before imputation:", cleaned_df['item_name'].isna().sum())
    print("item_id with 'Unknown' item_name:", (cleaned_df['item_name'] == 'Unknown').sum())
    print("Unique item_name per item_id:")
    name_counts = cleaned_df.groupby('item_id')['item_name'].nunique()
    print(name_counts.value_counts())
    if (name_counts > 1).any():
        print("item_id with multiple item_name:", name_counts[name_counts > 1].index.tolist())

    cleaned_df['item_price'] = cleaned_df['item_price'].astype(np.float32)
    cleaned_df['item_cnt_day'] = cleaned_df['item_cnt_day'].astype(np.float32)
    
    print("Initial null counts:")
    for col in cleaned_df.columns:
        nulls = cleaned_df[col].isna().sum()
        print(f"Column '{col}': {nulls:,} nulls ({nulls/len(cleaned_df)*100:.2f}%)")
    
    cleaned_df['Return'] = cleaned_df['item_cnt_day'].where(cleaned_df['item_cnt_day'] < 0, 0).abs().astype(np.float32)
    cleaned_df['item_cnt_day'] = cleaned_df['item_cnt_day'].clip(lower=0)
    print(f"Created Return column. Total Returns: {cleaned_df['Return'].sum():,}")
    
    cleaned_df['date'] = pd.to_datetime(cleaned_df['date'], format='%d.%m.%Y')
    print("Converted date column to datetime format")
    
    # Handle item_name
    if 'item_name' in cleaned_df.columns and cleaned_df['item_name'].isna().any():
        item_name_nulls = cleaned_df['item_name'].isna().sum()
        cleaned_df['item_name'] = cleaned_df['item_name'].fillna('Unknown')
        print(f"Imputed {item_name_nulls:,} missing item names with 'Unknown'")
    
    # Ensure string types
    cleaned_df['shop_id'] = cleaned_df['shop_id'].astype(str)
    cleaned_df['item_id'] = cleaned_df['item_id'].astype(str)
    cleaned_df['item_name'] = cleaned_df['item_name'].astype(str)
    
    # Fix multiple item_name per item_id
    name_counts = cleaned_df.groupby('item_id')['item_name'].nunique()
    if (name_counts > 1).any():
        print(f"Warning: {name_counts[name_counts > 1].count()} item_id(s) have multiple item_name values. Taking most frequent.")
        most_frequent = cleaned_df.groupby('item_id')['item_name'].agg(lambda x: x.mode()[0]).reset_index()
        cleaned_df = cleaned_df.drop(columns='item_name').merge(most_frequent, on='item_id', how='left')
    
    shop_stats = cleaned_df.groupby('shop_id').size().reset_index(name='count')
    shops_to_remove = shop_stats.nsmallest(6, 'count')['shop_id'].tolist()
    remove_records = cleaned_df[cleaned_df['shop_id'].isin(shops_to_remove)].shape[0]
    cleaned_df = cleaned_df[~cleaned_df['shop_id'].isin(shops_to_remove)]
    print(f"Selected {cleaned_df['shop_id'].nunique()} shops after removing {shops_to_remove}")
    print(f"Removed {remove_records:,} records from shop selection")
    
    cleaned_df = cleaned_df.sort_values(['shop_id', 'item_id', 'date'])
    
    def winsorize_with_rolling_stats(group):
        group = group.set_index('date').sort_index()
        rolling_mean = group['item_cnt_day'].rolling(window='30D', min_periods=1).mean()
        rolling_std = group['item_cnt_day'].rolling(window='30D', min_periods=1).std()
        upper_3sigma = rolling_mean + 3 * rolling_std
        
        outliers = group['item_cnt_day'] > upper_3sigma
        outlier_count = outliers.sum()
        
        winsor_limit = group['item_cnt_day'].quantile(0.99)
        clipped = group['item_cnt_day'].clip(upper=winsor_limit)
        clipped_count = (group['item_cnt_day'] > winsor_limit).sum()
        
        return clipped.reset_index(drop=True), outlier_count, clipped_count
        
    total_records = len(cleaned_df)
    item_cnt_day_winsorized = []
    total_outlier = 0
    total_clipped = 0
    
    for (shop_id, item_id), group in cleaned_df.groupby(['shop_id', 'item_id']):
        clipped_series, outliers, clipped = winsorize_with_rolling_stats(group.copy())
        item_cnt_day_winsorized.extend(clipped_series)
        total_outlier += outliers
        total_clipped += clipped
        
    cleaned_df['item_cnt_day'] = item_cnt_day_winsorized
    
    outlier_percentage = (total_outlier / total_records * 100) if total_records > 0 else 0
    clipped_percentage = (total_clipped / total_records * 100) if total_records > 0 else 0

    print("Winsorization applied at 99th percentile for item_cnt_day after 3σ rolling-window detection")
    print(f"Total records: {total_records:,}")
    print(f"Outliers (above 3σ): {total_outlier:,} ({outlier_percentage:.2f}%)")
    print(f"Values clipped (>99th percentile): {total_clipped:,} ({clipped_percentage:.2f}%)")
    
    # Final checks
    print("Final unique counts:")
    print(f"item_id: {cleaned_df['item_id'].nunique()}")
    print(f"item_name: {cleaned_df['item_name'].nunique()}")
    
    return cleaned_df

cleaned_data = data_preprocessing(df)

Preprocessing dataset with 2,935,849 records
Initial unique counts:
item_id: 21807
item_name: 21233
item_name nulls before imputation: 84
item_id with 'Unknown' item_name: 0
Unique item_name per item_id:
item_name
1    21806
0        1
Name: count, dtype: int64
Initial null counts:
Column 'date': 0 nulls (0.00%)
Column 'date_block_num': 0 nulls (0.00%)
Column 'shop_id': 0 nulls (0.00%)
Column 'item_id': 0 nulls (0.00%)
Column 'item_price': 0 nulls (0.00%)
Column 'item_cnt_day': 0 nulls (0.00%)
Column 'item_name': 84 nulls (0.00%)
Column 'item_category_id': 0 nulls (0.00%)
Column 'item_category_name': 0 nulls (0.00%)
Column 'shop_name': 0 nulls (0.00%)
Created Return column. Total Returns: 7,541.0
Converted date column to datetime format
Imputed 84 missing item names with 'Unknown'
Selected 54 shops after removing ['36', '11', '20', '8', '9', '40']
Removed 14,017 records from shop selection
Winsorization applied at 99th percentile for item_cnt_day after 3σ rolling-window detection
Total

In [42]:
cleaned_data.columns.tolist()

['date',
 'date_block_num',
 'shop_id',
 'item_id',
 'item_price',
 'item_cnt_day',
 'item_name',
 'item_category_id',
 'item_category_name',
 'shop_name',
 'Return']

In [45]:
import pandas as pd
import numpy as np

def aggregate_to_monthly(df_clean):
    # Aggregate to monthly level
    monthly_data = df_clean.groupby(['date_block_num', 'shop_id', 'item_id', 'item_category_id']).agg({
        'item_cnt_day': 'sum',
        'item_price': 'mean',
        'Return': 'sum'
    }).reset_index()
    
    # Rename columns
    monthly_data.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)
    
    # Clip item_cnt_month
    monthly_data['item_cnt_month'] = monthly_data['item_cnt_month'].clip(0, 20).astype(np.float32)
    
    return monthly_data

# Assuming df_clean is loaded
monthly_data = aggregate_to_monthly(cleaned_data)

# Save for debugging
monthly_data.to_parquet("lstm_data/monthly_data.parquet", index=False)

print("Monthly data sample:")
print(monthly_data.head())
print(f"Monthly data shape: {monthly_data.shape}")

Monthly data sample:
   date_block_num shop_id item_id  item_category_id  item_cnt_month  \
0               0       0    1000                67             5.0   
1               0       0    1001                67             2.0   
2               0       0   10012                40             1.0   
3               0       0    1002                67             2.0   
4               0       0    1003                67             2.0   

   item_price  Return  
0        58.0     0.0  
1        58.0     0.0  
2        76.0     0.0  
3        58.0     0.0  
4        58.0     0.0  
Monthly data shape: (1601409, 7)


In [43]:

monthly_data.columns.tolist()

['date_block_num',
 'shop_id',
 'item_id',
 'item_cnt_month',
 'Return',
 'item_price']

In [57]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

def create_lag_features(df, group_cols, value_cols, lags=[1, 2, 3]):
    df = df.sort_values(group_cols + ['date_block_num']).copy()
    for col in value_cols:
        for lag in lags:
            lag_col = f"{col}_lag_{lag}"
            df[lag_col] = df.groupby(group_cols)[col].shift(lag)
    return df

def impute_nans_with_median_then_mean(df, cols):
    imputation_values = {}
    for col in cols:
        nan_count = df[col].isna().sum()
        if nan_count > 0:
            median_val = df[col].median()
            if median_val != 0:
                fill_val = median_val
                method = 'median'
            else:
                mean_val = df[col].mean()
                if mean_val != 0:
                    fill_val = mean_val
                    method = 'mean'
                else:
                    fill_val = 0
                    method = 'zero'
            df[col] = df[col].fillna(fill_val).astype(np.float32)
            imputation_values[col] = (fill_val, method)
            print(f"Imputed {nan_count:,} NaN values in '{col}' with {method} value {fill_val:.4f}")
        else:
            imputation_values[col] = (None, None)
    return df, imputation_values

# ---- APPLY FEATURE ENGINEERING ----
monthly_data = pd.read_parquet("lstm_data/monthly_data.parquet")
engineered_df = monthly_data.copy()

print(f"Starting feature engineering on monthly_data with {len(engineered_df):,} rows")

group_cols = ['shop_id', 'item_id']
value_cols = ['item_cnt_month']
lags = [1, 2, 3]

# Clip item_cnt_month
engineered_df['item_cnt_month'] = engineered_df['item_cnt_month'].clip(0, 20)

# Make Return binary
engineered_df['Return'] = (engineered_df['Return'] > 0).astype(np.int8)

# Log-transform item_price with tighter clipping
engineered_df['item_price'] = np.log1p(engineered_df['item_price'].clip(lower=0, upper=5000)).astype(np.float32)

# Create lag features
engineered_df = create_lag_features(engineered_df, group_cols, value_cols, lags)

# Add mean encodings
category_means = engineered_df.groupby(['item_category_id', 'date_block_num'])['item_cnt_month'].mean().reset_index()
category_means.rename(columns={'item_cnt_month': 'item_cnt_month_mean_category'}, inplace=True)
engineered_df = engineered_df.merge(category_means, on=['item_category_id', 'date_block_num'], how='left')

shop_means = engineered_df.groupby(['shop_id', 'date_block_num'])['item_cnt_month'].mean().reset_index()
shop_means.rename(columns={'item_cnt_month': 'item_cnt_month_mean_shop'}, inplace=True)
engineered_df = engineered_df.merge(shop_means, on=['shop_id', 'date_block_num'], how='left')

item_means = engineered_df.groupby(['item_id', 'date_block_num'])['item_cnt_month'].mean().reset_index()
item_means.rename(columns={'item_cnt_month': 'item_cnt_month_mean_item'}, inplace=True)
engineered_df = engineered_df.merge(item_means, on=['item_id', 'date_block_num'], how='left')

# Impute missing values
features_to_impute = [
    'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
    'item_cnt_month_mean_category', 'item_cnt_month_mean_shop', 'item_cnt_month_mean_item',
    'item_price'
]
engineered_df, imputation_info = impute_nans_with_median_then_mean(engineered_df, features_to_impute)

# Save
engineered_df.to_parquet("lstm_data/engineered_df.parquet", index=False)

# Verify item_price
print("\nItem_price statistics in engineered_df:")
print(engineered_df['item_price'].describe())

# Preview
print("\nSample of engineered DataFrame:")
print(engineered_df[['shop_id', 'item_id', 'date_block_num', 'item_category_id', 'item_cnt_month', 'Return', 'item_price'] + features_to_impute].head(10))
print(f"Engineered data shape: {engineered_df.shape}")

Starting feature engineering on monthly_data with 1,601,409 rows
Imputed 418,619 NaN values in 'item_cnt_month_lag_1' with median value 1.0000
Imputed 694,941 NaN values in 'item_cnt_month_lag_2' with median value 1.0000
Imputed 890,881 NaN values in 'item_cnt_month_lag_3' with median value 1.0000

Item_price statistics in engineered_df:
count    1.601409e+06
mean     6.066872e+00
std      9.899732e-01
min      8.617770e-02
25%      5.298317e+00
50%      5.991465e+00
75%      6.789816e+00
max      8.517393e+00
Name: item_price, dtype: float64

Sample of engineered DataFrame:
  shop_id item_id  date_block_num  item_category_id  item_cnt_month  Return  \
0       0    1000               0                67             5.0       0   
1       0    1000               1                67             4.0       0   
2       0   10004               1                40             1.0       0   
3       0    1001               0                67             2.0       0   
4       0   10012      

In [54]:
engineered_df.columns.tolist()

['date_block_num',
 'shop_id',
 'item_id',
 'item_category_id',
 'item_cnt_month',
 'item_price',
 'Return',
 'item_cnt_month_lag_1',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 'item_cnt_month_mean_category',
 'item_cnt_month_mean_shop',
 'item_cnt_month_mean_item']

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
import joblib

# ---- COPY AND PREP ----
scaled_df = engineered_df.copy()

# Ensure correct data types
scaled_df['shop_id'] = scaled_df['shop_id'].astype('int16')
scaled_df['item_id'] = scaled_df['item_id'].astype('int32')
scaled_df['date_block_num'] = scaled_df['date_block_num'].astype('int8')
scaled_df['item_category_id'] = scaled_df['item_category_id'].astype('int32')
scaled_df['Return'] = scaled_df['Return'].astype('int8')

# ---- DEFINE COLUMNS ----
numerical_cols = [
    'item_cnt_month',
    'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
    'item_cnt_month_mean_shop', 'item_cnt_month_mean_item', 'item_cnt_month_mean_category'
]
price_col = ['item_price']
categorical_cols = ['shop_id', 'item_id', 'item_category_id']

# Verify columns exist
missing_numerical = [col for col in numerical_cols + price_col if col not in scaled_df.columns]
missing_categorical = [col for col in categorical_cols if col not in scaled_df.columns]
if missing_numerical:
    print(f"Warning: Missing numerical columns: {missing_numerical}")
if missing_categorical:
    print(f"Warning: Missing categorical columns: {missing_categorical}")

# ---- CHRONOLOGICAL SPLIT FOR SCALING/ENCODING ----
train_df = scaled_df[scaled_df['date_block_num'] <= 26].copy()
val_df = scaled_df[(scaled_df['date_block_num'] > 26) & (scaled_df['date_block_num'] <= 29)].copy()
test_df = scaled_df[scaled_df['date_block_num'] > 29].copy()

# ---- APPLY STANDARD SCALING TO NUMERICAL COLUMNS ----
scaler = StandardScaler()
scaler.fit(train_df[numerical_cols])  # Fit only on training data
train_df[numerical_cols] = scaler.transform(train_df[numerical_cols]).astype(np.float32)
val_df[numerical_cols] = scaler.transform(val_df[numerical_cols]).astype(np.float32)
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols]).astype(np.float32)

# Save scaler
joblib.dump(scaler, 'lstm_data/scaler.joblib')

# ---- APPLY ROBUST SCALING TO item_price ----
price_scaler = RobustScaler()
price_scaler.fit(train_df[price_col])  # Fit only on training data
train_df['item_price'] = price_scaler.transform(train_df[price_col]).astype(np.float32)
val_df['item_price'] = price_scaler.transform(val_df[price_col]).astype(np.float32)
test_df['item_price'] = price_scaler.transform(test_df[price_col]).astype(np.float32)

# Save price scaler
joblib.dump(price_scaler, 'lstm_data/price_scaler.joblib')

# Mean encoding for shop_id, item_id, item_category_id
for col in categorical_cols:
    mean_encoded = train_df.groupby(col)['item_cnt_month'].mean().to_dict()
    train_df[f'{col}_mean_encode'] = train_df[col].map(mean_encoded).astype(np.float32)
    val_df[f'{col}_mean_encode'] = val_df[col].map(mean_encoded).fillna(train_df['item_cnt_month'].mean()).astype(np.float32)
    test_df[f'{col}_mean_encode'] = test_df[col].map(mean_encoded).fillna(train_df['item_cnt_month'].mean()).astype(np.float32)


scaled_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

# ---- SORT BY date_block_num, shop_id, item_id ----
scaled_df = scaled_df.sort_values(['date_block_num', 'shop_id', 'item_id']).reset_index(drop=True)

# ---- SAVE OUTPUT ----
scaled_df.to_parquet("lstm_data/scaled_df.parquet", index=False)

# ---- VERIFICATION ----
print("Return value counts:")
print(scaled_df['Return'].value_counts())
print("\nDate_block_num value counts (first 10):")
print(scaled_df['date_block_num'].value_counts().sort_index().head(10))
print("\nItem_price statistics in scaled_df:")
print(scaled_df['item_price'].describe())

# ---- PREVIEW OUTPUT ----
print("\nSample of scaled_df after scaling and encoding:")
preview_cols = ['shop_id', 'item_id', 'date_block_num', 'item_category_id', 'item_cnt_month', 'Return', 'item_price', 'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3', 'item_cnt_month_mean_shop', 'item_cnt_month_mean_item', 'item_cnt_month_mean_category', 'shop_id_mean_encode', 'item_id_mean_encode', 'item_category_id_mean_encode']
print(scaled_df[preview_cols].head(10))

print("\nData shape:")
print(f"scaled_df: {scaled_df.shape}")

Return value counts:
Return
0    1594181
1       7228
Name: count, dtype: int64

Date_block_num value counts (first 10):
date_block_num
0    62238
1    59132
2    63302
3    54637
4    53296
5    56196
6    58035
7    58022
8    51575
9    50463
Name: count, dtype: int64

Item_price statistics in scaled_df:
count    1.601409e+06
mean     1.507174e-01
std      7.141147e-01
min     -4.163441e+00
25%     -4.036773e-01
50%      9.632267e-02
75%      6.722114e-01
max      1.918395e+00
Name: item_price, dtype: float64

Sample of scaled_df after scaling and encoding:
   shop_id  item_id  date_block_num  item_category_id  item_cnt_month  Return  \
0        0       32               0                40        1.531888       0   
1        0       33               0                37        0.371638       0   
2        0       35               0                40       -0.401862       0   
3        0       43               0                40       -0.401862       0   
4        0       51         

In [63]:
scaled_df.columns.tolist()

['date_block_num',
 'shop_id',
 'item_id',
 'item_category_id',
 'item_cnt_month',
 'item_price',
 'Return',
 'item_cnt_month_lag_1',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 'item_cnt_month_mean_category',
 'item_cnt_month_mean_shop',
 'item_cnt_month_mean_item',
 'shop_id_mean_encode',
 'item_id_mean_encode',
 'item_category_id_mean_encode']

In [None]:
import numpy as np
import pandas as pd
import os
import logging
from pathlib import Path

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

SEQUENCE_LENGTH = 3
FEATURE_COLS = [
    'item_cnt_month', 'item_price', 'Return',
    'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
    'item_cnt_month_mean_category', 'item_cnt_month_mean_shop', 'item_cnt_month_mean_item',
    'shop_id_mean_encode', 'item_id_mean_encode', 'item_category_id_mean_encode'
]
TARGET_COL = 'item_cnt_month'
DATE_COL = 'date_block_num'
GROUP_COLS = ['shop_id', 'item_id']
SAVE_DIR = "/Users/mohammednihal/XAI-1/training_data"

# Create output directory
try:
    os.makedirs(SAVE_DIR, exist_ok=True)
    logger.info(f"Created/Initialized directory: {SAVE_DIR}")
except Exception as e:
    logger.error(f"Failed to create directory {SAVE_DIR}: {e}")
    raise


try:
    original_df = scaled_df.copy()
    logger.info(f"Original data shape: {original_df.shape[0]}")
except NameError:
    logger.error("Error: scaled_df is not defined. Please ensure it is loaded.")
    raise


def prepare_data(df):
    logger.info("Preparing data...")
    df = df.sort_values([*GROUP_COLS, DATE_COL]).copy()
    min_required = SEQUENCE_LENGTH + 1
    item_counts = df.groupby(GROUP_COLS)[DATE_COL].count()
    valid_items = item_counts[item_counts >= min_required].index
    filtered_df = df[df.set_index(GROUP_COLS).index.isin(valid_items)]
    logger.info(f"Prepared data shape: {filtered_df.shape[0]:,}, valid groups: {len(valid_items):,}")
    return filtered_df, len(valid_items)

prepared_df, num_valid_groups = prepare_data(original_df)

def create_sequences(df):
    logger.info("Creating sequences...")

    missing_cols = [col for col in FEATURE_COLS if col not in df.columns]
    if missing_cols:
        logger.error(f"Missing columns in dataframe: {missing_cols}")
        raise KeyError(f"Missing required columns: {missing_cols}")
    
    sequences, targets, target_dates = [], [], []
    grouped = df.groupby(GROUP_COLS)

    for _, group in grouped:
        group_data = group[FEATURE_COLS].values
        group_target = group[TARGET_COL].values
        group_dates = group[DATE_COL].values
        
        for i in range(len(group) - SEQUENCE_LENGTH):
            sequences.append(group_data[i:i+SEQUENCE_LENGTH])
            targets.append(group_target[i+SEQUENCE_LENGTH])
            target_dates.append(group_dates[i+SEQUENCE_LENGTH])

    if not sequences:
        logger.warning("No sequences created.")
        return np.empty((0, SEQUENCE_LENGTH, len(FEATURE_COLS))), np.array([]), np.array([])

    logger.info(f"Created {len(sequences):,} sequences")
    return np.array(sequences, dtype=np.float32), np.array(targets, dtype=np.float32), np.array(target_dates, dtype=np.int32)

all_sequences, all_targets, target_dates = create_sequences(prepared_df)

# -----------------------------
# 4. TIME-BASED SPLITTING
# -----------------------------
def time_based_split(sequences, targets, target_dates, train_end=26, val_end=29):
    logger.info("Splitting data...")
    train_idx = target_dates <= train_end
    val_idx = (target_dates > train_end) & (target_dates <= val_end)
    test_idx = target_dates > val_end

    logger.info(f"Train: {train_idx.sum():,}, Val: {val_idx.sum():,}, Test: {test_idx.sum():,}")
    return (
        (sequences[train_idx], targets[train_idx]),
        (sequences[val_idx], targets[val_idx]),
        (sequences[test_idx], targets[test_idx])
    )

(X_train, y_train), (X_val, y_val), (X_test, y_test) = time_based_split(
    all_sequences, all_targets, target_dates
)


def save_sequences(X, y, prefix):
    if len(X) == 0:
        logger.warning(f"{prefix} set is empty")
        return
    
    X_flat = X.reshape(X.shape[0], -1)
    feature_names = [f"{feat}_t{t}" for t in range(SEQUENCE_LENGTH) for feat in FEATURE_COLS]
    
    try:
        pd.DataFrame(X_flat, columns=feature_names).to_parquet(
            os.path.join(SAVE_DIR, f"{prefix}_X.parquet"), index=False
        )
        pd.DataFrame(y, columns=[TARGET_COL]).to_parquet(
            os.path.join(SAVE_DIR, f"{prefix}_y.parquet"), index=False
        )
        logger.info(f"Saved {prefix} sequences to {SAVE_DIR}/{prefix}_[X|y].parquet")
    except Exception as e:
        logger.error(f"Failed to save {prefix} sequences: {e}")
        raise

save_sequences(X_train, y_train, 'train')
save_sequences(X_val, y_val, 'val')
save_sequences(X_test, y_test, 'test')

logger.info("\n=== Data Shapes ===")
logger.info(f"Original: {original_df.shape}")
logger.info(f"Prepared: {prepared_df.shape}")
logger.info(f"Train: {X_train.shape}, y: {y_train.shape}")
logger.info(f"Val: {X_val.shape}, y: {y_val.shape}")
logger.info(f"Test: {X_test.shape}, y: {y_test.shape}")

logger.info("\nDate ranges:")
logger.info(f"Train: {target_dates[target_dates <= 26].min()} to {target_dates[target_dates <= 26].max()}")
logger.info(f"Val: {target_dates[(target_dates > 26) & (target_dates <= 29)].min()} to {target_dates[(target_dates > 26) & (target_dates <= 29)].max()}")
logger.info(f"Test: {target_dates[target_dates > 29].min()} to {target_dates[target_dates > 29].max()}")

logger.info("\nSaved feature columns:")
logger.info([f"{feat}_t{t}" for t in range(SEQUENCE_LENGTH) for feat in FEATURE_COLS])

for prefix in ['train', 'val', 'test']:
    x_path = Path(SAVE_DIR) / f"{prefix}_X.parquet"
    y_path = Path(SAVE_DIR) / f"{prefix}_y.parquet"
    if x_path.exists() and y_path.exists():
        logger.info(f"✓ Verified: {x_path.name} and {y_path.name} exist")
    else:
        logger.error(f"✗ Missing: {x_path.name} or {y_path.name}")


2025-06-02 09:03:39,824 - INFO - Created/Initialized directory: /Users/mohammednihal/XAI-1/training_data
2025-06-02 09:03:39,866 - INFO - Original data shape: 1601409
2025-06-02 09:03:39,872 - INFO - Preparing data...
2025-06-02 09:03:40,176 - INFO - Prepared data shape: 1,146,719, valid groups: 145,397
2025-06-02 09:03:40,179 - INFO - Creating sequences...
2025-06-02 09:04:06,171 - INFO - Created 710,528 sequences
2025-06-02 09:04:06,547 - INFO - Splitting data...
2025-06-02 09:04:06,549 - INFO - Train: 573,640, Val: 59,474, Test: 77,414
2025-06-02 09:04:07,085 - INFO - Saved train sequences to /Users/mohammednihal/XAI-1/training_data/train_[X|y].parquet
2025-06-02 09:04:07,140 - INFO - Saved val sequences to /Users/mohammednihal/XAI-1/training_data/val_[X|y].parquet
2025-06-02 09:04:07,197 - INFO - Saved test sequences to /Users/mohammednihal/XAI-1/training_data/test_[X|y].parquet
2025-06-02 09:04:07,197 - INFO - 
=== Data Shapes ===
2025-06-02 09:04:07,198 - INFO - Original: (160140

In [65]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import polars as pl
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import sys
import traceback
from datetime import datetime, timedelta

# Setup logging
log_dir = Path('logs')
log_dir.mkdir(exist_ok=True)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_dir / f'run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
logger.info(f"Using device: {device}")

class SalesDataset(Dataset):
    def __init__(self, X_file, y_file, sequence_length=3):
        if not Path(X_file).exists() or not Path(y_file).exists():
            raise FileNotFoundError(f"Data files not found: {X_file}, {y_file}")
        
        self.X_file = X_file
        self.y_file = y_file
        
        # Load data
        self.X = pl.read_parquet(X_file)
        feature_cols = [
            'item_cnt_month', 'item_price', 'Return',
            'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
            'item_cnt_month_mean_shop', 'item_cnt_month_mean_item', 'item_cnt_month_mean_category',
            'shop_id_mean_encode', 'item_id_mean_encode', 'item_category_id_mean_encode'
        ]
        input_cols = [f"{feat}_t{t}" for t in range(sequence_length) for feat in feature_cols]
        
        # Verify columns
        available_cols = self.X.columns
        logger.info(f"Columns in {X_file}: {available_cols}")
        missing_cols = [col for col in input_cols if col not in available_cols]
        if missing_cols:
            raise ValueError(f"Missing columns in {X_file}: {missing_cols}")
        
        self.y = pl.read_parquet(y_file).to_numpy().astype(np.float32).reshape(-1, 1)
        self.indices = np.arange(len(self.X))
        
        logger.info(f"Dataset size: {len(self.indices)}, X shape: {self.X.shape}, y shape: {self.y.shape}")
        self.dates = [datetime(2013, 1, 1) + timedelta(days=30 * (int(idx) % 120)) for idx in self.indices]
        if len(self.X) != len(self.y):
            raise ValueError(f"X and y length mismatch: {len(self.X)} vs {len(self.y)}")
        
        self.sequence_length = sequence_length
        self.numerical_cols = input_cols
        
        numerical_data = self.X.select(self.numerical_cols).to_numpy().astype(np.float32)
        numerical_data = np.nan_to_num(numerical_data, nan=0.0)
        self.y = np.nan_to_num(self.y, nan=0.0)
        
        numerical_data = numerical_data.reshape(len(self.X), sequence_length, len(feature_cols))
        self.numerical = numerical_data
        
        self.identifiers = self.indices
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        numerical = torch.tensor(self.numerical[idx], dtype=torch.float32)  # Shape: (3, 12)
        target = torch.tensor(self.y[idx], dtype=torch.float32)  # Shape: (1,)
        identifiers = torch.tensor([self.identifiers[idx]], dtype=torch.int32)
        dates = self.dates[idx]
        return {
            'numerical': numerical, 'target': target, 'identifiers': identifiers, 'dates': dates
        }

class HALSTM(nn.Module):
    def __init__(self, numerical_dim=12, hidden_dim=128, num_layers=2, num_heads=4, dropout=0.3, l2_lambda=0.01):
        super(HALSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.l2_lambda = l2_lambda
        
        self.input_dim = numerical_dim
        self.lstm = nn.LSTM(self.input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.lstm_norm = nn.LayerNorm(hidden_dim)
        self.mha = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)
        self.mha_norm = nn.LayerNorm(hidden_dim)
        self.gate = nn.Linear(hidden_dim * 2, hidden_dim)
        self.sigmoid = nn.Sigmoid()
        self.fc_shared = nn.Linear(hidden_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
        self.positional_encoding = torch.zeros(3, hidden_dim).to(device)
        position = torch.arange(0, 3, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / hidden_dim))
        self.positional_encoding[:, 0::2] = torch.sin(position * div_term)
        self.positional_encoding[:, 1::2] = torch.cos(position * div_term)
        
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LSTM):
                for name, param in m.named_parameters():
                    if 'weight' in name:
                        nn.init.xavier_normal_(param)
                    elif 'bias' in name:
                        nn.init.constant_(param, 0)
    
    def forward(self, numerical):
        batch_size, seq_len, _ = numerical.size()
        x = self.dropout(numerical)
        
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim, device=x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim, device=x.device)
        lstm_out, _ = self.lstm(x, (h0, c0))
        lstm_out = self.lstm_norm(lstm_out)
        
        lstm_out = lstm_out + self.positional_encoding[:seq_len, :].unsqueeze(0)
        
        mha_out, mha_weights = self.mha(lstm_out, lstm_out, lstm_out)
        mha_out = self.mha_norm(mha_out)
        
        combined = torch.cat([lstm_out[:, -1, :], mha_out[:, -1, :]], dim=-1)
        gate = self.sigmoid(self.gate(combined))
        fused = gate * lstm_out[:, -1, :] + (1 - gate) * mha_out[:, -1, :]
        
        shared = self.relu(self.fc_shared(fused))
        output = self.fc_out(shared)
        
        return output, {'mha_weights': mha_weights, 'gate_weights': gate}

def collate_fn(batch):
    if not batch:
        return {}
    return {
        'numerical': torch.stack([item['numerical'] for item in batch]),
        'target': torch.stack([item['target'] for item in batch]),
        'identifiers': torch.stack([item['identifiers'] for item in batch]),
        'dates': [item['dates'] for item in batch]
    }

def train_model(model, train_loader, val_loader, num_epochs=15, lr=0.0005, accum_steps=2):
    criterion_mse = nn.MSELoss().to(device)
    criterion_mae = nn.L1Loss().to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=model.l2_lambda)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, epochs=num_epochs, steps_per_epoch=len(train_loader)//accum_steps)
    scaler = torch.cuda.amp.GradScaler() if device.type in ['cuda', 'mps'] else None
    
    metrics = {'epoch': [], 'train_mse': [], 'train_rmse': [], 'train_mae': [], 'val_mse': [], 'val_rmse': [], 'val_mae': []}
    best_val_loss = float('inf')
    output_dir = Path('results')
    output_dir.mkdir(exist_ok=True)
    patience = 5
    epochs_no_improve = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_mse = 0
        train_mae = 0
        optimizer.zero_grad()
        
        for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):
            numerical = batch['numerical'].to(device)
            target = batch['target'].to(device)
            
            if scaler:
                with torch.cuda.amp.autocast():
                    output, _ = model(numerical)
                    mse_loss = criterion_mse(output, target) / accum_steps
                    mae_loss = criterion_mae(output, target) / accum_steps
                    loss = mse_loss
                scaler.scale(loss).backward()
                if (batch_idx + 1) % accum_steps == 0:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
                    scaler.step(optimizer)
                    scheduler.step()
                    scaler.update()
                    optimizer.zero_grad()
            else:
                output, _ = model(numerical)
                mse_loss = criterion_mse(output, target) / accum_steps
                mae_loss = criterion_mae(output, target) / accum_steps
                loss = mse_loss
                loss.backward()
                if (batch_idx + 1) % accum_steps == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
            
            train_loss += mse_loss.item() * accum_steps
            train_mse += mse_loss.item() * accum_steps
            train_mae += mae_loss.item() * accum_steps
        
        train_loss /= len(train_loader)
        train_mse /= len(train_loader)
        train_rmse = np.sqrt(train_mse)
        train_mae /= len(train_loader)
        
        model.eval()
        val_loss = 0
        val_mse = 0
        val_mae = 0
        with torch.no_grad():
            for batch in val_loader:
                numerical = batch['numerical'].to(device)
                target = batch['target'].to(device)
                output, _ = model(numerical)
                mse_loss = criterion_mse(output, target)
                mae_loss = criterion_mae(output, target)
                val_loss += mse_loss.item()
                val_mse += mse_loss.item()
                val_mae += mae_loss.item()
        
        val_loss /= len(val_loader)
        val_mse /= len(val_loader)
        val_rmse = np.sqrt(val_mse)
        val_mae /= len(val_loader)
        
        metrics['epoch'].append(epoch + 1)
        metrics['train_mse'].append(train_mse)
        metrics['train_rmse'].append(train_rmse)
        metrics['train_mae'].append(train_mae)
        metrics['val_mse'].append(val_mse)
        metrics['val_rmse'].append(val_rmse)
        metrics['val_mae'].append(val_mae)
        logger.info(f"Epoch {epoch+1}, Train MSE: {train_mse:.4f}, RMSE: {train_rmse:.4f}, MAE: {train_mae:.4f}, "
                    f"Val MSE: {val_mse:.4f}, RMSE: {val_rmse:.4f}, MAE: {val_mae:.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), output_dir / 'best_ha_lstm.pth')
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                logger.info(f"Early stopping triggered after {epoch+1} epochs")
                break
    
    metrics_df = pd.DataFrame(metrics)
    metrics_df.to_csv(output_dir / 'training_metrics.csv', index=False)
    
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 2, 1)
    plt.plot(metrics['epoch'], metrics['train_rmse'], label='Train RMSE')
    plt.plot(metrics['epoch'], metrics['val_rmse'], label='Val RMSE')
    plt.xlabel('Epoch')
    plt.ylabel('RMSE')
    plt.title('Training and Validation RMSE')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(metrics['epoch'], metrics['train_mae'], label='Train MAE')
    plt.plot(metrics['epoch'], metrics['val_mae'], label='Val MAE')
    plt.xlabel('Epoch')
    plt.ylabel('MAE')
    plt.title('Training and Validation MAE')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(output_dir / 'metrics_plot.png')
    plt.close()
    
    model.load_state_dict(torch.load(output_dir / 'best_ha_lstm.pth'))
    return model, metrics_df

def predict(model, test_loader, dataset):
    model.eval()
    predictions = []
    interpret_outputs = []
    modalities = ['numerical']
    
    criterion_mse = nn.MSELoss()
    criterion_mae = nn.L1Loss()
    test_mse = 0
    test_mae = 0
    total_samples = 0
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(test_loader, desc="Predicting")):
            numerical = batch['numerical'].to(device)
            target = batch['target'].to(device)
            identifiers = batch['identifiers']
            dates = batch['dates']
            
            output, attn_dict = model(numerical)
            preds = output.cpu().numpy()
            
            mse_loss = criterion_mse(output, target)
            mae_loss = criterion_mae(output, target)
            batch_size = output.size(0)
            test_mse += mse_loss.item() * batch_size
            test_mae += mae_loss.item() * batch_size
            total_samples += batch_size
            
            mha_weights = attn_dict['mha_weights'][:, -1, :].cpu().numpy()
            gate_weights = attn_dict['gate_weights'].cpu().numpy()
            
            for i in range(len(preds)):
                pred_dict = {
                    'index': identifiers[i][0].item(),
                    'item_cnt_month': preds[i][0]
                }
                predictions.append(pred_dict)
                
                interpret_outputs.append({
                    'timestamp_reference': dates[i].isoformat() if isinstance(dates[i], datetime) else str(dates[i]),
                    'forecasted_value': preds[i].tolist(),
                    'fusion_weights': gate_weights[i].tolist(),
                    'attention_weights': mha_weights[i].tolist(),
                    'input_sequence_dates': [dates[i].isoformat()] if isinstance(dates[i], datetime) else [str(dates[i])],
                    'modalities_used': modalities,
                    'gating_decision_output': gate_weights[i].tolist()
                })
    
    test_mse /= total_samples
    test_rmse = np.sqrt(test_mse)
    test_mae /= total_samples
    
    pred_df = pd.DataFrame(predictions)
    interpret_df = pd.DataFrame(interpret_outputs)
    
    output_dir = Path('results')
    output_dir.mkdir(exist_ok=True)
    pred_df.to_csv(output_dir / 'predictions.csv', index=False)
    interpret_df.to_csv(output_dir / 'interpretability_outputs.csv', index=False)
    
    logger.info(f"Test MSE: {test_mse:.4f}, RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}")
    
    return pred_df, {'mse': test_mse, 'rmse': test_rmse, 'mae': test_mae}, interpret_df

def visualize_results(pred_df, y_test):
    output_dir = Path('results')
    output_dir.mkdir(exist_ok=True)
    
    if len(pred_df) != len(y_test):
        logger.error(f"Size mismatch: pred_df ({len(pred_df)}) vs y_test ({len(y_test)})")
        min_len = min(len(pred_df), len(y_test))
        pred_df = pred_df.iloc[:min_len]
        y_test = y_test[:min_len]
    
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.kdeplot(pred_df['item_cnt_month'], label='Predicted item_cnt_month')
    sns.kdeplot(y_test, label='Actual item_cnt_month')
    plt.title('Prediction vs. Actual Distribution')
    plt.xlabel('Item Count')
    plt.ylabel('Density')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.scatter(y_test, pred_df['item_cnt_month'], alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('Actual item_cnt_month')
    plt.ylabel('Predicted item_cnt_month')
    plt.title('Actual vs. Predicted Sales')
    
    plt.tight_layout()
    plt.savefig(output_dir / 'prediction_visualization.png')
    plt.close()

def main():
    data_dir = Path('/Users/mohammednihal/XAI-1/lstm_data')
    train_X_path = data_dir / '/Users/mohammednihal/XAI-1/training_data/train_X.parquet'
    train_y_path = data_dir / '/Users/mohammednihal/XAI-1/training_data/train_y.parquet'
    val_X_path = data_dir / '/Users/mohammednihal/XAI-1/training_data/val_X.parquet'
    val_y_path = data_dir / '/Users/mohammednihal/XAI-1/training_data/val_y.parquet'
    test_X_path = data_dir / '/Users/mohammednihal/XAI-1/training_data/test_X.parquet'
    test_y_path = data_dir / '/Users/mohammednihal/XAI-1/training_data/test_y.parquet'
    
    batch_size = 128
    num_workers = 0
    num_epochs = 15
    lr = 0.0005
    accum_steps = 2
    
    logger.info("Loading sample of test data for verification...")
    test_sample = pl.read_parquet(test_X_path).head(5)
    logger.info(f"Test data sample:\n{test_sample}")
    logger.info(f"Test data shape: {pl.read_parquet(test_X_path).shape}")
    logger.info(f"Test y shape: {pl.read_parquet(test_y_path).shape}")
    
    train_dataset = SalesDataset(train_X_path, train_y_path, sequence_length=3)
    val_dataset = SalesDataset(val_X_path, val_y_path, sequence_length=3)
    test_dataset = SalesDataset(test_X_path, test_y_path, sequence_length=3)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, collate_fn=collate_fn)
    
    model = HALSTM(numerical_dim=12, hidden_dim=128, num_layers=2, num_heads=4, dropout=0.3, l2_lambda=0.01).to(device)
    
    model, metrics_df = train_model(model, train_loader, val_loader, num_epochs, lr, accum_steps)
    
    pred_df, test_metrics, interpret_df = predict(model, test_loader, test_dataset)
    
    y_test = pl.read_parquet(test_y_path).to_pandas()['item_cnt_month'].values
    visualize_results(pred_df, y_test)

if __name__ == "__main__":
    try:
        logger.info("Initiating program execution")
        main()
        logger.info("Program executed successfully")
    except Exception as e:
        logger.error(f"Program execution failed: {str(e)}")
        logger.error(traceback.format_exc())
        sys.exit(1)

2025-06-02 09:05:24,715 - INFO - Using device: mps
2025-06-02 09:05:24,723 - INFO - Initiating program execution
2025-06-02 09:05:24,725 - INFO - Loading sample of test data for verification...
2025-06-02 09:05:24,855 - INFO - Test data sample:
shape: (5, 36)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ item_cnt_ ┆ item_pric ┆ Return_t0 ┆ item_cnt_ ┆ … ┆ item_cnt_ ┆ shop_id_m ┆ item_id_m ┆ item_cat │
│ month_t0  ┆ e_t0      ┆ ---       ┆ month_lag ┆   ┆ month_mea ┆ ean_encod ┆ ean_encod ┆ egory_id │
│ ---       ┆ ---       ┆ f32       ┆ _1_t0     ┆   ┆ n_item_t2 ┆ e_t2      ┆ e_t2      ┆ _mean_en │
│ f32       ┆ f32       ┆           ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ code_t…  │
│           ┆           ┆           ┆ f32       ┆   ┆ f32       ┆ f32       ┆ f32       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ f32      │
╞═══════════╪═══════════╪════════