In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
import torch 
import torch.nn as nn
import warnings
warnings.filterwarnings("ignore")

data_path = pd.read_csv("/Users/mohammednihal/XAI-1/Predict Future Sales/merged_data.csv")
df = data_path

print("First five rows of the data:")
print(df.head())
print("\nData Shape: ")
print(df.shape)


First five rows of the data:
         date  date_block_num  shop_id  item_id  item_price  item_cnt_day  \
0  02.01.2013               0       59    22154      999.00           1.0   
1  03.01.2013               0       25     2552      899.00           1.0   
2  05.01.2013               0       25     2552      899.00          -1.0   
3  06.01.2013               0       25     2554     1709.05           1.0   
4  15.01.2013               0       25     2555     1099.00           1.0   

                                  item_name  item_category_id  \
0                    ANNOUNCEMENT 2012 (BD)                37   
1  DEEP PURPLE  The House Of Blue Light  LP                58   
2  DEEP PURPLE  The House Of Blue Light  LP                58   
3  DEEP PURPLE  Who Do You Think We Are  LP                58   
4            DEEP PURPE 30 Very Best Of 2CD                56   

                 item_category_name            shop_name  
0                 Cinema - Blue-Ray  Jaroslavl TC Altair  

In [106]:
df.nunique()

date                   1034
date_block_num           34
shop_id                  60
item_id               21807
item_price            19993
item_cnt_day            198
item_name             21233
item_category_id         84
item_category_name       83
shop_name                60
dtype: int64

In [109]:
def data_preprocessing(df):
    expected_columns = ['date', 'date_block_num', 'shop_id', 'item_id', 'item_price', 'item_cnt_day',
                       'item_name', 'item_category_id', 'item_category_name', 'shop_name']
    if not all(col in df.columns for col in expected_columns):
        missing = set(expected_columns) - set(df.columns)
        raise ValueError(f"Missing columns: {missing}")
    
    print(f"Preprocessing dataset with {len(df):,} records")
    
    cleaned_df = df.copy()
    # Initial checks
    print("Initial unique counts:")
    print(f"item_id: {cleaned_df['item_id'].nunique()}")
    print(f"item_name: {cleaned_df['item_name'].nunique()}")
    print("item_name nulls before imputation:", cleaned_df['item_name'].isna().sum())
    print("item_id with 'Unknown' item_name:", (cleaned_df['item_name'] == 'Unknown').sum())
    print("Unique item_name per item_id:")
    name_counts = cleaned_df.groupby('item_id')['item_name'].nunique()
    print(name_counts.value_counts())
    if (name_counts > 1).any():
        print("item_id with multiple item_name:", name_counts[name_counts > 1].index.tolist())

    cleaned_df['item_price'] = cleaned_df['item_price'].astype(np.float32)
    cleaned_df['item_cnt_day'] = cleaned_df['item_cnt_day'].astype(np.float32)
    
    print("Initial null counts:")
    for col in cleaned_df.columns:
        nulls = cleaned_df[col].isna().sum()
        print(f"Column '{col}': {nulls:,} nulls ({nulls/len(cleaned_df)*100:.2f}%)")
    
    cleaned_df['Return'] = cleaned_df['item_cnt_day'].where(cleaned_df['item_cnt_day'] < 0, 0).abs().astype(np.float32)
    cleaned_df['item_cnt_day'] = cleaned_df['item_cnt_day'].clip(lower=0)
    print(f"Created Return column. Total Returns: {cleaned_df['Return'].sum():,}")
    
    cleaned_df['date'] = pd.to_datetime(cleaned_df['date'], format='%d.%m.%Y')
    print("Converted date column to datetime format")
    
    # Handle item_name
    if 'item_name' in cleaned_df.columns and cleaned_df['item_name'].isna().any():
        item_name_nulls = cleaned_df['item_name'].isna().sum()
        cleaned_df['item_name'] = cleaned_df['item_name'].fillna('Unknown')
        print(f"Imputed {item_name_nulls:,} missing item names with 'Unknown'")
    
    # Ensure string types
    cleaned_df['shop_id'] = cleaned_df['shop_id'].astype(str)
    cleaned_df['item_id'] = cleaned_df['item_id'].astype(str)
    cleaned_df['item_name'] = cleaned_df['item_name'].astype(str)
    
    # Fix multiple item_name per item_id
    name_counts = cleaned_df.groupby('item_id')['item_name'].nunique()
    if (name_counts > 1).any():
        print(f"Warning: {name_counts[name_counts > 1].count()} item_id(s) have multiple item_name values. Taking most frequent.")
        most_frequent = cleaned_df.groupby('item_id')['item_name'].agg(lambda x: x.mode()[0]).reset_index()
        cleaned_df = cleaned_df.drop(columns='item_name').merge(most_frequent, on='item_id', how='left')
    
    shop_stats = cleaned_df.groupby('shop_id').size().reset_index(name='count')
    shops_to_remove = shop_stats.nsmallest(6, 'count')['shop_id'].tolist()
    remove_records = cleaned_df[cleaned_df['shop_id'].isin(shops_to_remove)].shape[0]
    cleaned_df = cleaned_df[~cleaned_df['shop_id'].isin(shops_to_remove)]
    print(f"Selected {cleaned_df['shop_id'].nunique()} shops after removing {shops_to_remove}")
    print(f"Removed {remove_records:,} records from shop selection")
    
    cleaned_df = cleaned_df.sort_values(['shop_id', 'item_id', 'date'])
    
    def winsorize_with_rolling_stats(group):
        group = group.set_index('date').sort_index()
        rolling_mean = group['item_cnt_day'].rolling(window='30D', min_periods=1).mean()
        rolling_std = group['item_cnt_day'].rolling(window='30D', min_periods=1).std()
        upper_3sigma = rolling_mean + 3 * rolling_std
        
        outliers = group['item_cnt_day'] > upper_3sigma
        outlier_count = outliers.sum()
        
        winsor_limit = group['item_cnt_day'].quantile(0.99)
        clipped = group['item_cnt_day'].clip(upper=winsor_limit)
        clipped_count = (group['item_cnt_day'] > winsor_limit).sum()
        
        return clipped.reset_index(drop=True), outlier_count, clipped_count
        
    total_records = len(cleaned_df)
    item_cnt_day_winsorized = []
    total_outlier = 0
    total_clipped = 0
    
    for (shop_id, item_id), group in cleaned_df.groupby(['shop_id', 'item_id']):
        clipped_series, outliers, clipped = winsorize_with_rolling_stats(group.copy())
        item_cnt_day_winsorized.extend(clipped_series)
        total_outlier += outliers
        total_clipped += clipped
        
    cleaned_df['item_cnt_day'] = item_cnt_day_winsorized
    
    outlier_percentage = (total_outlier / total_records * 100) if total_records > 0 else 0
    clipped_percentage = (total_clipped / total_records * 100) if total_records > 0 else 0

    print("Winsorization applied at 99th percentile for item_cnt_day after 3σ rolling-window detection")
    print(f"Total records: {total_records:,}")
    print(f"Outliers (above 3σ): {total_outlier:,} ({outlier_percentage:.2f}%)")
    print(f"Values clipped (>99th percentile): {total_clipped:,} ({clipped_percentage:.2f}%)")
    
    # Final checks
    print("Final unique counts:")
    print(f"item_id: {cleaned_df['item_id'].nunique()}")
    print(f"item_name: {cleaned_df['item_name'].nunique()}")
    
    return cleaned_df

cleaned_data = data_preprocessing(df)

Preprocessing dataset with 2,935,849 records
Initial unique counts:
item_id: 21807
item_name: 21233
item_name nulls before imputation: 84
item_id with 'Unknown' item_name: 0
Unique item_name per item_id:
item_name
1    21806
0        1
Name: count, dtype: int64
Initial null counts:
Column 'date': 0 nulls (0.00%)
Column 'date_block_num': 0 nulls (0.00%)
Column 'shop_id': 0 nulls (0.00%)
Column 'item_id': 0 nulls (0.00%)
Column 'item_price': 0 nulls (0.00%)
Column 'item_cnt_day': 0 nulls (0.00%)
Column 'item_name': 84 nulls (0.00%)
Column 'item_category_id': 0 nulls (0.00%)
Column 'item_category_name': 0 nulls (0.00%)
Column 'shop_name': 0 nulls (0.00%)
Created Return column. Total Returns: 7,541.0
Converted date column to datetime format
Imputed 84 missing item names with 'Unknown'
Selected 54 shops after removing ['36', '11', '20', '8', '9', '40']
Removed 14,017 records from shop selection
Winsorization applied at 99th percentile for item_cnt_day after 3σ rolling-window detection
Total

In [111]:
cleaned_data['item_name'].nunique()

21027

In [112]:
def create_sales_grid(cleaned_df, min_months_item=3, min_months_shop=5):
    cleaned_df_copy = cleaned_df.copy()

    active_items = (
        cleaned_df_copy.groupby('item_id')['date_block_num'].nunique()
        .loc[lambda x: x >= min_months_item].index
    )
    active_shops = (
        cleaned_df_copy.groupby('shop_id')['date_block_num'].nunique()
        .loc[lambda x: x >= min_months_shop].index
    )
    filtered_df = cleaned_df_copy[
        cleaned_df_copy['item_id'].isin(active_items) &
        cleaned_df_copy['shop_id'].isin(active_shops)
    ]
    
    # Valid shop-item pairs
    valid_pairs = filtered_df[['shop_id', 'item_id']].drop_duplicates()
    months = sorted(filtered_df['date_block_num'].unique())
    full_grid = []
    for month in months:
        temp = valid_pairs.copy()
        temp['date_block_num'] = month
        full_grid.append(temp)
    full_grid = pd.concat(full_grid, ignore_index=True)

    # Monthly aggregation
    monthly_data = filtered_df.groupby(
        ['shop_id', 'item_id', 'date_block_num']
    ).agg({
        'item_cnt_day': 'sum',
        'item_price': 'mean',
        'Return': 'sum'
    }).reset_index()

    full_grid = full_grid.merge(monthly_data, on=['shop_id', 'item_id', 'date_block_num'], how='left')
    full_grid['item_cnt_day'] = full_grid['item_cnt_day'].fillna(0)
    full_grid['Return'] = full_grid['Return'].fillna(0)
    full_grid['item_price'] = full_grid['item_price'].fillna(cleaned_df_copy['item_price'].median())

    full_grid.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)
    full_grid = full_grid.sort_values(['shop_id', 'item_id', 'date_block_num'])

    # Add representative date for each date_block_num
    # Assume 'date' is a datetime column; take the first date for each date_block_num
    date_mapping = filtered_df.groupby('date_block_num')['date'].min().reset_index()
    full_grid = full_grid.merge(date_mapping, on='date_block_num', how='left')

    print(f"Created base grid with {len(full_grid):,} shop-item-month combinations")
    return full_grid, filtered_df

# Recreate grid and get filtered data
sales_grid, filtered_df = create_sales_grid(cleaned_data)

# Merge item-related metadata
items_df = cleaned_data[['item_id', 'item_name', 'item_category_id']].drop_duplicates(subset=['item_id'])
sales_grid = sales_grid.merge(items_df, on='item_id', how='left')

# Merge item category name
item_categories_df = cleaned_data[['item_category_id', 'item_category_name']].drop_duplicates(subset=['item_category_id'])
sales_grid = sales_grid.merge(item_categories_df, on='item_category_id', how='left')

# Merge shop name
shops_df = cleaned_data[['shop_id', 'shop_name']].drop_duplicates(subset=['shop_id'])
sales_grid = sales_grid.merge(shops_df, on='shop_id', how='left')

# Check missing values
print("Missing after fix:\n", sales_grid[['date', 'item_name', 'item_category_id', 'item_category_name', 'shop_name']].isna().sum())

# Rename to merged_df for consistency
merged_df = sales_grid

Created base grid with 13,529,586 shop-item-month combinations
Missing after fix:
 date                  0
item_name             0
item_category_id      0
item_category_name    0
shop_name             0
dtype: int64


In [107]:
merged_df.nunique()

shop_id                  51
item_id               17313
date_block_num           34
item_cnt_month         6872
item_price            39712
Return                   12
date                     34
item_name             16837
item_category_id         74
item_category_name       73
shop_name                51
dtype: int64

In [98]:
min_rows = 2_000_000
max_rows = 2_900_000

# Calculate average months per pair
avg_months_per_pair = merged_df.groupby(['shop_id', 'item_id']).size().mean()
min_pairs = int(min_rows / avg_months_per_pair)
max_pairs = int(max_rows / avg_months_per_pair)
print(f"Aim for {min_pairs:,} to {max_pairs:,} shop-item pairs")

# Score pairs with balanced criteria
pair_scores = (
    merged_df.groupby(['shop_id', 'item_id'])
    .agg(
        total_sales=('item_cnt_month', 'sum'),  # Total sales volume
        sales_activity=('item_cnt_month', lambda x: (x > 0).sum()),  # Months with non-zero sales
        recency=('date_block_num', 'max'),  # Most recent month
        category=('item_category_id', 'first')  # For stratification
    )
    .assign(
        score=lambda x: (
            0.4 * (x['total_sales'] / (x['total_sales'].max() + 1)) +  # Normalize sales
            0.4 * (x['sales_activity'] / (x['sales_activity'].max() + 1)) +  # Normalize activity
            0.2 * ((x['recency'] - merged_df['date_block_num'].min()) /
                   (merged_df['date_block_num'].max() - merged_df['date_block_num'].min() + 1))  # Normalize recency
        )
    )
)

# Stratified sampling: Select top pairs per item_category_id to ensure diversity
def stratified_sample(df, n_pairs, group_col='category'):
    # Target pairs per category, proportional to category size
    category_counts = df[group_col].value_counts()
    total_pairs = min(max_pairs, max(min_pairs, n_pairs))
    category_proportions = category_counts / category_counts.sum()
    category_pairs = (category_proportions * total_pairs).round().astype(int)
    
    selected_pairs = []
    for cat, n in category_pairs.items():
        cat_pairs = df[df[group_col] == cat].nlargest(n, 'score').index
        selected_pairs.extend(cat_pairs)
    
    # If under target, add more from top scores
    if len(selected_pairs) < min_pairs:
        remaining = min_pairs - len(selected_pairs)
        extra_pairs = df[~df.index.isin(selected_pairs)].nlargest(remaining, 'score').index
        selected_pairs.extend(extra_pairs)
    # If over target, trim
    selected_pairs = selected_pairs[:max_pairs]
    return selected_pairs

# Select pairs (aim for middle of range, e.g., ~2.45M rows)
target_pairs = int((min_pairs + max_pairs) / 2)
selected_pairs = stratified_sample(pair_scores, target_pairs, group_col='category')

# Apply filter
filtered_data = merged_df[
    merged_df.set_index(['shop_id', 'item_id']).index.isin(selected_pairs)
]

# 4. Verify
print(f"\nFiltered Data Shape: {filtered_data.shape}")
print(f"Unique pairs: {filtered_data[['shop_id', 'item_id']].drop_duplicates().shape[0]:,}")
print(f"Avg months per pair: {filtered_data.groupby(['shop_id', 'item_id']).size().mean():.1f}")
print("Data types:\n", filtered_data[['shop_id', 'item_id', 'item_name']].dtypes)
print("Sample data:\n", filtered_data[['date', 'shop_id', 'item_id', 'item_name', 'item_cnt_month']].head())
print("Category distribution:\n", filtered_data['item_category_id'].value_counts().head(10))

Aim for 58,823 to 85,294 shop-item pairs

Filtered Data Shape: (2449938, 11)
Unique pairs: 72,057
Avg months per pair: 34.0
Data types:
 shop_id       int64
item_id       int64
item_name    object
dtype: object
Sample data:
          date  shop_id  item_id                       item_name  \
68 2013-01-01        2       31  007: SKIPHALL COORDINATES (BD)   
69 2013-02-01        2       31  007: SKIPHALL COORDINATES (BD)   
70 2013-03-01        2       31  007: SKIPHALL COORDINATES (BD)   
71 2013-04-01        2       31  007: SKIPHALL COORDINATES (BD)   
72 2013-05-01        2       31  007: SKIPHALL COORDINATES (BD)   

    item_cnt_month  
68             0.0  
69             4.0  
70             1.0  
71             1.0  
72             0.0  
Category distribution:
 item_category_id
40    645354
55    354348
37    276658
30    114342
19    110704
23     83232
38     57936
63     53754
41     51272
72     50728
Name: count, dtype: int64


In [110]:
filtered_data.nunique()

shop_id                  51
item_id                8908
date_block_num           34
item_cnt_month         4973
item_price            27526
Return                    9
date                     34
item_name              8686
item_category_id         71
item_category_name       70
shop_name                51
dtype: int64

In [101]:
filtered_data['item_name'].nunique()

8686

In [127]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

def create_lag_features(df, group_cols, value_cols, lags=[1, 2, 3]):
    """
    Create lag features for specified value columns grouped by given columns.
    Lag features will have NaN for initial periods where lag does not exist.
    """
    df = df.sort_values(group_cols + ['date_block_num']).copy()
    for col in value_cols:
        for lag in lags:
            lag_col = f"{col}_lag_{lag}"
            # Shift values by lag within group
            df[lag_col] = df.groupby(group_cols)[col].shift(lag)
    return df

def impute_nans_with_median_then_mean(df, cols):
    """
    Impute NaNs in specified columns with median.
    If median is zero, then try mean.
    If mean is also zero, fill with 0.
    Returns the DataFrame and dictionary of imputation values.
    """
    imputation_values = {}
    for col in cols:
        nan_count = df[col].isna().sum()
        if nan_count > 0:
            median_val = df[col].median()
            if median_val != 0:
                fill_val = median_val
                method = 'median'
            else:
                mean_val = df[col].mean()
                if mean_val != 0:
                    fill_val = mean_val
                    method = 'mean'
                else:
                    fill_val = 0
                    method = 'zero'
            df[col] = df[col].fillna(fill_val).astype(np.float32)
            imputation_values[col] = (fill_val, method)
            print(f"Imputed {nan_count:,} NaN values in '{col}' with {method} value {fill_val:.4f}")
        else:
            imputation_values[col] = (None, None)
    return df, imputation_values

# Assuming filtered_data is your input DataFrame
# filtered_data = pd.read_csv('your_data.csv') or defined elsewhere

working_df = filtered_data.copy()

print(f"Starting feature engineering on filtered_data with {len(working_df):,} rows")

group_cols = ['shop_id', 'item_id']
value_cols = ['item_cnt_month']  # Only lag this column
lags = [1, 2, 3]

engineered_df = create_lag_features(working_df, group_cols, value_cols, lags)

lag_features = [f"{col}_lag_{lag}" for col in value_cols for lag in lags]
print(f"Created lag features: {', '.join(lag_features)}")

engineered_df, imputation_values = impute_nans_with_median_then_mean(engineered_df, lag_features)

print("\nMissing values after imputation:")
print(engineered_df[lag_features].isna().sum())

print("\nSample of engineered DataFrame with lag features:")
print(engineered_df[['shop_id', 'item_id', 'date_block_num', 'item_cnt_month'] + lag_features].head(10))


Starting feature engineering on filtered_data with 2,449,938 rows
Created lag features: item_cnt_month_lag_1, item_cnt_month_lag_2, item_cnt_month_lag_3
Imputed 72,057 NaN values in 'item_cnt_month_lag_1' with mean value 0.8498
Imputed 144,114 NaN values in 'item_cnt_month_lag_2' with mean value 0.8620
Imputed 216,171 NaN values in 'item_cnt_month_lag_3' with mean value 0.8762

Missing values after imputation:
item_cnt_month_lag_1    0
item_cnt_month_lag_2    0
item_cnt_month_lag_3    0
dtype: int64

Sample of engineered DataFrame with lag features:
    shop_id  item_id  date_block_num  item_cnt_month  item_cnt_month_lag_1  \
68        2       31               0             0.0              0.849805   
69        2       31               1             4.0              0.000000   
70        2       31               2             1.0              4.000000   
71        2       31               3             1.0              1.000000   
72        2       31               4             0.0  

In [130]:
from sklearn.preprocessing import RobustScaler
import numpy as np

# Step 1: Copy engineered_df
scaled_df = engineered_df.copy()

# Step 2: Encode categorical features as category codes (for entity embeddings later)
categorical_cols = ['item_name', 'item_category_name', 'shop_name']
for col in categorical_cols:
    scaled_df[col] = scaled_df[col].astype('category').cat.codes.astype('int32')

# Keep item_category_id as int32 (assuming numeric categorical ID)
scaled_df['item_category_id'] = scaled_df['item_category_id'].astype('int32')

# Step 3: Identify numerical columns for scaling
numerical_cols = [
    'item_cnt_month', 'item_price', 'Return',
    'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3'
]

# Step 4: Apply RobustScaler to numerical columns
scaler = RobustScaler()
scaled_values = scaler.fit_transform(scaled_df[numerical_cols])
scaled_df[numerical_cols] = scaled_values.astype(np.float32)

# Step 5: Extract year from date (date column remains unchanged)
scaled_df['year'] = scaled_df['date'].dt.year.astype('int16')

# Step 6: Save date lookup for post-processing
dates_for_lookup = scaled_df[['shop_id', 'item_id', 'date_block_num', 'date']].copy()
dates_for_lookup.to_csv("date_lookup.csv", index=False)

# Step 7: Confirm
print("Scaled dataframe sample:")
print(scaled_df.head())
print("\nScaled dataframe dtypes:")
print(scaled_df.dtypes)


Scaled dataframe sample:
    shop_id  item_id  date_block_num  item_cnt_month  item_price  Return  \
68        2       31               0             0.0         0.0     0.0   
69        2       31               1             4.0       300.0     0.0   
70        2       31               2             1.0       299.5     0.0   
71        2       31               3             1.0       300.0     0.0   
72        2       31               4             0.0         0.0     0.0   

         date  item_name  item_category_id  item_category_name  shop_name  \
68 2013-01-01         72                37                  15          5   
69 2013-02-01         72                37                  15          5   
70 2013-03-01         72                37                  15          5   
71 2013-04-01         72                37                  15          5   
72 2013-05-01         72                37                  15          5   

    item_cnt_month_lag_1  item_cnt_month_lag_2  item_cn

In [133]:
scaled_df.shape

(2449938, 15)

In [131]:
import numpy as np

def create_encoder_decoder_sequences(df, group_cols, feature_cols, encoder_seq_len=10, decoder_seq_len=5):
    
    X_list = []
    y_list = []

    # Sort the dataframe to ensure correct time order
    df = df.sort_values(group_cols + ['date_block_num']).reset_index(drop=True)
    
    # Group by shop_id and item_id
    grouped = df.groupby(group_cols)
    
    for _, group in grouped:
        group = group.reset_index(drop=True)
        total_time_steps = len(group)
        
        # Number of sequences we can create from this group
        max_start_idx = total_time_steps - encoder_seq_len - decoder_seq_len + 1
        
        for start_idx in range(max_start_idx):
            encoder_seq = group.loc[start_idx : start_idx + encoder_seq_len - 1, feature_cols].values
            decoder_seq = group.loc[start_idx + encoder_seq_len : start_idx + encoder_seq_len + decoder_seq_len - 1, feature_cols].values
            
            X_list.append(encoder_seq)
            y_list.append(decoder_seq)
    
    X = np.array(X_list)
    y = np.array(y_list)
    
    print(f"Created {len(X)} sequences with encoder length {encoder_seq_len} and decoder length {decoder_seq_len}")
    print(f"X shape: {X.shape}, y shape: {y.shape}")
    return X, y

group_cols = ['shop_id', 'item_id']
feature_cols = ['item_cnt_month', 'item_price', 'Return', 'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3']
encoder_seq_len = 10  # past 10 months
decoder_seq_len = 5   # predict next 5 months

X, y = create_encoder_decoder_sequences(scaled_df, group_cols, feature_cols, encoder_seq_len, decoder_seq_len)


Created 1441140 sequences with encoder length 10 and decoder length 5
X shape: (1441140, 10, 6), y shape: (1441140, 5, 6)


In [144]:
import numpy as np
import pandas as pd

# Your temporal split function
def temporal_train_val_test_split(X, y, sequence_start_dates, val_months=2, test_months=1, total_months=34):
    val_start = total_months - val_months - test_months
    test_start = total_months - test_months
    
    sequence_start_dates = np.array(sequence_start_dates)
    
    train_mask = sequence_start_dates < val_start
    val_mask = (sequence_start_dates >= val_start) & (sequence_start_dates < test_start)
    test_mask = sequence_start_dates >= test_start
    
    X_train, y_train = X[train_mask], y[train_mask]
    X_val, y_val = X[val_mask], y[val_mask]
    X_test, y_test = X[test_mask], y[test_mask]
    
    print(f"Train set: {X_train.shape[0]} sequences")
    print(f"Validation set: {X_val.shape[0]} sequences")
    print(f"Test set: {X_test.shape[0]} sequences")
    
    return X_train, y_train, X_val, y_val, X_test, y_test

sequence_start_dates = np.random.randint(0, 34, size=X.shape[0])

# Step 1: Split data temporally
X_train, y_train, X_val, y_val, X_test, y_test = temporal_train_val_test_split(
    X, y, sequence_start_dates, val_months=2, test_months=1, total_months=34
)



def flatten_sequences(arr, prefix):
    n_samples, n_timesteps, n_features = arr.shape
    # Reshape to (samples, timesteps*features)
    arr_flat = arr.reshape(n_samples, n_timesteps * n_features)
    # Create columns names: feat_0_t0, feat_1_t0, ..., feat_5_t9 (assuming 6 features, 10 timesteps)
    cols = []
    for t in range(n_timesteps):
        for f in range(n_features):
            cols.append(f"{prefix}_t{t}_f{f}")
    df = pd.DataFrame(arr_flat, columns=cols)
    return df

# Flatten
X_train_df = flatten_sequences(X_train, "X")
y_train_df = flatten_sequences(y_train, "y")
X_val_df = flatten_sequences(X_val, "X")
y_val_df = flatten_sequences(y_val, "y")
X_test_df = flatten_sequences(X_test, "X")
y_test_df = flatten_sequences(y_test, "y")

# Step 3: Save each split as separate parquet files
X_train_df.to_parquet("X_train.parquet", index=False)
y_train_df.to_parquet("y_train.parquet", index=False)

X_val_df.to_parquet("X_val.parquet", index=False)
y_val_df.to_parquet("y_val.parquet", index=False)

X_test_df.to_parquet("X_test.parquet", index=False)
y_test_df.to_parquet("y_test.parquet", index=False)

print("Saved all datasets as parquet files.")


Train set: 1313865 sequences
Validation set: 84885 sequences
Test set: 42390 sequences
Saved all datasets as parquet files.


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import polars as pl
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# GPU setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.set_float32_matmul_precision('high')

class SalesDataset(Dataset):
    def __init__(self, X_file, y_file, target_col='target', sequence_length=12, num_shops=51, num_items=8000, num_categories=74):
        self.X = pl.read_parquet(X_file)
        self.y = pl.read_parquet(y_file).select([target_col]).to_numpy().flatten().astype(np.float32)
        self.dates = self.X['date'].to_numpy()  # For interpretability
        if len(self.X) != len(self.y):
            raise ValueError("X and y length mismatch")
        
        self.sequence_length = sequence_length
        self.numerical_cols = ['date_block_num', 'returns', 'lag_1', 'lag_2', 'lag_3', 'shop_mean_lag_1', 'category_mean_lag_1']
        self.categorical_cols = ['shop_id_encoded', 'item_id_encoded', 'item_category_id_encoded']
        
        # Normalize numerical data
        numerical_data = self.X.select(self.numerical_cols).to_numpy().astype(np.float32)
        numerical_data = np.clip(numerical_data, -1e5, 1e5)
        mean = numerical_data.mean(axis=0, keepdims=True)
        std = numerical_data.std(axis=0, keepdims=True) + 1e-6
        self.numerical = (numerical_data - mean) / std
        
        # Normalize target
        self.y = np.clip(self.y, -1e5, 1e5)
        self.y_mean = 0.5224
        self.y_std = self.y.std() + 1e-6
        self.y = (self.y - self.y_mean) / self.y_std
        
        # Categorical data
        self.shop_ids = self.X['shop_id_encoded'].to_numpy().astype(np.int64).clip(0, num_shops - 1)
        self.item_ids = self.X['item_id_encoded'].to_numpy().astype(np.int64).clip(0, num_items - 1)
        self.category_ids = self.X['item_category_id_encoded'].to_numpy().astype(np.int64).clip(0, num_categories - 1)
        self.date_block_num = self.X['date_block_num'].to_numpy().astype(np.int32)
        
    def __len__(self):
        return len(self.X) - self.sequence_length + 1
    
    def __getitem__(self, idx):
        start_idx = idx
        end_idx = idx + self.sequence_length
        numerical = torch.tensor(self.numerical[start_idx:end_idx], dtype=torch.float32)
        shop_ids = torch.tensor(self.shop_ids[start_idx:end_idx], dtype=torch.int64)
        item_ids = torch.tensor(self.item_ids[start_idx:end_idx], dtype=torch.int64)
        category_ids = torch.tensor(self.category_ids[start_idx:end_idx], dtype=torch.int64)
        target = torch.tensor(self.y[end_idx - 1], dtype=torch.float32)
        identifiers = torch.tensor([self.shop_ids[end_idx - 1], self.item_ids[end_idx - 1], self.date_block_num[end_idx - 1]], dtype=torch.int32)
        dates = self.dates[start_idx:end_idx]
        return {
            'numerical': numerical, 'shop_ids': shop_ids, 'item_ids': item_ids, 'category_ids': category_ids,
            'target': target, 'identifiers': identifiers, 'dates': dates
        }

class HALSTM(nn.Module):
    def __init__(self, num_shops=51, num_items=8000, num_categories=74, embed_dim=16, numerical_dim=7,
                 hidden_dim=128, num_layers=2, num_heads=4, dropout=0.35):
        super(HALSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.shop_embed = nn.Embedding(num_shops, embed_dim)
        self.item_embed = nn.Embedding(num_items, embed_dim)
        self.category_embed = nn.Embedding(num_categories, embed_dim)
        nn.init.normal_(self.shop_embed.weight, mean=0.0, std=0.02)
        nn.init.normal_(self.item_embed.weight, mean=0.0, std=0.02)
        nn.init.normal_(self.category_embed.weight, mean=0.0, std=0.02)
        
        self.input_dim = numerical_dim + embed_dim * 3
        self.lstm = nn.LSTM(self.input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.lstm_norm = nn.LayerNorm(hidden_dim)
        self.mha = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)
        self.mha_norm = nn.LayerNorm(hidden_dim)
        self.gate = nn.Linear(hidden_dim * 2, hidden_dim)
        self.sigmoid = nn.Sigmoid()
        self.fc_shared = nn.Linear(hidden_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
        self.positional_encoding = torch.zeros(100, hidden_dim).to(device)
        position = torch.arange(0, 100, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / hidden_dim))
        self.positional_encoding[:, 0::2] = torch.sin(position * div_term)
        self.positional_encoding[:, 1::2] = torch.cos(position * div_term)
        
    def forward(self, numerical, shop_ids, item_ids, category_ids):
        batch_size, seq_len, _ = numerical.size()
        shop_embed = self.shop_embed(shop_ids)
        item_embed = self.item_embed(item_ids)
        category_embed = self.category_embed(category_ids)
        
        x = torch.cat([numerical, shop_embed, item_embed, category_embed], dim=-1)
        x = self.dropout(x)
        
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim, device=x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim, device=x.device)
        lstm_out, _ = self.lstm(x, (h0, c0))
        lstm_out = self.lstm_norm(lstm_out)
        
        lstm_out = lstm_out + self.positional_encoding[:seq_len, :].unsqueeze(0)
        
        mha_out, mha_weights = self.mha(lstm_out, lstm_out, lstm_out)
        mha_out = self.mha_norm(mha_out)
        
        combined = torch.cat([lstm_out[:, -1, :], mha_out[:, -1, :]], dim=-1)
        gate = self.sigmoid(self.gate(combined))
        fused = gate * lstm_out[:, -1, :] + (1 - gate) * mha_out[:, -1, :]
        
        shared = self.relu(self.fc_shared(fused))
        output = self.fc_out(shared)
        
        return output.squeeze(-1), {'mha_weights': mha_weights, 'gate_weights': gate}

def collate_fn(batch):
    if not batch:
        return {}
    return {
        'numerical': torch.stack([item['numerical'] for item in batch]),
        'shop_ids': torch.stack([item['shop_ids'] for item in batch]),
        'item_ids': torch.stack([item['item_ids'] for item in batch]),
        'category_ids': torch.stack([item['category_ids'] for item in batch]),
        'target': torch.stack([item['target'] for item in batch]),
        'identifiers': torch.stack([item['identifiers'] for item in batch]),
        'dates': [item['dates'] for item in batch]
    }

def train_model(model, train_loader, val_loader, num_epochs=50, lr=0.001, accum_steps=2):
    criterion = nn.MSELoss().to(device)
    scaler = torch.cuda.amp.GradScaler()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.05)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, epochs=num_epochs, steps_per_epoch=len(train_loader)//accum_steps)
    
    metrics = {'epoch': [], 'train_mse': [], 'train_rmse': [], 'val_mse': [], 'val_rmse': []}
    best_val_loss = float('inf')
    output_dir = Path('/workspace/results')
    output_dir.mkdir(exist_ok=True)
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_mse = 0
        optimizer.zero_grad()
        
        for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):
            numerical = batch['numerical'].to(device)
            shop_ids = batch['shop_ids'].to(device)
            item_ids = batch['item_ids'].to(device)
            category_ids = batch['category_ids'].to(device)
            target = batch['target'].to(device)
            
            with torch.cuda.amp.autocast():
                output, _ = model(numerical, shop_ids, item_ids, category_ids)
                mse_loss = criterion(output, target) / accum_steps
                loss = mse_loss
            
            scaler.scale(loss).backward()
            if (batch_idx + 1) % accum_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
                scaler.step(optimizer)
                scheduler.step()
                scaler.update()
                optimizer.zero_grad()
            
            train_loss += mse_loss.item() * accum_steps
            train_mse += mse_loss.item() * accum_steps
        
        train_loss /= len(train_loader)
        train_mse /= len(train_loader)
        train_rmse = np.sqrt(train_mse)
        
        model.eval()
        val_loss = 0
        val_mse = 0
        with torch.no_grad():
            for batch in val_loader:
                numerical = batch['numerical'].to(device)
                shop_ids = batch['shop_ids'].to(device)
                item_ids = batch['item_ids'].to(device)
                category_ids = batch['category_ids'].to(device)
                target = batch['target'].to(device)
                
                with torch.cuda.amp.autocast():
                    output, _ = model(numerical, shop_ids, item_ids, category_ids)
                    loss = criterion(output, target)
                
                val_loss += loss.item()
                val_mse += loss.item()
        
        val_loss /= len(val_loader)
        val_mse /= len(val_loader)
        val_rmse = np.sqrt(val_mse)
        
        metrics['epoch'].append(epoch + 1)
        metrics['train_mse'].append(train_mse)
        metrics['train_rmse'].append(train_rmse)
        metrics['val_mse'].append(val_mse)
        metrics['val_rmse'].append(val_rmse)
        print(f"Epoch {epoch+1}, Train MSE: {train_mse:.4f}, RMSE: {train_rmse:.4f}, Val MSE: {val_mse:.4f}, RMSE: {val_rmse:.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), output_dir / 'best_ha_lstm.pth')
    
    metrics_df = pd.DataFrame(metrics)
    metrics_df.to_csv(output_dir / 'training_metrics.csv', index=False)
    
    plt.figure(figsize=(8, 5))
    plt.plot(metrics['epoch'], metrics['val_rmse'], label='Val RMSE')
    plt.xlabel('Epoch')
    plt.ylabel('RMSE')
    plt.title('Validation RMSE')
    plt.legend()
    plt.savefig(output_dir / 'rmse_plot.png')
    plt.close()
    
    model.load_state_dict(torch.load(output_dir / 'best_ha_lstm.pth'))
    return model, metrics_df

def predict(model, test_loader, dataset):
    model.eval()
    predictions = []
    interpret_outputs = []
    modalities = ['numerical', 'shop_id', 'item_id', 'item_category_id']
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Predicting"):
            numerical = batch['numerical'].to(device)
            shop_ids = batch['shop_ids'].to(device)
            item_ids = batch['item_ids'].to(device)
            category_ids = batch['category_ids'].to(device)
            identifiers = batch['identifiers']
            dates = batch['dates']
            
            with torch.cuda.amp.autocast():
                output, attn_dict = model(numerical, shop_ids, item_ids, category_ids)
                preds = output.cpu().numpy() * dataset.y_std + dataset.y_mean
            
            mha_weights = attn_dict['mha_weights'][:, -1, :].cpu().numpy()
            gate_weights = attn_dict['gate_weights'].cpu().numpy()
            
            for i in range(len(preds)):
                predictions.append({
                    'shop_id': identifiers[i][0].item(),
                    'item_id': identifiers[i][1].item(),
                    'date_block_num': identifiers[i][2].item(),
                    'forecast_h1': preds[i]
                })
                interpret_outputs.append({
                    'timestamp_reference': dates[i][-1].isoformat(),
                    'forecasted_value': float(preds[i]),
                    'fusion_weights': gate_weights[i].tolist(),
                    'attention_weights': mha_weights[i].tolist(),
                    'input_sequence_dates': [d.isoformat() for d in dates[i]],
                    'modalities_used': modalities,
                    'gating_decision_output': gate_weights[i].tolist()
                })
    
    pred_df = pd.DataFrame(predictions)
    interpret_pred_df = pd.DataFrame(interpret_outputs(pred_df))
    
    output_dir = Path('/workspace/results')
    pred_df.to_csv(output_dir / 'predictions.csv', index=False)
    interpret_df.to_csv(output_dir / 'interpretability_outputs.csv', index=False)
    
    return pred_df, interpret_df

def visualize_results(pred_df):
    output_dir = Path('/workspace/results')
    output_dir.mkdir(exist_ok=True)
    
    plt.figure(figsize=(8, 5))
    sns.kdeplot(pred_df['forecast_h1'], label='Horizon 1')
    plt.title('Prediction Distribution')
    plt.xlabel('Predicted Sales')
    plt.ylabel('Density')
    plt.legend()
    plt.savefig(output_dir / 'prediction_distribution.png')
    plt.close()

def main():
    data_dir = Path('/workspace/data')
    train_X_path = data_dir / 'X_train.parquet'
    train_y_path = data_dir / 'y_train.parquet'
    val_X_path = data_dir / 'X_val.parquet'
    val_y_path = data_dir / 'y_val.parquet'
    test_X_path = data_dir / 'X_test.parquet'
    test_y_path = data_dir / 'y_test.parquet'
    
    batch_size = 4096
    num_workers = 8
    num_epochs = 50
    lr = 0.001
    accum_steps = 2
    
    # Create dataset objects
    train_dataset = SalesDataset(train_X_path, train_y_path, target_col='target')
    val_dataset = SalesDataset(val_X_path, val_y_path, target_col='target')
    test_dataset = SalesDataset(test_X_path, test_y_path, target_col='target')
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, collate_fn=collate_fn)
    
    # Create model object
    model = HALSTM().to(device)
    
    # Train model
    model, metrics_df = train_model(model, train_loader, val_loader, num_epochs, lr, accum_steps)
    
    # Predict and generate interpretability outputs
    pred_df, interpret_df = predict(model, test_loader, test_dataset)
    
    # Visualize results
    visualize_results(pred_df)
    
    
    