In [2]:
import os
import sys
import subprocess
import warnings
import shutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings('ignore')

def check_cuda_environment():
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available. This notebook requires a GPU.")
    
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    
    try:
        test_tensor = torch.tensor([1.0, 2.0, 3.0], device='cuda')
        test_result = test_tensor + 1
        print(f"CUDA test operation successful: {test_result}")
    except Exception as e:
        print(f"CUDA test operation failed: {e}")
        raise
    
    try:
        nvidia_smi = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
        print("NVIDIA-SMI output:")
        print(nvidia_smi.stdout)
    except Exception as e:
        print(f"Failed to run nvidia-smi: {e}")
    
    print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 2**30:.2f} GiB")
    print(f"Allocated GPU memory: {torch.cuda.memory_allocated(0) / 2**30:.2f} GiB")
    print(f"Reserved GPU memory: {torch.cuda.memory_reserved(0) / 2**30:.2f} GiB")

try:
    check_cuda_environment()
except Exception as e:
    print(f"Error with PyTorch or CUDA setup: {e}")
    print("Try reinstalling PyTorch: pip install torch==2.7.0 --index-url https://download.pytorch.org/whl/cu124")
    raise

print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")
print(f"PATH: {os.environ.get('PATH')}")
print(f"Available disk space: {shutil.disk_usage('/').free / (2**30):.2f} GiB")

if os.path.exists('/workspace/XAI/torch.py') or os.path.exists('/workspace/XAI/torch.pyc'):
    print("Warning: Found 'torch.py' or 'torch.pyc' in /workspace/XAI. Please rename or remove it.")

torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda")
print(f"Using device: {device}")

torch.cuda.empty_cache()
print("Cleared GPU memory cache")

PyTorch version: 2.2.2+cu121
CUDA available: True
CUDA version: 12.1
GPU device: NVIDIA RTX A6000
GPU count: 1
Current device: 0
CUDA test operation successful: tensor([2., 3., 4.], device='cuda:0')
NVIDIA-SMI output:
Mon May 12 04:37:02 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.01              Driver Version: 565.57.01      CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A6000               On  |   00000000:D5:00.0 Off |                    0 |
| 30%   33C    P2             76W /  300W |     315MiB /  46068MiB |      0%      Default |
|             

In [3]:
def load_data(data_dir="/workspace/data", file_name="merged_data.csv"):
    """Load CSV data with fallback path."""
    file_path = os.path.join(data_dir, file_name)
    alt_path = "/workspace/XAI-2/Predict Future Sales/merged_data.csv"
    
    if not os.path.exists(file_path):
        if os.path.exists(alt_path):
            file_path = alt_path
        else:
            raise FileNotFoundError(f"File not found at {file_path} or {alt_path}")
    
    try:
        data = pd.read_csv(file_path)
        print(f"Loaded data from {file_path}")
        print(f"Dataset shape: {data.shape}")
        print(f"Columns: {list(data.columns)}")
    except Exception as e:
        raise RuntimeError(f"Failed to load {file_path}: {e}")
    
    expected_columns = ['date', 'shop_id', 'item_id', 'item_name', 'item_cnt_day', 
                        'item_price', 'item_category_id', 'shop_name', 'item_category_name', 'date_block_num']
    missing_cols = [col for col in expected_columns if col not in data.columns]
    if missing_cols:
        print(f"Warning: Missing expected columns: {missing_cols}")
    
    return data

data = load_data()

Loaded data from /workspace/XAI-2/Predict Future Sales/merged_data.csv
Dataset shape: (2935849, 10)
Columns: ['date', 'date_block_num', 'shop_id', 'item_id', 'item_price', 'item_cnt_day', 'item_name', 'item_category_id', 'item_category_name', 'shop_name']


In [4]:
import polars as pl
from datetime import datetime

def get_russian_holidays():
    """Return DataFrame of Russian holidays and shopping events (2013–2015)."""
    holidays = [
        ("2013-01-01", "New Year"), ("2014-01-01", "New Year"), ("2015-01-01", "New Year"),
        ("2013-02-23", "Defender Day"), ("2014-02-23", "Defender Day"), ("2015-02-23", "Defender Day"),
        ("2013-03-08", "Women's Day"), ("2014-03-08", "Women's Day"), ("2015-03-08", "Women's Day"),
        ("2013-06-12", "Russia Day"), ("2014-06-12", "Russia Day"), ("2015-06-12", "Russia Day"),
        ("2013-11-29", "Black Friday"), ("2014-11-28", "Black Friday"), ("2015-11-27", "Black Friday")
    ]
    holiday_df = pl.DataFrame({
        "date": [datetime.strptime(date, "%Y-%m-%d") for date, _ in holidays],
        "holiday": [name for _, name in holidays]
    }).with_columns([
        pl.col("date").dt.month().cast(pl.Int32).alias("month"),
        pl.col("date").dt.year().cast(pl.Int32).alias("year")
    ])
    return holiday_df

holiday_df = get_russian_holidays()
print(f"Holiday DataFrame created with {len(holiday_df)} entries")

Holiday DataFrame created with 15 entries


In [5]:
import time
import polars as pl

def initial_preprocessing(df):
    """Perform initial data cleaning and filtering, including shop selection."""
    start_time = time.time()
    
    # Convert to Polars
    df = pl.from_pandas(df)
    print(f"Initial dataset size: {len(df)}")
    print(f"Unique shops: {df['shop_id'].n_unique()}, items: {df['item_id'].n_unique()}")
    print(f"Date block range: {df['date_block_num'].min()}–{df['date_block_num'].max()}")
    
    # Parse dates
    df = df.with_columns(
        pl.col('date').str.strptime(pl.Date, "%d.%m.%Y")
    ).with_columns([
        pl.col('date').dt.month().cast(pl.Int32).alias('month'),
        pl.col('date').dt.year().cast(pl.Int32).alias('year')
    ])
    
    # Filter to date_block_num <= 32
    df = df.filter(pl.col('date_block_num') <= 32)
    print(f"Dataset size after date filter: {len(df)}")
    
    # Filter to top 54 shops by total sales volume
    shop_sales = df.group_by('shop_id').agg(
        total_sales=pl.col('item_cnt_day').sum()
    ).sort('total_sales', descending=True).head(54)
    valid_shops = shop_sales['shop_id'].to_list()
    df = df.filter(pl.col('shop_id').is_in(valid_shops))
    print(f"Dataset size after shop filter: {len(df)}")
    print(f"Unique shops after processing: {df['shop_id'].n_unique()}")
    
    # Optimize dtypes
    df = df.with_columns([
        pl.col('date_block_num').cast(pl.Int16),
        pl.col('shop_id').cast(pl.Int32),
        pl.col('item_id').cast(pl.Int32),
        pl.col('item_category_id').cast(pl.Int32),
        pl.col('item_cnt_day').cast(pl.Float32),
        pl.col('item_price').cast(pl.Float32),
        pl.col('month').cast(pl.Int32),
        pl.col('year').cast(pl.Int32)
    ])
    
    print(f"Initial preprocessing time: {time.time() - start_time:.2f} seconds")
    return df

df = initial_preprocessing(data)

Initial dataset size: 2935849
Unique shops: 60, items: 21807
Date block range: 0–33
Dataset size after date filter: 2882335
Dataset size after shop filter: 2868195
Unique shops after processing: 54
Initial preprocessing time: 2.48 seconds


In [6]:
def handle_negative_sales_and_outliers(df):
    """Apply rolling Winsorization and handle negative sales at daily level."""
    start_time = time.time()
    
    # Diagnostics: Inspect item_cnt_day distribution
    print("item_cnt_day distribution before processing:")
    print(df['item_cnt_day'].describe())
    print(f"Rows with item_cnt_day > 100: {df.filter(pl.col('item_cnt_day') > 100).height}")
    print(f"Rows with item_cnt_day > 500: {df.filter(pl.col('item_cnt_day') > 500).height}")
    
    # Sort for rolling operations
    df = df.sort(['shop_id', 'item_id', 'date'])
    
    # Rolling Winsorization (30-day window, 95th percentile) with static cap
    df = df.with_columns(
        rolling_quantile=pl.col('item_cnt_day').rolling_quantile(
            quantile=0.95, window_size=30, min_periods=1
        ).over(['shop_id', 'item_id'])
    ).with_columns(
        item_cnt_day_winsor=pl.col('item_cnt_day').clip(None, pl.min_horizontal(pl.col('rolling_quantile'), 1000))
    )
    outlier_count = df.filter(pl.col('item_cnt_day') > pl.col('rolling_quantile')).height
    print(f"Outliers capped: {outlier_count} ({outlier_count / len(df) * 100:.2f}%)")
    
    # Diagnostics: Inspect rolling quantile and winsorized values
    print("Rolling quantile stats:")
    print(df['rolling_quantile'].describe())
    print(f"Max item_cnt_day_winsor: {df['item_cnt_day_winsor'].max()}")
    
    # Handle negative sales
    df = df.with_columns([
        pl.when(pl.col('item_cnt_day_winsor') < 0)
          .then(pl.col('item_cnt_day_winsor').abs())
          .otherwise(0)
          .alias('returns'),
        pl.col('item_cnt_day_winsor').clip(lower_bound=0).alias('item_cnt_day_winsor')
    ])
    print(f"Negative sales after processing: {df.filter(pl.col('item_cnt_day_winsor') < 0).height}")
    
    # Drop temporary column
    df = df.drop('rolling_quantile')
    
    print(f"Negative sales and outlier handling time: {time.time() - start_time:.2f} seconds")
    return df

df = handle_negative_sales_and_outliers(df)

item_cnt_day distribution before processing:
shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ value      │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 2.868195e6 │
│ null_count ┆ 0.0        │
│ mean       ┆ 1.240954   │
│ std        ┆ 2.28652    │
│ min        ┆ -22.0      │
│ 25%        ┆ 1.0        │
│ 50%        ┆ 1.0        │
│ 75%        ┆ 1.0        │
│ max        ┆ 1000.0     │
└────────────┴────────────┘
Rows with item_cnt_day > 100: 135
Rows with item_cnt_day > 500: 11


Outliers capped: 9314 (0.32%)
Rolling quantile stats:
shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ value      │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 2.868195e6 │
│ null_count ┆ 0.0        │
│ mean       ┆ 2.361271   │
│ std        ┆ 6.042789   │
│ min        ┆ -5.0       │
│ 25%        ┆ 1.0        │
│ 50%        ┆ 1.0        │
│ 75%        ┆ 2.0        │
│ max        ┆ 1000.0     │
└────────────┴────────────┘
Max item_cnt_day_winsor: 1000.0
Negative sales after processing: 0
Negative sales and outlier handling time: 1.00 seconds


In [7]:
def aggregate_to_monthly(df):
    """Aggregate data to monthly level."""
    start_time = time.time()
    
    # Aggregate to monthly
    df = df.group_by(['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'month', 'year']).agg([
        pl.col('item_cnt_day_winsor').sum().alias('item_cnt_day_winsor'),
        pl.col('returns').sum().alias('returns'),
        pl.col('item_price').mean().alias('item_price')
    ])
    
    # Diagnostics: Inspect aggregated item_cnt_day_winsor
    print("item_cnt_day_winsor distribution after aggregation:")
    print(df['item_cnt_day_winsor'].describe())
    
    print(f"Dataset size after aggregation: {len(df)}")
    print(f"Unique shops: {df['shop_id'].n_unique()}, items: {df['item_id'].n_unique()}")
    print(f"Aggregation time: {time.time() - start_time:.2f} seconds")
    return df

df = aggregate_to_monthly(df)

item_cnt_day_winsor distribution after aggregation:
shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ value     │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 1.56857e6 │
│ null_count ┆ 0.0       │
│ mean       ┆ 2.254784  │
│ std        ┆ 8.010804  │
│ min        ┆ 0.0       │
│ 25%        ┆ 1.0       │
│ 50%        ┆ 1.0       │
│ 75%        ┆ 2.0       │
│ max        ┆ 1274.0    │
└────────────┴───────────┘
Dataset size after aggregation: 1568570
Unique shops: 54, items: 21309
Aggregation time: 0.26 seconds


In [8]:
def create_full_grid(df):
    """Create full shop-item-month grid."""
    start_time = time.time()
    
    shops = df['shop_id'].unique().to_list()
    items = df['item_id'].unique().to_list()
    date_blocks = list(range(33))  # 0–32
    
    grid = pl.DataFrame({'shop_id': shops}).join(
        pl.DataFrame({'item_id': items}), how='cross'
    ).join(
        pl.DataFrame({'date_block_num': date_blocks}), how='cross'
    ).with_columns([
        pl.col('shop_id').cast(pl.Int32),
        pl.col('item_id').cast(pl.Int32),
        pl.col('date_block_num').cast(pl.Int16),
        ((pl.col('date_block_num') % 12) + 1).cast(pl.Int32).alias('month'),
        ((pl.col('date_block_num') // 12) + 2013).cast(pl.Int32).alias('year')
    ])
    
    # Merge with aggregated data
    df = grid.join(
        df, on=['shop_id', 'item_id', 'date_block_num', 'month', 'year'], how='left'
    ).with_columns([
        pl.col('item_cnt_day_winsor').fill_null(0),
        pl.col('returns').fill_null(0),
        pl.col('item_price').fill_null(pl.col('item_price').mean().over('item_id')).fill_null(0),
        pl.col('item_category_id').fill_null(pl.col('item_category_id').first().over('item_id')).fill_null(0)
    ]).with_columns(
        pl.datetime(pl.col('year'), pl.col('month'), 1).alias('date')
    )
    
    print(f"Grid size: {len(grid)}, after merge: {len(df)}")
    print(f"Unique shops: {df['shop_id'].n_unique()}, items: {df['item_id'].n_unique()}")
    print(f"Grid creation time: {time.time() - start_time:.2f} seconds")
    return df

# Create full grid
df = create_full_grid(df)

Grid size: 37972638, after merge: 37972638
Unique shops: 54, items: 21309
Grid creation time: 5.13 seconds


In [9]:
def seasonal_imputation(df, col):
    """Apply seasonality-aware imputation to a column."""
    df = df.with_columns(
        pl.col(col).interpolate().over(['shop_id', 'item_id']).alias(f'{col}_interp')
    ).with_columns(
        seasonal_value=pl.col(col).shift(12).over(['shop_id', 'item_id', 'month']),
        ma_value=pl.col(col).rolling_mean(window_size=12, min_periods=1).over(['shop_id', 'item_id'])
    ).with_columns(
        pl.when(pl.col(f'{col}_interp').is_null() & pl.col('seasonal_value').is_not_null())
          .then(pl.col('seasonal_value'))
          .when(pl.col(f'{col}_interp').is_null())
          .then(pl.col('ma_value'))
          .otherwise(pl.col(f'{col}_interp'))
          .alias(col)
    ).drop([f'{col}_interp', 'seasonal_value', 'ma_value'])
    return df

def apply_imputation(df, cols):
    """Impute missing values for specified columns."""
    start_time = time.time()
    
    for col in cols:
        df = seasonal_imputation(df, col)
    df = df.with_columns([pl.col(col).fill_null(0) for col in cols])
    
    print(f"Imputation time: {time.time() - start_time:.2f} seconds")
    return df

# Apply imputation
numerical_cols = ['item_cnt_day_winsor', 'returns', 'item_price']
df = apply_imputation(df, numerical_cols)

Imputation time: 61.02 seconds


In [10]:
def add_holiday_features(df, holiday_df):
    """Add holiday features to the dataset."""
    start_time = time.time()
    
    df = df.join(
        holiday_df.select(['year', 'month', 'holiday']),
        on=['year', 'month'], how='left'
    ).with_columns(
        is_holiday=pl.col('holiday').is_not_null().cast(pl.Int8),
        holiday=pl.col('holiday').fill_null('None')
    )
    
    print(f"Holiday feature addition time: {time.time() - start_time:.2f} seconds")
    return df

# Add holiday features
df = add_holiday_features(df, holiday_df)

Holiday feature addition time: 0.60 seconds


In [11]:
def filter_sparse_products(df):
    """Exclude shop-item pairs with >30% missing data."""
    start_time = time.time()
    
    shop_item_missing = df.group_by(['shop_id', 'item_id']).agg(
        missing_ratio=pl.col('item_cnt_day_winsor').eq(0).mean()
    )
    valid_shop_items = shop_item_missing.filter(pl.col('missing_ratio') <= 0.3).select(['shop_id', 'item_id'])
    initial_size = len(df)
    df = df.join(valid_shop_items, on=['shop_id', 'item_id'], how='inner')
    
    print(f"Records dropped due to >30% missing: {initial_size - len(df)}")
    print(f"Records after filtering: {len(df)}, shop-item pairs: {len(valid_shop_items)}")
    print(f"Filtering time: {time.time() - start_time:.2f} seconds")
    return df

# Filter sparse products
df = filter_sparse_products(df)

Records dropped due to >30% missing: 37883835
Records after filtering: 88803, shop-item pairs: 2691
Filtering time: 0.47 seconds


In [12]:
def create_lag_features(df):
    """Create lag features for 1–3 months."""
    start_time = time.time()
    
    df = df.sort(['shop_id', 'item_id', 'date'])
    for lag in [1, 2, 3]:
        df = df.with_columns([
            pl.col('item_cnt_day_winsor').shift(lag).over(['shop_id', 'item_id']).alias(f'lag_sales_{lag}'),
            pl.col('returns').shift(lag).over(['shop_id', 'item_id']).alias(f'lag_returns_{lag}'),
            pl.col('item_price').shift(lag).over(['shop_id', 'item_id']).alias(f'lag_price_{lag}')
        ])
    
    print(f"Lag feature creation time: {time.time() - start_time:.2f} seconds")
    return df

# Create lag features
df = create_lag_features(df)

Lag feature creation time: 0.09 seconds


In [13]:
# Impute lag features
numerical_cols += ['lag_sales_1', 'lag_sales_2', 'lag_sales_3',
                   'lag_returns_1', 'lag_returns_2', 'lag_returns_3',
                   'lag_price_1', 'lag_price_2', 'lag_price_3']
print("\nMissing values before lag imputation:")
print(df.select(numerical_cols).null_count().to_pandas().to_string())

df = apply_imputation(df, numerical_cols)

print("\nMissing values after lag imputation:")
print(df.select(numerical_cols).null_count().to_pandas().to_string())

# Note: Ensure generate_embeddings.py creates entity embeddings for shop_id, item_id, item_category_id


Missing values before lag imputation:
   item_cnt_day_winsor  returns  item_price  lag_sales_1  lag_sales_2  lag_sales_3  lag_returns_1  lag_returns_2  lag_returns_3  lag_price_1  lag_price_2  lag_price_3
0                    0        0           0         2691         5382         8073           2691           5382           8073         2691         5382         8073


Imputation time: 1.03 seconds

Missing values after lag imputation:
   item_cnt_day_winsor  returns  item_price  lag_sales_1  lag_sales_2  lag_sales_3  lag_returns_1  lag_returns_2  lag_returns_3  lag_price_1  lag_price_2  lag_price_3
0                    0        0           0            0            0            0              0              0              0            0            0            0


In [14]:
import pickle

def scale_and_save_data(df, numerical_cols):
    """Apply robust scaling and save datasets."""
    start_time = time.time()
    
    # Convert to pandas
    monthly_sales = df.to_pandas()
    del df  # Free memory
    
    # Save unscaled data
    monthly_sales.to_parquet('/workspace/XAI-2/processed_data/monthly_sales_unscaled.parquet')
    print("Saved unscaled data")
    
    # Robust scaling
    scaler = RobustScaler()
    train_data = monthly_sales[monthly_sales['date_block_num'] < 30][numerical_cols]
    print(f"Scaler training data shape: {train_data.shape}")
    
    scaler.fit(train_data)
    monthly_sales[numerical_cols] = scaler.transform(monthly_sales[numerical_cols])
    
    if monthly_sales[numerical_cols].isna().any().any():
        raise ValueError("NaNs introduced during scaling")
    
    with open('/workspace/XAI-2/processed_data/scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    
    # Optimize dtypes
    dtypes = {col: 'float32' for col in numerical_cols}
    dtypes.update({
        'shop_id': 'int32', 'item_id': 'int32', 'item_category_id': 'int32',
        'date_block_num': 'int16', 'month': 'int32', 'year': 'int32', 'is_holiday': 'int8'
    })
    monthly_sales = monthly_sales.astype(dtypes, errors='ignore')
    
    print(f"Scaling and saving time: {time.time() - start_time:.2f} seconds")
    return monthly_sales

# Scale and save data
monthly_sales = scale_and_save_data(df, numerical_cols)

Saved unscaled data
Scaler training data shape: (80730, 12)
Scaling and saving time: 0.28 seconds


In [None]:
def split_and_save_sets(df):
    """Split data into train/val/test per paper (months 0–30, 31, 32) and save X and y."""
    start_time = time.time()
    
    # Define splits
    train_df = df[df['date_block_num'] <= 30]  # Inclusive
    val_df = df[df['date_block_num'] == 31]
    test_df = df[df['date_block_num'] == 32]
    
    # Extract features (X) and target (y)
    target_col = 'item_cnt_day_winsor'
    exclude_cols = [target_col, 'holiday']  # Exclude target and non-numeric holiday
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    X_train = train_df[feature_cols]
    y_train = train_df[[target_col]]
    X_val = val_df[feature_cols]
    y_val = val_df[[target_col]]
    X_test = test_df[feature_cols]
    y_test = test_df[[target_col]]
    
    # Print shapes
    print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")
    print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
    
    # Save splits
    output_dir = '/workspace/XAI-2/processed_data'
    os.makedirs(output_dir, exist_ok=True)
    
    X_train.to_parquet(os.path.join(output_dir, 'X_train_processed.parquet'))
    y_train.to_parquet(os.path.join(output_dir, 'y_train_processed.parquet'))
    X_val.to_parquet(os.path.join(output_dir, 'X_val_processed.parquet'))
    y_val.to_parquet(os.path.join(output_dir, 'y_val_processed.parquet'))
    X_test.to_parquet(os.path.join(output_dir, 'X_test_processed.parquet'))
    y_test.to_parquet(os.path.join(output_dir, 'y_test_processed.parquet'))
    
    # Save full dataset
    df.to_parquet('/workspace/XAI-2/processed_data/')
    
    print(f"Split and save time: {time.time() - start_time:.2f} seconds")
    return X_train, y_train, X_val, y_val, X_test, y_test

# Split and save datasets
X_train, y_train, X_val, y_val, X_test, y_test = split_and_save_sets(monthly_sales)

X_train: (83421, 19), y_train: (83421, 1)
X_val: (2691, 19), y_val: (2691, 1)
X_test: (2691, 19), y_test: (2691, 1)


OSError: Cannot save file into a non-existent directory: '/workspace/XAI-2/raw_data'

In [None]:
def save_metadata(df, numerical_cols):
    """Save feature and date index as JSON."""
    start_time = time.time()
    
    feature_index = {col: i for i, col in enumerate(numerical_cols)}
    date_index = df[['date']].reset_index().rename(columns={'index': 'row_index'}).to_dict(orient='records')
    
    metadata = {
        "feature_index": feature_index,
        "date_index": date_index
    }
    
    with open('/workspace/processed_data/metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2, default=str)
    
    logger.info("Saved metadata to /workspace/processed_data/metadata.json")
    logger.info(f"Metadata save time: {time.time() - start_time:.2f} seconds")

# Save metadata
save_metadata(monthly_sales, numerical_cols)

In [None]:
def validate_statistics(raw_df, processed_df):
    """Compute coefficient of variation for raw and processed data."""
    raw_cv = raw_df['item_cnt_day'].std() / raw_df['item_cnt_day'].mean()
    processed_cv = processed_df['item_cnt_day_winsor'].std() / processed_df['item_cnt_day_winsor'].mean()
    logger.info(f"Raw coefficient of variation: {raw_cv:.2f}")
    logger.info(f"Processed coefficient of variation: {processed_cv:.2f}")
    if abs(raw_cv - 2.8) > 0.1 or abs(processed_cv - 1.9) > 0.1:
        logger.warning("Coefficient of variation deviates from paper's reported values (raw: 2.8, processed: 1.9)")

# Validate statistics
validate_statistics(data, monthly_sales)

In [None]:
def validate_final_dataset(df):
    """Validate final dataset and log statistics."""
    logger.info(f"Final dataset size: {len(df)}")
    logger.info(f"Unique shops: {df['shop_id'].nunique()}, items: {df['item_id'].nunique()}")
    logger.info(f"Date block range: {df['date_block_num'].min()}–{df['date_block_num'].max()}")
    
    expected_size = 2935849  # Raw record count from paper
    if abs(len(df) - expected_size) / expected_size > 0.1:
        logger.warning(f"Dataset size {len(df)} deviates from expected {expected_size}")

# Validate final dataset
validate_final_dataset(monthly_sales)