In [None]:
import os
import sys
import subprocess
import warnings
import shutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings('ignore')

def check_cuda_environment():
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available. This notebook requires a GPU.")
    
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    
    try:
        test_tensor = torch.tensor([1.0, 2.0, 3.0], device='cuda')
        test_result = test_tensor + 1
        print(f"CUDA test operation successful: {test_result}")
    except Exception as e:
        print(f"CUDA test operation failed: {e}")
        raise
    
    try:
        nvidia_smi = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
        print("NVIDIA-SMI output:")
        print(nvidia_smi.stdout)
    except Exception as e:
        print(f"Failed to run nvidia-smi: {e}")
    
    print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 2**30:.2f} GiB")
    print(f"Allocated GPU memory: {torch.cuda.memory_allocated(0) / 2**30:.2f} GiB")
    print(f"Reserved GPU memory: {torch.cuda.memory_reserved(0) / 2**30:.2f} GiB")

try:
    check_cuda_environment()
except Exception as e:
    print(f"Error with PyTorch or CUDA setup: {e}")
    print("Try reinstalling PyTorch: pip install torch==2.7.0 --index-url https://download.pytorch.org/whl/cu124")
    raise

print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")
print(f"PATH: {os.environ.get('PATH')}")
print(f"Available disk space: {shutil.disk_usage('/').free / (2**30):.2f} GiB")

if os.path.exists('/workspace/XAI/torch.py') or os.path.exists('/workspace/XAI/torch.pyc'):
    print("Warning: Found 'torch.py' or 'torch.pyc' in /workspace/XAI. Please rename or remove it.")

torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda")
print(f"Using device: {device}")

torch.cuda.empty_cache()
print("Cleared GPU memory cache")

In [None]:
def load_data(data_dir="/workspace/data", file_name="merged_data.csv"):
    """Load CSV data with fallback path."""
    file_path = os.path.join(data_dir, file_name)
    alt_path = "/workspace/XAI-2/Predict Future Sales/merged_data.csv"
    
    if not os.path.exists(file_path):
        if os.path.exists(alt_path):
            file_path = alt_path
        else:
            raise FileNotFoundError(f"File not found at {file_path} or {alt_path}")
    
    try:
        data = pd.read_csv(file_path)
        print(f"Loaded data from {file_path}")
        print(f"Dataset shape: {data.shape}")
        print(f"Columns: {list(data.columns)}")
    except Exception as e:
        raise RuntimeError(f"Failed to load {file_path}: {e}")
    
    expected_columns = ['date', 'shop_id', 'item_id', 'item_name', 'item_cnt_day', 
                        'item_price', 'item_category_id', 'shop_name', 'item_category_name', 'date_block_num']
    missing_cols = [col for col in expected_columns if col not in data.columns]
    if missing_cols:
        print(f"Warning: Missing expected columns: {missing_cols}")
    
    return data

data = load_data()

In [None]:
import polars as pl
from datetime import datetime

def get_russian_holidays():
    """Return DataFrame of Russian holidays and shopping events (2013–2015)."""
    holidays = [
        ("2013-01-01", "New Year"), ("2014-01-01", "New Year"), ("2015-01-01", "New Year"),
        ("2013-02-23", "Defender Day"), ("2014-02-23", "Defender Day"), ("2015-02-23", "Defender Day"),
        ("2013-03-08", "Women's Day"), ("2014-03-08", "Women's Day"), ("2015-03-08", "Women's Day"),
        ("2013-06-12", "Russia Day"), ("2014-06-12", "Russia Day"), ("2015-06-12", "Russia Day"),
        ("2013-11-29", "Black Friday"), ("2014-11-28", "Black Friday"), ("2015-11-27", "Black Friday")
    ]
    holiday_df = pl.DataFrame({
        "date": [datetime.strptime(date, "%Y-%m-%d") for date, _ in holidays],
        "holiday": [name for _, name in holidays]
    }).with_columns([
        pl.col("date").dt.month().cast(pl.Int32).alias("month"),
        pl.col("date").dt.year().cast(pl.Int32).alias("year")
    ])
    return holiday_df

holiday_df = get_russian_holidays()
print(f"Holiday DataFrame created with {len(holiday_df)} entries")

In [None]:
import time
import polars as pl

def initial_preprocessing(df):
    """Perform initial data cleaning and filtering, including shop selection."""
    start_time = time.time()
    
    # Convert to Polars
    df = pl.from_pandas(df)
    print(f"Initial dataset size: {len(df)}")
    print(f"Unique shops: {df['shop_id'].n_unique()}, items: {df['item_id'].n_unique()}")
    print(f"Date block range: {df['date_block_num'].min()}–{df['date_block_num'].max()}")
    
    # Parse dates
    df = df.with_columns(
        pl.col('date').str.strptime(pl.Date, "%d.%m.%Y")
    ).with_columns([
        pl.col('date').dt.month().cast(pl.Int32).alias('month'),
        pl.col('date').dt.year().cast(pl.Int32).alias('year')
    ])
    
    # Filter to date_block_num <= 32
    df = df.filter(pl.col('date_block_num') <= 32)
    print(f"Dataset size after date filter: {len(df)}")
    
    # Filter to top 54 shops by total sales volume
    shop_sales = df.group_by('shop_id').agg(
        total_sales=pl.col('item_cnt_day').sum()
    ).sort('total_sales', descending=True).head(54)
    valid_shops = shop_sales['shop_id'].to_list()
    df = df.filter(pl.col('shop_id').is_in(valid_shops))
    print(f"Dataset size after shop filter: {len(df)}")
    print(f"Unique shops after processing: {df['shop_id'].n_unique()}")
    
    # Optimize dtypes
    df = df.with_columns([
        pl.col('date_block_num').cast(pl.Int16),
        pl.col('shop_id').cast(pl.Int32),
        pl.col('item_id').cast(pl.Int32),
        pl.col('item_category_id').cast(pl.Int32),
        pl.col('item_cnt_day').cast(pl.Float32),
        pl.col('item_price').cast(pl.Float32),
        pl.col('month').cast(pl.Int32),
        pl.col('year').cast(pl.Int32)
    ])
    
    print(f"Initial preprocessing time: {time.time() - start_time:.2f} seconds")
    return df

df = initial_preprocessing(data)

In [None]:
def handle_negative_sales_and_outliers(df):
    """Apply rolling Winsorization and handle negative sales at daily level."""
    start_time = time.time()
    
    # Diagnostics: Inspect item_cnt_day distribution
    print("item_cnt_day distribution before processing:")
    print(df['item_cnt_day'].describe())
    print(f"Rows with item_cnt_day > 100: {df.filter(pl.col('item_cnt_day') > 100).height}")
    print(f"Rows with item_cnt_day > 500: {df.filter(pl.col('item_cnt_day') > 500).height}")
    
    # Check for sparse shop-item pairs
    shop_item_counts = df.group_by(['shop_id', 'item_id']).agg(
        day_count=pl.col('date').count()
    )
    print(f"Shop-item pairs with < 30 days: {shop_item_counts.filter(pl.col('day_count') < 30).height}")
    
    # Sort for rolling operations
    df = df.sort(['shop_id', 'item_id', 'date'])
    
    # Rolling Winsorization (30-day window, 95th percentile) with static cap
    df = df.with_columns(
        rolling_quantile=pl.col('item_cnt_day').rolling_quantile(
            quantile=0.95, window_size=30, min_periods=1
        ).over(['shop_id', 'item_id'])
    ).with_columns(
        item_cnt_day_winsor=pl.col('item_cnt_day').clip(None, pl.min_horizontal(pl.col('rolling_quantile'), 1000))
    )
    outlier_count = df.filter(pl.col('item_cnt_day') > pl.col('item_cnt_day_winsor')).height
    print(f"Outliers capped: {outlier_count} ({outlier_count / len(df) * 100:.2f}%)")
    
    # Diagnostics: Inspect rolling quantile and winsorized values
    print("Rolling quantile stats:")
    print(df['rolling_quantile'].describe())
    print(f"Max item_cnt_day_winsor: {df['item_cnt_day_winsor'].max()}")
    
    # Handle negative sales
    df = df.with_columns([
        pl.when(pl.col('item_cnt_day_winsor') < 0)
          .then(pl.col('item_cnt_day_winsor').abs())
          .otherwise(0)
          .alias('returns'),
        pl.col('item_cnt_day_winsor').clip(lower_bound=0).alias('item_cnt_day_winsor')
    ])
    print(f"Negative sales after processing: {df.filter(pl.col('item_cnt_day_winsor') < 0).height}")
    
    # Drop temporary column
    df = df.drop('rolling_quantile')
    
    print(f"Negative sales and outlier handling time: {time.time() - start_time:.2f} seconds")
    return df

df = handle_negative_sales_and_outliers(df)

In [None]:
def aggregate_to_monthly(df):
    """Aggregate data to monthly level."""
    start_time = time.time()
    
    # Aggregate to monthly
    df = df.group_by(['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'month', 'year']).agg([
        pl.col('item_cnt_day_winsor').sum().alias('item_cnt_day_winsor'),
        pl.col('returns').sum().alias('returns'),
        pl.col('item_price').mean().alias('item_price')
    ])
    
    # Diagnostics: Inspect aggregated item_cnt_day_winsor
    print("item_cnt_day_winsor distribution after aggregation:")
    print(df['item_cnt_day_winsor'].describe())
    
    print(f"Dataset size after aggregation: {len(df)}")
    print(f"Unique shops: {df['shop_id'].n_unique()}, items: {df['item_id'].n_unique()}")
    print(f"Aggregation time: {time.time() - start_time:.2f} seconds")
    return df

df = aggregate_to_monthly(df)

In [None]:
def create_full_grid(df):
    """Create full shop-item-month grid."""
    start_time = time.time()
    
    shops = df['shop_id'].unique().to_list()
    items = df['item_id'].unique().to_list()
    date_blocks = list(range(33))  # 0–32
    
    grid = pl.DataFrame({'shop_id': shops}).join(
        pl.DataFrame({'item_id': items}), how='cross'
    ).join(
        pl.DataFrame({'date_block_num': date_blocks}), how='cross'
    ).with_columns([
        pl.col('shop_id').cast(pl.Int32),
        pl.col('item_id').cast(pl.Int32),
        pl.col('date_block_num').cast(pl.Int16),
        ((pl.col('date_block_num') % 12) + 1).cast(pl.Int32).alias('month'),
        ((pl.col('date_block_num') // 12) + 2013).cast(pl.Int32).alias('year')
    ])
    
    df = grid.join(
        df, on=['shop_id', 'item_id', 'date_block_num', 'month', 'year'], how='left'
    ).with_columns([
        pl.col('item_cnt_day_winsor').fill_null(0),
        pl.col('returns').fill_null(0),
        pl.col('item_price').fill_null(pl.col('item_price').mean().over('item_id')).fill_null(0),
        pl.col('item_category_id').fill_null(pl.col('item_category_id').first().over('item_id')).fill_null(0)
    ]).with_columns(
        pl.datetime(pl.col('year'), pl.col('month'), 1).alias('date')
    )
    
    print(f"Grid size: {len(grid)}, after merge: {len(df)}")
    print(f"Unique shops: {df['shop_id'].n_unique()}, items: {df['item_id'].n_unique()}")
    print(f"Grid creation time: {time.time() - start_time:.2f} seconds")
    return df

df = create_full_grid(df)

In [None]:
def seasonal_imputation(df, col):
    """Apply seasonality-aware imputation to a column."""
    df = df.with_columns(
        pl.col(col).interpolate().over(['shop_id', 'item_id']).alias(f'{col}_interp')
    ).with_columns(
        seasonal_value=pl.col(col).shift(12).over(['shop_id', 'item_id', 'month']),
        ma_value=pl.col(col).rolling_mean(window_size=12, min_periods=1).over(['shop_id', 'item_id'])
    ).with_columns(
        pl.when(pl.col(f'{col}_interp').is_null() & pl.col('seasonal_value').is_not_null())
          .then(pl.col('seasonal_value'))
          .when(pl.col(f'{col}_interp').is_null())
          .then(pl.col('ma_value'))
          .otherwise(pl.col(f'{col}_interp'))
          .alias(col)
    ).drop([f'{col}_interp', 'seasonal_value', 'ma_value'])
    return df

def apply_imputation(df, cols):
    """Impute missing values for specified columns."""
    start_time = time.time()
    
    for col in cols:
        df = seasonal_imputation(df, col)
    df = df.with_columns([pl.col(col).fill_null(0) for col in cols])
    
    print(f"Imputation time: {time.time() - start_time:.2f} seconds")
    return df

numerical_cols = ['item_cnt_day_winsor', 'returns', 'item_price']
df = apply_imputation(df, numerical_cols)

In [None]:
def add_holiday_features(df, holiday_df):
    """Add holiday features to the dataset."""
    start_time = time.time()
    
    df = df.join(
        holiday_df.select(['year', 'month', 'holiday']),
        on=['year', 'month'], how='left'
    ).with_columns(
        is_holiday=pl.col('holiday').is_not_null().cast(pl.Int8),
        holiday=pl.col('holiday').fill_null('None')
    )
    
    print(f"Holiday feature addition time: {time.time() - start_time:.2f} seconds")
    return df

df = add_holiday_features(df, holiday_df)

In [None]:
def filter_sparse_products(df):
    """Exclude shop-item pairs with >30% missing data."""
    start_time = time.time()
    
    shop_item_missing = df.group_by(['shop_id', 'item_id']).agg(
        missing_ratio=pl.col('item_cnt_day_winsor').eq(0).mean()
    )
    valid_shop_items = shop_item_missing.filter(pl.col('missing_ratio') <= 0.3).select(['shop_id', 'item_id'])
    initial_size = len(df)
    df = df.join(valid_shop_items, on=['shop_id', 'item_id'], how='inner')
    
    print(f"Records dropped due to >30% missing: {initial_size - len(df)}")
    print(f"Records after filtering: {len(df)}, shop-item pairs: {len(valid_shop_items)}")
    print(f"Filtering time: {time.time() - start_time:.2f} seconds")
    return df

df = filter_sparse_products(df)

In [None]:
def create_lag_features(df):
    """Create lag features for 1–3 months."""
    start_time = time.time()
    
    df = df.sort(['shop_id', 'item_id', 'date'])
    for lag in [1, 2, 3]:
        df = df.with_columns([
            pl.col('item_cnt_day_winsor').shift(lag).over(['shop_id', 'item_id']).alias(f'lag_sales_{lag}'),
            pl.col('returns').shift(lag).over(['shop_id', 'item_id']).alias(f'lag_returns_{lag}'),
            pl.col('item_price').shift(lag).over(['shop_id', 'item_id']).alias(f'lag_price_{lag}')
        ])
    
    print(f"Lag feature creation time: {time.time() - start_time:.2f} seconds")
    return df

df = create_lag_features(df)

In [None]:
import pickle

def scale_and_save_data(df, numerical_cols):
    """Apply robust scaling and save datasets."""
    start_time = time.time()
    
    monthly_sales = df.to_pandas()
    del df
    
    monthly_sales.to_parquet('/workspace/XAI-2/processed_data/monthly_sales_unscaled.parquet')
    print("Saved unscaled data")
    
    scaler = RobustScaler()
    train_data = monthly_sales[monthly_sales['date_block_num'] < 30][numerical_cols]
    print(f"Scaler training data shape: {train_data.shape}")
    
    scaler.fit(train_data)
    monthly_sales[numerical_cols] = scaler.transform(monthly_sales[numerical_cols])
    
    if monthly_sales[numerical_cols].isna().any().any():
        raise ValueError("NaNs introduced during scaling")
    
    with open('/workspace/XAI-2/processed_data/scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    
    dtypes = {col: 'float32' for col in numerical_cols}
    dtypes.update({
        'shop_id': 'int32', 'item_id': 'int32', 'item_category_id': 'int32',
        'date_block_num': 'int16', 'month': 'int32', 'year': 'int32', 'is_holiday': 'int8'
    })
    monthly_sales = monthly_sales.astype(dtypes, errors='ignore')
    
    print(f"Scaling and saving time: {time.time() - start_time:.2f} seconds")
    return monthly_sales

monthly_sales = scale_and_save_data(df, numerical_cols)

In [None]:
def split_and_save_sets(df):
    """Split data into train/val/test per paper (months 0–30, 31, 32) and save X and y."""
    start_time = time.time()
    
    train_df = df[df['date_block_num'] <= 30]
    val_df = df[df['date_block_num'] == 31]
    test_df = df[df['date_block_num'] == 32]
    
    target_col = 'item_cnt_day_winsor'
    exclude_cols = [target_col, 'holiday']
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    X_train = train_df[feature_cols]
    y_train = train_df[[target_col]]
    X_val = val_df[feature_cols]
    y_val = val_df[[target_col]]
    X_test = test_df[feature_cols]
    y_test = test_df[[target_col]]
    
    print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")
    print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
    
    output_dir = '/workspace/XAI-2/processed_data'
    os.makedirs(output_dir, exist_ok=True)
    
    X_train.to_parquet(os.path.join(output_dir, 'X_train_processed.parquet'))
    y_train.to_parquet(os.path.join(output_dir, 'y_train_processed.parquet'))
    X_val.to_parquet(os.path.join(output_dir, 'X_val_processed.parquet'))
    y_val.to_parquet(os.path.join(output_dir, 'y_val_processed.parquet'))
    X_test.to_parquet(os.path.join(output_dir, 'X_test_processed.parquet'))
    y_test.to_parquet(os.path.join(output_dir, 'y_test_processed.parquet'))
    
    df.to_parquet('/workspace/XAI-2/raw_data/processed_sales.parquet')
    
    print(f"Split and save time: {time.time() - start_time:.2f} seconds")
    return X_train, y_train, X_val, y_val, X_test, y_test

X_train, y_train, X_val, y_val, X_test, y_test = split_and_save_sets(monthly_sales)

In [None]:
def validate_dataset(processed_df, expected_size=2935849, raw_cv_target=2.8, processed_cv_target=1.9):
    """Validate processed dataset against paper's expectations."""
    start_time = time.time()
    results = {}
    
    # Dataset Size
    results['dataset_size'] = {
        'raw_size': len(processed_df),
        'expected_processed_size': expected_size,
        'status': 'PASS' if abs(len(processed_df) - expected_size) / expected_size <= 0.1 else 'FAIL'
    }
    print(f"Dataset Size - Raw: {len(processed_df)}, Expected Processed: {expected_size}")
    print(f"Dataset Size - Status: {results['dataset_size']['status']}")
    print("-" * 50)
    
    # Sparsity
    zero_sales = processed_df.filter(pl.col('item_cnt_day_winsor') == 0).height
    sparsity_ratio = zero_sales / len(processed_df)
    results['sparsity'] = {
        'zero_sales': zero_sales,
        'sparsity_ratio': sparsity_ratio,
        'status': 'PASS' if 0.2 <= sparsity_ratio <= 0.5 else 'FAIL'
    }
    print(f"Sparsity - Zero sales: {zero_sales} ({sparsity_ratio:.2%})")
    print(f"Sparsity - Status: {results['sparsity']['status']}")
    print("-" * 50)
    
    # Coefficient of Variation
    processed_cv = processed_df['item_cnt_day_winsor'].std() / processed_df['item_cnt_day_winsor'].mean()
    results['processed_cv'] = {
        'processed_cv': processed_cv,
        'target': processed_cv_target,
        'status': 'PASS' if abs(processed_cv - processed_cv_target) / processed_cv_target <= 0.2 else 'FAIL'
    }
    print(f"Processed CV - Value: {processed_cv:.2f}, Target: {processed_cv_target}")
    print(f"Processed CV - Status: {results['processed_cv']['status']}")
    print("-" * 50)
    
    # Null Values
    null_counts = processed_df.select(pl.col('item_cnt_day_winsor', 'returns', 'item_price').is_null().sum()).to_dict()
    results['null_values'] = {
        'null_counts': null_counts,
        'status': 'PASS' if all(count == 0 for count in null_counts.values()) else 'FAIL'
    }
    print(f"Null Values - Counts: {null_counts}")
    print(f"Null Values - Status: {results['null_values']['status']}")
    print("-" * 50)
    
    # Negative Values
    negative_count = processed_df.filter(pl.col('item_cnt_day_winsor') < 0).height
    results['negative_values'] = {
        'negative_count': negative_count,
        'status': 'PASS' if negative_count == 0 else 'FAIL'
    }
    print(f"Negative Values - Count: {negative_count}")
    print(f"Negative Values - Status: {results['negative_values']['status']}")
    print("-" * 50)
    
    # Outliers
    q1 = processed_df['item_cnt_day_winsor'].quantile(0.25)
    q3 = processed_df['item_cnt_day_winsor'].quantile(0.75)
    iqr = q3 - q1
    outlier_count = processed_df.filter(
        (pl.col('item_cnt_day_winsor') < q1 - 1.5 * iqr) | (pl.col('item_cnt_day_winsor') > q3 + 1.5 * iqr)
    ).height
    results['outliers'] = {
        'outlier_count': outlier_count,
        'status': 'PASS' if outlier_count / len(processed_df) < 0.1 else 'FAIL'
    }
    print(f"Outliers - Count: {outlier_count} ({outlier_count / len(processed_df):.2%})")
    print(f"Outliers - Status: {results['outliers']['status']}")
    print("-" * 50)
    
    # Shop and Item Counts
    shop_count = processed_df['shop_id'].n_unique()
    item_count = processed_df['item_id'].n_unique()
    results['shop_item_counts'] = {
        'shops': shop_count,
        'items': item_count,
        'status': 'PASS' if shop_count == 54 and 10000 <= item_count <= 15000 else 'FAIL'
    }
    print(f"Shops: {shop_count}, Items: {item_count}")
    print(f"Shop/Item Counts - Status: {results['shop_item_counts']['status']}")
    print("-" * 50)
    
    print(f"Validation time: {time.time() - start_time:.2f} seconds")
    return results

# Load processed data for validation
processed_df = pl.from_pandas(monthly_sales)
validation_results = validate_dataset(processed_df)