# Data Preprocessing - Missing Value Treatment

**Purpose**: Create two versions of dataset for performance comparison
- **Original**: Keep missing values as-is (current version)
- **Filled**: Fill missing open prices with previous day's close price

**Missing Value Statistics**: ~1.8M missing open values out of 34.3M total records (5.3%)

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("Data Preprocessing - Missing Value Treatment")
print("=" * 50)

In [None]:
# Load original data
print("Loading original datasets...")

train_file = 'data/data_1993_2000_train_val.parquet'
test_file = 'data/data_2001_2019_test.parquet'

train_df = pd.read_parquet(train_file)
test_df = pd.read_parquet(test_file)

print(f"Train data: {len(train_df):,} records")
print(f"Test data: {len(test_df):,} records")
print(f"Total: {len(train_df) + len(test_df):,} records")

In [None]:
# Analyze missing values in original data
def analyze_missing_values(df, dataset_name):
    print(f"\n{dataset_name} Missing Value Analysis:")
    print("-" * 40)
    
    key_columns = ['open', 'high', 'low', 'close', 'volume', 'ret']
    
    for col in key_columns:
        if col in df.columns:
            missing_count = df[col].isnull().sum()
            missing_pct = missing_count / len(df) * 100
            print(f"  {col:8}: {missing_count:,} ({missing_pct:.2f}%)")
    
    return df

analyze_missing_values(train_df, "Training Data")
analyze_missing_values(test_df, "Test Data")

In [None]:
# Fill missing open prices with previous day's close price
def fill_missing_open_prices(df, dataset_name):
    print(f"\nProcessing {dataset_name}...")
    
    df_filled = df.copy()
    original_missing = df_filled['open'].isnull().sum()
    
    # Sort by stock code and date for proper forward fill
    df_filled = df_filled.sort_values(['code', 'date']).reset_index(drop=True)
    
    # Fill missing open prices with previous day's close price
    # Group by stock code to avoid cross-stock contamination
    filled_count = 0
    
    for code in df_filled['code'].unique():
        mask = df_filled['code'] == code
        stock_data = df_filled[mask].copy()
        
        # Create previous day's close price column
        stock_data['prev_close'] = stock_data['close'].shift(1)
        
        # Fill missing open with previous close
        missing_open_mask = stock_data['open'].isnull()
        stock_data.loc[missing_open_mask, 'open'] = stock_data.loc[missing_open_mask, 'prev_close']
        
        # Count filled values
        filled_count += missing_open_mask.sum()
        
        # Update main dataframe
        df_filled.loc[mask, 'open'] = stock_data['open']
    
    remaining_missing = df_filled['open'].isnull().sum()
    actually_filled = original_missing - remaining_missing
    
    print(f"  Original missing: {original_missing:,}")
    print(f"  Filled: {actually_filled:,}")
    print(f"  Still missing: {remaining_missing:,} (first day of each stock)")
    
    return df_filled

# Process both datasets
train_filled = fill_missing_open_prices(train_df, "Training Data")
test_filled = fill_missing_open_prices(test_df, "Test Data")

In [None]:
# Compare statistics before and after filling
def compare_statistics(original_df, filled_df, dataset_name):
    print(f"\n{dataset_name} - Before vs After Comparison:")
    print("-" * 50)
    
    key_stats = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
    
    orig_stats = original_df['open'].describe()
    filled_stats = filled_df['open'].describe()
    
    print(f"{'Statistic':<10} {'Original':<12} {'Filled':<12} {'Difference':<12}")
    print("-" * 48)
    
    for stat in key_stats:
        if stat == 'count':
            orig_val = orig_stats[stat]
            filled_val = filled_stats[stat]
            diff = filled_val - orig_val
            print(f"{stat:<10} {orig_val:<12.0f} {filled_val:<12.0f} {diff:<12.0f}")
        else:
            orig_val = orig_stats[stat]
            filled_val = filled_stats[stat]
            diff = filled_val - orig_val
            print(f"{stat:<10} {orig_val:<12.4f} {filled_val:<12.4f} {diff:<12.4f}")

compare_statistics(train_df, train_filled, "Training Data")
compare_statistics(test_df, test_filled, "Test Data")

In [None]:
# Save filled datasets with new filenames
print("\nSaving filled datasets...")

train_filled_file = 'data/data_1993_2000_train_val_filled.parquet'
test_filled_file = 'data/data_2001_2019_test_filled.parquet'

# Save filled versions
train_filled.to_parquet(train_filled_file, index=False)
test_filled.to_parquet(test_filled_file, index=False)

print(f"✅ Saved: {train_filled_file}")
print(f"✅ Saved: {test_filled_file}")

# File size comparison
import os

def get_file_size_mb(filepath):
    return os.path.getsize(filepath) / (1024**2)

print("\nFile Size Comparison:")
print(f"  Original train: {get_file_size_mb(train_file):.1f}MB")
print(f"  Filled train:   {get_file_size_mb(train_filled_file):.1f}MB")
print(f"  Original test:  {get_file_size_mb(test_file):.1f}MB")
print(f"  Filled test:    {get_file_size_mb(test_filled_file):.1f}MB")

In [None]:
# Validation: Check that filled data makes sense
print("\nValidation Checks:")
print("-" * 20)

# Check 1: No negative open prices after filling
train_neg_open = (train_filled['open'] <= 0).sum()
test_neg_open = (test_filled['open'] <= 0).sum()
print(f"✓ Non-positive open prices: Train={train_neg_open}, Test={test_neg_open}")

# Check 2: Reasonable open/close ratios
train_filled['open_close_ratio'] = train_filled['open'] / train_filled['close']
test_filled['open_close_ratio'] = test_filled['open'] / test_filled['close']

extreme_ratios_train = ((train_filled['open_close_ratio'] > 2) | (train_filled['open_close_ratio'] < 0.5)).sum()
extreme_ratios_test = ((test_filled['open_close_ratio'] > 2) | (test_filled['open_close_ratio'] < 0.5)).sum()

print(f"✓ Extreme open/close ratios (>2x or <0.5x): Train={extreme_ratios_train}, Test={extreme_ratios_test}")

# Check 3: Overall data integrity
print(f"✓ Data integrity: Train shape={train_filled.shape}, Test shape={test_filled.shape}")

print("\n🎉 Data preprocessing completed successfully!")
print("\nNext steps:")
print("1. Generate images with both versions using datageneration.py")
print("2. Train models with both datasets")
print("3. Compare performance in data_analysis.ipynb")

In [None]:
# Generate summary for next steps
print("\n" + "="*60)
print("DATASET VERSIONS READY FOR COMPARISON")
print("="*60)

print("\n📁 Original Version (with missing values):")
print("   - data/data_1993_2000_train_val.parquet")
print("   - data/data_2001_2019_test.parquet")
print("   - Use: python datageneration.py --data_version original")

print("\n📁 Filled Version (missing values filled):")
print("   - data/data_1993_2000_train_val_filled.parquet")
print("   - data/data_2001_2019_test_filled.parquet")
print("   - Use: python datageneration.py --data_version filled")

print("\n🔬 Performance Comparison Plan:")
print("   1. Generate 20d images for both versions")
print("   2. Train CNN20d models separately")
print("   3. Compare Sharpe ratios and portfolio performance")
print("   4. Analyze in data_analysis.ipynb")

print("\n💡 Hypothesis:")
print("   Filled version may show improved performance due to:")
print("   - More complete candlestick patterns")
print("   - Reduced noise from missing data")
print("   - Better gradient flow during training")