# Dataset Up/Down Distribution Analysis

Analyze the number of up/down stocks after 20 days in the entire dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 한글 폰트 설정
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False

## 1. Training Data Analysis (1993-2000)

In [None]:
# Load training data
print("=== Training Data (1993-2000) ===")
train_data = pd.read_parquet('data/data_1993_2000_train_val.parquet')

print(f"Total data size: {len(train_data):,}")
print(f"Columns: {list(train_data.columns)}")
print(f"Date range: {train_data['date'].min()} ~ {train_data['date'].max()}")
print(f"Unique stocks: {train_data['code'].nunique():,}")

In [None]:
# 20-day up/down distribution (label_20)
if 'label_20' in train_data.columns:
    label_20_counts = train_data['label_20'].value_counts().sort_index()
    print(f"\n20-day up/down distribution (training data):")
    print(f"  Down (0): {label_20_counts.get(0, 0):,} ({label_20_counts.get(0, 0)/len(train_data)*100:.1f}%)")
    print(f"  Up (1): {label_20_counts.get(1, 0):,} ({label_20_counts.get(1, 0)/len(train_data)*100:.1f}%)")
    
    # Check NA values
    na_count = train_data['label_20'].isna().sum()
    if na_count > 0:
        print(f"  NA values: {na_count:,} ({na_count/len(train_data)*100:.1f}%)")
        
    # Visualization
    plt.figure(figsize=(8, 6))
    plt.pie(label_20_counts.values, labels=['Down (0)', 'Up (1)'], autopct='%1.1f%%', startangle=90)
    plt.title('Training Data: 20-day Up/Down Distribution')
    plt.show()

In [None]:
# Check ret20 distribution
if 'ret20' in train_data.columns:
    ret20_stats = train_data['ret20'].describe()
    print(f"\n20-day actual returns (ret20) statistics:")
    print(f"  Mean: {ret20_stats['mean']:.4f} ({ret20_stats['mean']*100:.2f}%)")
    print(f"  Std: {ret20_stats['std']:.4f} ({ret20_stats['std']*100:.2f}%)")
    print(f"  Median: {ret20_stats['50%']:.4f} ({ret20_stats['50%']*100:.2f}%)")
    print(f"  Min: {ret20_stats['min']:.4f} ({ret20_stats['min']*100:.2f}%)")
    print(f"  Max: {ret20_stats['max']:.4f} ({ret20_stats['max']*100:.2f}%)")
    
    # Return distribution histogram
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(train_data['ret20'].dropna(), bins=100, alpha=0.7, edgecolor='black')
    plt.xlabel('20-day Returns')
    plt.ylabel('Frequency')
    plt.title('Training Data: 20-day Return Distribution')
    plt.axvline(0, color='red', linestyle='--', alpha=0.7, label='0% baseline')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    # Remove outliers and plot again
    ret20_filtered = train_data['ret20'].dropna()
    q1, q99 = ret20_filtered.quantile([0.01, 0.99])
    ret20_filtered = ret20_filtered[(ret20_filtered >= q1) & (ret20_filtered <= q99)]
    plt.hist(ret20_filtered, bins=100, alpha=0.7, edgecolor='black')
    plt.xlabel('20-day Returns (1-99% range)')
    plt.ylabel('Frequency')
    plt.title('Training Data: 20-day Return Distribution (outliers removed)')
    plt.axvline(0, color='red', linestyle='--', alpha=0.7, label='0% baseline')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

## 2. Test Data Analysis (2001-2019)

In [None]:
# Load test data
print("=== Test Data (2001-2019) ===")
test_data = pd.read_parquet('data/data_2001_2019_test.parquet')

print(f"Total data size: {len(test_data):,}")
print(f"Columns: {list(test_data.columns)}")
print(f"Date range: {test_data['date'].min()} ~ {test_data['date'].max()}")
print(f"Unique stocks: {test_data['code'].nunique():,}")

In [None]:
# 20-day up/down distribution (label_20)
if 'label_20' in test_data.columns:
    label_20_counts_test = test_data['label_20'].value_counts().sort_index()
    print(f"\n20-day up/down distribution (test data):")
    print(f"  Down (0): {label_20_counts_test.get(0, 0):,} ({label_20_counts_test.get(0, 0)/len(test_data)*100:.1f}%)")
    print(f"  Up (1): {label_20_counts_test.get(1, 0):,} ({label_20_counts_test.get(1, 0)/len(test_data)*100:.1f}%)")
    
    # Check NA values
    na_count_test = test_data['label_20'].isna().sum()
    if na_count_test > 0:
        print(f"  NA values: {na_count_test:,} ({na_count_test/len(test_data)*100:.1f}%)")
        
    # Visualization
    plt.figure(figsize=(8, 6))
    plt.pie(label_20_counts_test.values, labels=['Down (0)', 'Up (1)'], autopct='%1.1f%%', startangle=90)
    plt.title('Test Data: 20-day Up/Down Distribution')
    plt.show()

In [None]:
# Test data ret20 distribution check
if 'ret20' in test_data.columns:
    ret20_stats_test = test_data['ret20'].describe()
    print(f"\n20-day actual returns (ret20) statistics (test):")
    print(f"  Mean: {ret20_stats_test['mean']:.4f} ({ret20_stats_test['mean']*100:.2f}%)")
    print(f"  Std: {ret20_stats_test['std']:.4f} ({ret20_stats_test['std']*100:.2f}%)")
    print(f"  Median: {ret20_stats_test['50%']:.4f} ({ret20_stats_test['50%']*100:.2f}%)")
    print(f"  Min: {ret20_stats_test['min']:.4f} ({ret20_stats_test['min']*100:.2f}%)")
    print(f"  Max: {ret20_stats_test['max']:.4f} ({ret20_stats_test['max']*100:.2f}%)")
    
    # Return distribution histogram
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(test_data['ret20'].dropna(), bins=100, alpha=0.7, edgecolor='black')
    plt.xlabel('20-day Returns')
    plt.ylabel('Frequency')
    plt.title('Test Data: 20-day Return Distribution')
    plt.axvline(0, color='red', linestyle='--', alpha=0.7, label='0% baseline')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    # Remove outliers and plot again
    ret20_filtered_test = test_data['ret20'].dropna()
    q1_test, q99_test = ret20_filtered_test.quantile([0.01, 0.99])
    ret20_filtered_test = ret20_filtered_test[(ret20_filtered_test >= q1_test) & (ret20_filtered_test <= q99_test)]
    plt.hist(ret20_filtered_test, bins=100, alpha=0.7, edgecolor='black')
    plt.xlabel('20-day Returns (1-99% range)')
    plt.ylabel('Frequency')
    plt.title('Test Data: 20-day Return Distribution (outliers removed)')
    plt.axvline(0, color='red', linestyle='--', alpha=0.7, label='0% baseline')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

## 3. Overall Data Comparison

In [None]:
# Training vs test data comparison
print("=== Overall Dataset Comparison ===")
print(f"Training data (1993-2000): {len(train_data):,}")
print(f"Test data (2001-2019): {len(test_data):,}")
print(f"Total data: {len(train_data) + len(test_data):,}")

if 'label_20' in train_data.columns and 'label_20' in test_data.columns:
    # Overall up/down distribution
    total_up_train = label_20_counts.get(1, 0)
    total_down_train = label_20_counts.get(0, 0)
    total_up_test = label_20_counts_test.get(1, 0)
    total_down_test = label_20_counts_test.get(0, 0)
    
    total_up = total_up_train + total_up_test
    total_down = total_down_train + total_down_test
    total_samples = total_up + total_down
    
    print(f"\nOverall 20-day up/down distribution:")
    print(f"  Total down (0): {total_down:,} ({total_down/total_samples*100:.1f}%)")
    print(f"  Total up (1): {total_up:,} ({total_up/total_samples*100:.1f}%)")
    
    # Comparison bar chart
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Training vs test comparison
    categories = ['Training Data\n(1993-2000)', 'Test Data\n(2001-2019)']
    up_counts = [total_up_train, total_up_test]
    down_counts = [total_down_train, total_down_test]
    
    x = np.arange(len(categories))
    width = 0.35
    
    ax1.bar(x - width/2, down_counts, width, label='Down (0)', alpha=0.8)
    ax1.bar(x + width/2, up_counts, width, label='Up (1)', alpha=0.8)
    ax1.set_xlabel('Dataset')
    ax1.set_ylabel('Number of Stocks')
    ax1.set_title('20-day Up/Down Distribution by Dataset')
    ax1.set_xticks(x)
    ax1.set_xticklabels(categories)
    ax1.legend()
    
    # Overall distribution pie chart
    ax2.pie([total_down, total_up], labels=['Down (0)', 'Up (1)'], autopct='%1.1f%%', startangle=90)
    ax2.set_title('Overall Data: 20-day Up/Down Distribution')
    
    plt.tight_layout()
    plt.show()

## 4. Compare 5-day/60-day distributions alongside

In [None]:
# Compare 5-day, 20-day, 60-day prediction period up/down distributions
periods = [5, 20, 60]
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

for i, period in enumerate(periods):
    label_col = f'label_{period}'
    
    if label_col in train_data.columns:
        # Training data
        train_counts = train_data[label_col].value_counts().sort_index()
        axes[0, i].pie(train_counts.values, labels=['Down (0)', 'Up (1)'], autopct='%1.1f%%', startangle=90)
        axes[0, i].set_title(f'Training Data: {period}-day Distribution')
        
    if label_col in test_data.columns:
        # Test data  
        test_counts = test_data[label_col].value_counts().sort_index()
        axes[1, i].pie(test_counts.values, labels=['Down (0)', 'Up (1)'], autopct='%1.1f%%', startangle=90)
        axes[1, i].set_title(f'Test Data: {period}-day Distribution')

plt.tight_layout()
plt.show()

# Numerical summary table
summary_data = []
for period in periods:
    label_col = f'label_{period}'
    
    if label_col in train_data.columns:
        train_counts = train_data[label_col].value_counts().sort_index()
        train_up_pct = train_counts.get(1, 0) / len(train_data) * 100
    else:
        train_up_pct = 0
        
    if label_col in test_data.columns:
        test_counts = test_data[label_col].value_counts().sort_index()
        test_up_pct = test_counts.get(1, 0) / len(test_data) * 100
    else:
        test_up_pct = 0
    
    summary_data.append({
        'Prediction Period': f'{period} days',
        'Training Up %': f'{train_up_pct:.1f}%',
        'Test Up %': f'{test_up_pct:.1f}%'
    })

summary_df = pd.DataFrame(summary_data)
print("\n=== Up Percentage Summary by Prediction Period ===")
print(summary_df.to_string(index=False))

## 5. Data Version Comparison: Original vs Filled Missing Values

Compare the two data versions to understand the impact of filling missing open prices with previous day's close prices.

In [None]:
# Load filled data versions (if they exist)
import os

filled_train_path = 'data/data_1993_2000_train_val_filled.parquet'
filled_test_path = 'data/data_2001_2019_test_filled.parquet'

if os.path.exists(filled_train_path) and os.path.exists(filled_test_path):
    print("=== Loading Filled Data Versions ===")
    train_data_filled = pd.read_parquet(filled_train_path)
    test_data_filled = pd.read_parquet(filled_test_path)
    
    print(f"Filled training data: {len(train_data_filled):,}")
    print(f"Filled test data: {len(test_data_filled):,}")
    
    # Compare missing values between original and filled versions
    print("\n=== Missing Value Comparison ===")
    key_columns = ['open', 'high', 'low', 'close', 'volume']
    
    comparison_data = []
    for col in key_columns:
        if col in train_data.columns and col in train_data_filled.columns:
            orig_missing = train_data[col].isnull().sum()
            filled_missing = train_data_filled[col].isnull().sum()
            
            comparison_data.append({
                'Column': col,
                'Original Missing': orig_missing,
                'Filled Missing': filled_missing,
                'Reduction': orig_missing - filled_missing,
                'Reduction %': f"{(orig_missing - filled_missing) / orig_missing * 100:.1f}%" if orig_missing > 0 else "0.0%"
            })
    
    comparison_df = pd.DataFrame(comparison_data)
    print("\nTraining Data Missing Value Comparison:")
    print(comparison_df.to_string(index=False))
    
    # Test data comparison
    comparison_data_test = []
    for col in key_columns:
        if col in test_data.columns and col in test_data_filled.columns:
            orig_missing = test_data[col].isnull().sum()
            filled_missing = test_data_filled[col].isnull().sum()
            
            comparison_data_test.append({
                'Column': col,
                'Original Missing': orig_missing,
                'Filled Missing': filled_missing,
                'Reduction': orig_missing - filled_missing,
                'Reduction %': f"{(orig_missing - filled_missing) / orig_missing * 100:.1f}%" if orig_missing > 0 else "0.0%"
            })
    
    comparison_df_test = pd.DataFrame(comparison_data_test)
    print("\nTest Data Missing Value Comparison:")
    print(comparison_df_test.to_string(index=False))
    
else:
    print("⚠️  Filled data versions not found.")
    print("Please run data_preprocessing_filled.ipynb first to generate filled datasets.")

## 6. Model Performance Comparison

Compare CNN model performance between original and filled data versions.