# Re-Imaging Price Trends - Image Generation

**Purpose**: Convert stock price data to candlestick chart images and save to disk

**Next Step**: Run `2_model_training.ipynb` after completion

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/ReImaging_Price_Trends')
print(f"Current directory: {os.getcwd()}")
print(f"File list: {[f for f in os.listdir('.') if not f.startswith('.')]}")

# Check Numba JIT performance optimization
try:
    import numba
    print(f"Numba JIT available: {numba.__version__}")
    print("   Image generation speed improved by 50-100x!")
except ImportError:
    print("Numba installation failed - check requirements.txt")

# Check memory status
import psutil
memory = psutil.virtual_memory()
print(f"Available memory: {memory.available // (1024**3):.1f}GB")
if memory.available < 2 * (1024**3):  # Less than 2GB
    print("Warning: Low memory - recommend using --parallel 1 option")

In [None]:
# 환경 설정 및 최적화 확인
!pip install -r requirements.txt

In [None]:
# Check data files (both original and filled versions)
data_files = [
    'data/data_1993_2000_train_val.parquet',
    'data/data_2001_2019_test.parquet',
    'data/data_1993_2000_train_val_filled.parquet',
    'data/data_2001_2019_test_filled.parquet'
]

print("Data file check:")
print("Original data files:")
original_exist = True
for file in data_files[:2]:
    if os.path.exists(file):
        size_mb = os.path.getsize(file) / (1024**2)
        print(f"✓ {file} ({size_mb:.1f}MB)")
    else:
        print(f"✗ {file} missing")
        original_exist = False

print("\nFilled data files:")
filled_exist = True        
for file in data_files[2:]:
    if os.path.exists(file):
        size_mb = os.path.getsize(file) / (1024**2)
        print(f"✓ {file} ({size_mb:.1f}MB)")
    else:
        print(f"✗ {file} missing")
        filled_exist = False

print(f"\nData availability:")
print(f"   Original data: {'✓ Available' if original_exist else '✗ Missing'}")
print(f"   Filled data: {'✓ Available' if filled_exist else '✗ Missing'}")

if not original_exist and not filled_exist:
    print("\nWarning: No data files found. Run data/datageneration.ipynb first.")
elif not filled_exist:
    print("\nNote: Only original data available. Run data_preprocessing_filled.ipynb to create filled data.")

## Original Data Image Generation

In [None]:
!python datageneration.py --image_days 5 --mode train --sample_rate 1.0

In [None]:
!python datageneration.py --image_days 5 --mode test --sample_rate 1.0

In [None]:
!python datageneration.py --image_days 20 --mode train --sample_rate 1.0

In [None]:
!python datageneration.py --image_days 20 --mode test --sample_rate 1.0

In [None]:
!python datageneration.py --image_days 60 --mode train --sample_rate 1.0

In [None]:
!python datageneration.py --image_days 60 --mode test --sample_rate 1.0

In [None]:
# Check generated original format images
import pandas as pd
import os
import numpy as np

print("Generated original format image summary:")

# Check original format directories
original_dirs = {
    'weekly_5d': (5, '5d_week_has_vb_[5]_ma'),
    'monthly_20d': (20, '20d_month_has_vb_[20]_ma'),
    'quarterly_60d': (60, '60d_quarter_has_vb_[60]_ma')
}

base_dir = 'img_data_reconstructed'
if os.path.exists(base_dir):
    total_images = 0
    total_size_gb = 0
    success_count = 0

    for dir_name, (win_size, prefix) in original_dirs.items():
        img_dir = os.path.join(base_dir, dir_name)

        if os.path.exists(img_dir):
            # Check .dat and .feather files
            dat_files = [f for f in os.listdir(img_dir) if f.endswith('.dat')]
            feather_files = [f for f in os.listdir(img_dir) if f.endswith('.feather')]

            dir_images = 0
            dir_size = 0

            # Check files for each year
            for dat_file in dat_files:
                dat_path = os.path.join(img_dir, dat_file)

                # Calculate image count (based on .dat file size)
                file_size = os.path.getsize(dat_path)
                if win_size == 5:
                    image_size = 32 * 15  # 5-day: 32x15
                elif win_size == 20:
                    image_size = 64 * 60  # 20-day: 64x60
                else:  # 60
                    image_size = 96 * 180  # 60-day: 96x180

                num_images = file_size // image_size
                dir_images += num_images
                dir_size += file_size

            # Add .feather file sizes
            for feather_file in feather_files:
                feather_path = os.path.join(img_dir, feather_file)
                dir_size += os.path.getsize(feather_path)

            size_gb = dir_size / (1024**3)
            total_size_gb += size_gb

            print(f"✓ {dir_name}: {dir_images:,} images, {size_gb:.2f}GB")
            print(f"   .dat files: {len(dat_files)}, .feather files: {len(feather_files)}")

            total_images += dir_images
            success_count += 1
        else:
            print(f"✗ {dir_name}: not generated")

    print(f"\nOriginal data results:")
    print(f"   Success: {success_count}/{len(original_dirs)} directories")
    print(f"   Total images: {total_images:,}")
    print(f"   Total size: {total_size_gb:.2f}GB")
    print(f"   Average size per image: {total_size_gb*1024*1024/max(total_images,1):.1f}KB")

    if success_count == len(original_dirs):
        print(f"\n✅ All original format images generated successfully!")
        print(f"Save path: {base_dir}/")
        print(f"Format: .dat (binary images) + .feather (labels)")
        print(f"\nSaved in same format as original paper authors")
    else:
        print(f"\nWarning: {len(original_dirs)-success_count} directories failed")
        print("   Check datageneration.py error logs.")
else:
    print(f"✗ {base_dir} directory not created.")
    print("   Check datageneration.py execution.")

## Filled Data Image Generation (Missing Values Filled)

In [None]:
print(f"Generating 5-day images for training data (filled version)...")
!python datageneration.py --image_days 5 --mode train --sample_rate 1.0 --data_version filled

In [None]:
print(f"Generating 5-day images for test data (filled version)...")
!python datageneration.py --image_days 5 --mode test --sample_rate 1.0 --data_version filled

In [None]:
print(f"Generating 20-day images for training data (filled version)...")
!python datageneration.py --image_days 20 --mode train --sample_rate 1.0 --data_version filled

In [None]:
print(f"Generating 20-day images for test data (filled version)...")
!python datageneration.py --image_days 20 --mode test --sample_rate 1.0 --data_version filled

In [None]:
print(f"Generating 60-day images for training data (filled version)...")
!python datageneration.py --image_days 60 --mode train --sample_rate 1.0 --data_version filled

In [None]:
print(f"Generating 60-day images for test data (filled version)...")
!python datageneration.py --image_days 60 --mode test --sample_rate 1.0 --data_version filled

In [None]:
# Check generated filled format images
import pandas as pd
import os
import numpy as np

print("Generated filled format image summary:")

# Check filled format directories
filled_dirs = {
    'weekly_5d': (5, '5d_week_has_vb_[5]_ma'),
    'monthly_20d': (20, '20d_month_has_vb_[20]_ma'),
    'quarterly_60d': (60, '60d_quarter_has_vb_[60]_ma')
}

base_dir = 'img_data_reconstructed_filled'
if os.path.exists(base_dir):
    total_images = 0
    total_size_gb = 0
    success_count = 0

    for dir_name, (win_size, prefix) in filled_dirs.items():
        img_dir = os.path.join(base_dir, dir_name)

        if os.path.exists(img_dir):
            # Check .dat and .feather files
            dat_files = [f for f in os.listdir(img_dir) if f.endswith('.dat')]
            feather_files = [f for f in os.listdir(img_dir) if f.endswith('.feather')]

            dir_images = 0
            dir_size = 0

            # Check files for each year
            for dat_file in dat_files:
                dat_path = os.path.join(img_dir, dat_file)

                # Calculate image count (based on .dat file size)
                file_size = os.path.getsize(dat_path)
                if win_size == 5:
                    image_size = 32 * 15  # 5-day: 32x15
                elif win_size == 20:
                    image_size = 64 * 60  # 20-day: 64x60
                else:  # 60
                    image_size = 96 * 180  # 60-day: 96x180

                num_images = file_size // image_size
                dir_images += num_images
                dir_size += file_size

            # Add .feather file sizes
            for feather_file in feather_files:
                feather_path = os.path.join(img_dir, feather_file)
                dir_size += os.path.getsize(feather_path)

            size_gb = dir_size / (1024**3)
            total_size_gb += size_gb

            print(f"✓ {dir_name}: {dir_images:,} images, {size_gb:.2f}GB")
            print(f"   .dat files: {len(dat_files)}, .feather files: {len(feather_files)}")

            total_images += dir_images
            success_count += 1
        else:
            print(f"✗ {dir_name}: not generated")

    print(f"\nFilled data results:")
    print(f"   Success: {success_count}/{len(filled_dirs)} directories")
    print(f"   Total images: {total_images:,}")
    print(f"   Total size: {total_size_gb:.2f}GB")
    print(f"   Average size per image: {total_size_gb*1024*1024/max(total_images,1):.1f}KB")

    if success_count == len(filled_dirs):
        print(f"\n✅ All filled format images generated successfully!")
        print(f"Save path: {base_dir}/")
        print(f"Format: .dat (binary images) + .feather (labels)")
        print(f"\nFilled data: Missing values replaced with previous close prices")
        print(f"This should provide more complete training data with fewer gaps")
    else:
        print(f"\nWarning: {len(filled_dirs)-success_count} directories failed")
        print("   Check datageneration.py error logs.")

    # Compare with original if exists
    original_base_dir = 'img_data_reconstructed'
    if os.path.exists(original_base_dir):
        print(f"\n📊 Comparison with original data:")
        print(f"   Original: {original_base_dir}/")
        print(f"   Filled: {base_dir}/")
        print(f"   Filled data should have equal or more images due to fewer NA gaps")
else:
    print(f"✗ {base_dir} directory not created.")
    print("   Check datageneration.py execution with --data_version filled.")