# FI-2010 Price Sensitivity Prediction

This notebook runs the complete pipeline on Google Colab or locally.

## Setup

In [None]:
# Clone repository (if running on Colab) or navigate to project root
import os
from pathlib import Path

if 'COLAB_GPU' in os.environ or 'COLAB_RELEASE_TAG' in os.environ:
    # Running on Colab - clone if needed
    if not Path('src').exists():
        print("Clone your repo here or upload files")
        # !git clone https://github.com/YOUR_USERNAME/fi2010-prediction.git
        # %cd fi2010-prediction
else:
    # Local: navigate to project root if in notebooks folder
    if Path('../src').exists():
        os.chdir('..')
    print(f"Working directory: {os.getcwd()}")

In [None]:
# Install dependencies
!pip install -q numpy pandas scikit-learn xgboost torch pyyaml mlflow matplotlib tqdm

In [None]:
# Verify project structure
!ls -la
print("\n--- src/ ---")
!ls -la src/
print("\n--- configs/ ---")
!ls -la configs/

## Upload Data

Upload FI-2010 dataset files to `data/raw/fi2010/`

In [None]:
# Create data directory
!mkdir -p data/raw/fi2010

# On Colab: upload files manually or use Google Drive
# Uncomment to upload from local machine:
# from google.colab import files
# uploaded = files.upload()
# for fname in uploaded.keys():
#     import shutil
#     shutil.move(fname, f"data/raw/fi2010/{fname}")

# Alternative: Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# !cp /content/drive/MyDrive/fi2010_data/*.txt data/raw/fi2010/

In [None]:
# Verify data files are present
!ls -la data/raw/fi2010/

## Run Training Experiments

In [None]:
# Run XGBoost regression
!python -m src.train --config configs/xgboost_reg.yaml

In [None]:
# Run LSTM regression
!python -m src.train --config configs/lstm_reg.yaml

## View Results

In [None]:
import json
from pathlib import Path

# Find and display results
outputs_dir = Path('outputs')
if outputs_dir.exists():
    for exp_dir in sorted(outputs_dir.iterdir()):
        if exp_dir.is_dir():
            for run_dir in sorted(exp_dir.iterdir()):
                metrics_file = run_dir / 'metrics.json'
                if metrics_file.exists():
                    print(f"\n{'='*60}")
                    print(f"Experiment: {exp_dir.name}")
                    print(f"Run: {run_dir.name}")
                    print(f"{'='*60}")
                    with open(metrics_file) as f:
                        metrics = json.load(f)
                    for k, v in metrics.items():
                        if isinstance(v, float):
                            print(f"  {k}: {v:.6f}")
                        else:
                            print(f"  {k}: {v}")
else:
    print("No outputs found. Run training first.")

In [None]:
# Display plots
from IPython.display import Image, display
from pathlib import Path

outputs_dir = Path('outputs')
if outputs_dir.exists():
    png_files = list(outputs_dir.rglob('*.png'))
    if png_files:
        for png_file in png_files:
            print(f"\n{png_file.relative_to(outputs_dir)}")
            display(Image(filename=str(png_file), width=600))
    else:
        print("No plots found.")
else:
    print("No outputs found.")

## MLflow Experiment Tracking

In [None]:
import mlflow
from pathlib import Path

mlruns_path = Path('mlruns')
if mlruns_path.exists():
    mlflow.set_tracking_uri(f'file://{mlruns_path.absolute()}')
    
    for exp in mlflow.search_experiments():
        print(f"\nExperiment: {exp.name}")
        runs = mlflow.search_runs(experiment_ids=[exp.experiment_id])
        if len(runs) > 0:
            cols = ['run_id', 'status']
            metric_cols = [c for c in runs.columns if c.startswith('metrics.')]
            display_cols = cols + metric_cols[:5]
            display_cols = [c for c in display_cols if c in runs.columns]
            print(runs[display_cols].to_string())
else:
    print("No MLflow runs found. Run training first.")

## Run Classification Task

In [None]:
# Run XGBoost classification
!python -m src.train --config configs/xgboost_cls.yaml

## Custom Configuration Example

In [None]:
import yaml

custom_config = {
    'experiment_name': 'custom_experiment',
    'seed': 123,
    'data': {
        'raw_dir': './data/raw/fi2010',
        'processed_dir': './data/processed',
        'file_patterns': ['*.txt', '*.csv']
    },
    'label': {
        'tau': 20,
        'task': 'regression',
        'epsilon': 0.0002
    },
    'window': {
        'lookback': 100,
        'mode': 'sequence'
    },
    'split': {
        'train_ratio': 0.70,
        'val_ratio': 0.15,
        'test_ratio': 0.15,
        'purge_boundary': True
    },
    'normalization': {
        'method': 'zscore'
    },
    'model': {
        'name': 'gru',
        'params': {
            'hidden_size': 128,
            'num_layers': 2,
            'dropout': 0.3,
            'bidirectional': False
        }
    },
    'training': {
        'batch_size': 128,
        'epochs': 50,
        'learning_rate': 0.0005,
        'early_stopping_patience': 10,
        'device': 'auto'
    }
}

with open('configs/custom.yaml', 'w') as f:
    yaml.dump(custom_config, f, default_flow_style=False)

print("Custom config saved to configs/custom.yaml")
print("\nConfig contents:")
print(yaml.dump(custom_config, default_flow_style=False))

In [None]:
# Run custom config (uncomment when data is available)
# !python -m src.train --config configs/custom.yaml