In [2]:
import pandas as pd

def preprocess_and_save(input_file: str, output_file: str) -> None:
    """
    Read raw dataset from input_file, preprocess to keep only
    Date, Close, High, Low, Open columns, then save to output_file.
    """
    # Load the data, skipping the first two rows
    df = pd.read_csv(input_file, skiprows=2)
    
    # Print the original column names for debugging
    print("Original column names:", df.columns.tolist())
    
    # Manually set the column names
    df.columns = ['Date', 'Close', 'High', 'Low', 'Open']
    
    # Strip whitespace from column names
    df.columns = df.columns.str.strip()
    
    # Define columns to keep
    columns_to_keep = ['Date', 'Close', 'High', 'Low', 'Open']
    
    # Check if all required columns exist
    missing_cols = [col for col in columns_to_keep if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in input data: {missing_cols}. Please check your input file.")
    
    # Select only required columns
    df_clean = df[columns_to_keep]
    
    # Save to CSV
    df_clean.to_csv(output_file, index=False)
    print(f"Preprocessed data saved to {output_file}")

if __name__ == "__main__":
    # Example usage:
    input_csv = 'USDJPY_full.csv'    # replace with your actual input file path
    output_csv = 'USDJPY_processed_data.csv'  # desired output file path
    
    preprocess_and_save(input_csv, output_csv)


Original column names: ['Date', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
Preprocessed data saved to USDJPY_processed_data.csv
