In [3]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import glob

# Define the directory containing all the data files
data_dir = '/home/lr/Documents/FUTURUES_PROJECT/futures_vpoc_backtest/DATA/'
output_dir = '/home/lr/Documents/FUTURUES_PROJECT/futures_vpoc_backtest/RESULTS/'

In [4]:
# Use glob to find all .txt files in the directory
file_paths = glob.glob(os.path.join(data_dir, '*.txt'))
print(f"Found {len(file_paths)} text files in {data_dir}")

# Initialize a list to store dataframes from each file
all_dfs = []

Found 13 text files in /home/lr/Documents/FUTURUES_PROJECT/futures_vpoc_backtest/DATA/


In [8]:
# Process each file
for file_path in file_paths:
    file_name = os.path.basename(file_path)
    print(f"\nProcessing file: {file_name}")
    
    try:
        with open(file_path, 'r') as file:
            data_lines = file.readlines()
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        continue
    except PermissionError:
        print(f"Error: Permission denied when accessing {file_path}")
        continue
    except Exception as e:
        print(f"Unexpected error when opening file: {str(e)}")
        continue
    
    print(f"Successfully loaded file with {len(data_lines)} lines")
    
    # Initialize lists to store data
    timestamps = []
    open_prices = []
    high_prices = []
    low_prices = []
    close_prices = []
    volumes = []
    contract_names = []  # To track which file/contract the data came from
    
    # Parse each line
    for line in data_lines:
        # Split by semicolon
        parts = line.strip().split(';')
        if len(parts) != 6:
            print(f"Skipping malformed line: {line}")
            continue
        
        # Parse timestamp (format: YYYYMMDD HHMMSS)
        date_time_str = parts[0]
        try:
            timestamp = datetime.strptime(date_time_str, '%Y%m%d %H%M%S')
            timestamps.append(timestamp)
        except ValueError:
            print(f"Skipping line with invalid datetime: {line}")
            continue
        
        # Parse OHLC and volume data
        try:
            open_prices.append(float(parts[1]))
            high_prices.append(float(parts[2]))
            low_prices.append(float(parts[3]))
            close_prices.append(float(parts[4]))
            volumes.append(int(parts[5]))
            contract_names.append(file_name.replace('.Last.txt', ''))  # Store contract name
        except ValueError:
            print(f"Skipping line with invalid numeric data: {line}")
            continue
    
    # Skip if no valid data was found
    if not timestamps:
        print(f"No valid data found in {file_name}, skipping...")
        continue
        
    # Create DataFrame for this file
    df = pd.DataFrame({
        'timestamp': timestamps,
        'open': open_prices,
        'high': high_prices,
        'low': low_prices,
        'close': close_prices,
        'volume': volumes,
        'contract': contract_names
    })
    
    # Set timestamp as index
    df.set_index('timestamp', inplace=True)
    
    # Add trading session information
    df['session'] = 'ETH'  # Default to Extended Trading Hours
    # Regular Trading Hours (RTH) is typically 9:30 AM - 4:00 PM ET
    df.loc[df.index.hour.isin(range(9, 16)) & 
           ((df.index.hour != 9) | (df.index.minute >= 30)), 'session'] = 'RTH'
    
    # Check for data quality issues
    print(f"Data shape for {file_name}: {df.shape}")
    
    # Check for duplicated timestamps
    duplicates = df.index.duplicated()
    if duplicates.any():
        print(f"Found {duplicates.sum()} duplicate timestamps")
        # Either keep first occurrence or handle as needed
        df = df[~duplicates]
    
    # Check for missing data
    missing_values = df.isna().sum()
    if missing_values.sum() > 0:
        print(f"Missing values per column:\n{missing_values}")
    
    # Check for zero or negative prices
    if (df[['open', 'high', 'low', 'close']] <= 0).any().any():
        print("Warning: Found zero or negative prices")
    
    # Check for high-low inconsistency
    inconsistent = (df['high'] < df['low']).any()
    if inconsistent:
        print("Warning: Found high < low inconsistencies")
    
    # Check for OHLC inconsistencies
    ohlc_issues = ((df['open'] > df['high']) | 
                   (df['open'] < df['low']) | 
                   (df['close'] > df['high']) | 
                   (df['close'] < df['low']))
    if ohlc_issues.any():
        print(f"Found {ohlc_issues.sum()} OHLC relationship inconsistencies")
    
    # Add derived columns useful for analysis
    df['bar_range'] = df['high'] - df['low']
    df['bar_return'] = df['close'].pct_change()
    
    # Add this dataframe to our list
    all_dfs.append(df)
    print(f"Successfully processed {file_name}")


Processing file: ES 09-24.Last.txt
Successfully loaded file with 92765 lines
Data shape for ES 09-24.Last.txt: (92765, 7)
Successfully processed ES 09-24.Last.txt

Processing file: ES 06-24.Last.txt
Successfully loaded file with 103071 lines
Data shape for ES 06-24.Last.txt: (103071, 7)
Successfully processed ES 06-24.Last.txt

Processing file: ES 09-22.Last.txt
Successfully loaded file with 91763 lines
Data shape for ES 09-22.Last.txt: (91763, 7)
Successfully processed ES 09-22.Last.txt

Processing file: ES_03_22.Last.txt
Successfully loaded file with 92207 lines
Data shape for ES_03_22.Last.txt: (92207, 7)
Successfully processed ES_03_22.Last.txt

Processing file: ES 12-23.Last.txt
Successfully loaded file with 92924 lines
Data shape for ES 12-23.Last.txt: (92924, 7)
Successfully processed ES 12-23.Last.txt

Processing file: ES 06-23.Last.txt
Successfully loaded file with 91705 lines
Data shape for ES 06-23.Last.txt: (91705, 7)
Successfully processed ES 06-23.Last.txt

Processing fi

In [9]:
# Combine all dataframes
if all_dfs:
    combined_df = pd.concat(all_dfs, axis=0)
    
    # Sort by timestamp
    combined_df.sort_index(inplace=True)
    
    print("\n===== Combined Dataset Summary =====")
    print(f"Total records: {len(combined_df)}")
    print(f"Date range: {combined_df.index.min()} to {combined_df.index.max()}")
    print(f"Number of contracts: {combined_df['contract'].nunique()}")
    print(f"Contracts: {', '.join(combined_df['contract'].unique())}")
    print(f"RTH sessions: {(combined_df['session'] == 'RTH').sum()}")
    print(f"ETH sessions: {(combined_df['session'] == 'ETH').sum()}")
    
    print("\nPrice statistics:")
    print(combined_df[['open', 'high', 'low', 'close']].describe())
    
    print("\nVolume statistics:")
    print(combined_df['volume'].describe())
    
    # Save the combined data
    output_path = os.path.join(output_dir, 'combined_es_futures_data.csv')
    combined_df.to_csv(output_path)
    print(f"\nCombined data saved to: {output_path}")
    
    # Optional: Save individual cleaned files
    for contract in combined_df['contract'].unique():
        contract_df = combined_df[combined_df['contract'] == contract]
        contract_output_path = os.path.join(output_dir, f'cleaned_{contract}_data.csv')
        contract_df.to_csv(contract_output_path)
        print(f"Saved {len(contract_df)} records for {contract} to {contract_output_path}")
else:
    print("No valid data was processed from any file.")


===== Combined Dataset Summary =====
Total records: 2373480
Date range: 2021-12-05 16:31:00 to 2025-02-27 16:18:00
Number of contracts: 13
Contracts: ES_03_22, ES 06-22, ES 09-22, ES 12-22, ES 03-23, ES 06-23, ES 09-23, ES 12-23, ES 03-24, ES 06-24, ES 09-24, ES 12-24, ES 03-25
RTH sessions: 675972
ETH sessions: 1697508

Price statistics:
               open          high           low         close
count  2.373480e+06  2.373480e+06  2.373480e+06  2.373480e+06
mean   4.697478e+03  4.698205e+03  4.696745e+03  4.697481e+03
std    7.143690e+02  7.142643e+02  7.144513e+02  7.143553e+02
min    1.250000e+00  3.507500e+03  1.250000e+00  3.503500e+03
25%    4.124250e+03  4.125250e+03  4.123500e+03  4.124250e+03
50%    4.494500e+03  4.495250e+03  4.493750e+03  4.494500e+03
75%    5.282500e+03  5.283000e+03  5.281750e+03  5.282500e+03
max    6.165750e+03  6.166500e+03  6.165250e+03  6.165750e+03

Volume statistics:
count    2.373480e+06
mean     1.086009e+03
std      2.326713e+03
min      1.000