In [1]:
import pandas as pd

# --- 1. Load CSV ---
file_path = 'data/processed/GAS/combined_data.csv'  # <- Update this path
df = pd.read_csv(file_path, parse_dates=['timestamp'])

# --- 2. Sort and Check Columns ---
df = df.sort_values('timestamp').reset_index(drop=True)

required_cols = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    raise ValueError(f"Missing columns: {missing_cols}")
print("✅ Basic columns present.")

# --- 3. Check for Missing Values ---
missing_values = df[required_cols].isnull().sum()
print("\n🕳️ Missing values per column:")
print(missing_values)

# --- 4. Check for Missing Minutes ---
start_time = df['timestamp'].min()
end_time = df['timestamp'].max()

expected_minutes = pd.date_range(start=start_time, end=end_time, freq='T')
actual_minutes = pd.DatetimeIndex(df['timestamp'])

missing_minutes = expected_minutes.difference(actual_minutes)
extra_minutes = actual_minutes.difference(expected_minutes)

print("\n🕒 Time Range Check")
print("Start:", start_time)
print("End:  ", end_time)
print("Expected rows:", len(expected_minutes))
print("Actual rows:  ", len(actual_minutes))
print("❌ Missing minutes:", len(missing_minutes))
print("⚠️ Extra minutes:", len(extra_minutes))

# --- 5. Price Movement Analysis ---
df['no_price_change'] = (
    (df['open'] == df['close']) &
    (df['high'] == df['low']) &
    (df['open'] == df['high'])
)
no_movement = df[df['no_price_change']]
print(f"\n📉 Minutes with no price movement: {len(no_movement)}")

# --- 6. Invalid Prices ---
invalid_prices = df[
    (df[['open', 'high', 'low', 'close']] <= 0).any(axis=1)
]
print(f"🚫 Rows with zero or negative prices: {len(invalid_prices)}")

# --- 7. Volume Analysis ---
zero_volume = df[df['volume'] == 0]
print(f"\n🔇 Minutes with zero volume: {len(zero_volume)}")

# Check: zero volume but price moved
volume_with_price_move = df[
    (df['volume'] == 0) &
    ~((df['open'] == df['close']) & (df['high'] == df['low']))
]
print(f"⚠️ Zero volume with price movement: {len(volume_with_price_move)}")

# --- 8. Summary ---
print("\n📋 Summary:")
print(f"Total rows in DataFrame:             {len(df)}")
print(f"Missing values found:                {missing_values.sum()}")
print(f"Missing minutes (timestamp gaps):    {len(missing_minutes)}")
print(f"No price movement candles:           {len(no_movement)}")
print(f"Zero or negative prices:             {len(invalid_prices)}")
print(f"Zero volume minutes:                 {len(zero_volume)}")
print(f"Zero volume + price movement:        {len(volume_with_price_move)}")

# --- Optional Samples ---
if not missing_minutes.empty:
    print("\n❗ Sample missing minutes:")
    print(missing_minutes[:10])

if not extra_minutes.empty:
    print("\n❗ Sample extra minutes:")
    print(extra_minutes[:10])


✅ Basic columns present.

🕳️ Missing values per column:
timestamp    0
open         0
high         0
low          0
close        0
volume       0
dtype: int64

🕒 Time Range Check
Start: 2020-01-01 00:00:00+00:00
End:   2024-12-31 23:59:00+00:00
Expected rows: 2630880
Actual rows:   2630880
❌ Missing minutes: 0
⚠️ Extra minutes: 0

📉 Minutes with no price movement: 1357264
🚫 Rows with zero or negative prices: 0

🔇 Minutes with zero volume: 1140052
⚠️ Zero volume with price movement: 0

📋 Summary:
Total rows in DataFrame:             2630880
Missing values found:                0
Missing minutes (timestamp gaps):    0
No price movement candles:           1357264
Zero or negative prices:             0
Zero volume minutes:                 1140052
Zero volume + price movement:        0
