# 6. Microstructure Mean Reversion Analysis

This notebook analyzes high-frequency mean reversion patterns in NVDA data, focusing on:
- Mean reversion analysis across different timeframes (1-min, 5-min, 15-min)
- Bid-ask bounce patterns using OHLC spread proxies
- Overreaction/correction cycles after large price moves
- Half-life of price dislocations and optimal entry/exit timing


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.stattools import adfuller

# Load the cleaned data
all_data = pd.read_csv('combined_nvda_ohlc_clean.csv')
all_data['date'] = pd.to_datetime(all_data['date'])
all_data['datetime'] = pd.to_datetime(all_data['datetime'])

# Sort by datetime
all_data_sorted = all_data.sort_values('datetime').reset_index(drop=True)

print(f"Loaded data shape: {all_data.shape}")
print(f"Date range: {all_data['date'].min()} to {all_data['date'].max()}")


Loaded data shape: (865782, 13)
Date range: 2021-01-04 00:00:00 to 2025-05-30 00:00:00


In [2]:
# Create different timeframe datasets
def create_timeframe_data(data, freq):
    """Resample data to different frequencies"""
    data_copy = data.copy()
    data_copy.set_index('datetime', inplace=True)
    
    # Resample OHLC data
    resampled = data_copy.resample(freq).agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum',
        'transactions': 'sum'
    }).dropna()
    
    # Calculate returns
    resampled['returns'] = resampled['close'].pct_change()
    
    # Calculate HL spread
    resampled['hl_spread'] = resampled['high'] - resampled['low']
    resampled['hl_spread_pct'] = (resampled['hl_spread'] / resampled['close']) * 100
    
    return resampled

# Create datasets for different timeframes
data_1min = all_data_sorted.copy()
data_1min['returns'] = data_1min['close'].pct_change()
data_1min['hl_spread'] = data_1min['high'] - data_1min['low']
data_1min['hl_spread_pct'] = (data_1min['hl_spread'] / data_1min['close']) * 100

data_5min = create_timeframe_data(all_data_sorted, '5T')
data_15min = create_timeframe_data(all_data_sorted, '15T')

print(f"1-minute data: {len(data_1min)} observations")
print(f"5-minute data: {len(data_5min)} observations") 
print(f"15-minute data: {len(data_15min)} observations")


1-minute data: 865782 observations
5-minute data: 199944 observations
15-minute data: 69363 observations


  resampled = data_copy.resample(freq).agg({
  resampled = data_copy.resample(freq).agg({


## Mean Reversion Analysis

Analyze mean reversion characteristics across different timeframes.


In [3]:
def analyze_mean_reversion(data, name):
    """
    Comprehensive mean reversion analysis
    """
    print(f"\n=== Mean Reversion Analysis: {name} ===")
    
    returns = data['returns'].dropna()
    
    if len(returns) < 50:
        print(f"Insufficient data for {name}")
        return None
    
    results = {}
    
    # 1. Autocorrelation analysis
    lags = [1, 2, 3, 5, 10]
    autocorrs = []
    
    for lag in lags:
        autocorr = returns.autocorr(lag=lag)
        autocorrs.append(autocorr)
        print(f"Lag-{lag} Autocorrelation: {autocorr:.4f}")
    
    results['autocorrelations'] = dict(zip(lags, autocorrs))
    
    # 2. Half-life calculation using AR(1) model
    # Create lagged returns
    lagged_returns = returns.shift(1).dropna()
    current_returns = returns[1:].values
    
    if len(current_returns) > 20:
        # Fit AR(1): r_t = α + β*r_{t-1} + ε_t
        X = lagged_returns.values.reshape(-1, 1)
        y = current_returns
        
        model = LinearRegression()
        model.fit(X, y)
        
        beta = model.coef_[0]
        alpha = model.intercept_
        
        # Half-life calculation: HL = -ln(2)/ln(|β|) for |β| < 1
        if abs(beta) < 1 and beta != 0:
            half_life = -np.log(2) / np.log(abs(beta))
            print(f"AR(1) coefficient (β): {beta:.4f}")
            print(f"Half-life: {half_life:.2f} periods")
            
            # Convert to time units
            if 'min' in name.lower():
                if '1' in name:
                    hl_minutes = half_life * 1
                elif '5' in name:
                    hl_minutes = half_life * 5
                elif '15' in name:
                    hl_minutes = half_life * 15
                print(f"Half-life in minutes: {hl_minutes:.1f}")
        else:
            half_life = np.inf
            print("No mean reversion detected (β >= 1)")
        
        results['ar1_beta'] = beta
        results['ar1_alpha'] = alpha
        results['half_life'] = half_life
    
    # 3. ADF test for stationarity
    try:
        adf_result = adfuller(returns.dropna())
        results['adf_statistic'] = adf_result[0]
        results['adf_pvalue'] = adf_result[1]
        print(f"ADF Test - Statistic: {adf_result[0]:.4f}, p-value: {adf_result[1]:.4f}")
        
        if adf_result[1] < 0.05:
            print("Series is stationary (mean-reverting)")
        else:
            print("Series is non-stationary")
    except:
        print("ADF test failed")
    
    return results

# Analyze mean reversion for different timeframes
mr_results = {}
mr_results['1min'] = analyze_mean_reversion(data_1min, "1-Minute")
#mr_results['5min'] = analyze_mean_reversion(data_5min, "5-Minute") 
#mr_results['15min'] = analyze_mean_reversion(data_15min, "15-Minute")



=== Mean Reversion Analysis: 1-Minute ===
Lag-1 Autocorrelation: -0.0395
Lag-2 Autocorrelation: -0.0132
Lag-3 Autocorrelation: -0.0038
Lag-5 Autocorrelation: -0.0044
Lag-10 Autocorrelation: 0.0053
AR(1) coefficient (β): -0.0395
Half-life: 0.21 periods
Half-life in minutes: 0.2


  res = np.dot(np.transpose(vt), np.multiply(s[:, np.newaxis],
  res = np.dot(np.transpose(vt), np.multiply(s[:, np.newaxis],
  res = np.dot(np.transpose(vt), np.multiply(s[:, np.newaxis],
  beta = np.dot(self.pinv_wexog, self.wendog)
  beta = np.dot(self.pinv_wexog, self.wendog)
  beta = np.dot(self.pinv_wexog, self.wendog)


ADF test failed


## Bid-Ask Bounce Analysis

Analyze bid-ask bounce patterns using OHLC spread as proxy for market microstructure effects.


In [4]:
def analyze_bid_ask_bounce(data, name):
    """
    Analyze bid-ask bounce patterns using OHLC spread as proxy
    """
    print(f"\n=== Bid-Ask Bounce Analysis: {name} ===")
    
    clean_data = data.dropna()
    
    if len(clean_data) < 50:
        print(f"Insufficient data for {name}")
        return None
    
    results = {}
    
    # 1. Calculate spread statistics
    hl_spread = clean_data['hl_spread']
    hl_spread_pct = clean_data['hl_spread_pct']
    
    print(f"Mean HL Spread: ${hl_spread.mean():.4f}")
    print(f"Median HL Spread: ${hl_spread.median():.4f}")
    print(f"Std HL Spread: ${hl_spread.std():.4f}")
    print(f"Mean HL Spread %: {hl_spread_pct.mean():.4f}%")
    
    results['spread_stats'] = {
        'mean': hl_spread.mean(),
        'median': hl_spread.median(),
        'std': hl_spread.std(),
        'mean_pct': hl_spread_pct.mean()
    }
    
    # 2. Analyze relationship between spread and volume
    if 'volume' in clean_data.columns:
        correlation = clean_data['hl_spread_pct'].corr(clean_data['volume'])
        print(f"Spread-Volume Correlation: {correlation:.4f}")
        results['spread_volume_corr'] = correlation
    
    # 3. Detect bounce patterns: high spread followed by low spread
    # Define high spread as > 75th percentile, low spread as < 25th percentile
    spread_75th = hl_spread_pct.quantile(0.75)
    spread_25th = hl_spread_pct.quantile(0.25)
    
    print(f"75th percentile spread: {spread_75th:.4f}%")
    print(f"25th percentile spread: {spread_25th:.4f}%")
    
    # Find bounce patterns (high spread followed by low spread within next few periods)
    high_spread_mask = hl_spread_pct > spread_75th
    bounce_count = 0
    total_high_spread = high_spread_mask.sum()
    
    for i in range(len(clean_data) - 3):
        if high_spread_mask.iloc[i]:
            # Check if any of the next 3 periods have low spread
            next_3_periods = hl_spread_pct.iloc[i+1:i+4]
            if (next_3_periods < spread_25th).any():
                bounce_count += 1
    
    bounce_rate = bounce_count / total_high_spread if total_high_spread > 0 else 0
    print(f"Bounce patterns detected: {bounce_count}/{total_high_spread} ({bounce_rate:.2%})")
    
    results['bounce_patterns'] = {
        'bounce_count': bounce_count,
        'total_high_spread': total_high_spread,
        'bounce_rate': bounce_rate
    }
    
    return results

# Analyze bid-ask bounce for different timeframes
bounce_results = {}
bounce_results['1min'] = analyze_bid_ask_bounce(data_1min, "1-Minute")
bounce_results['5min'] = analyze_bid_ask_bounce(data_5min, "5-Minute")
bounce_results['15min'] = analyze_bid_ask_bounce(data_15min, "15-Minute")



=== Bid-Ask Bounce Analysis: 1-Minute ===
Insufficient data for 1-Minute

=== Bid-Ask Bounce Analysis: 5-Minute ===
Mean HL Spread: $0.1526
Median HL Spread: $0.0680
Std HL Spread: $0.2735
Mean HL Spread %: 0.2706%
Spread-Volume Correlation: 0.5800
75th percentile spread: 0.3501%
25th percentile spread: 0.0806%
Bounce patterns detected: 1782/49986 (3.56%)

=== Bid-Ask Bounce Analysis: 15-Minute ===
Mean HL Spread: $0.2678
Median HL Spread: $0.1200
Std HL Spread: $0.4523
Mean HL Spread %: 0.4816%
Spread-Volume Correlation: 0.5887
75th percentile spread: 0.6203%
25th percentile spread: 0.1589%
Bounce patterns detected: 1000/17341 (5.77%)
