In [5]:
# polymarket_arb_scanner.py

import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
import time
from typing import Dict, List, Optional
import json
from itertools import combinations

class PolymarketArbScanner:
    """
    Scan all Polymarket markets for arbitrage opportunities.
    Handles both binary and multi-outcome markets.
    """
    
    GAMMA_API = "https://gamma-api.polymarket.com"
    CLOB_API = "https://clob.polymarket.com"
    
    def __init__(self, min_liquidity: float = 50, min_profit_pct: float = 1.0):
        self.min_liquidity = min_liquidity
        self.min_profit_pct = min_profit_pct
        self.cache_dir = Path("data/arb_scanner")
        self.cache_dir.mkdir(parents=True, exist_ok=True)
    
    def fetch_all_active_markets(self) -> List[Dict]:
        """
        Fetch all active markets from Gamma API.
        """
        try:
            all_markets = []
            offset = 0
            limit = 100
            
            while True:
                resp = requests.get(
                    f"{self.GAMMA_API}/markets",
                    params={
                        "closed": "false",
                        "limit": limit,
                        "offset": offset,
                    },
                    timeout=30
                )
                resp.raise_for_status()
                markets = resp.json()
                
                if not markets:
                    break
                
                all_markets.extend(markets)
                offset += limit
                
                if len(markets) < limit:
                    break
                
                time.sleep(0.2)
            
            print(f"Fetched {len(all_markets)} active markets")
            return all_markets
            
        except Exception as e:
            print(f"Error fetching markets: {e}")
            return []
    
    def fetch_clob_token_book(self, token_id: str) -> Optional[Dict]:
        """
        Fetch order book for a specific CLOB token.
        """
        try:
            resp = requests.get(
                f"{self.CLOB_API}/book",
                params={"token_id": token_id},
                timeout=10
            )
            
            if resp.status_code != 200:
                return None
            
            book = resp.json()
            
            bids = book.get('bids', [])
            asks = book.get('asks', [])
            
            if not bids or not asks:
                return None
            
            best_bid_price = float(bids[0]['price'])
            best_bid_size = float(bids[0]['size'])
            
            best_ask_price = float(asks[0]['price'])
            best_ask_size = float(asks[0]['size'])
            
            return {
                'token_id': token_id,
                'best_bid': best_bid_price,
                'best_ask': best_ask_price,
                'bid_size': best_bid_size,
                'ask_size': best_ask_size,
            }
            
        except Exception as e:
            return None
    
    def check_binary_market_arbitrage(self, market: Dict, token_books: List[Dict]) -> Optional[Dict]:
        """
        Check binary market (2 outcomes) for arbitrage.
        For binary: sum of asks should be < 1.0
        """
        if len(token_books) != 2:
            return None
        
        book_0, book_1 = token_books[0], token_books[1]
        
        ask_0 = book_0['best_ask']
        ask_1 = book_1['best_ask']
        ask_size_0 = book_0['ask_size']
        ask_size_1 = book_1['ask_size']
        
        # Validate prices
        if ask_0 <= 0 or ask_0 >= 1 or ask_1 <= 0 or ask_1 >= 1:
            return None
        
        # Combined ask
        combined_ask = ask_0 + ask_1
        
        # Check for arbitrage
        if combined_ask >= (1.0 - self.min_profit_pct / 100):
            return None
        
        # Available pairs
        available_pairs = min(ask_size_0, ask_size_1)
        capital_required = available_pairs * combined_ask
        
        if capital_required < self.min_liquidity:
            return None
        
        # Calculate profit
        profit_per_pair = 1.0 - combined_ask
        profit_pct = (profit_per_pair / combined_ask) * 100
        total_profit = available_pairs * profit_per_pair
        
        return {
            'type': 'binary',
            'tokens': [book_0['token_id'], book_1['token_id']],
            'asks': [ask_0, ask_1],
            'combined_ask': combined_ask,
            'available_pairs': available_pairs,
            'capital_required': capital_required,
            'profit_per_pair': profit_per_pair,
            'profit_pct': profit_pct,
            'total_profit': total_profit,
        }
    
    def check_multi_outcome_arbitrage(self, market: Dict, token_books: List[Dict]) -> Optional[Dict]:
        """
        Check multi-outcome market (3+ outcomes) for arbitrage.
        For multi-outcome: sum of all asks should be < 1.0
        
        Strategy: Buy ALL outcome tokens. Exactly one will pay $1, others pay $0.
        Cost = sum of all asks
        Payout = $1 (guaranteed)
        Profit = $1 - sum(asks)
        """
        if len(token_books) < 3:
            return None
        
        # Calculate sum of all asks
        asks = [book['best_ask'] for book in token_books]
        ask_sizes = [book['ask_size'] for book in token_books]
        
        # Validate all prices
        if any(ask <= 0 or ask >= 1 for ask in asks):
            return None
        
        combined_ask = sum(asks)
        
        # Check for arbitrage
        if combined_ask >= (1.0 - self.min_profit_pct / 100):
            return None
        
        # Available "sets" - limited by smallest token
        # For multi-outcome, need 1 share of EACH token
        available_sets = min(ask_sizes)
        capital_required = available_sets * combined_ask
        
        if capital_required < self.min_liquidity:
            return None
        
        # Calculate profit
        profit_per_set = 1.0 - combined_ask
        profit_pct = (profit_per_set / combined_ask) * 100
        total_profit = available_sets * profit_per_set
        
        return {
            'type': 'multi_outcome',
            'num_outcomes': len(token_books),
            'tokens': [book['token_id'] for book in token_books],
            'asks': asks,
            'combined_ask': combined_ask,
            'available_sets': available_sets,
            'capital_required': capital_required,
            'profit_per_set': profit_per_set,
            'profit_pct': profit_pct,
            'total_profit': total_profit,
        }
    
    def check_market_for_arbitrage(self, market: Dict) -> Optional[Dict]:
        """
        Check any market type for arbitrage opportunities.
        Works for binary (2 outcomes) and multi-outcome (3+ outcomes).
        """
        clob_token_ids = market.get('clobTokenIds', [])
        
        # Need at least 2 outcomes
        if len(clob_token_ids) < 2:
            return None
        
        # Fetch all token order books
        token_books = []
        for token_id in clob_token_ids:
            book = self.fetch_clob_token_book(token_id)
            if book:
                token_books.append(book)
            time.sleep(0.05)  # Small delay between token fetches
        
        # Need all tokens to have books
        if len(token_books) != len(clob_token_ids):
            return None
        
        # Check for arbitrage based on market type
        if len(token_books) == 2:
            arb = self.check_binary_market_arbitrage(market, token_books)
        else:
            arb = self.check_multi_outcome_arbitrage(market, token_books)
        
        if not arb:
            return None
        
        # Add market metadata
        end_date_iso = market.get('end_date_iso')
        if end_date_iso:
            try:
                end_dt = pd.to_datetime(end_date_iso)
                now = pd.Timestamp.now(tz='UTC')
                hours_to_resolution = (end_dt - now).total_seconds() / 3600
                
                if hours_to_resolution < 0:
                    return None
            except:
                hours_to_resolution = None
        else:
            hours_to_resolution = None
        
        # Capital efficiency
        if hours_to_resolution and hours_to_resolution > 0:
            efficiency_score = arb['total_profit'] / hours_to_resolution
        else:
            efficiency_score = 0
        
        # Combine arbitrage details with market info
        result = {
            'market_slug': market.get('slug', ''),
            'market_title': market.get('question', '')[:80],
            'condition_id': market.get('condition_id', ''),
            'market_type': arb['type'],
            'num_outcomes': arb.get('num_outcomes', 2),
            'hours_to_resolution': hours_to_resolution,
            'efficiency_score': efficiency_score,
            'end_date': end_date_iso,
            **arb  # Merge arbitrage details
        }
        
        return result
    
    def scan_all_markets(self, sample_size: Optional[int] = None) -> pd.DataFrame:
        """
        Scan all (or sample) markets for arbitrage.
        """
        print(f"\n{'='*80}")
        print(f"POLYMARKET ARBITRAGE SCANNER")
        print(f"{'='*80}")
        print(f"Started: {datetime.now()}")
        print(f"Min liquidity: ${self.min_liquidity}")
        print(f"Min profit: {self.min_profit_pct}%\n")
        
        markets = self.fetch_all_active_markets()
        
        if not markets:
            print("No markets found")
            return pd.DataFrame()
        
        if sample_size:
            markets = markets[:sample_size]
            print(f"Scanning sample of {len(markets)} markets\n")
        
        opportunities = []
        errors = 0
        skipped_binary = 0
        skipped_multi = 0
        
        for i, market in enumerate(markets):
            if i % 50 == 0 and i > 0:
                print(f"Progress: {i}/{len(markets)} | Found: {len(opportunities)} | "
                      f"Binary checked: {skipped_binary} | Multi checked: {skipped_multi} | Errors: {errors}")
            
            try:
                num_outcomes = len(market.get('clobTokenIds', []))
                
                if num_outcomes == 2:
                    skipped_binary += 1
                elif num_outcomes > 2:
                    skipped_multi += 1
                
                arb = self.check_market_for_arbitrage(market)
                
                if arb:
                    opportunities.append(arb)
                    print(f"\n  ✓ ARBITRAGE FOUND!")
                    print(f"    Market: {arb['market_slug'][:50]}")
                    print(f"    Type: {arb['market_type']} ({arb['num_outcomes']} outcomes)")
                    print(f"    Combined ask: {arb['combined_ask']:.4f}")
                    print(f"    Profit: {arb['profit_pct']:.2f}% (${arb['total_profit']:.2f})")
                    print(f"    Capital: ${arb['capital_required']:.2f}")
            
            except Exception as e:
                errors += 1
                if errors <= 5:
                    print(f"  ✗ Error: {market.get('slug', 'unknown')[:30]}: {str(e)[:50]}")
            
            time.sleep(0.15)
        
        print(f"\n{'='*80}")
        print(f"SCAN COMPLETE")
        print(f"{'='*80}")
        print(f"Markets scanned: {len(markets)}")
        print(f"  Binary markets: {skipped_binary}")
        print(f"  Multi-outcome markets: {skipped_multi}")
        print(f"Opportunities found: {len(opportunities)}")
        print(f"Errors: {errors}")
        
        if not opportunities:
            return pd.DataFrame()
        
        df = pd.DataFrame(opportunities)
        df = df.sort_values('efficiency_score', ascending=False)
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_file = self.cache_dir / f"scan_{timestamp}.csv"
        df.to_csv(output_file, index=False)
        print(f"\nResults saved: {output_file}")
        
        return df
    
    def print_top_opportunities(self, df: pd.DataFrame, top_n: int = 20):
        """
        Print formatted summary of opportunities.
        """
        if df.empty:
            print("\n❌ No arbitrage opportunities found")
            return
        
        print(f"\n{'='*80}")
        print(f"TOP {min(top_n, len(df))} OPPORTUNITIES (by capital efficiency)")
        print(f"{'='*80}\n")
        
        for i, (_, opp) in enumerate(df.head(top_n).iterrows(), 1):
            print(f"{i}. {opp['market_title']}")
            print(f"   Type: {opp['market_type']} ({opp['num_outcomes']} outcomes)")
            print(f"   Slug: {opp['market_slug']}")
            print(f"   Combined Ask: {opp['combined_ask']:.4f}")
            print(f"   Profit: {opp['profit_pct']:.2f}% (${opp['total_profit']:.2f})")
            print(f"   Capital: ${opp['capital_required']:.2f}")
            
            if pd.notna(opp['hours_to_resolution']):
                hours = opp['hours_to_resolution']
                if hours < 1:
                    print(f"   Resolves: {hours*60:.0f} min")
                elif hours < 24:
                    print(f"   Resolves: {hours:.1f} hr")
                else:
                    print(f"   Resolves: {hours/24:.1f} days")
                print(f"   Efficiency: ${opp['efficiency_score']:.2f}/hr")
            print()
        
        # Summary
        print(f"{'='*80}")
        print(f"SUMMARY")
        print(f"{'='*80}")
        print(f"Total opportunities: {len(df)}")
        print(f"  Binary: {len(df[df['market_type'] == 'binary'])}")
        print(f"  Multi-outcome: {len(df[df['market_type'] == 'multi_outcome'])}")
        print(f"\nTotal profit potential: ${df['total_profit'].sum():.2f}")
        print(f"Total capital needed: ${df['capital_required'].sum():.2f}")
        print(f"Weighted avg profit: {(df['total_profit'].sum() / df['capital_required'].sum() * 100):.2f}%")


# Run
if __name__ == "__main__":
    scanner = PolymarketArbScanner(
        min_liquidity=50,
        min_profit_pct=1.0
    )
    
    # Test with sample
    print("Testing with first 100 markets...")
    results = scanner.scan_all_markets(sample_size=100)
    scanner.print_top_opportunities(results, top_n=10)

Testing with first 100 markets...

POLYMARKET ARBITRAGE SCANNER
Started: 2026-02-19 09:11:24.115302
Min liquidity: $50
Min profit: 1.0%

Fetched 29787 active markets
Scanning sample of 100 markets

Progress: 50/100 | Found: 0 | Binary checked: 0 | Multi checked: 50 | Errors: 0

SCAN COMPLETE
Markets scanned: 100
  Binary markets: 0
  Multi-outcome markets: 100
Opportunities found: 0
Errors: 0

❌ No arbitrage opportunities found


In [6]:
# Strategy exploration for BTC 5-minute markets

import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

def analyze_predictive_patterns():
    """
    Look for patterns in order book behavior that predict outcomes.
    The idea: Maybe certain book imbalances or price movements early in the 
    window predict the final outcome better than the current mid-price.
    """
    
    results = []
    
    for slug in list(resolutions.keys())[:100]:  # Sample 100 markets
        # Load book data
        book_path = Path(f"data/book_snapshots/{slug}.parquet")
        if not book_path.exists():
            continue
        
        books = pd.read_parquet(book_path)
        books['timestamp_dt'] = pd.to_datetime(books['exchange_timestamp'], unit='ms', utc=True)
        
        # Get outcome
        if slug not in resolutions:
            continue
        
        token_outcomes = resolutions[slug]['token_outcomes']
        token_ids = list(token_outcomes.keys())
        
        if len(token_ids) != 2:
            continue
        
        # Identify YES/NO tokens
        yes_token = [tid for tid in token_ids if token_outcomes[tid] == 1.0][0]
        no_token = [tid for tid in token_ids if token_outcomes[tid] == 0.0][0]
        
        # Separate by token
        yes_books = books[books['asset_id'] == yes_token].sort_values('timestamp_dt')
        no_books = books[books['asset_id'] == no_token].sort_values('timestamp_dt')
        
        if yes_books.empty or no_books.empty:
            continue
        
        # Calculate features at different time windows
        for window_end_pct in [0.2, 0.4, 0.6, 0.8]:  # First 20%, 40%, 60%, 80%
            cutoff_idx = int(len(yes_books) * window_end_pct)
            
            if cutoff_idx < 10:
                continue
            
            yes_window = yes_books.iloc[:cutoff_idx]
            no_window = no_books.iloc[:cutoff_idx]
            
            # Feature: Average mid price
            yes_mid = ((yes_window['bid_price_1'] + yes_window['ask_price_1']) / 2).mean()
            no_mid = ((no_window['bid_price_1'] + no_window['ask_price_1']) / 2).mean()
            
            # Feature: Order book imbalance (bid size / ask size)
            yes_imbalance = (yes_window['bid_size_1'] / yes_window['ask_size_1']).replace([np.inf, -np.inf], np.nan).mean()
            no_imbalance = (no_window['bid_size_1'] / no_window['ask_size_1']).replace([np.inf, -np.inf], np.nan).mean()
            
            # Feature: Spread
            yes_spread = (yes_window['ask_price_1'] - yes_window['bid_price_1']).mean()
            no_spread = (no_window['ask_price_1'] - no_window['bid_price_1']).mean()
            
            # Feature: Price volatility
            yes_vol = ((yes_window['bid_price_1'] + yes_window['ask_price_1']) / 2).std()
            no_vol = ((no_window['bid_price_1'] + no_window['ask_price_1']) / 2).std()
            
            # Prediction: Which side wins?
            prediction = 1 if yes_mid > no_mid else 0
            actual = 1  # YES token won (by definition since we filtered)
            correct = (prediction == actual)
            
            results.append({
                'slug': slug,
                'window_pct': window_end_pct,
                'yes_mid': yes_mid,
                'no_mid': no_mid,
                'yes_imbalance': yes_imbalance,
                'no_imbalance': no_imbalance,
                'yes_spread': yes_spread,
                'no_spread': no_spread,
                'yes_vol': yes_vol,
                'no_vol': no_vol,
                'prediction': prediction,
                'actual': actual,
                'correct': correct,
                'edge': abs(yes_mid - no_mid),
            })
    
    return pd.DataFrame(results)

# Run pattern analysis
print("Analyzing predictive patterns in order book data...")
patterns_df = analyze_predictive_patterns()

if not patterns_df.empty:
    print(f"\n{'='*80}")
    print(f"PREDICTIVE PATTERN ANALYSIS")
    print(f"{'='*80}")
    
    # Accuracy by window
    print(f"\nPrediction accuracy by time window:")
    for window_pct in sorted(patterns_df['window_pct'].unique()):
        window_data = patterns_df[patterns_df['window_pct'] == window_pct]
        accuracy = window_data['correct'].mean()
        n = len(window_data)
        print(f"  Using first {int(window_pct*100):2d}% of window: {accuracy:5.1%} accuracy (n={n})")
    
    # Can we improve by filtering on "edge"?
    print(f"\nAccuracy when YES clearly ahead (edge > 0.10):")
    for window_pct in sorted(patterns_df['window_pct'].unique()):
        window_data = patterns_df[
            (patterns_df['window_pct'] == window_pct) &
            (patterns_df['edge'] > 0.10)
        ]
        if not window_data.empty:
            accuracy = window_data['correct'].mean()
            n = len(window_data)
            print(f"  At {int(window_pct*100):2d}% mark: {accuracy:5.1%} accuracy (n={n})")
    
    # Order book imbalance signal?
    print(f"\nDoes order book imbalance predict outcomes?")
    for window_pct in [0.2, 0.4, 0.6]:
        window_data = patterns_df[patterns_df['window_pct'] == window_pct].copy()
        
        # Try different imbalance thresholds
        for threshold in [1.2, 1.5, 2.0]:
            strong_buy_pressure = window_data[
                (window_data['yes_imbalance'] > threshold) &
                (window_data['no_imbalance'] < 1/threshold)
            ]
            
            if not strong_buy_pressure.empty:
                accuracy = strong_buy_pressure['correct'].mean()
                n = len(strong_buy_pressure)
                avg_profit = (strong_buy_pressure['yes_mid'] - 0.5).mean()
                print(f"  {int(window_pct*100)}% window, imbalance >{threshold:.1f}x: "
                      f"{accuracy:.1%} accuracy (n={n}), avg entry {strong_buy_pressure['yes_mid'].mean():.3f}")


# Strategy 2: Mean reversion on extremes
def analyze_mean_reversion():
    """
    Check if prices that hit extremes (0.10 or 0.90) tend to revert.
    """
    
    reversions = []
    
    for slug in list(resolutions.keys())[:100]:
        book_path = Path(f"data/book_snapshots/{slug}.parquet")
        if not book_path.exists():
            continue
        
        books = pd.read_parquet(book_path)
        books['timestamp_dt'] = pd.to_datetime(books['exchange_timestamp'], unit='ms', utc=True)
        
        if slug not in resolutions:
            continue
        
        token_outcomes = resolutions[slug]['token_outcomes']
        token_ids = list(token_outcomes.keys())
        
        if len(token_ids) != 2:
            continue
        
        yes_token = [tid for tid in token_ids if token_outcomes[tid] == 1.0]
        if not yes_token:
            continue
        yes_token = yes_token[0]
        
        yes_books = books[books['asset_id'] == yes_token].sort_values('timestamp_dt')
        
        if yes_books.empty or len(yes_books) < 20:
            continue
        
        yes_books['mid'] = (yes_books['bid_price_1'] + yes_books['ask_price_1']) / 2
        
        # Find if price hit extreme in first 80%
        cutoff_idx = int(len(yes_books) * 0.8)
        early_books = yes_books.iloc[:cutoff_idx]
        late_books = yes_books.iloc[cutoff_idx:]
        
        # Check for extremes
        hit_low = (early_books['mid'] < 0.15).any()
        hit_high = (early_books['mid'] > 0.85).any()
        
        if hit_low:
            entry_price = early_books[early_books['mid'] < 0.15]['mid'].iloc[0]
            exit_price = late_books['mid'].mean() if not late_books.empty else entry_price
            outcome = 1.0  # YES won
            
            # If we bought at extreme low, what happened?
            reversions.append({
                'slug': slug,
                'extreme': 'low',
                'entry': entry_price,
                'exit': exit_price,
                'outcome': outcome,
                'profit': outcome - entry_price,
            })
        
        if hit_high:
            # Sold YES (bought NO) when YES was expensive
            entry_price = early_books[early_books['mid'] > 0.85]['mid'].iloc[0]
            exit_price = late_books['mid'].mean() if not late_books.empty else entry_price
            outcome = 1.0  # YES won
            
            # We bet AGAINST YES at high prices
            reversions.append({
                'slug': slug,
                'extreme': 'high',
                'entry': entry_price,
                'exit': exit_price,
                'outcome': outcome,
                'profit': entry_price - outcome,  # We bet against, so profit inverted
            })
    
    return pd.DataFrame(reversions)

print(f"\n\n{'='*80}")
print("MEAN REVERSION ANALYSIS")
print(f"{'='*80}")

reversion_df = analyze_mean_reversion()

if not reversion_df.empty:
    print(f"\nBuying at extreme lows (<0.15):")
    low_buys = reversion_df[reversion_df['extreme'] == 'low']
    if not low_buys.empty:
        print(f"  Opportunities: {len(low_buys)}")
        print(f"  Avg entry: {low_buys['entry'].mean():.3f}")
        print(f"  Avg profit: ${low_buys['profit'].mean():.2f}")
        print(f"  Win rate: {(low_buys['profit'] > 0).mean():.1%}")
        print(f"  Total P&L: ${low_buys['profit'].sum():.2f}")
    
    print(f"\nSelling at extreme highs (>0.85):")
    high_sells = reversion_df[reversion_df['extreme'] == 'high']
    if not high_sells.empty:
        print(f"  Opportunities: {len(high_sells)}")
        print(f"  Avg entry: {high_sells['entry'].mean():.3f}")
        print(f"  Avg profit: ${high_sells['profit'].mean():.2f}")
        print(f"  Win rate: {(high_sells['profit'] > 0).mean():.1%}")
        print(f"  Total P&L: ${high_sells['profit'].sum():.2f}")

Analyzing predictive patterns in order book data...


NameError: name 'resolutions' is not defined