# Scrape Statheads Data

### Import Libraries

In [None]:
/additional_web_scraping_scripts
/data
/stathead_credentials
/.ipynb_checkpoints


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import time
import os
import getpass

### Read in Statheads Credentials

In [1]:
with open('./stathead_credentials/stathead_credentials.txt', "r") as text_file:
    creds = text_file.readlines()

In [8]:
stats_username = creds[0].split('=')[1].strip('')
stats_password = creds[1].split('=')[1]

### Scrape from Statheads

In [None]:
def login_to_stathead(driver, username, password):
    """Handle the login process for Stathead"""
    print("Attempting to log in to Stathead...")
    
    wait = WebDriverWait(driver, 15)
    
    try:
        # Go to the login page first
        login_url = "https://stathead.com/users/login.cgi"
        driver.get(login_url)
        time.sleep(3)
        
        print("Looking for login form...")
        
        # Find username field
        username_selectors = [
            "input[name='username']",
            "input[name='email']", 
            "input[type='email']",
            "input[id='username']",
            "input[id='email']"
        ]
        
        username_field = None
        for selector in username_selectors:
            try:
                username_field = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
                break
            except:
                continue
        
        if not username_field:
            raise Exception("Username field not found")
        
        # Find password field
        password_field = driver.find_element(By.CSS_SELECTOR, "input[type='password']")
        
        # Enter credentials
        username_field.clear()
        username_field.send_keys(username)
        password_field.clear()
        password_field.send_keys(password)
        
        # Submit form
        password_field.send_keys(Keys.RETURN)
        time.sleep(5)
        
        # Check if login successful
        if 'logout' in driver.page_source.lower() or 'account' in driver.page_source.lower():
            print("✅ Login successful!")
            return True
        else:
            print("❌ Login may have failed")
            return False
            
    except Exception as e:
        print(f"❌ Login error: {e}")
        return False

def find_next_page_button(driver):
    """Find the Next Page button using multiple strategies"""
    
    next_page_selectors = [
        # Text-based searches
        "//a[contains(text(), 'Next')]",
        "//button[contains(text(), 'Next')]",
        "//a[contains(text(), 'next')]",
        "//button[contains(text(), 'next')]",
        "//a[contains(text(), 'Next Page')]",
        "//button[contains(text(), 'Next Page')]",
        
        # Common pagination patterns
        "//a[@title='Next Page']",
        "//button[@title='Next Page']",
        "//a[@title='Next']",
        "//button[@title='Next']",
        
        # Class-based searches (common pagination classes)
        "//a[contains(@class, 'next')]",
        "//button[contains(@class, 'next')]",
        "//a[contains(@class, 'page-next')]",
        "//button[contains(@class, 'page-next')]",
        
        # Arrow symbols
        "//a[contains(text(), '→')]",
        "//button[contains(text(), '→')]",
        "//a[contains(text(), '>')]",
        "//button[contains(text(), '>')]",
        
        # Rel attribute (HTML standard for pagination)
        "//a[@rel='next']",
        
        # Sports Reference specific patterns
        "//a[contains(@href, 'offset=')]",
        "//a[contains(@href, 'page=')]"
    ]
    
    for selector in next_page_selectors:
        try:
            elements = driver.find_elements(By.XPATH, selector)
            for element in elements:
                # Check if element is visible and clickable
                if element.is_displayed() and element.is_enabled():
                    # Additional check - make sure it's not disabled
                    classes = element.get_attribute('class') or ''
                    if 'disabled' not in classes.lower():
                        return element
        except:
            continue
    
    return None

def scrape_page_table(driver, page_num):
    """Scrape table data from current page"""
    print(f"Scraping page {page_num}...")
    
    # Wait for table to load
    wait = WebDriverWait(driver, 10)
    time.sleep(2)  # Give page time to fully load
    
    # Find the main data table
    table_selectors = [
        "#stats",
        "#results", 
        ".stats_table",
        ".sortable",
        "table[id*='stats']",
        "table[class*='stats']"
    ]
    
    table = None
    for selector in table_selectors:
        try:
            tables = driver.find_elements(By.CSS_SELECTOR, selector)
            if tables:
                table = max(tables, key=lambda t: len(t.find_elements(By.TAG_NAME, "tr")))
                break
        except:
            continue
    
    if not table:
        print(f"❌ No table found on page {page_num}")
        return []
    
    # Extract data rows (skip headers since we'll get them from first page)
    data_rows = []
    try:
        tbody = table.find_element(By.TAG_NAME, "tbody")
        rows = tbody.find_elements(By.TAG_NAME, "tr")
    except:
        # Fallback: get all rows
        all_rows = table.find_elements(By.TAG_NAME, "tr")
        # Skip first row if it contains only th elements (header)
        rows = []
        for row in all_rows:
            if row.find_elements(By.TAG_NAME, "td"):  # Has data cells
                rows.append(row)
    
    for i, row in enumerate(rows):
        try:
            # Skip rows that are just headers
            if row.find_elements(By.TAG_NAME, "th") and not row.find_elements(By.TAG_NAME, "td"):
                continue
            
            cells = row.find_elements(By.TAG_NAME, "td")
            if not cells:
                continue
            
            row_data = []
            for cell in cells:
                cell_text = cell.text.strip()
                
                # Handle links
                if not cell_text:
                    link = cell.find_elements(By.TAG_NAME, "a")
                    if link:
                        cell_text = link[0].text.strip()
                
                # Handle data attributes
                if not cell_text:
                    cell_text = cell.get_attribute('data-stat') or ''
                
                row_data.append(cell_text)
            
            if row_data:
                data_rows.append(row_data)
                
        except Exception as e:
            print(f"Error processing row {i} on page {page_num}: {e}")
            continue
    
    print(f"✅ Page {page_num}: extracted {len(data_rows)} rows")
    return data_rows

def get_table_headers(driver):
    """Extract table headers from the first page"""
    table_selectors = [
        "#stats",
        "#results", 
        ".stats_table",
        ".sortable",
        "table[id*='stats']",
        "table[class*='stats']"
    ]
    
    table = None
    for selector in table_selectors:
        try:
            tables = driver.find_elements(By.CSS_SELECTOR, selector)
            if tables:
                table = max(tables, key=lambda t: len(t.find_elements(By.TAG_NAME, "tr")))
                break
        except:
            continue
    
    if not table:
        return []
    
    headers = []
    try:
        thead = table.find_element(By.TAG_NAME, "thead")
        header_rows = thead.find_elements(By.TAG_NAME, "tr")
        header_row = header_rows[-1]  # Last header row usually has column names
        header_cells = header_row.find_elements(By.TAG_NAME, "th")
        
        for cell in header_cells:
            header_text = cell.text.strip()
            if not header_text:
                header_text = cell.get_attribute('data-stat') or cell.get_attribute('aria-label') or ''
            headers.append(header_text)
    except:
        # Fallback: use first data row structure to determine column count
        try:
            first_data_row = table.find_element(By.XPATH, ".//tr[td]")
            cells = first_data_row.find_elements(By.TAG_NAME, "td")
            headers = [f"Column_{i+1}" for i in range(len(cells))]
        except:
            headers = []
    
    return headers

def scrape_all_pages(username, password, url):
    """Scrape all pages by clicking Next until no more pages"""
    
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    # Keep browser visible for debugging
    # chrome_options.add_argument("--headless")
    
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    all_data = []
    headers = []
    
    try:
        # Step 1: Login
        login_success = login_to_stathead(driver, username, password)
        if not login_success:
            print("Login failed, cannot continue")
            return None
        
        # Step 2: Navigate to first page
        print(f"Navigating to data page...")
        driver.get(url)
        time.sleep(5)
        
    
        # Step 3: Get headers from first page
        print("Extracting table headers...")
        headers = get_table_headers(driver)
        if not headers:
            print("❌ Could not extract table headers")
            return None
        
        print(f"Found {len(headers)} columns: {headers[:5]}..." if len(headers) > 5 else f"Found {len(headers)} columns: {headers}")
        
        # Step 4: Scrape all pages
        page_num = 1
        max_pages = 1000  # Safety limit to prevent infinite loops
        
        while page_num <= max_pages:
            print(f"\n--- PAGE {page_num} ---")
            
            # Scrape current page
            page_data = scrape_page_table(driver, page_num)
            if page_data:
                all_data.extend(page_data)
                print(f"Total rows collected so far: {len(all_data)}")
            else:
                print(f"No data found on page {page_num}")
            
            # Look for Next Page button
            print("Looking for Next Page button...")
            next_button = find_next_page_button(driver)
            
            if next_button:
                try:
                    button_text = next_button.text.strip() or next_button.get_attribute('title') or 'Next'
                    print(f"✅ Found Next Page button: '{button_text}'")
                    
                    # Scroll to button and click
                    driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                    time.sleep(1)
                    
                    # Click the button
                    next_button.click()
                    print("✅ Clicked Next Page button")
                    
                    # Wait for next page to load
                    time.sleep(3)
                    
                    page_num += 1
                    
                except Exception as e:
                    print(f"❌ Error clicking Next Page button: {e}")
                    break
            else:
                print("🏁 No Next Page button found - reached end of data")
                break
        
        if page_num > max_pages:
            print(f"⚠️ Reached maximum page limit ({max_pages})")
        
        print(f"\n✅ Scraping complete! Total pages: {page_num-1}")
        print(f"Total rows collected: {len(all_data)}")
        
        # Step 5: Create DataFrame
        if not all_data:
            print("❌ No data collected")
            return None
        
        # Ensure all rows have same number of columns
        max_cols = len(headers) if headers else max(len(row) for row in all_data)
        
        # Pad headers if necessary
        while len(headers) < max_cols:
            headers.append(f"Column_{len(headers) + 1}")
        
        # Pad data rows if necessary
        for row in all_data:
            while len(row) < max_cols:
                row.append("")
        
        # Create DataFrame
        df = pd.DataFrame(all_data, columns=headers[:max_cols])
        
        # Clean data
        print("Cleaning data...")
        df = df.dropna(axis=1, how='all')  # Remove empty columns
        df = df.dropna(axis=0, how='all')  # Remove empty rows
        
        # Strip whitespace
        for col in df.select_dtypes(include=['object']):
            df[col] = df[col].astype(str).str.strip()
        
        print(f"✅ Final dataset: {len(df)} rows × {len(df.columns)} columns")
        
        # Display sample
        print("\nFirst 5 rows:")
        print(df.head())
        
        print(f"\nLast 5 rows:")
        print(df.tail())
        
        # Save to CSV
        output_file = "nfl_qb_stats_all_pages.csv"
        df.to_csv(output_file, index=False)
        print(f"\n💾 All data saved to: {output_file}")
        
        return df
        
    except Exception as e:
        print(f"❌ Error during multi-page scraping: {e}")
        
        # Save whatever data we have
        if all_data:
            try:
                df = pd.DataFrame(all_data)
                df.to_csv("nfl_qb_stats_partial.csv", index=False)
                print(f"💾 Partial data saved (rows: {len(all_data)})")
            except:
                pass
        
        # Screenshot for debugging
        try:
            driver.save_screenshot("debug_screenshot.png")
            print("Screenshot saved: debug_screenshot.png")
        except:
            pass
        
        return None
        
    finally:
        input("\nPress Enter to close browser...")
        driver.quit()

def main():
    """Main function"""
    print("=== Stathead Multi-Page Table Scraper ===")
    #print("This will scrape ALL pages by automatically clicking 'Next Page'")
    #print()
    
    # URL to scrape  
    url = "https://stathead.com/football/player-game-finder.cgi?request=1&match=player_game&order_by=pass_rating&year_min=2006&year_max=2024&week_num_season_min=1&week_num_season_max=18&ccomp%5B2%5D=gt&cval%5B2%5D=1&cstat%5B2%5D=pass_att"
    
    # Get credentials
    username = stats_username
    password = stats_password
    
    if not username or not password:
        print("Username and password are required!")
        return
    
    print(f"\nStarting multi-page scrape for user: {username}")
    print("This may take several minutes depending on the number of pages...")
    
    df = scrape_all_pages(username, password, url)
    
    if df is not None:
        print(f"\n🎉 SUCCESS! Scraped {len(df)} total rows across all pages")
        print(f"Data shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        
        # Show some statistics
        if 'Player' in df.columns:
            unique_players = df['Player'].nunique()
            print(f"Unique players: {unique_players}")
        
    else:
        print("\n❌ Multi-page scraping failed")

if __name__ == "__main__":
    main()