In [None]:
import pandas as pd
import time
import re
import os
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

print("---------- Now running 8-scrape-odds.py ----------")

### Setup Selenium WebDriver ###
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

### Load the odds page ###
url = "https://www.wagertalk.com/odds"
driver.get(url)

time.sleep(5)

soup = BeautifulSoup(driver.page_source, "lxml")
tables = pd.read_html(str(soup), flavor="lxml")
driver.quit()

if not tables:
    raise ValueError("No tables found on the webpage.")

df = tables[0]
df.reset_index(drop=True, inplace=True)

### Find and extract MLB sections (AL/NL/Interleague) ###
section_titles = ["AMERICAN LEAGUE", "NATIONAL LEAGUE", "INTERLEAGUE"]
section_starts = []

for i, row in df.iterrows():
    row_str = str(row.values).upper()
    if "[-]" in row_str:
        for section in section_titles:
            if section in row_str:
                section_starts.append((i, section))
                break

section_ranges = []
for idx, (start_idx, _) in enumerate(section_starts):
    end_idx = section_starts[idx + 1][0] if idx + 1 < len(section_starts) else len(df)
    section_ranges.append((start_idx, end_idx))

mlb_sections = [df.iloc[start:end] for start, end in section_ranges]
mlb_odds_df = pd.concat(mlb_sections, ignore_index=True)

if mlb_odds_df.empty:
    raise ValueError("No MLB sections (AL/NL/Interleague) found in the scraped data.")

mlb_odds_df.reset_index(drop=True, inplace=True)

### Extract and Format Date from "Time" Column ###
def parse_date(time_str):
    if not isinstance(time_str, str) or len(time_str) < 5:
        return None
    date_part = time_str[:5]
    try:
        return datetime.strptime(date_part, "%m/%d").replace(year=2025).strftime("%Y-%m-%d")
    except ValueError:
        return None

if "Time" in mlb_odds_df.columns:
    mlb_odds_df["Date"] = mlb_odds_df["Time"].apply(parse_date)

### Extract Runline from "Consensus" Column ###
def extract_runline(consensus_str):
    if not isinstance(consensus_str, str) or consensus_str.lower() == "unknown":
        return 0
    match = re.findall(r'\b(\d{1,2}½?)\s*(?=[ou]|\s|$)', consensus_str)
    if match:
        return float(match[0].replace("½", ".5"))
    return 0

if "Consensus" in mlb_odds_df.columns:
    mlb_odds_df["Runline"] = mlb_odds_df["Consensus"].apply(extract_runline)

### Extract and Normalize Team Names ###
'''team_name_mapping_spring_training = {
    "NY Yankees": "New York Yankees", "NY Mets": "New York Mets",
    "LA Dodgers": "Los Angeles Dodgers", "LA Angels": "Los Angeles Angels",
    "Chi. Cubs": "Chicago Cubs", "Chi. White Sox": "Chicago White Sox",
    "Atlanta": "Atlanta Braves", "Arizona": "Arizona Diamondbacks",
    "Baltimore": "Baltimore Orioles", "Boston": "Boston Red Sox",
    "Cincinnati": "Cincinnati Reds", "Cleveland": "Cleveland Guardians",
    "Colorado": "Colorado Rockies", "Detroit": "Detroit Tigers",
    "Houston": "Houston Astros", "Kansas City": "Kansas City Royals",
    "Milwaukee": "Milwaukee Brewers", "Minnesota": "Minnesota Twins",
    "Oakland": "Oakland Athletics", "Philadelphia": "Philadelphia Phillies",
    "Pittsburgh": "Pittsburgh Pirates", "San Diego": "San Diego Padres",
    "San Francisco": "San Francisco Giants", "Seattle": "Seattle Mariners",
    "St. Louis": "St. Louis Cardinals", "Tampa Bay": "Tampa Bay Rays",
    "Texas": "Texas Rangers", "Toronto": "Toronto Blue Jays",
    "Washington": "Washington Nationals", "Miami": "Miami Marlins"
}'''

team_name_mapping = {
    "NYY": "New York Yankees", "NYM": "New York Mets",
    "LAD": "Los Angeles Dodgers", "LAA": "Los Angeles Angels",
    "CHC": "Chicago Cubs", "CWS": "Chicago White Sox",
    "ATL": "Atlanta Braves", "ARI": "Arizona Diamondbacks",
    "BAL": "Baltimore Orioles", "BOS": "Boston Red Sox",
    "CIN": "Cincinnati Reds", "CLE": "Cleveland Guardians",
    "COL": "Colorado Rockies", "DET": "Detroit Tigers",
    "HOU": "Houston Astros", "KC": "Kansas City Royals",
    "MIL": "Milwaukee Brewers", "MIN": "Minnesota Twins",
    "OAK": "Oakland Athletics", "PHI": "Philadelphia Phillies",
    "PIT": "Pittsburgh Pirates", "SD": "San Diego Padres",
    "SF": "San Francisco Giants", "SEA": "Seattle Mariners",
    "STL": "St. Louis Cardinals", "TB": "Tampa Bay Rays",
    "TEX": "Texas Rangers", "TOR": "Toronto Blue Jays",
    "WSH": "Washington Nationals", "MIA": "Miami Marlins"
}



def extract_teams(teams_str):
    if not isinstance(teams_str, str):
        return None, None
    home_team = next((key for key in team_name_mapping if teams_str.endswith(key)), None)
    if home_team:
        away_team = teams_str.replace(home_team, "").strip()
        return team_name_mapping.get(away_team, away_team), team_name_mapping[home_team]
    return None, None

if "Teams" in mlb_odds_df.columns:
    mlb_odds_df[["Away Team", "Home Team"]] = mlb_odds_df["Teams"].apply(
        lambda x: pd.Series(extract_teams(x))
    )

### Load and Merge with test currentdata.csv ###
current_data_path = "model/currentdata.csv"

if not os.path.exists(current_data_path):
    raise FileNotFoundError(f"File not found: {current_data_path}")

current_df = pd.read_csv(current_data_path)
current_df["game_date"] = pd.to_datetime(current_df["game_date"])
mlb_odds_df["Date"] = pd.to_datetime(mlb_odds_df["Date"])

double_headers = {}

for index, row in mlb_odds_df.iterrows():
    matches = current_df[
        (current_df["game_date"] == row["Date"]) &
        (current_df["home_name"] == row["Home Team"]) &
        (current_df["away_name"] == row["Away Team"])
    ]

    if not matches.empty:
        if len(matches) == 1:
            current_df.at[matches.index[0], "over_under_runline"] = row["Runline"]
        else:
            key = (row["Date"], row["Home Team"], row["Away Team"])
            if key not in double_headers:
                double_headers[key] = list(matches.index)
            if double_headers[key]:
                match_index = double_headers[key].pop(0)
                current_df.at[match_index, "over_under_runline"] = row["Runline"]

out_data_path= "currentdata-test.csv"
current_df.to_csv(out_data_path, index=False)
print("Live runlines successfully merged into currentdata-test.csv!")


---------- Now running 8-scrape-odds.py ----------


  tables = pd.read_html(str(soup), flavor="lxml")


Live runlines successfully merged into currentdata-test.csv!
