# ESPN NCAA In-State Minutes Scraper

This notebook:
1. Retrieves the list of Division I men’s basketball teams from ESPN.
2. Constructs the roster and stats URLs for the **2024–2025** season.
3. Scrapes each player’s hometown and total minutes played.
4. Outputs a combined DataFrame.

Note: ESPN may not yet provide the 2024–2025 season data; adjust if necessary.

In [None]:
# !conda install --yes numexpr=2.8.4 bottleneck=1.3.6
!conda install -c conda-forge bottleneck=1.3.6 numexpr=2.8.4


Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: | 
Found conflicts! Looking for incompatible packages.
This can take several minutes.  Press CTRL-C to abort.
Examining conflict for bokeh panel hvplot dask datashader anaconda intake holo/ 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

def get_soup(url):
    """
    Helper function to get a BeautifulSoup object from a URL.
    Adjust headers or add delays if needed.
    """
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/88.0.4324.150 Safari/537.36"
        )
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")

def parse_hometown(text):
    """
    Attempt to parse "City, State" from a string that might have more info.
    e.g., "Durham, NC (Northern HS)" -> ("Durham", "NC")
    """
    parts = text.split(" (")
    loc_part = parts[0]  # e.g. "Durham, NC"
    city, state = "", ""
    if "," in loc_part:
        city, state = loc_part.split(",", 1)
        city = city.strip()
        state = state.strip()
    return city, state

In [None]:
def parse_team_main_page():
    """
    Parse the ESPN teams page to get each team's name, link to their page,
    and any city/state info available (if ESPN shows it).
    
    Returns:
        List of dicts:
        [
            {
                "team_name": str,
                "team_url": str,
                "team_city": str (if discovered),
                "team_state": str (if discovered)
            },
            ...
        ]
    """
    base_url = "https://www.espn.com"
    teams_page_url = "https://www.espn.com/mens-college-basketball/teams"
    soup = get_soup(teams_page_url)

    teams_data = []
    # Each conference block has a list of teams
    conference_blocks = soup.select("section.TeamLinks")
    for block in conference_blocks:
        team_links = block.select("a.AnchorLink")
        for link in team_links:
            href = link.get("href", "")
            # Typically the team link looks like: "/mens-college-basketball/team/_/id/XXX/team-name"
            if "/mens-college-basketball/team/" in href:
                full_url = base_url + href
                team_name = link.get_text(strip=True)
                
                # Attempt to parse city/state if provided. Usually not available in the text.
                city = ""
                state = ""
                
                teams_data.append({
                    "team_name": team_name,
                    "team_url": full_url,
                    "team_city": city,
                    "team_state": state
                })
    return teams_data

def parse_roster_page(team_url):
    """
    Given the main team URL, find the roster URL for the 2024-25 season,
    then scrape each player's name and hometown/birthplace.

    Returns:
        Dict[str, (str, str)] -> {player_name: (home_city, home_state)}
    """
    # Construct roster URL. ESPN's structure could change.
    if team_url.endswith("/"):
        roster_url = team_url + "roster/_/season/2025"
    else:
        roster_url = team_url + "/roster/_/season/2025"

    soup = get_soup(roster_url)

    player_map = {}
    rows = soup.select("table.Table tbody tr")
    for row in rows:
        cells = row.find_all("td")
        if len(cells) < 2:
            continue

        # Player name often in the second cell
        player_name_link = None
        if len(cells) > 1:
            player_name_link = cells[1].find("a")
        if not player_name_link:
            continue

        player_name = player_name_link.get_text(strip=True)

        # The last cell often has hometown info. We'll guess index 6, or if not present, fallback.
        hometown_cell_text = ""
        if len(cells) > 6:
            hometown_cell_text = cells[6].get_text(strip=True)
        elif len(cells) > 0:
            hometown_cell_text = cells[-1].get_text(strip=True)

        home_city, home_state = parse_hometown(hometown_cell_text)
        player_map[player_name] = (home_city, home_state)

    return player_map

def parse_stats_page(team_url):
    """
    Given the main team URL, find the season stats URL for the 2024-25 season,
    then scrape each player's total minutes played (if available).
    
    Returns:
        Dict[str, float] -> {player_name: minutes_played}
    """
    if team_url.endswith("/"):
        stats_url = team_url + "stats/_/season/2025"
    else:
        stats_url = team_url + "/stats/_/season/2025"

    soup = get_soup(stats_url)
    player_minutes = {}

    tables = soup.select("table.Table")
    for table in tables:
        headers = [th.get_text(strip=True).upper() for th in table.select("thead tr th")]
        if "MIN" in headers:
            min_index = headers.index("MIN")
            rows = table.select("tbody tr")
            for row in rows:
                cells = row.find_all("td")
                if len(cells) <= min_index:
                    continue
                name_link = row.select_one("a.AnchorLink")
                if not name_link:
                    continue
                player_name = name_link.get_text(strip=True)

                minutes_text = cells[min_index].get_text(strip=True)
                try:
                    minutes_val = float(minutes_text)
                except ValueError:
                    minutes_val = 0.0

                player_minutes[player_name] = minutes_val

    return player_minutes

In [None]:
def main():
    teams_data = parse_team_main_page()
    all_rows = []

    for team_info in teams_data:
        team_name = team_info["team_name"]
        team_url = team_info["team_url"]
        team_city = team_info["team_city"]
        team_state = team_info["team_state"]

        # Scrape roster
        try:
            roster_map = parse_roster_page(team_url)
        except Exception as e:
            print(f"Could not parse roster for {team_name}: {e}")
            continue

        # Scrape stats
        try:
            stats_map = parse_stats_page(team_url)
        except Exception as e:
            print(f"Could not parse stats for {team_name}: {e}")
            continue

        # Combine data
        for player_name, (hometown, home_state) in roster_map.items():
            minutes = stats_map.get(player_name, 0.0)
            row = {
                "team_name": team_name,
                "team_city": team_city,
                "team_state": team_state,
                "player": player_name,
                "minutes": minutes,
                "hometown": hometown,
                "home_state": home_state
            }
            all_rows.append(row)
        
        print(f"Completed scraping for team: {team_name}")
        
        # Be kind to ESPN's servers
        time.sleep(1)

    # Create final DataFrame
    df = pd.DataFrame(all_rows, columns=[
        "team_name",
        "team_city",
        "team_state",
        "player",
        "minutes",
        "hometown",
        "home_state"
    ])

    display(df.head(50))

    # Optional: save to CSV
    # df.to_csv("espn_ncaa_instate_minutes.csv", index=False)

# Run the main function
main()