# Scraper

In [172]:
# Main Python Imports
import duckdb as dd
import pandas as pd
import time
from io import StringIO

# Selenium Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

# Scraping Imports
from bs4 import BeautifulSoup
import requests

# Warnings to avoid pd.read_html deprecation warnings
import warnings
warnings.filterwarnings('error', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)


# Progress bar
import tqdm

In [173]:
# Database Connection
con = dd.connect("project.db")

In [3]:
teams = con.sql("SELECT * FROM DIMTEAM").df()

In [4]:
service = Service(executable_path="C:/Users/matth/geckodriver-v0.35.0-win32/geckodriver.exe")
options = webdriver.FirefoxOptions()
options.binary_location = r'C:\Program Files\Mozilla Firefox\firefox.exe'
options.add_argument('-headless')
driver = webdriver.Firefox(service=service, options=options)

In [5]:
years = [
    "2014-15",
    "2015-16",
    "2016-17",
    "2017-18",
    "2018-19",
    "2019-20",
    "2020-21",
    "2021-22",
    "2022-23",
    "2023-24",
    "2024-25"
]

In [6]:
custom_urls = {
    "University of Kentucky": "https://ukathletics.com/sports/mbball/roster/seasonYEAR",
    "Auburn University": "https://auburntigers.com/sports/mens-basketball/roster/seasonYEAR",
    "Georgia Institute of Technology": "https://ramblinwreck.com/sports/m-baskbl/roster/seasonYEAR",
    "University of Arkansas, Fayetteville": "https://arkansasrazorbacks.com/sport/m-baskbl/roster/?season=YEAR",
    "University of New Mexico": "https://golobos.com/sports/mbball/roster/seasonYEAR",
    "University of Notre Dame": "https://fightingirish.com/sports/mbball/roster/seasonYEAR",
    "University of San Francisco": "https://usfdons.com/sports/mens-basketball/rosterYEAR",
    "Vanderbilt University": "https://vucommodores.com/sports/mbball/roster/seasonYEAR",
    "Pennsylvania State University": "https://gopsusports.com/sports/mens-basketball/roster/seasonYEAR?view=table",
    "San Diego State University": "https://goaztecs.com/sports/mens-basketball/roster/seasonYEAR?view=table",
    "San Jose State University": "https://sjsuspartans.com/sports/mens-basketball/roster/seasonYEAR?view=table",
    "University of South Carolina, Columbia": "https://gamecocksonline.com/sports/mbball/roster/seasonYEAR",
    "Virginia Polytechnic Institute and State University": "https://hokiesports.com/sports/mens-basketball/roster/seasonYEAR?view=table",
    "Brigham Young University": "https://byucougars.com/sports/mens-basketball/roster/seasonYEAR?view=table", 
    "University of Central Florida": "https://ucfknights.com/sports/mens-basketball/roster/seasonYEAR?view=table",
    "Clemson University": "https://clemsontigers.com/sports/mens-basketball/roster/seasonYEAR",
    "University of Iowa": "https://hawkeyesports.com/sports/mbball/roster/seasonYEAR",
    "Louisiana State University": "https://lsusports.net/sports/mb/roster/seasonYEAR",
    "Stanford University": "https://gostanford.com/sports/mens-basketball/roster/seasonYEAR",
    "University of Miami (Florida)": "https://miamihurricanes.com/sports/mbball/roster/seasonYEAR",
    "University of Nebraska-Lincoln": "https://huskers.com/sports/mens-basketball/roster/seasonYEAR",
    "University of Virginia": "https://virginiasports.com/sports/mbball/roster/seasonYEAR"
}

In [7]:
%%time

rosters = {}

pbar = tqdm.tqdm(total = len(teams))

# Regular roster urls
for index, row in teams.iterrows():

    for year in years:

        if row["Name"] in ["Clemson University"]:
            year = year[:4]

        if row["Name"] in ["Brigham Young University"]:
            year = year[:5] + "20" + year[5:]
        
        if row["Name"] in custom_urls.keys():
            if row["Name"] == "University of Arkansas, Fayetteville":
                url = custom_urls[row["Name"]].replace("YEAR", year)
            else:
                url = custom_urls[row["Name"]].replace("YEAR", "/" + year)
        else:
            url = row["website"] + "sports/mens-basketball/roster" + "/" + year
    
        try:
            r = requests.get(url)
            df = pd.read_html(StringIO(r.text))
        except:
            # Try with selenium
            try:
                driver.get(url)
                WebDriverWait(driver, 100).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
    
                if row["Name"] in ["University of Wyoming", "University of San Diego", "Iowa State University", "Utah State University"]:
    
                    action = ActionChains(driver)
                    action.send_keys(Keys.ENTER).perform()
    
                    time.sleep(2)
                    
                    dropdown = ""
                    if row["Name"] == "University of San Diego":
                        dropdown = "-dropdown"
                        
                    sel_el = driver.find_element(By.XPATH, f"//*[@id=\"sidearm-roster-select-template{dropdown}\"]")
                    selector = Select(sel_el)
                    selector.select_by_visible_text("Roster View - Grid")
    
                    time.sleep(2)
    
                    # Click go to apply
                    driver.find_element(By.XPATH, "//*[@id=\"sidearm-roster-select-template-button\"]").click()
                    time.sleep(2)
                
                df = pd.read_html(StringIO(driver.page_source))
            except:
                try:
                    # Click the grid view button
                    time.sleep(2)
                    driver.find_element(By.XPATH, "//*[@id=\"_viewType_table\"]").click()
                    time.sleep(2)
                    df = pd.read_html(StringIO(driver.page_source))
                except:
                        
                    print("Scraping failed. Url: " + url)
                    
        if row["Name"] not in rosters.keys():
            rosters[row["Name"]] = {}
        
        rosters[row["Name"]][year] = df
        
    pbar.update(1)

pbar.close()

  7%|▋         | 7/101 [03:20<37:45, 24.10s/it]  

Scraping failed. Url: https://www.gobulldogs.com/sports/mens-basketball/roster/2014-15


 41%|████      | 41/101 [33:48<50:02, 50.05s/it]  

Scraping failed. Url: https://gostanford.com/sports/mens-basketball/roster/season/2023-24


 42%|████▏     | 42/101 [34:07<40:01, 40.70s/it]

Scraping failed. Url: https://cuse.com/sports/mens-basketball/roster/2023-24


 45%|████▍     | 45/101 [37:45<51:16, 54.94s/it]  

Scraping failed. Url: https://www.texastech.com/sports/mens-basketball/roster/2014-15


 57%|█████▋    | 58/101 [49:03<24:20, 33.96s/it]  

Scraping failed. Url: https://www.georgiadogs.com/sports/mens-basketball/roster/2014-15


100%|██████████| 101/101 [1:20:14<00:00, 47.67s/it]

CPU times: total: 54.8 s
Wall time: 1h 20min 14s





In [8]:
driver.quit()

In [147]:
rosters_scraped = {}
for team in rosters:
    year_change = {k:[df for df in v if len(df) > 0 and 
                     ("Ht." in df.columns 
                      or "Ht" in df.columns 
                      or "Height" in df.columns 
                      or "HT." in df.columns 
                      or "Pos" in df.columns
                      or "Pos." in df.columns
                      or "HT" in df.columns
                      or "HGT." in df.columns
                      )]
                  for k,v in rosters[team].items()}
    # if team == "Baylor University":
    #     print(year_change)
    year_change = {k:v[0] for k,v in year_change.items() if len(v) > 0}

    team_rosters = pd.DataFrame(columns = list(year_change.values())[0].columns)

    for year in year_change:
        if len(year_change[year]) == 0:
            continue
        year_change[year]["Season"] = year
        team_rosters = pd.concat([team_rosters,year_change[year]])


    rosters_scraped[team] = team_rosters

In [148]:
rosters_update = {}
for team in rosters_scraped:
    df = rosters_scraped[team]
    columns_to_keep = [
        c for c in df.columns
        if "name" in c.lower() 
           or "pos" in c.lower()
           or "#" in c.lower()
           or "no" in c.lower()
           or "num" in c.lower()
           or "ht" in c.lower()
           or "height" in c.lower()
           or "weight" in c.lower()
           or "wt" in c.lower()
           or "yr" in c.lower()
           or "year" in c.lower()
           or "cl" in c.lower()
           or "season" in c.lower()
    ]
    
    rosters_update[team] = df[columns_to_keep]

In [149]:
def coalesced_cols(df, cols):
    return list(df[cols].bfill(axis=1).iloc[:, 0])

In [150]:
rosters_coalesced = {}
for team in rosters_update:
    df = rosters_update[team]
    new_df = {}
    
    # Names
    name_cols = [c for c in df if "name" in c.lower()]
    
    if len(name_cols) == 1:
        new_df["Name"] = df[name_cols[0]]
    else:
        new_df["Name"] = coalesced_cols(df, name_cols)

    # Nums
    num_cols = [c for c in df if "num" in c.lower() or "no" in c.lower() or "#" in c.lower()]

    if len(num_cols) == 1:
        new_df["#"] = df[num_cols[0]]
    else:
        new_df["#"] = coalesced_cols(df, num_cols)

    # Height
    ht_cols = [c for c in df if "ht" in c.lower() or "height" in c.lower()]

    if len(ht_cols) == 0:
        new_df["Height"] = [None for i in range(len(df))]
    elif len(ht_cols) == 1:
        new_df["Height"] = df[ht_cols[0]]
    else:
        new_df["Height"] = coalesced_cols(df, ht_cols)


    # Weight
    wt_cols = [c for c in df if "weight" in c.lower() or "wt" in c.lower()]

    if len(wt_cols) == 0:
        new_df["Weight"] = [None for i in range(len(df))]
    elif len(wt_cols) == 1:
        new_df["Weight"] = df[wt_cols[0]]
    else:
        new_df["Weight"] = coalesced_cols(df, wt_cols)

    # Year
    year_cols = [c for c in df if "year" in c.lower() or "yr" in c.lower() or "cl" in c.lower()]

    if len(year_cols) == 1:
        new_df["Year"] = df[year_cols[0]]
    else:
        new_df["Year"] = coalesced_cols(df, year_cols)
        
    new_df["Season"] = df["Season"]
    
    team_df = pd.DataFrame(new_df)
    team_df["Team"] = team
    
    rosters_coalesced[team] = team_df

In [175]:
cols = ["Name", "#", "Height", "Weight", "Season", "Year", "Team"]
roster_df = pd.DataFrame(columns=cols)
for roster in rosters_coalesced:
    roster_df = pd.concat([test_df, rosters_coalesced[roster]])
roster_df = roster_df[~(roster_df["Name"].isna())]

In [174]:
con.sql(
    """
    CREATE OR REPLACE TABLE ROSTERS AS
    SELECT *, ROW_NUMBER(` FROM roster_df
    """
)

In [170]:
con.close()