# Database Setup

In [1]:
# Main Python Imports
import duckdb as dd
import pandas as pd
import time
from io import StringIO

# Selenium Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Scraping Imports
from bs4 import BeautifulSoup
import requests

# Warnings to avoid pd.read_html deprecation warnings
import warnings
warnings.filterwarnings('error', category=DeprecationWarning)

In [2]:
con = dd.connect('project.db')

In [3]:
service = Service(executable_path="C:/Users/matth/geckodriver-v0.35.0-win32/geckodriver.exe")
options = webdriver.FirefoxOptions()
options.binary_location = r'C:\Program Files\Mozilla Firefox\firefox.exe'
options.add_argument('-headless')
driver = webdriver.Firefox(service=service, options=options)

In [4]:
# Get NCAA team links
driver.get("https://web3.ncaa.org/directory/memberList?type=12&division=I&sportCode=MBB")

In [5]:
colleges = []

while True:
    time.sleep(1)
    next_element = driver.find_element(By.ID, "memberListTable_next")

    # Get the table
    full_table = pd.read_html(StringIO(driver.page_source))[0]

    # Get links for each college
    soup = BeautifulSoup(driver.page_source)
    full_table["relative_link"] = [a['href'] for a in soup.find('table').find_all('a')][::2]

    colleges.append(full_table)

    if "disabled" in next_element.get_attribute("class"):
        break

    driver.execute_script("arguments[0].click();",next_element)
    time.sleep(1)


In [6]:
power_bball = pd.concat(colleges)
power_bball = power_bball[power_bball["Conference"].isin(["Big East Conference", "Big Ten Conference", "Big 12 Conference", "Atlantic Coast Conference", "Southeastern Conference", "Mountain West Conference", "West Coast Conference"])]

power_bball = power_bball.drop(columns=["Division", "Reclass Division", "Public/Private", "HBCU", "Region"])

In [7]:
driver.quit()

In [8]:
# Get all links to university athletic websites
base_url = "https://web3.ncaa.org/directory/"

website_links = []
for link in power_bball["relative_link"]:
    r = requests.get(base_url + link).content
    soup = BeautifulSoup(r, 'html.parser')
    scraped_link = soup.find_all("div", {"class": "list-group"})[1].find_all('a')[1]['href']
    website_links.append(scraped_link)

# Make sure we are using https
website_links = [s.replace('http', 'https') for s in website_links]

# Add slash at end to link if it doesn't have one
for i in range(len(website_links)):
    if website_links[i][-1] != "/":
        website_links[i] += "/"

power_bball['website'] = website_links

In [9]:
# Create the table with all the teams we want to investigate
con.sql("""
    CREATE OR REPLACE TABLE DIMTEAM as
    SELECT * FROM power_bball
""")

In [10]:
# Close the connection
con.close()