In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Create initial dataframe
We will create an initial dataframe with the name of the school and City, State from the list of men's college basketball schools as listed on sportsreference

In [2]:
url = 'https://www.sports-reference.com/cbb/schools/'
html = requests.get(url).content

df_list = pd.read_html(html)
df = df_list[0] # first table is men's, second is women's

# drop repeating header rows from table
df.drop_duplicates(keep=False, inplace=True)

df = df.loc[:, ['School', 'City, State']]

## Add SR key
We will parse the urls for each of the schools listed on sportsreference and add the key to our dataframe

In [3]:
# create a BeautifulSoup object
soup = BeautifulSoup(html, 'html.parser')

# find the table element
table = soup.find('table')

# find all rows in the table
rows = table.find_all('tr')

for row in rows[1:]:
    try:
        # extract the hyperlink
        link = row.find_all('td')[0].find('a')
        if link:
            # create a BeautifulSoup object
            soup = BeautifulSoup(str(link), 'html.parser')

            # extract the text and hyperlink
            text = soup.get_text().strip()
            link = soup.a['href']

            # extract the school name from the hyperlink using regex
            key = re.search(r'/schools/(.*)/men', link).group(1)

            df.loc[df['School'] == text, 'SR key'] = key
    except IndexError:
        pass

## Add NCAA key
We will parse the schools listed in the NCAA website and add the key to our dataframe

In [4]:
hardcoded_schools_dict = {
    "Augusta State Jaguars": "", # Does not exist on NCAA website
    "Baker University Wildcats": "", # Does not exist on NCAA website
    "Baltimore Super Bees": "", # Does not exist on NCAA website
    "Birmingham-Southern Panthers": "birmingham-so",
    "Boston University": "boston-u",
    "Brigham Young College": "", # Does not exist on NCAA website
    "Cal State Bakersfield": "bakersfield",
    "Cal State Fullerton": "cal-st-fullerton",
    "Cal State Los Angeles Golden Eagles": "cal-st-la",
    "Cal State Northridge": "cal-st-northridge",
    "Canterbury College": "", # Does not exist on NCAA website
    "Carleton College Knights": "carleton",
    "Case Western Reserve Spartans": "case",
    "Central Missouri Mules": "central-mo",
    "Central Pennsylvania College Knights": "", # Does not exist on NCAA website
    "Centre (KY) Colonels": "centre",
    "Cheyenne Business College": "", # Does not exist on NCAA website
    "City College of New York Beavers": "ccny",
    "College of New Jersey Lions": "tcnj",
    "Colorado College Tigers": "colorado-col",
    "Colorado School of Mines Orediggers": "colorado-mines",
    "Concordia Seminary Preachers": "", # Does not exist on NCAA website
    "Cotner College": "", # Does not exist on NCAA website
    "Cumberland": "", # Does not exist on NCAA website
    "Dakota Wesleyan Tigers": "", # Does not exist on NCAA website
    "Dickinson College Red Devils": "dickinson",
    "Emporia State Hornets": "emporia-st",
    "Ensign College": "", # Does not exist on NCAA website
    "Haskell (KS) Fighting Indians": "", # Does not exist on NCAA website
    "Illinois Wesleyan Titans": "ill-wesleyan",
    "Kentucky Wesleyan Panthers": "ky-wesleyan",
    "Lawrence Tech": "", # Does not exist on NCAA website
    "Long Island University": "long-island",
    "Louisiana": "la-lafayette",
    "Louisiana-Monroe": "la-monroe",
    "Loyola (LA) Wolfpack": "", # Does not exist on NCAA website
    "Loyola (MD)": "loyola-maryland",
    "Maryland-Baltimore County": "umbc",
    "Maryland-Eastern Shore": "umes",
    "Massachusetts Institute of Technology Engineers": "mit",
    "Massachusetts-Lowell": "umass-lowell",
    "Merchant Marine Mariners": "merchant-marine",
    "Middle Tennessee": "middle-tenn",
    "Minnesota A&M Aggies": "", # Does not exist on NCAA website
    "Montana State": "montana-st",
    "NC State": "north-carolina-st",
    "Nebraska Wesleyan Prairie Wolves": "neb-wesleyan",
    "Nevada-Las Vegas": "unlv",
    "New York University Violets": "new-york-u",
    "North Central Cardinals": "north-central-il",
    "Northeastern Illinois Golden Eagles": "northeastern-ill",
    "Northwest Missouri State Bearcats": "northwest-mo-st",
    "Ohio State": "ohio-st",
    "Oklahoma City Chiefs": "", # Does not exist on NCAA website
    "Phillips Haymakers": "", # Does not exist on NCAA website
    "Pittsburg State Gorillas": "pittsburg-st",
    "Saint Francis (PA)": "saint-francis-pa",
    "Saint Mary's (CA)": "st-marys-ca",
    "Sam Houston": "sam-houston-st",
    "Savage School of Physical Education": "", # Does not exist on NCAA website
    "Savannah State Tigers": "savannah-st",
    "Southern Illinois-Edwardsville": "siu-edwardsville",
    "Southern Mississippi": "southern-miss",
    "Southwestern (KS) Moundbuilders": "", # Does not exist on NCAA website
    "St. John's College (OH)": "", # Does not exist on NCAA website
    "Stevens Institute Ducks": "stevens",
    "Tennessee-Martin": "ut-martin",
    "Texas-Rio Grande Valley": "utrgv",
    "U.S. International Gulls": "", # Does not exist on NCAA website
    "UTEP": "utep",
    "UTSA": "utsa",
    "Washington & Jefferson Presidents": "wash-jeff",
    "Washington & Lee Generals": "wash-lee",
    "Washington (MO) Bears": "", # Does not exist on NCAA website
    "Washington College Shoremen": "washington-col",
    "Wayne State (MI) Warriors": "wayne-st-mi",
    "West Texas A&M Buffaloes": "west-tex-am",
    "Western Colorado Mountaineers": "western-st",
    "Wisconsin-Stevens Point Pointers": "wis-stevens-point",
    "Wisconsin-Superior Yellowjackets": "wis-superior",
    "WPI Engineers": "wpi"
}

In [5]:
for school in hardcoded_schools_dict:
    df.loc[df['School'] == school, 'NCAA key'] = hardcoded_schools_dict[school]

In [6]:
ncaa_url = 'https://www.ncaa.com/schools-index/'
MAX_SCHOOLS_INDEX = 23

for i in range(0, MAX_SCHOOLS_INDEX + 1):
    html = requests.get(f'{ncaa_url}{i}').content

    # create a BeautifulSoup object
    soup = BeautifulSoup(html, 'html.parser')

    # find the table element
    table = soup.find('table')

    # find all rows in the table
    rows = table.find_all('tr')

    for row in rows[1:]:
        try:
            # extract the hyperlink
            link = row.find_all('td')[1].find('a')
            if link:
                # create a BeautifulSoup object
                soup = BeautifulSoup(str(link), 'html.parser')

                # extract the text and hyperlink
                text = soup.get_text().strip()
                link = soup.a['href']

                # extract the full name
                full_name = row.find_all('td')[2].get_text().strip()

                parsed_name = re.sub(r"\bUniversity(?: of)?\b", "", full_name)
                parsed_name = full_name.replace('  ', '').strip()
                print(full_name, parsed_name)

                # extract the school name from the hyperlink using regex
                key = re.search(r'/schools/(.*)', link).group(1)
                
                row = df.loc[(df['School'] == text) | (df['SR key'] == key) | (df['School'] == parsed_name)]
                if not row.empty:
                    df.loc[(df['School'] == text) | (df['SR key'] == key) | (df['School'] == parsed_name), 'NCAA key'] = key
        except IndexError:
            pass

In [7]:
import json

print(f"Total schools: {len(df)}")
print(f"No. of schools where NCAA key is NaN (we didn't add to hardcoded values or found in automation) {len(df.loc[df['NCAA key'].isnull()])}")
schools = df.loc[df['NCAA key'].duplicated()]
for index, row in schools.iterrows():
    dict = row.to_dict()
    print(f"Duplicated NCAA Key or not found in NCAA {dict}")


Total schools: 491
No. of schools where NCAA key is NaN (we didn't add to hardcoded values or found in automation) 93
Duplicated NCAA Key or not found in NCAA {'School': 'Alcorn State', 'City, State': 'Alcorn State, Mississippi', 'SR key': 'alcorn-state', 'NCAA key': nan}
Duplicated NCAA Key or not found in NCAA {'School': 'Appalachian State', 'City, State': 'Boone, North Carolina', 'SR key': 'appalachian-state', 'NCAA key': nan}
Duplicated NCAA Key or not found in NCAA {'School': 'Arizona State', 'City, State': 'Tempe, Arizona', 'SR key': 'arizona-state', 'NCAA key': nan}
Duplicated NCAA Key or not found in NCAA {'School': 'Arkansas State', 'City, State': 'State University, Arkansas', 'SR key': 'arkansas-state', 'NCAA key': nan}
Duplicated NCAA Key or not found in NCAA {'School': 'Arkansas-Pine Bluff', 'City, State': 'Pine Bluff, Arkansas', 'SR key': 'arkansas-pine-bluff', 'NCAA key': nan}
Duplicated NCAA Key or not found in NCAA {'School': 'Baker University Wildcats', 'City, State': 

## Save data
We will save the data in the `data` folder as `teams.csv`

In [8]:
path = os.path.abspath('../data/teams.csv')
df.to_csv(path, index=False)