### Import necessary packages

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
import numpy as np

&nbsp;
### Check [2021 Canadians in College](https://www.canadianbaseballnetwork.com/canadians-in-college/2021-canadians-in-college) for current players on site

In [2]:
year = 2020
page = requests.get('https://www.canadianbaseballnetwork.com/canadians-in-college/{}-canadians-in-college'.format(str(year)))
soup = BeautifulSoup(page.text, 'html.parser')
div = soup.find("div", {"id": "block-yui_3_17_2_1_1604547079964_31164"})

In [3]:
df_dict = dict()
class_list = ['freshmen', 'sophomores', 'juniors', 'seniors']
for year in class_list:
     df_dict[year] = list()
class_list.append('freshman')

current_class = ''
for p in div.find_all("p"):
    text = p.getText()
    if (current_class != ''):
        for m in re.compile("\(.{2}\)").finditer(text):
            text = text[0: m.end()]
            #position = text.split(' ')[0]
            player_dict = dict()
            player_dict['text'] = text
            df_dict[current_class if current_class != 'freshman' else 'freshmen'].append(player_dict)
    if text.lower() in class_list:
        current_class = text.lower()

class_list.remove('freshman')
for year in class_list:
    print('{}: {} people'.format(year, str(len(df_dict[year]))))

freshmen: 252 people
sophomores: 159 people
juniors: 177 people
seniors: 224 people


&nbsp;
### Get Team Websites

In [4]:
# set url and parameters
baseUrl = 'https://www.collegebaseballhub.com/_api/wix-code-public-dispatcher/siteview/wix/data-web.jsw/find.ajax'
gridAppId = '4544fd6a-5fe8-4482-a5fc-1fafab59ad5a'
instance = 'wixcode-pub.b1c45a24c1814ed732fc488a1b3ad90c65889da3.eyJpbnN0YW5jZUlkIjoiZjFiMGI3NjUtMzM1OC00Mjc1LThkODItY2NmZWJiNTAyMGIwIiwiaHRtbFNpdGVJZCI6IjFkNmQxYzg5LTRlNTAtNGNiMy1hM2M3LTJkNzFmYjg0MDU5NCIsInVpZCI6bnVsbCwicGVybWlzc2lvbnMiOm51bGwsImlzVGVtcGxhdGUiOmZhbHNlLCJzaWduRGF0ZSI6MTYxMzY1OTY1MzE2NiwiYWlkIjoiMjVhOTNlMTAtM2NiYS00Y2IzLWExYTItZmRiMjEwY2U2OWE4IiwiYXBwRGVmSWQiOiJDbG91ZFNpdGVFeHRlbnNpb24iLCJpc0FkbWluIjpmYWxzZSwibWV0YVNpdGVJZCI6IjA3NzA4OGEwLWU2ODAtNDg4Ni1hMWY4LThmMjYxMGZjMDQwZiIsImNhY2hlIjpudWxsLCJleHBpcmF0aW9uRGF0ZSI6bnVsbCwicHJlbWl1bUFzc2V0cyI6IlNob3dXaXhXaGlsZUxvYWRpbmcsQWRzRnJlZSxIYXNEb21haW4sSGFzRUNvbW1lcmNlIiwidGVuYW50IjpudWxsLCJzaXRlT3duZXJJZCI6IjczODNhZGJlLTE3OGUtNDhhNS1hYTFiLTYyN2JmMTA1MWJmYiIsImluc3RhbmNlVHlwZSI6InB1YiIsInNpdGVNZW1iZXJJZCI6bnVsbH0='
viewMode = 'site'
params = {'gridAppId': gridAppId, 'instance': instance, 'viewMode': viewMode}

In [5]:
# initalize dataframe
schools_df = pd.DataFrame(columns=['title', 'division', 'conference', 'state', 'location', 'link'])
schools_df

for division in ['D1', 'D2', 'D3']:
    # set request body
    request_body = ["Division1",{"$and":[{"$and":[]},{"$and":[]},{"$and":[]},{"orderId":{"$gt":0}},{"division":{"$eq":"{}".format(division)}}]},[{"orderId":"asc"}],0,500]

    # send post request
    r = requests.post(url = baseUrl, params = params, json = request_body) 

    # extracting response text
    json_string = r.text

    items = json.loads(json_string)['result']['items']
    df = pd.DataFrame(items, columns=['title', 'division', 'conference', 'state', 'location', 'link'])

    schools_df = schools_df.append(df, ignore_index=True)
schools_df.sort_values(by=['division', 'title'], ignore_index=True, inplace=True)
schools_df

Unnamed: 0,title,division,conference,state,location,link
0,Abilene Christian University,D1,Southland,TX,"Abilene, TX",http://www.acusports.com/index.aspx?path=baseball
1,Alabama A&M University,D1,Southwestern,AL,"Huntsville, AL",http://www.aamusports.com/index.aspx?path=base...
2,Alabama State University,D1,Southwestern,AL,"Montgomery, AL",http://bamastatesports.com/index.aspx?path=bas...
3,Alcorn State University,D1,Southwestern,MS,"Lorman, MS",http://www.alcornsports.com/index.aspx?path=ba...
4,Appalachian State University,D1,Sun Belt,NC,"Boone, NC",https://appstatesports.com/index.aspx?path=bas...
...,...,...,...,...,...,...
942,Wittenberg University,D3,North Coast Athletic Conference,OH,"Springfield, OH",https://www.wittenbergtigers.com/sports/bsb/index
943,Worcester Polytechnic Institute,D3,New England Women's and Men's Athletic Conference,MA,"Worcester, MA",https://athletics.wpi.edu/sports/bsb/index
944,Worcester State University,D3,Massachusetts State Collegiate Athletic Confer...,MA,"Worcester, MA",https://www.wsulancers.com/sports/bsb/index
945,Yeshiva University,D3,Skyline Conference,NY,"New York, NY",https://yumacs.com/index.aspx?path=baseball


&nbsp;
### Get Roster Pages

In [6]:
# Case 1: http://www.______.com/index.aspx?path=...
schools_df['roster_link'] = np.where(schools_df['link'].str.contains('\/(?:index|roster|schedule)\.aspx\?path=(?:baseball|base|bball|bb|bs|bsb|mbase)', regex=True),
                                     schools_df['link'].str.replace('\/(?:index|roster|schedule)\.aspx\?path=(?:baseball|base|bball|bb|bs|bsb|mbase).*', '/sports/baseball/roster', regex=True),
                                     '')

# Case 2: https://www.______.com/sports/bsb/index
schools_df['roster_link'] = np.where((schools_df['link'].str.contains('(?:\/landing|\/sports\/bsb)\/index$', regex=True)) & (schools_df['roster_link'] == ''),
                                     schools_df['link'].str.replace('(?:\/landing|\/sports\/bsb)\/index', '/sports/bsb/2020-21/roster', regex=True),
                                     schools_df['roster_link'])

# Case 3: https://______.com/sports/baseball
schools_df['roster_link'] = np.where((schools_df['link'].str.contains('\/baseball\/*$', regex=True)) & (schools_df['roster_link'] == ''),
                                     schools_df['link'].str.replace('\/baseball\/*', '/baseball/roster', regex=True),
                                     schools_df['roster_link'])

# Case 4: https://______.com/sports/m-basebl
schools_df['roster_link'] = np.where((schools_df['link'].str.contains('\/sports*\/m-basebl\/*.*$', regex=True)) & (schools_df['roster_link'] == ''),
                                     schools_df['link'].str.replace('\/m-basebl\/*.*', '/m-basebl/roster', regex=True),
                                     schools_df['roster_link'])

# Case 5: https://______.com/sports/m-basebl/index
schools_df['roster_link'] = np.where((schools_df['link'].str.contains('\/sports\/m-basebl\/index$', regex=True)) & (schools_df['roster_link'] == ''),
                                     schools_df['link'].str.replace('\/index', '/2020-21/roster', regex=True),
                                     schools_df['roster_link'])

# Case 6: https://______.com/SportSelect.dbml...
schools_df['roster_link'] = np.where((schools_df['link'].str.contains('SportSelect\.dbml.*', regex=True)) & (schools_df['roster_link'] == ''),
                                     schools_df['link'].str.replace('\/SportSelect\.dbml.*', '/sports/baseball/roster', regex=True),
                                     schools_df['roster_link'])

# Case 7: https://______.com/sport/0/3
schools_df['roster_link'] = np.where((schools_df['link'].str.contains('\/sport\/0\/3$', regex=True)) & (schools_df['roster_link'] == ''),
                                     schools_df['link'].str.replace('\/sport\/0\/3', '/roster/0/3', regex=True),
                                     schools_df['roster_link'])

# Case 8: https://______.com/athletics/bb/
schools_df['roster_link'] = np.where((schools_df['link'].str.contains('\/athletics\/bb\/*$', regex=True)) & (schools_df['roster_link'] == ''),
                                     schools_df['link'].str.replace('\/athletics\/bb\/*', '/athletics/bb/roster', regex=True),
                                     schools_df['roster_link'])

# Case 9: http://______.com/sport.asp?sportID=1
schools_df['roster_link'] = np.where((schools_df['link'].str.contains('\/sport.asp', regex=True)) & (schools_df['roster_link'] == ''),
                                     schools_df['link'].str.replace('\/sport.asp', '/roster.asp', regex=True),
                                     schools_df['roster_link'])

schools_df

Unnamed: 0,title,division,conference,state,location,link,roster_link
0,Abilene Christian University,D1,Southland,TX,"Abilene, TX",http://www.acusports.com/index.aspx?path=baseball,http://www.acusports.com/sports/baseball/roster
1,Alabama A&M University,D1,Southwestern,AL,"Huntsville, AL",http://www.aamusports.com/index.aspx?path=base...,http://www.aamusports.com/sports/baseball/roster
2,Alabama State University,D1,Southwestern,AL,"Montgomery, AL",http://bamastatesports.com/index.aspx?path=bas...,http://bamastatesports.com/sports/baseball/roster
3,Alcorn State University,D1,Southwestern,MS,"Lorman, MS",http://www.alcornsports.com/index.aspx?path=ba...,http://www.alcornsports.com/sports/baseball/ro...
4,Appalachian State University,D1,Sun Belt,NC,"Boone, NC",https://appstatesports.com/index.aspx?path=bas...,https://appstatesports.com/sports/baseball/roster
...,...,...,...,...,...,...,...
942,Wittenberg University,D3,North Coast Athletic Conference,OH,"Springfield, OH",https://www.wittenbergtigers.com/sports/bsb/index,https://www.wittenbergtigers.com/sports/bsb/20...
943,Worcester Polytechnic Institute,D3,New England Women's and Men's Athletic Conference,MA,"Worcester, MA",https://athletics.wpi.edu/sports/bsb/index,https://athletics.wpi.edu/sports/bsb/2020-21/r...
944,Worcester State University,D3,Massachusetts State Collegiate Athletic Confer...,MA,"Worcester, MA",https://www.wsulancers.com/sports/bsb/index,https://www.wsulancers.com/sports/bsb/2020-21/...
945,Yeshiva University,D3,Skyline Conference,NY,"New York, NY",https://yumacs.com/index.aspx?path=baseball,https://yumacs.com/sports/baseball/roster


In [7]:
schools_df['roster_link_flg'] = schools_df['roster_link'] != ''
value_counts = schools_df['roster_link_flg'].value_counts()
print('{} schools have a known roster URL, and {} schools have yet to be determined.'.format(value_counts[True], value_counts[False]))

941 schools have a known roster URL, and 6 schools have yet to be determined.


In [8]:
# Case 10: Manual Edits --- try to make this more computationally efficient eventually
schools_df['roster_link'] = np.where(schools_df['title'] == 'Liberty University', 'https://www.liberty.edu/flames/index.cfm?PID=36959&teamID=1', schools_df['roster_link'])
schools_df['roster_link'] = np.where(schools_df['title'] == 'Keystone College', 'https://www.gokcgiants.com/sports/baseball/roster', schools_df['roster_link'])
schools_df['roster_link'] = np.where(schools_df['title'] == 'University of Dubuque', 'https://udspartans.com/sports/baseball/roster', schools_df['roster_link'])
schools_df['roster_link'] = np.where(schools_df['title'] == 'University of St. Thomas, Texas', 'https://www.ustcelts.com/sports/bsb/2020-21/roster',schools_df['roster_link'])
schools_df['roster_link'] = np.where(schools_df['title'] == 'Utica College', 'https://ucpioneers.com/sports/baseball/roster', schools_df['roster_link'])

missing_roster_link_df = schools_df[schools_df['roster_link'] == '']
missing_roster_link_df.style.set_properties(subset=['link'], **{'width-min': '500px'})

Unnamed: 0,title,division,conference,state,location,link,roster_link,roster_link_flg
772,Newbury College,D3,New England Collegiate Conference,MA,"Brookline, MA",http://newburynighthawks.com/,,False


In [9]:
# Export to CSV
# schools_df[schools_df['roster_link'] != ''].drop(columns=['roster_link_flg']).to_csv('roster_pages.csv', index=False)

&nbsp;
### Check Roster Sites

In [10]:
schools_df = pd.read_csv('roster_pages.csv')
schools_df

Unnamed: 0,title,division,conference,state,location,link,roster_link
0,Abilene Christian University,D1,Southland,TX,"Abilene, TX",http://www.acusports.com/index.aspx?path=baseball,http://www.acusports.com/sports/baseball/roster
1,Alabama A&M University,D1,Southwestern,AL,"Huntsville, AL",http://www.aamusports.com/index.aspx?path=base...,http://www.aamusports.com/sports/baseball/roster
2,Alabama State University,D1,Southwestern,AL,"Montgomery, AL",http://bamastatesports.com/index.aspx?path=bas...,http://bamastatesports.com/sports/baseball/roster
3,Alcorn State University,D1,Southwestern,MS,"Lorman, MS",http://www.alcornsports.com/index.aspx?path=ba...,http://www.alcornsports.com/sports/baseball/ro...
4,Appalachian State University,D1,Sun Belt,NC,"Boone, NC",https://appstatesports.com/index.aspx?path=bas...,https://appstatesports.com/sports/baseball/roster
...,...,...,...,...,...,...,...
941,Wittenberg University,D3,North Coast Athletic Conference,OH,"Springfield, OH",https://www.wittenbergtigers.com/sports/bsb/index,https://www.wittenbergtigers.com/sports/bsb/20...
942,Worcester Polytechnic Institute,D3,New England Women's and Men's Athletic Conference,MA,"Worcester, MA",https://athletics.wpi.edu/sports/bsb/index,https://athletics.wpi.edu/sports/bsb/2020-21/r...
943,Worcester State University,D3,Massachusetts State Collegiate Athletic Confer...,MA,"Worcester, MA",https://www.wsulancers.com/sports/bsb/index,https://www.wsulancers.com/sports/bsb/2020-21/...
944,Yeshiva University,D3,Skyline Conference,NY,"New York, NY",https://yumacs.com/index.aspx?path=baseball,https://yumacs.com/sports/baseball/roster


In [11]:
index_col_length, title_col_length, players_col_length, roster_link_col_length = 5, 50, 9, 80

print('\nIterating through {} schools...\n'.format(str(len(schools_df.index))))
print('|{}|{}|{}|{}|'.format('-'*index_col_length, '-'*title_col_length, '-'*players_col_length, '-'*roster_link_col_length))
print('|{}|{}|{}|{}|'.format('#'.center(index_col_length), 'school'.center(title_col_length), 'players'.center(players_col_length), 'roster_link'.center(roster_link_col_length)))
print('|{}|{}|{}|{}|'.format('-'*index_col_length, '-'*title_col_length, '-'*players_col_length, '-'*roster_link_col_length))

success_count = 0
fail_count = 0

for index, school in schools_df.iterrows():
    try:
        roster_link = school['roster_link']
        html = pd.read_html(roster_link)
        df = html[0]
        for temp_df in html:
            if len(temp_df.index) > len(df.index):
                df = temp_df
        print('| {} | {} | {} | {} |'.format(str(index).ljust(index_col_length-2), school['title'].ljust(title_col_length-2), str(len(df.index)).center(players_col_length-2), roster_link.ljust(roster_link_col_length-2)))
        if len(df.index) > 0:
            success_count += 1
        else:
            fail_count += 1
    except:
        print('| {} | {} | {} | {} |'.format(str(index).ljust(index_col_length-2), school['title'].ljust(title_col_length-2), '-'*(players_col_length-2), roster_link.ljust(roster_link_col_length-2)))
        fail_count += 1

print('|{}|{}|{}|{}|'.format('-'*index_col_length, '-'*title_col_length, '-'*players_col_length, '-'*roster_link_col_length))
print('\n{} successes... {} failures\n'.format(str(success_count), str(fail_count)))


Iterating through 946 schools...

|-----|--------------------------------------------------|---------|--------------------------------------------------------------------------------|
|  #  |                      school                      | players |                                  roster_link                                   |
|-----|--------------------------------------------------|---------|--------------------------------------------------------------------------------|
| 0   | Abilene Christian University                     |    38   | http://www.acusports.com/sports/baseball/roster                                |
| 1   | Alabama A&M University                           |    26   | http://www.aamusports.com/sports/baseball/roster                               |
| 2   | Alabama State University                         | ------- | http://bamastatesports.com/sports/baseball/roster                              |
| 3   | Alcorn State University                          |    33 