In [98]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

In [120]:

dnames = ["sets", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "retatt", "rerr", "b_solo", "b_assist", "b_error", "pts", "bhe"]

def row_to_dict(row, column_names):
    items = {}
    
    first3 = row.find_all("td", attrs={'class':'smtext'})
    # rest = row.find_all(lambda el: el.tag == "td" and el["class"] in [None, "game_high"])
    rest = row.find_all("td", attrs={'style':"text-align:right"})
    
    if len(first3) != 3:
        print(row)
    
    if first3[2].text.strip() == "-":
        return None
    
    items["date"] = first3[0].text.strip()
    items["opponent/venue"] = first3[1].text.strip()
    items["result"] = first3[2].text.strip()
    
    for i, element in enumerate(rest):
        items[column_names[i]] = float(element.attrs['data-order']) if element.attrs['data-order'] != "-" else 0
        
    return items


def tr_to_df(tbody, column_names):
    rows = tbody.find_all("tr", attrs={'class':None, 'style':None, 'id':None, 'colspan':None})

    return pd.DataFrame(filter(lambda x: x is not None, [row_to_dict(row, column_names) for row in rows]))


def link_to_df(link, column_names):
    response = requests.get(link, headers=headers)
    # response.raise_for_status()
    if response.status_code != 200:
        raise ConnectionRefusedError("request blocked")
    
    soup = BeautifulSoup(response.content)
    if len(soup.find_all("table", attrs={'class': 'mytable'})) < 2:
        return None
    tr = soup.find_all("table", attrs={'class': 'mytable'})[1]
    
    return tr_to_df(tr, column_names)

In [121]:
team_id = {}
with open("team_id.csv", "r") as f:
    for line in f.readlines()[1:]:
        team, id = line.split(",")
        team_id[int(id.strip())] = team
        


def get_all_schedules_by_year(year_code = 16480, year = 2023, column_names = []):
    # os.makedirs(f"schedules-{year}", exist_ok=True)
    for id, team in team_id.items():
        # if os.path.exists(f"schedules-{year}/{team}-schedule-{year}.csv"): continue
        print("Finding schedule for", team, year)
        try:
            df = link_to_df(f"https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id={year_code}&org_id={id}&stats_player_seq=-100", column_names)
        except ConnectionRefusedError as e:
            print(team, year, "not found on website")
        if df is None:
            print("Couldn't find page for", team, year)
            continue
        
        df.to_csv(f"all_schedules/schedules-{year}/{team}-schedule-{year}.csv", index=False)
        print(team, year, "schedule found, writing to file")

In [112]:
import threading
years = {
    2022: [16000, ["sets", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "retatt", "rerr", "b_solo", "b_assist", "b_error", "tb", "pts", "bhe"]],
    2021: [15820, ["sets", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "rerr", "b_solo", "b_assist", "b_error", "pts", "bhe"]],
    2020: [15460, ["sets", "ms", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "rerr", "b_solo", "b_assist", "b_error", "tb", "pts", "bhe"]],
    2019: [14942, ["sets", "mp", "attend", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "rerr", "b_solo", "b_assist", "b_error", "pts", "bhe"]],
    2018: [14242, ["attend", "sets", "mp", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "rerr", "b_solo", "b_assist", "b_error", "pts", "bhe"]],
    2017: [12622, ["sets", "mp", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "rerr", "b_solo", "b_assist", "b_error", "pts", "bhe"]],
    2016: [12426, ["sets", "mp", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "rerr", "b_solo", "b_assist", "b_error", "pts", "bhe"]],
    2015: [12221, ["sets", "mp", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "rerr", "b_solo", "b_assist", "b_error", "pts", "bhe"]],
    2014: [11760, ["sets", "mp", "ms", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "rerr", "b_solo", "b_assist", "b_error", "pts", "bhe"]],
    2013: [11440, ["sets", "ms", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "rerr", "b_solo", "b_assist", "b_error", "pts", "bhe"]],
    2012: [11180, ["sets", "ms", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "rerr", "b_solo", "b_assist", "b_error", "pts", "bhe"]],
    2011: [10700, ["sets", "ms", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "b_solo", "b_assist", "b_error", "pts", "bhe"]],
}

In [126]:
for year, (code, column_names) in years.items():
    if year in [2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014]: continue
    get_all_schedules_by_year(code, year, column_names)

Finding schedule for Nebraska 2013
Nebraska 2013 schedule found, writing to file
Finding schedule for Dayton 2013
Dayton 2013 schedule found, writing to file
Finding schedule for Western Mich. 2013
Western Mich. 2013 schedule found, writing to file
Finding schedule for Wisconsin 2013
Wisconsin 2013 schedule found, writing to file
Finding schedule for Pittsburgh 2013
Pittsburgh 2013 schedule found, writing to file
Finding schedule for Stanford 2013
Stanford 2013 schedule found, writing to file
Finding schedule for The Citadel 2013
The Citadel 2013 schedule found, writing to file
Finding schedule for Texas 2013
Texas 2013 schedule found, writing to file
Finding schedule for Western Ky. 2013
Western Ky. 2013 schedule found, writing to file
Finding schedule for Creighton 2013
Creighton 2013 schedule found, writing to file
Finding schedule for SFA 2013
SFA 2013 schedule found, writing to file
Finding schedule for Eastern Ill. 2013
Eastern Ill. 2013 schedule found, writing to file
Finding sc

In [127]:
for folder in os.listdir("all_schedules"):
    print(folder, len(os.listdir(os.path.join("all_schedules", folder))))

schedules-2023 331
schedules-2015 331
schedules-2012 331
schedules-2013 331
schedules-2014 331
schedules-2022 331
schedules-2020 331
schedules-2018 331
schedules-2011 331
schedules-2016 331
schedules-2017 331
schedules-2019 331
schedules-2021 331


In [124]:
year = 2019
for id in [639, 688, 691, 346, 17, 295, 459, 647, 270, 617]:
    team = team_id[id]
    year_code, column_names = [14942, ["sets", "mp", "attend", "kills", "errors", "total_attacks", "hit_pct", "assists", "aces", "serr", "digs", "rerr", "b_solo", "b_assist", "b_error", "pts", "bhe"]]
    try:
        df = link_to_df(f"https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id={year_code}&org_id={id}&stats_player_seq=-100", column_names)
    except ConnectionRefusedError as e:
        print(team, year, "not found on website")
    if df is None:
        print("Couldn't find page for", team, year)
        continue
    df.to_csv(f"all_schedules/schedules-{year}/{team}-schedule-{year}.csv", index=False)
    print(team, year, "schedule found, writing to file")

Siena 2019 schedule found, writing to file
Syracuse 2019 schedule found, writing to file
Tennessee St. 2019 schedule found, writing to file
Lamar University 2019 schedule found, writing to file
Alcorn 2019 schedule found, writing to file
Idaho 2019 schedule found, writing to file
UNC Greensboro 2019 schedule found, writing to file
South Carolina St. 2019 schedule found, writing to file
Hampton 2019 schedule found, writing to file
Saint Peter's 2019 schedule found, writing to file


In [110]:
for i in range(2011, 2024):
    print(len(os.listdir(f"schedules-{i}")), i)

331 2011
331 2012
331 2013
331 2014
331 2015
331 2016
331 2017
331 2018
331 2019
331 2020
331 2021
331 2022
331 2023


In [108]:
threads = []
for year, (code, column_names) in years.items():
    t = threading.Thread(target=get_all_schedules_by_year, args=[code, year, column_names])
    threads.append(t)
    t.start()

Finding schedule forFinding schedule for Nebraska 2022
Finding schedule for Nebraska 2020
 Nebraska 2021
Finding schedule for Nebraska 2019
Finding schedule for Nebraska 2018
Finding schedule for Nebraska 2017
Finding schedule for Nebraska 2016
Finding schedule for Nebraska 2015
Finding schedule for Nebraska 2014
Finding schedule for Nebraska 2013
Finding schedule for Nebraska 2012
Finding schedule for Nebraska 2011


Exception in thread Thread-5 (get_all_schedules_by_year):
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/q1/7dmp8q9j1k56svnw3n21281m0000gn/T/ipykernel_83414/342338581.py", line 18, in get_all_schedules_by_year
UnboundLocalError: cannot access local variable 'df' where it is not associated with a value


Nebraska 2022 not found on website
Nebraska 2020 schedule found, writing to file
Finding schedule for Dayton 2020
Dayton 2020 not found on website
Dayton 2020 schedule found, writing to file
Finding schedule for Western Mich. 2020
Western Mich. 2020 not found on website
Western Mich. 2020 schedule found, writing to file
Finding schedule for Wisconsin 2020
Wisconsin 2020 not found on website
Wisconsin 2020 schedule found, writing to file
Finding schedule for Pittsburgh 2020
Pittsburgh 2020 not found on website
Pittsburgh 2020 schedule found, writing to file
Finding schedule for Stanford 2020
Stanford 2020 not found on website
Stanford 2020 schedule found, writing to file
Finding schedule for The Citadel 2020
The Citadel 2020 not found on website
The Citadel 2020 schedule found, writing to file
Finding schedule for Texas 2020
Texas 2020 not found on website
Texas 2020 schedule found, writing to file
Finding schedule for Western Ky. 2020
Western Ky. 2020 not found on website
Western Ky. 2

In [100]:
sp = "/Users/nato/Desktop/Volleyball/schedules-2023"
for filename in os.listdir(sp):
    os.rename(os.path.join(sp, filename), os.path.join(sp, filename + ".csv"))

In [99]:
for id, team in team_id.items():
    if os.path.exists(f"schedules-2023/{team}-schedule-2023"): continue
    print("Finding schedule for", team)
    df = link_to_df(f"https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=16480&org_id={id}&stats_player_seq=-100")
    df.to_csv(f"schedules-2023/{team}-schedule-2023", index=False)
    print(team, "schedule found, writing to file then sleeping")

Finding schedule for Coppin St.
Coppin St. schedule found, writing to file then sleeping
Finding schedule for Louisville
Louisville schedule found, writing to file then sleeping
Finding schedule for UC Santa Barbara
UC Santa Barbara schedule found, writing to file then sleeping
Finding schedule for Yale
Yale schedule found, writing to file then sleeping
Finding schedule for Tennessee
Tennessee schedule found, writing to file then sleeping
Finding schedule for Oregon
Oregon schedule found, writing to file then sleeping
Finding schedule for Delaware
Delaware schedule found, writing to file then sleeping
Finding schedule for Arkansas
Arkansas schedule found, writing to file then sleeping
Finding schedule for Arizona St.
Arizona St. schedule found, writing to file then sleeping
Finding schedule for Kansas
Kansas schedule found, writing to file then sleeping
Finding schedule for Towson
Towson schedule found, writing to file then sleeping
Finding schedule for FGCU
FGCU schedule found, writin

In [92]:
test = link_to_df("https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=16480&org_id=796&stats_player_seq=-100")

In [93]:
test

Unnamed: 0,date,opponent/venue,result,sets,kills,errors,total_attacks,hit_pct,assists,aces,serr,digs,retatt,rerr,b_solo,b_assist,b_error,pts,bhe
0,08/25/2023,"Baylor @ Minneapolis, MN",3-1,4.0,52.0,10.0,122.0,0.344,48.0,6.0,17.0,44.0,67.0,7.0,1.0,14.0,0.0,66.0,1.0
1,08/26/2023,"TCU @ Minneapolis, MN",3-0,3.0,46.0,13.0,96.0,0.344,40.0,5.0,10.0,39.0,51.0,4.0,2.0,16.0,1.0,61.0,0.0
2,08/30/2023,@ Arkansas,3-2,5.0,70.0,18.0,176.0,0.295,67.0,2.0,10.0,77.0,95.0,9.0,2.0,30.0,0.0,89.0,1.0
3,08/31/2023,@ Arkansas,3-0,3.0,50.0,11.0,113.0,0.345,48.0,5.0,4.0,67.0,50.0,4.0,3.0,8.0,1.0,62.0,0.0
4,09/03/2023,Tennessee,3-2,5.0,62.0,17.0,147.0,0.306,60.0,3.0,7.0,57.0,96.0,12.0,3.0,26.0,2.0,81.0,0.0
5,09/07/2023,Arizona,3-0,3.0,47.0,14.0,104.0,0.317,44.0,11.0,10.0,37.0,54.0,2.0,0.0,14.0,1.0,65.0,2.0
6,09/08/2023,Miami (FL),3-0,3.0,39.0,8.0,91.0,0.341,37.0,3.0,7.0,36.0,45.0,1.0,1.0,14.0,1.0,50.0,0.0
7,09/13/2023,@ Marquette,3-1,4.0,60.0,19.0,168.0,0.244,58.0,5.0,18.0,72.0,92.0,8.0,1.0,22.0,1.0,77.0,1.0
8,09/17/2023,@ Florida,3-2,5.0,56.0,23.0,166.0,0.199,49.0,4.0,12.0,70.0,93.0,7.0,1.0,14.0,2.0,68.0,5.0
9,09/22/2023,@ Northwestern,3-0,3.0,41.0,8.0,90.0,0.367,36.0,8.0,10.0,42.0,38.0,1.0,3.0,10.0,0.0,57.0,0.0
