In [57]:
import urllib3
import pandas as pd
import numpy as np
import string
from datetime import datetime
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
pd.set_option('chained_assignment', None)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
def get_season_schools(year):
    http = urllib3.PoolManager()
    URL = f'https://www.sports-reference.com/cbb/seasons/men/{year}-school-stats.html'
    r = http.request('GET', URL)
    soup = BeautifulSoup(r.data, 'html.parser')

    filename = 'data_file.txt'
    with open(filename, 'w') as f:
        header = ','.join([item.get_text() for item in soup.select("table[id = 'basic_school_stats'] > thead > tr > th")][12:])
        print(header, file=f)
        for item in soup.select("table[id='basic_school_stats'] > tbody > tr"):
            row_data = ','.join( [ item.get_text() for item in item.select("td") ] )
            print(row_data, file=f)

    df = pd.read_csv(filename)
    cols = df.columns
    df.drop(columns = 'PF', inplace = True)
    df.columns = cols[1:]
    
    df.drop(columns = ['\xa0','\xa0.1','\xa0.2','\xa0.3','\xa0.4'], inplace = True)
    
    column_names = {
        'School':'school',
        'G':'games',
        'W':'wins',
        'L':'losses',
        'W-L%':'win_loss_percent',
        'SRS':'simple_rating_system',
        'SOS':'strength_of_schedule',
        'W.1':'conference_wins',
        'L.1':'conference_losses',
        'W.2':'home_wins',
        'L.2':'home_losses',
        'W.3':'away_wins',
        'L.3':'away_losses',
        'Tm.':'points_for',
        'Opp.':'points_against',
        'MP':'minutes_played',
        'FG':'field_goals',
        'FGA':'field_goals_attempted',
        'FG%':'field_goal_percent',
        '3P':'three_pointers',
        '3PA':'three_pointers_attempted',
        '3P%':'three_pointer_percentage',
        'FT':'free_throws',
        'FTA':'free_throws_attempted',
        'FT%':'free_throw_percentage',
        'ORB':'offensive_rebounds',
        'TRB':'total_rebounds',
        'AST':'assists',
        'STL':'steals',
        'BLK':'blocks',
        'TOV':'turnovers',
        'PF':'personal_fouls'
        }

    df.columns = df.columns.map(column_names)
    
    schools = df['school'].unique()
    school_dict = {}
    for school in schools:
        school_dict[school] = school.lower().translate(str.maketrans('','',string.punctuation)).replace(' ','-')
    
    return df, school_dict

In [3]:
df, school_dict = get_season_schools(2025)
df

Unnamed: 0,school,games,wins,losses,win_loss_percent,simple_rating_system,strength_of_schedule,conference_wins,conference_losses,home_wins,home_losses,away_wins,away_losses,points_for,points_against,minutes_played,field_goals,field_goals_attempted,field_goal_percent,three_pointers,three_pointers_attempted,three_pointer_percentage,free_throws,free_throws_attempted,free_throw_percentage,offensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls
0,Abilene Christian,22,9,13,0.409,-8.25,-0.56,1,6,6,4,2,9,1533,1569,885,554,1248,0.444,103,342,0.301,322,464,0.694,245,742,285,216,62,333,441
1,Air Force,23,3,20,0.130,-6.73,1.80,0,12,2,10,0,10,1454,1650,925,496,1144,0.434,185,535,0.346,277,436,0.635,178,691,303,131,70,308,401
2,Akron,22,17,5,0.773,4.16,-3.74,10,0,11,0,5,3,1846,1604,885,670,1454,0.461,248,691,0.359,258,344,0.750,280,895,407,170,81,273,402
3,Alabama,22,19,3,0.864,25.22,13.31,8,1,10,1,6,1,1985,1723,885,669,1405,0.476,224,667,0.336,423,597,0.709,307,985,375,137,114,291,406
4,Alabama A&M,22,7,15,0.318,-19.66,-7.72,3,6,7,6,0,9,1694,1798,900,585,1423,0.411,185,574,0.322,339,508,0.667,339,848,317,192,95,362,478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,Wright State,25,12,13,0.480,-3.45,-2.58,6,8,8,3,3,8,1910,1865,1010,709,1472,0.482,206,550,0.375,286,407,0.703,236,888,366,147,73,300,434
360,Wyoming,23,11,12,0.478,1.16,4.07,4,8,7,4,3,6,1581,1622,925,562,1271,0.442,182,516,0.353,275,406,0.677,242,807,271,100,64,301,437
361,Xavier,23,14,9,0.609,14.31,7.62,6,6,11,2,2,6,1767,1613,930,594,1306,0.455,187,488,0.383,392,501,0.782,199,784,380,172,67,266,371
362,Yale,19,13,6,0.684,7.82,-3.40,6,0,7,0,4,5,1567,1311,760,567,1161,0.488,150,377,0.398,283,379,0.747,229,765,327,112,72,214,326


In [4]:
def players_per_game(year):
    http = urllib3.PoolManager()
    URL = f'https://www.sports-reference.com/cbb/schools/abilene-christian/men/{year}.html'
    r = http.request('GET', URL)
    soup = BeautifulSoup(r.data, 'html.parser')

    filename = 'players_per_game.txt'
    with open(filename, 'w') as f:
        header = ','.join([item.get_text() for item in soup.select("table[id = 'players_per_game'] > thead > tr > th")])
        print(header, file=f)
        for item in soup.select("table[id='players_per_game'] > tbody > tr"):
            row_data = ','.join( [ item.get_text() for item in item.select("td") ] )
            print(row_data, file=f)

    df = pd.read_csv(filename)
    cols = df.columns
    df.drop(columns = 'Awards', inplace = True)
    df.columns = cols[1:]
    
    return df

In [5]:
player_df = players_per_game(2025)

In [6]:
player_df

Unnamed: 0,Player,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards
0,Quion Williams,G,20,20,29.7,5.3,11.0,0.479,0.7,1.8,0.361,4.6,9.2,0.503,0.509,2.7,4.0,0.671,1.9,3.8,5.6,2.8,0.9,0.7,3.0,2.2,13.8,
1,Leonardo Bettiol,F,22,21,26.0,5.2,9.3,0.556,0.0,0.2,0.2,5.1,9.1,0.565,0.559,3.3,4.7,0.709,1.6,2.3,3.9,1.2,1.1,0.4,2.0,3.2,13.7,
2,Bradyn Hubbard,F,21,13,21.4,3.2,6.4,0.5,0.5,1.2,0.44,2.7,5.2,0.514,0.541,1.8,2.1,0.822,1.9,3.4,5.3,0.8,1.3,0.2,1.1,2.3,8.7,
3,Hunter-Jack Madden,G,22,19,26.0,2.4,7.4,0.321,1.5,5.1,0.295,0.9,2.3,0.38,0.423,1.6,1.8,0.923,0.5,1.4,2.0,2.0,1.0,0.1,2.1,1.0,7.9,
4,Dontrez Williams,G,22,0,16.1,2.1,5.0,0.414,0.4,1.6,0.229,1.7,3.5,0.5,0.45,1.5,2.3,0.647,1.0,1.8,2.8,0.5,1.2,0.3,1.1,1.8,6.0,
5,Rich Smith,G,21,13,19.8,1.5,3.0,0.516,0.0,0.0,0.0,1.5,2.9,0.525,0.516,1.7,2.8,0.593,0.2,1.8,2.0,2.4,1.1,0.2,1.5,2.5,4.7,
6,Christian Alston,G,14,0,9.5,1.4,3.7,0.385,0.5,1.3,0.389,0.9,2.4,0.382,0.452,1.2,1.6,0.739,0.9,1.1,2.0,0.6,0.7,0.3,0.8,0.9,4.6,
7,Nasir DeGruy,G,22,9,19.0,1.5,4.4,0.344,0.1,1.0,0.143,1.4,3.4,0.4,0.359,0.6,0.8,0.722,0.3,1.4,1.7,1.4,1.4,0.1,1.7,1.9,3.7,
8,Cade Hornecker,C,21,1,9.0,1.4,3.0,0.469,0.1,0.4,0.222,1.3,2.6,0.509,0.484,0.3,0.6,0.583,0.8,1.2,2.0,0.4,0.2,0.2,0.5,1.6,3.3,
9,Yaniel Rivera,G,22,7,13.4,1.0,3.2,0.3,0.6,2.3,0.275,0.3,0.9,0.368,0.4,0.2,0.4,0.625,0.2,1.4,1.5,1.0,0.6,0.0,0.5,0.9,2.8,


In [7]:
# opp_df = df.copy()
# opp_df.columns = ['opp_' + item for item in df.columns]

# pd.concat([df[df['school'] == 'Abilene Christian'].reset_index(drop = True), opp_df[opp_df['opp_school'] == 'Akron'].reset_index(drop = True)], axis = 1)

In [8]:
def add_player_stats_to_team(team_df):
    top_ten_points = list(player_df.sort_values('PTS', ascending = False)['PTS'][0:10])
    top_ten_rebounds = list(player_df.sort_values('TRB', ascending = False)['TRB'][0:10])
    top_ten_fg_per = list(player_df.sort_values('FG%', ascending = False)['FG%'][0:10])
    
    for i in range(10):
        team_df[f'top_points_{i+1}'] = top_ten_points[i]
    for i in range(10):
        team_df[f'top_rebounds_{i+1}'] = top_ten_rebounds[i]
    for i in range(10):
        team_df[f'top_fg_per_{i+1}'] = top_ten_fg_per[i]
        
    return team_df

In [9]:
abi_df = df[df['school'] == 'Abilene Christian']
add_player_stats_to_team(abi_df)

Unnamed: 0,school,games,wins,losses,win_loss_percent,simple_rating_system,strength_of_schedule,conference_wins,conference_losses,home_wins,home_losses,away_wins,away_losses,points_for,points_against,minutes_played,field_goals,field_goals_attempted,field_goal_percent,three_pointers,three_pointers_attempted,three_pointer_percentage,free_throws,free_throws_attempted,free_throw_percentage,offensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,top_points_1,top_points_2,top_points_3,top_points_4,top_points_5,top_points_6,top_points_7,top_points_8,top_points_9,top_points_10,top_rebounds_1,top_rebounds_2,top_rebounds_3,top_rebounds_4,top_rebounds_5,top_rebounds_6,top_rebounds_7,top_rebounds_8,top_rebounds_9,top_rebounds_10,top_fg_per_1,top_fg_per_2,top_fg_per_3,top_fg_per_4,top_fg_per_5,top_fg_per_6,top_fg_per_7,top_fg_per_8,top_fg_per_9,top_fg_per_10
0,Abilene Christian,22,9,13,0.409,-8.25,-0.56,1,6,6,4,2,9,1533,1569,885,554,1248,0.444,103,342,0.301,322,464,0.694,245,742,285,216,62,333,441,13.8,13.7,8.7,7.9,6.0,4.7,4.6,3.7,3.3,2.8,5.6,5.3,4.0,3.9,2.8,2.0,2.0,2.0,2.0,1.7,0.611,0.556,0.516,0.5,0.5,0.479,0.469,0.414,0.4,0.387


In [96]:
def get_home_team_results():
    http = urllib3.PoolManager()
    URL = f'https://www.sports-reference.com/cbb/schools/abilene-christian/men/2025-schedule.html'
    r = http.request('GET', URL)
    soup = BeautifulSoup(r.data, 'html.parser')

    filename = 'schedule.txt'
    with open(filename, 'w') as f:
        header = ','.join([item.get_text() for item in soup.select("table[id = 'schedule'] > thead > tr > th")])
        print(header, file=f)
        for item in soup.select("table[id='schedule'] > tbody > tr"):
            row_data = ','.join( [ item.get_text() for item in item.select("td") ] )
            print(row_data, file=f)

    df = pd.read_csv(filename)
    df.rename(columns = {'\xa0':'home_away',
                         '\xa0.1':'w_l',
                        'Tm':'home_team_pts',
                        'Opp':'road_team_pts',
                        'Opponent':'road_team'}, inplace = True)
    df = df.loc[(df['home_away'].isin([np.nan,'N'])) & (pd.isna(df['w_l']) == False)]
    df['home_team'] = abi_df['school'][0]
    df = df[['home_team','road_team','Conf','w_l','home_team_pts','road_team_pts']].reset_index(drop = True)
    
    return df

In [99]:
df = get_home_team_results()
df

Unnamed: 0,home_team,road_team,Conf,w_l,home_team_pts,road_team_pts
0,Abilene Christian,Howard Payne,,W,107.0,74.0
1,Abilene Christian,Middle Tennessee,CUSA,L,56.0,79.0
2,Abilene Christian,McMurry,,W,101.0,55.0
3,Abilene Christian,Texas State,Sun Belt,W,72.0,60.0
4,Abilene Christian,Southern Mississippi,Sun Belt,W,82.0,74.0
5,Abilene Christian,Hardin-Simmons,,W,93.0,62.0
6,Abilene Christian,Texas Southern,SWAC,W,69.0,65.0
7,Abilene Christian,Stephen F. Austin,Southland,L,57.0,62.0
8,Abilene Christian,California Baptist,WAC,L,54.0,60.0
9,Abilene Christian,Tarleton State,WAC,W,67.0,56.0


In [101]:
http = urllib3.PoolManager()
URL = f'https://www.sports-reference.com/cbb/boxscores/index.cgi?month=02&day=7&year=2025'
r = http.request('GET', URL)
soup = BeautifulSoup(r.data, 'html.parser')

filename = 'today_games.txt'
with open(filename, 'w') as f:
    header = ','.join([item.get_text() for item in soup.select("table[class = 'teams']")])
    print(header, file=f)
    for item in soup.select("table[class = 'teams'] > tbody > tr"):
        row_data = ','.join( [ item.get_text() for item in item.select("td") ] )
        print(row_data, file=f)

df = pd.read_csv(filename)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 17, saw 2


In [102]:
with open(filename, 'w') as f:
    header = ','.join([item.get_text() for item in soup.select("table[class = 'teams']")])

In [111]:
header.replace('\n','').split(',')[0].replace('\xa0','')

"USCPurdue(7)7:00p\t\t\tMen's"

In [None]:
!pip install cbbpy
