## This code looks up basketball stats for all college basketball teams for a given span of years and combines them into a data file for modeling. Predicting sports data has long been something that intrigues me as I am a sports fan and especially enjoy basketball

In [1]:
#import packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [4]:
#list year for data
years = list(range(2023,2024))
print(years)

[2023]


In [5]:
#set starting URL
url_start= "https://www.sports-reference.com/cbb/seasons/men/{}-school-stats.html"

In [185]:
#Download and save URL for overall stats for given year
for year in years:
    url = url_start.format(year)
    data = requests.get(url)
    
    with open('College Data/Overall/{}.html'.format(year), "w+") as f:
        f.write(data.text)

In [186]:
#open overall stats URL
with open("College Data/Overall/2023.html") as f:
    page = f.read()
    

In [187]:
#Parse URL and pull out data as a pandas data frame
soup = BeautifulSoup(page, "html.parser")
soup.find('tr', class_="over_header").decompose()
stats_table = soup.find(id="basic_school_stats")
all_schools = pd.read_html(str(stats_table))[0]


In [190]:
all_schools["School"][318]

'St Thomas MN'

In [189]:
#remove symbols from school names
all_schools['School'] = all_schools['School'].str.replace("(", "", regex = False)\
.str.replace(")","", regex = False)\
.str.replace("&","")\
.str.replace(".","", regex = False)\
.str.replace("'","", regex = False)\
.str.replace("--", "-", regex = False)\
.str.replace("  ", " ")\
.str.replace("The Citadel" , "Citadel")\
.str.replace("Houston Christian" , "Houston Baptist")\
.str.replace("Kansas City" , "Missouri Kansas City")\
.str.replace("Little Rock" , "Arkansas Little Rock")\
.str.replace("Louisiana" , "Louisiana Lafayette")\
.str.replace("NC State" , "North Carolina State")\
.str.replace("Omaha" , "Nebraska Omaha")\
.str.replace("Purdue-Fort Wayne" , "IPFW")\
.str.replace("SIU Edwardsville" , "Southern Illinois Edwardsville")\
.str.replace("TCU" , "Texas Christian")\
.str.replace("Texas-Rio Grande Valley" , "Texas Pan American")\
.str.replace("UAB" , "Alabama Birmingham")\
.str.replace("UC" , "California")\
.str.replace("UT Arlington" , "Texas Arlington")\
.str.replace("Utah Tech" , "Dixie State")\
.str.replace("UTEP" , "Texas El Paso")\
.str.replace("UTSA" , "Texas San Antonio")\
.str.replace("VMI" , "Virginia Military Institute")\
.str.replace("UNC", "North Carolina")\
.str.replace("CaliforniaLA", "UCLA")\
.str.replace("Louisiana Lafayette State", "Louisiana State")\
.str.replace("Louisiana Lafayette Tech", "Louisiana Tech")\
.str.replace("Southeastern Louisiana Lafayette", "Southeastern Louisiana")\
.str.replace("Louisiana Lafayette-Monroe", "Louisiana Monroe")\
.str.replace("St Thomas", "St Thomas MN")\
.str.replace("\xa0NCAA", "", regex = False)




In [191]:
#drop NAs, convert names to all lowercase, drop name "school"
school_names = all_schools['School']
school_names = school_names.dropna()

school_names = school_names.str.replace(' ', '-').str.lower()
school_names.drop(school_names[school_names == 'school'].index, inplace = True)

In [193]:
#set starting url for obtaining individual school data
school_url_start = "https://www.sports-reference.com/cbb/schools/{}/men/2023-gamelogs.html"

#download URLs for each indiviudal schools season data
for name in school_names:
    url = school_url_start.format(name)
    
    time.sleep(2)
    
    data = requests.get(url)
    
    with open('College Data/Schools/{}.html'.format(name), "w+") as f:
        f.write(data.text)

In [194]:
#Check if any pages have errors, if so correct school names above
with open("College Data/Error.html") as f:
    pageE = f.read()
     

soupE = BeautifulSoup(pageE, "html.parser")


In [195]:
for name in school_names:   
    
    with open("College Data/Schools/{}.html".format(name)) as f:
        page3 = f.read()
        
    soup3 = BeautifulSoup(page3, "html.parser")
    
    if soup3 == soupE:
        print(name)

In [196]:
#setting empty list
full_data = []

In [199]:
# soup = BeautifulSoup(page2, "html.parser")
# soup.find('tr', class_="over_header").decompose()
# stats_table = soup.find(id="div_sgl-basic_NCAAM")
# test = pd.read_html(str(stats_table))[0]

Index(['G', 'Date', 'Unnamed: 2', 'Opp', 'W/L', 'Tm', 'Opp.1', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'TRB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'Unnamed: 23', 'FG.1', 'FGA.1', 'FG%.1',
       '3P.1', '3PA.1', '3P%.1', 'FT.1', 'FTA.1', 'FT%.1', 'ORB.1', 'TRB.1',
       'AST.1', 'STL.1', 'BLK.1', 'TOV.1', 'PF.1'],
      dtype='object')

In [201]:
#using school URLs to obtain school season game data and append it together
for name in school_names:
    
    
    with open("College Data/Schools/{}.html".format(name)) as f:
        page2 = f.read()
        
    soup = BeautifulSoup(page2, "html.parser")
    soup.find('tr', class_="over_header").decompose()
    stats_table = soup.find(id="div_sgl-basic_NCAAM")
    data = pd.read_html(str(stats_table))[0]
    
    data.drop(data[(data['FG%'] == "School") | (data['FG%'] == "FG%")].index, inplace=True)
    data['Unnamed: 2'] = data['Unnamed: 2'].replace("@","A").fillna("H")
    data = data.drop(columns=['Unnamed: 23']).dropna()
    data = data.drop(columns=['Unnamed: 2'])
    
    full_data.append(data)



In [203]:
full_data = pd.concat(full_data)

In [232]:
full_data.columns

Index(['G', 'Date', 'Opp', 'W/L', 'Tm', 'Opp.1', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'FG.1', 'FGA.1', 'FG%.1', '3P.1', '3PA.1', '3P%.1', 'FT.1',
       'FTA.1', 'FT%.1', 'ORB.1', 'TRB.1', 'AST.1', 'STL.1', 'BLK.1', 'TOV.1',
       'PF.1'],
      dtype='object')

In [234]:
full_data.dtypes

G        object
Date     object
Opp      object
W/L      object
Tm       object
Opp.1    object
FG       object
FGA      object
FG%      object
3P       object
3PA      object
3P%      object
FT       object
FTA      object
FT%      object
ORB      object
TRB      object
AST      object
STL      object
BLK      object
TOV      object
PF       object
FG.1     object
FGA.1    object
FG%.1    object
3P.1     object
3PA.1    object
3P%.1    object
FT.1     object
FTA.1    object
FT%.1    object
ORB.1    object
TRB.1    object
AST.1    object
STL.1    object
BLK.1    object
TOV.1    object
PF.1     object
dtype: object

In [233]:
full_data['eFG'] = (full_data["FG"] + (0.5 * full_data['3P'])) / full_data['FGA']

TypeError: can't multiply sequence by non-int of type 'float'

In [204]:
full_data.to_csv("College Data/All Teams Data.csv", index = False)

Index(['G', 'Date', 'Opp', 'W/L', 'Tm', 'Opp.1', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'FG.1', 'FGA.1', 'FG%.1', '3P.1', '3PA.1', '3P%.1', 'FT.1',
       'FTA.1', 'FT%.1', 'ORB.1', 'TRB.1', 'AST.1', 'STL.1', 'BLK.1', 'TOV.1',
       'PF.1'],
      dtype='object')

Index(['G', 'Date', 'Unnamed: 2', 'Opp', 'W/L', 'Tm', 'Opp.1', 'ORtg', 'DRtg',
       'Pace', 'FTr', '3PAr', 'TS%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'Unnamed: 17', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'Unnamed: 22',
       'eFG%.1', 'TOV%.1', 'DRB%', 'FT/FGA.1'],
      dtype='object')