In [28]:
# Import Packages
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Fetches the HTML content from the given URL
def get_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch content from {url}. Status code: {response.status_code}")
        return None

# Finds all tables on page, returns list 
def find_all_tables(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    tables = soup.find_all('table')
    return [extract_table_data(table) for table in tables]
    
# Extracts the contents of the given table and returns as a list of lists  
def extract_table_data(table):
    table_data = []
    if table:
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all(['th', 'td'])
            row_data = [col.text.strip() for col in cols]
            table_data.append(row_data)
    return pd.DataFrame(table_data)

# Exports The Tables Present On The Page to .csv
def export_barttovic(url: str):
    html_content = get_html(url)
    if html_content:
        tables = find_all_tables(html_content)
        return tables
    else:
        print("Export of barttovic data did not yield table.")

In [186]:
dfs = export_barttovic(f'https://barttorvik.com/team.php?team=Houston&year=2023')

In [209]:
# Loop Through DFs On Team Page
import numpy as np
import re
 
pd.set_option('display.max_columns', 500)

def replace_non_numeric(value):
    try:
        if value:
            return float(value)
    except (ValueError, TypeError):
        return pd.NaT

def extract_between_parentheses(input_string):
    match = re.search( r'\((.*?)\)', str(input_string))
    if match:
        result = match.group(1)
        return result
    else:
        return input_string
        
def percent_str_to_float(value):
    if '%' in value:
        try:
            return float(value.rstrip('%'))/100
        except (ValueError, AttributeError):
            return value
    else:
        return value        
 
def clean_schedule(value):
    if "/n" in value:
        return value.replace('\n', '')
    else:
        value
         
# Loop Through Recieved Tables    
for i in range(len(dfs)):
    df = dfs[i]
    cols_od = ['Category', 'Offense_Val', 'Offense_Rank',  'Defense_Val', 'Defense_Rank']
    cols_val_rank =  ['Category', 'Val', 'Rank']
    cols_conf = ['Category', 'Non_Con_Val', 'Non_Con_Rank',  'Con_Val', 'Con_Rank']

    # Team Stats Table
    if i == 0:
        df = df.iloc[2:]
        df_team_stats = pd.concat([df.iloc[:5,:5], df.iloc[7:10,:5], df.iloc[12:14,:5]])
        df_team_stats.columns = cols_od
        df_team_stats[cols_od[-4:]] = df_team_stats[cols_od[-4:]].applymap(replace_non_numeric)
    # Fun Stuff Table
    if i == 1:
        df_team_fun_od = pd.concat([df.iloc[1:3,:5]])
        df_team_fun_od.columns = cols_od
        
        df_team_fun_luck = pd.concat([df.iloc[3:6,:3]])
        df_team_fun_luck.columns = cols_val_rank
        df_team_fun_luck[[cols_val_rank[1]]] = df_team_fun_luck[[cols_val_rank[1]]].applymap(extract_between_parentheses)
        
        df_team_fun_sos = pd.concat([df.iloc[8:10,:5]])
        df_team_fun_sos.columns = cols_conf
        cols_conf_val = [cols_conf[i] for i in [1, 3]]
        df_team_fun_sos[cols_conf_val] = df_team_fun_sos[cols_conf_val].applymap(percent_str_to_float)
    # Schedule Table 
    if i == 2:
        df_schedule = df.iloc[2:]
        df_schedule.columns = df.iloc[1]
        # df_schedule[["Date"]] = df_schedule[["Date"]].applymap(clean_schedule)
        
df_schedule.head(1)

1,Date,Unnamed: 2,Opponent,Result/Line,Record,WAB,AdjO,AdjD,EFF,eFG%,TO%,OR%,FTR,2P,3P,EFF.1,eFG%.1,TO%.1,OR%.1,FTR.1,2P.1,3P.1,G-Sc,+/-,None,None.1,None.2,None.3,None.4,None.5
2,Mon 11-07\n11-07,H,6,232 (Ⅳ),,Northern Colorado,NColo,"W, 83-36",64,1-0,,0.1,111.5,66.5,130.3,52.1,12.6,45.2,13.9,21-43,11-29,56.5,32.1,36.1,25.0,41.0,5-18,5-21,100,14.3
3,Fri 11-11\n11-11,N,5,179 (Ⅲ),,Saint Joseph's,St.Joes,"W, 81-55",69,2-0,,0.2,117.4,80.3,117.9,54.6,13.1,32.4,24.6,28-39,5-26,80.0,34.3,23.3,34.1,50.0,11-30,5-24,99,15.8
4,Mon 11-14\n11-14,H,4,83 (Ⅱ),,Oral Roberts,ORU,"W, 83-45",67,3-0,,0.5,121.9,66.6,123.9,56.9,17.9,46.9,43.1,24-41,6-17,67.2,29.0,16.4,26.0,24.2,6-24,8-38,100,18.2


In [229]:
cols_schedule = ['Date', 'Location', 'Team_Rank', 'Opponent_Rank', 'None', 'Opponent', 'Opponent_Short', 
                 'Result', 'Tempo', 'Record', 'None', 'Wab', 'AdjO', 'AdjD', 'Off_EFF', 'Off_eFG%', 
                 'Off_TO%', 'Off_OR%', 'Off_FTR', 'Off_2P', 'Off_3P', 'Def_EFF', 'Def_eFG%', 'Def_TO%', 
                 'Def_OR%', 'Def_FTR', 'Def_2P', 'Def_3P', 'Game_Score', 'Avg_Lead_Deficit']

df_schedule.columns = cols_schedule
df_schedule.head(4)

Unnamed: 0,Date,Location,Team_Rank,Opponent_Rank,None,Opponent,Opponent_Short,Result,Tempo,Record,None.1,Wab,AdjO,AdjD,Off_EFF,Off_eFG%,Off_TO%,Off_OR%,Off_FTR,Off_2P,Off_3P,Def_EFF,Def_eFG%,Def_TO%,Def_OR%,Def_FTR,Def_2P,Def_3P,Game_Score,Avg_Lead_Deficit
2,Mon 11-07\n11-07,H,6,232 (Ⅳ),,Northern Colorado,NColo,"W, 83-36",64,1-0,,0.1,111.5,66.5,130.3,52.1,12.6,45.2,13.9,21-43,11-29,56.5,32.1,36.1,25.0,41.0,5-18,5-21,100,14.3
3,Fri 11-11\n11-11,N,5,179 (Ⅲ),,Saint Joseph's,St.Joes,"W, 81-55",69,2-0,,0.2,117.4,80.3,117.9,54.6,13.1,32.4,24.6,28-39,5-26,80.0,34.3,23.3,34.1,50.0,11-30,5-24,99,15.8
4,Mon 11-14\n11-14,H,4,83 (Ⅱ),,Oral Roberts,ORU,"W, 83-45",67,3-0,,0.5,121.9,66.6,123.9,56.9,17.9,46.9,43.1,24-41,6-17,67.2,29.0,16.4,26.0,24.2,6-24,8-38,100,18.2
5,Wed 11-16\n11-16,H,2,293 (Ⅳ),,Texas Southern,TXSO,"W, 83-48",66,4-0,,0.5,118.6,85.6,126.1,55.8,12.2,34.4,33.3,26-41,5-19,72.9,39.8,27.4,27.3,30.6,15-33,3-16,98,14.8
