In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

In [4]:
# Scrape the data

# URL of the webpage you want to scrape
url = 'https://umsltritons.com/sports/mens-basketball/stats/2023-24'

# Fetch the content of the page
response = requests.get(url)
html_content = response.text

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Locate the section with id 'individual-overall'
section = soup.find('section', {'id': 'individual-overall'})

# Define the DataFrame with column names if known
columns = [
    'Number', 'Player', 'Games Played', 'GS', 'Minutes Played Played Played Played', 'Min Per Game', 'FGM', 'FGA',
    'FG%', '3PTM', '3PTA', '3PT%',
    'FTM', 'FTA', 'FT%', 'PTS', 'PPG', 'OREB',
    'DREB', 'REB', 'AVG REB', 'PF', 'A', 'TO', 'STL', 'BLK', 'View Bio'
]

# Initialize an empty DataFrame
df = pd.DataFrame(columns=columns)

# If the section is found, locate the table within it
if section:
    table = section.find('table', {'class': 'sidearm-table'})

    # If the table is found, extract the data
    if table:
        for row in table.find_all('tr'):
            columns_data = [col.get_text(strip=True) for col in row.find_all('td')]
            if columns_data:  # Append non-empty rows
                df = pd.concat([df, pd.DataFrame([columns_data], columns=columns)], ignore_index=True)

# Display the DataFrame
#df


In [7]:
# Clean the Player Stats table
def process_stats(df):
    # Create an explicit copy of the DataFrame
    df = df.copy()
 
    # Convert relevant columns to the correct type
    df['FGM'] = df['FGM'].astype(float)
    df['3PTM'] = df['3PTM'].astype(float)
    df['FGA'] = df['FGA'].astype(float)
    df['PTS'] = df['PTS'].astype(float)
    df['FTA'] = df['FTA'].astype(float)
    df['A'] = df['A'].astype(float)
    df['TO'] = df['TO'].astype(float)
    df['REB'] = df['REB'].astype(float)
    df['STL'] = df['STL'].astype(float)
    df['BLK'] = df['BLK'].astype(float)
    df['PF'] = df['PF'].astype(float)
    df['Minutes Played'] = df['Minutes Played'].astype(float)

    # Calculate Advanced Statistics
    # eFG% (Effective Field Goal Percentage): Measures shooting efficiency, taking into account 3-pointers. Formula: (FGM + 0.5*3PM) / FGA
    df['eFG%'] = (df['FGM'] + (0.5 * df['3PTM'])) / df['FGA']
    df['eFG%'] = df['eFG%'].fillna(0.0).round(2)

    # TS% (True Shooting Percentage): Measures shooting efficiency, taking into account 3-pointers and free throws. Formula: PTS / (2*(FGA + 0.44*FTA))
    df['TS%'] = df['PTS'] / (2 * (df['FGA'] + (0.44 * df['FTA'])))
    df['TS%'] = df['TS%'].fillna(0.0).round(2)

    # Assist-to-Turnover Ratio: Measures the number of assists per turnover
    df['A/TO'] = np.where(df['TO'] == 0, 0, df['A'] / df['TO']) # Numpy handles div by 0 cases were 0 Turnovers(TO) are committed
    df['A/TO'] = round(df['A/TO'], 2)

    # Usage Rate: Measures how often a player is involved in team plays
    df['Usage Rate'] = (df['FGA'] + df['FTA'] + df['A'] + df['TO']) / (df['Minutes Played'] / 40)  # 40 minutes per game
    df['Usage Rate'] = round(df['Usage Rate'], 2)

    # Box Plus/Minus: Measures a player's overall contribution
    df['Box +/-'] = np.where(df['Minutes Played'] == 0, 0, ((df['PTS'] + df['REB'] + df['A'] + df['STL'] + df['BLK']) - (df['FGA'] + df['FTA'] + df['TO'] + df['PF'])) / df['Minutes Played']) # Numpy handles div by 0 cases were 0 Minutes(MIN) are recorded
    df['Box +/-'] = round(df['Box +/-'], 2)

    # Place columns in a specific order
    df = df[
         ['Number', 'Player', 'Games Played', 'GS', 'Minutes Played', 'Min Per Game', 'FGM', 'FGA',
    'FG%', '3PTM', '3PTA', '3PT%',
    'FTM', 'FTA', 'FT%', 'PTS', 'PPG', 'OREB',
    'DREB', 'REB', 'AVG REB', 'PF', 'A', 'TO', 'STL', 'BLK', 'eFG%', 'TS%', 'A/TO', 'Usage Rate', 'Box +/-'
    ]]

    return df

def clean_name(name):
    # Find the position of the first digit
    match = re.search(r'\d+', name)
    
    if match:
        number_pos = match.start()
        # Split the name by the position of the number
        part1 = name[:number_pos].strip()
        
        # Split part1 by the comma
        last_name, first_name = part1.split(',', 1)
        
        # Format as "First Last"
        return f"{first_name.strip()} {last_name.strip()}"
    else:
        # In case no number is found, just return the original name
        return name

# Process the score and stats
seasonStats = process_stats(df)
# Apply the splitting function to the 'Player' column
seasonStats['Player'] = seasonStats['Player'].apply(clean_name)
# Select only the last two rows
overallSeasonStats = seasonStats.iloc[-2:, :]
# Exclude the last 3 rows for season stats table
seasonStats = seasonStats.iloc[:-3, :]
#Display cleaned dataset
seasonStats

Unnamed: 0,Number,Player,Games Played,GS,Minutes Played,Min Per Game,FGM,FGA,FG%,3PTM,...,PF,A,TO,STL,BLK,eFG%,TS%,A/TO,Usage Rate,Box +/-
0,3,Matt Enright,29,29,1037.0,35.8,149.0,334.0,0.446,57.0,...,56.0,94.0,50.0,35.0,4.0,0.53,0.56,1.88,20.68,0.14
1,15,Mayson Quartlebaum,27,24,776.0,28.7,114.0,219.0,0.521,26.0,...,46.0,36.0,41.0,16.0,10.0,0.58,0.61,0.88,19.79,0.14
2,12,Emanuel Prospere II,28,14,790.0,28.2,127.0,238.0,0.534,13.0,...,87.0,66.0,84.0,30.0,8.0,0.56,0.58,0.79,23.19,0.05
3,35,Troy Glover II,29,21,781.0,26.9,123.0,221.0,0.557,0.0,...,58.0,37.0,42.0,25.0,49.0,0.56,0.58,0.88,19.31,0.31
4,20,Savon Wykle,25,21,767.0,30.7,82.0,184.0,0.446,42.0,...,50.0,34.0,37.0,13.0,16.0,0.56,0.59,0.92,15.75,0.1
5,10,Janeir Harris,4,4,100.0,25.0,13.0,34.0,0.382,3.0,...,9.0,7.0,9.0,1.0,3.0,0.43,0.48,0.78,23.2,0.04
6,21,Terrell Kabala,29,24,796.0,27.4,81.0,184.0,0.44,36.0,...,71.0,42.0,41.0,24.0,2.0,0.54,0.56,1.02,15.73,0.03
7,5,Kris O'Neal II,29,7,540.0,18.6,33.0,72.0,0.458,9.0,...,42.0,35.0,27.0,18.0,2.0,0.52,0.56,1.3,11.85,0.07
8,4,Sam Bledsoe,20,1,151.0,7.6,8.0,36.0,0.222,8.0,...,14.0,3.0,6.0,0.0,1.0,0.33,0.33,0.5,11.92,-0.13
9,13,Georden Rogers,12,0,63.0,5.3,5.0,21.0,0.238,3.0,...,4.0,0.0,6.0,2.0,1.0,0.31,0.34,0.0,18.41,-0.06


In [8]:
overallSeasonStats = overallSeasonStats.drop('Number', axis=1)
overallSeasonStats

Unnamed: 0,Player,Games Played,GS,Minutes Played,Min Per Game,FGM,FGA,FG%,3PTM,3PTA,...,PF,A,TO,STL,BLK,eFG%,TS%,A/TO,Usage Rate,Box +/-
12,Total,29,,5875.0,202.6,736.0,1551.0,0.475,197.0,559,...,447.0,354.0,364.0,166.0,96.0,0.54,0.57,0.97,18.4,0.12
13,Opponents,29,,5875.0,202.6,747.0,1585.0,0.471,247.0,641,...,408.0,372.0,318.0,184.0,80.0,0.55,0.58,1.17,18.46,0.14


In [10]:
# Save to CSV
seasonStats.to_csv('../data/men/clean/23-24/seasonStats.csv', index=False)
overallSeasonStats.to_csv('../data/men/clean/23-24/overallSeasonStats.csv', index=False)
