In [None]:
pip install -r ../requirements.txt

In [2]:
import pandas as pd
from tabula import read_pdf
import numpy as np
import os

In [41]:
# Clean the Player Stats tables for both teams
def process_stats(df):
    # Create an explicit copy of the DataFrame
    df = df.copy()
    
    # Remove rows with all NaN values
    df = df.dropna(axis=1, how='all')

    # Split 'FT ORB-DRB' column into separate columns
    df[['FT', 'ORB-DRB']] = df['FT ORB-DRB'].str.split(' ', expand=True)

    # Split 'ORB-DRB' column into separate columns
    df[['ORB', 'DRB']] = df['ORB-DRB'].str.split('-', expand=True).astype(float)

    # Split FG, 3PT, and FT columns into made and attempted
    df[['FGM', 'FGA']] = df['FG'].str.split('-', expand=True).astype(float)
    df[['3PTM', '3PTA']] = df['3PT'].str.split('-', expand=True).astype(float)
    df[['FTM', 'FTA']] = df['FT'].str.split('-', expand=True).astype(float)

    # Split 'A TO BLK' column into 3 separate columns
    df[['A', 'TO', 'BLK']] = df['A TO BLK'].str.split(' ', expand=True).astype(float)

    # Split 'GS MIN' column into 2 separate columns
    df['MIN'] = df['GS MIN'].str.replace('*', '').str.replace(' ', '').astype(float)

    # Split '# Player' column into separate columns
    df[['#', 'Player']] = df['# Player'].str.split(' ', n=1, expand=True)
    # Clean name values
    df['Player'] = df['Player'].str.split(',').str[::-1].str.join(' ')

    # Convert rest of relevant columns to the correct type
    df['PTS'] = df['PTS'].astype(float)
    df['REB'] = df['REB'].astype(float)
    df['STL'] = df['STL'].astype(float)
    df['PF'] = df['PF'].astype(float)
    
    # Calculate Advanced Statistics
    # eFG% (Effective Field Goal Percentage): Measures shooting efficiency, taking into account 3-pointers. Formula: (FGM + 0.5*3PM) / FGA
    df['eFG%'] = (df['FGM'] + (0.5 * df['3PTM'])) / df['FGA']
    df['eFG%'] = df['eFG%'].fillna(0.0).round(2)

    # TS% (True Shooting Percentage): Measures shooting efficiency, taking into account 3-pointers and free throws. Formula: PTS / (2*(FGA + 0.44*FTA))
    df['TS%'] = df['PTS'] / (2 * (df['FGA'] + (0.44 * df['FTA'])))
    df['TS%'] = df['TS%'].fillna(0.0).round(2)

    # Assist-to-Turnover Ratio: Measures the number of assists per turnover
    df['A/TO'] = np.where(df['TO'] == 0, 0, df['A'] / df['TO']) # Numpy handles div by 0 cases were 0 Turnovers(TO) are committed
    df['A/TO'] = round(df['A/TO'], 2)

    # Usage Rate: Measures how often a player is involved in team plays
    df['Usage Rate'] = (df['FGA'] + df['FTA'] + df['A'] + df['TO']) / (df['MIN'] / 40)  # 40 minutes per game
    df['Usage Rate'] = round(df['Usage Rate'], 2)

    # Box Plus/Minus: Measures a player's overall contribution
    df['Box +/-'] = np.where(df['MIN'] == 0, 0, ((df['PTS'] + df['REB'] + df['A'] + df['STL'] + df['BLK']) - (df['FGA'] + df['FTA'] + df['TO'] + df['PF'])) / df['MIN']) # Numpy handles div by 0 cases were 0 Minutes(MIN) are recorded
    df['Box +/-'] = round(df['Box +/-'], 2)

    # Drop original columns that were cleaned
    df = df.drop(['FT ORB-DRB', 'A TO BLK', 'ORB-DRB', 'FG', '3PT', 'FT', 'GS MIN', '# Player', '#'], axis=1)
    # Place columns in a specific order
    df = df[
        ['Player', 'MIN', 'PTS', 'FGM', 'FGA', '3PTM', '3PTA', 'FTM', 
         'FTA', 'A', 'REB', 'ORB', 'DRB', 'TO', 'BLK', 'STL', 'PF', 'eFG%', 'TS%', 'A/TO', 'Usage Rate', 'Box +/-']]

    return df

# Add parameters for each file
def generate_files_info(input_path, output_path, team_name, opp_team_name, date, team_range, opp_team_range):
    print(f"Output path: {output_path}")
    return {
        'input_path': input_path,
        'output_path': output_path,
        'team_name': team_name,
        'opp_team_name': opp_team_name,
        'date': date,
        'team_range': team_range,
        'opp_team_range': opp_team_range
    }


def process_files(files_info):
    all_playerStats = pd.DataFrame()
    all_oppPlayerStats = pd.DataFrame()

    for file_info in files_info:
        input_path = file_info['input_path']
        output_path = file_info['output_path']
        team_name = file_info['team_name']
        opp_team_name = file_info['opp_team_name']
        date = file_info['date']
        team_range = file_info['team_range']
        opp_team_range = file_info['opp_team_range']

        dfs = read_pdf(input_path, pages="all", multiple_tables=True)

        # Separate the DataFrames into individual variables
        df0 = dfs[0]
        df1 = dfs[1]

        # Process the score and stats
        playerStats = process_stats(df1.iloc[team_range])
        oppPlayerStats = process_stats(df1.iloc[opp_team_range])

        # Assign additional columns
        playerStats = playerStats.assign(Team=team_name, Opponent=opp_team_name, Date=date)
        oppPlayerStats = oppPlayerStats.assign(Team=opp_team_name, Opponent=team_name, Date=date)

        playerStats['Date'] = pd.to_datetime(playerStats['Date'], format='%m-%d-%Y')
        oppPlayerStats['Date'] = pd.to_datetime(oppPlayerStats['Date'], format='%m-%d-%Y')

        # Append to the main DataFrames
        all_playerStats = pd.concat([all_playerStats, playerStats])
        all_oppPlayerStats = pd.concat([all_oppPlayerStats, oppPlayerStats])

    return all_playerStats, all_oppPlayerStats

In [42]:
files_info = [
    generate_files_info("../data/men/raw/CED-11-04-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Cedarville', '11-04-2023', range(20, 29), range(0, 10)),
    generate_files_info("../data/men/raw/BENT-11-05-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Bentley', '11-05-2023', range(0, 10), range(20, 28)),
    generate_files_info("../data/men/raw/WINO-11-16-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30))
    # Add more files here...
]

all_playerStats, all_oppPlayerStats = process_files(files_info)

# Export DataFrames
all_playerStats.to_csv(files_info[0]['output_path'] + 'player_stats.csv', index=False)
all_oppPlayerStats.to_csv(files_info[0]['output_path'] + 'opp_player_stats.csv', index=False)

Output path: ../data/men/clean/23-24/
Output path: ../data/men/clean/23-24/
Output path: ../data/men/clean/23-24/


Got stderr: Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 01, 2024 9:40:47 PM or

In [38]:
all_playerStats

Unnamed: 0,Player,MIN,PTS,FGM,FGA,3PTM,3PTA,FTM,FTA,A,...,STL,PF,eFG%,TS%,A/TO,Usage Rate,Box +/-,Team,Opponent,Date
20,Matt Enright,36.0,22.0,9.0,15.0,0.0,5.0,4.0,5.0,2.0,...,2.0,4.0,0.6,0.64,2.0,25.56,0.14,UMSL,Cedarville,11-04-2023
21,Janeir Harris,38.0,14.0,5.0,11.0,0.0,3.0,4.0,5.0,3.0,...,1.0,2.0,0.45,0.53,3.0,21.05,0.13,UMSL,Cedarville,11-04-2023
22,Mayson Quartlebaum,24.0,13.0,6.0,10.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.65,0.65,0.0,18.33,0.29,UMSL,Cedarville,11-04-2023
23,Emanuel Prospere II,37.0,9.0,4.0,7.0,1.0,2.0,0.0,2.0,4.0,...,2.0,2.0,0.64,0.57,2.0,16.22,0.16,UMSL,Cedarville,11-04-2023
24,Terrell Kabala,23.0,6.0,2.0,6.0,2.0,4.0,0.0,0.0,2.0,...,1.0,3.0,0.5,0.5,2.0,15.65,0.04,UMSL,Cedarville,11-04-2023
25,Troy Glover II,16.0,6.0,2.0,3.0,0.0,0.0,2.0,4.0,0.0,...,1.0,2.0,0.67,0.63,0.0,20.0,0.25,UMSL,Cedarville,11-04-2023
26,Kris O'Neal II,11.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,3.0,1.0,0.0,0.0,0.33,18.18,0.09,UMSL,Cedarville,11-04-2023
27,Christian Meeks,8.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,15.0,-0.12,UMSL,Cedarville,11-04-2023
28,Sam Bledsoe,7.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,17.14,-0.43,UMSL,Cedarville,11-04-2023
0,Emanuel Prospere II,32.0,18.0,7.0,13.0,2.0,3.0,2.0,3.0,2.0,...,1.0,3.0,0.62,0.63,0.5,27.5,0.09,UMSL,Bentley,11-05-2023


In [25]:
all_oppPlayerStats

Unnamed: 0,Player,MIN,PTS,FGM,FGA,3PTM,3PTA,FTM,FTA,A,...,STL,PF,eFG%,TS%,A/TO,Usage Rate,Box +/-,Team,Opponent,Date
0,Jayvon Maughmer,33.0,31.0,11.0,18.0,3.0,5.0,6.0,6.0,0.0,...,0.0,2.0,0.69,0.75,0.0,31.52,0.33,Cedarville,UMSL,11-04-2023
1,Chris Rogers,30.0,14.0,5.0,8.0,2.0,5.0,2.0,2.0,3.0,...,0.0,2.0,0.75,0.79,3.0,18.67,0.13,Cedarville,UMSL,11-04-2023
2,Grant Whisman,31.0,11.0,4.0,9.0,3.0,7.0,0.0,0.0,1.0,...,0.0,3.0,0.61,0.61,0.0,12.9,0.06,Cedarville,UMSL,11-04-2023
3,Jacob Drees,24.0,4.0,2.0,5.0,0.0,1.0,0.0,0.0,1.0,...,0.0,2.0,0.4,0.4,0.25,16.67,0.21,Cedarville,UMSL,11-04-2023
4,Timothy Davis,6.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,26.67,-1.17,Cedarville,UMSL,11-04-2023
5,Kyle Thomas,29.0,9.0,3.0,7.0,3.0,7.0,0.0,0.0,2.0,...,1.0,0.0,0.64,0.64,2.0,13.79,0.28,Cedarville,UMSL,11-04-2023
6,Anthony Ruffolo,16.0,4.0,2.0,4.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.5,0.5,1.0,15.0,0.12,Cedarville,UMSL,11-04-2023
7,Tymoteusz Pszczola,14.0,2.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.25,0.25,0.0,14.29,-0.07,Cedarville,UMSL,11-04-2023
8,Ethan Sellars,16.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,2.0,...,0.0,1.0,0.0,0.0,1.0,15.0,-0.06,Cedarville,UMSL,11-04-2023
9,David Okpara,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,Cedarville,UMSL,11-04-2023
