In [None]:
pip install -r ../requirements.txt

In [27]:
import pandas as pd
from tabula import read_pdf
import numpy as np
import os

In [28]:
# Clean the Player Stats tables for both teams
def process_stats(df):
    # Create an explicit copy of the DataFrame
    df = df.copy()
    
    # Remove rows with all NaN values
    df = df.dropna(axis=1, how='all')

    # Split 'FT ORB-DRB' column into separate columns
    df[['FT', 'ORB-DRB']] = df['FT ORB-DRB'].str.split(' ', expand=True)

    # Split 'ORB-DRB' column into separate columns
    df[['ORB', 'DRB']] = df['ORB-DRB'].str.split('-', expand=True).astype(float)

    # Split FG, 3PT, and FT columns into made and attempted
    df[['FGM', 'FGA']] = df['FG'].str.split('-', expand=True).astype(float)
    df[['3PTM', '3PTA']] = df['3PT'].str.split('-', expand=True).astype(float)
    df[['FTM', 'FTA']] = df['FT'].str.split('-', expand=True).astype(float)

    # Split 'A TO BLK' column into 3 separate columns
    df[['A', 'TO', 'BLK']] = df['A TO BLK'].str.split(' ', expand=True).astype(float)

    # Split 'GS MIN' column into 2 separate columns
    df['MIN'] = df['GS MIN'].str.replace('*', '').str.replace(' ', '').astype(float)

    # Split '# Player' column into separate columns
    df[['#', 'Player']] = df['# Player'].str.split(' ', n=1, expand=True)
    # Clean name values
    df['Player'] = df['Player'].str.split(',').str[::-1].str.join(' ')

    # Convert rest of relevant columns to the correct type
    df['PTS'] = df['PTS'].astype(float)
    df['REB'] = df['REB'].astype(float)
    df['STL'] = df['STL'].astype(float)
    df['PF'] = df['PF'].astype(float)
    
    # Calculate Statistics
    # FG% (Field Goal Percentage): Measures general shooting efficiency
    df['FG%'] = df['FGM'] / df['FGA']
    df['FG%'] = df['FG%'].fillna(0.0).round(2)

    # 3PT% (3 Point Percentage): Measures 3 point shooting efficiency
    df['3PT%'] = df['3PTM'] / df['3PTA']
    df['3PT%'] = df['3PT%'].fillna(0.0).round(2)

    # FT% (Free Throw Percentage): Measures free throw shooting efficiency
    df['FT%'] = df['FTM'] / df['FTA']
    df['FT%'] = df['FT%'].fillna(0.0).round(2)


    # Calculate Advanced Statistics
    # eFG% (Effective Field Goal Percentage): Measures shooting efficiency, taking into account 3-pointers. Formula: (FGM + 0.5*3PM) / FGA
    df['eFG%'] = (df['FGM'] + (0.5 * df['3PTM'])) / df['FGA']
    df['eFG%'] = df['eFG%'].fillna(0.0).round(2)

    # TS% (True Shooting Percentage): Measures shooting efficiency, taking into account 3-pointers and free throws. Formula: PTS / (2*(FGA + 0.44*FTA))
    df['TS%'] = df['PTS'] / (2 * (df['FGA'] + (0.44 * df['FTA'])))
    df['TS%'] = df['TS%'].fillna(0.0).round(2)

    # Assist-to-Turnover Ratio: Measures the number of assists per turnover
    df['A/TO'] = np.where(df['TO'] == 0, 0, df['A'] / df['TO']) # Numpy handles div by 0 cases were 0 Turnovers(TO) are committed
    df['A/TO'] = round(df['A/TO'], 2)

    # Usage Rate: Measures how often a player is involved in team plays
    df['Usage Rate'] = (df['FGA'] + df['FTA'] + df['A'] + df['TO']) / (df['MIN'] / 40)  # 40 minutes per game
    df['Usage Rate'] = round(df['Usage Rate'], 2)

    # Box Plus/Minus: Measures a player's overall contribution
    df['Box +/-'] = np.where(df['MIN'] == 0, 0, ((df['PTS'] + df['REB'] + df['A'] + df['STL'] + df['BLK']) - (df['FGA'] + df['FTA'] + df['TO'] + df['PF'])) / df['MIN']) # Numpy handles div by 0 cases were 0 Minutes(MIN) are recorded
    df['Box +/-'] = round(df['Box +/-'], 2)

    # Drop original columns that were cleaned
    df = df.drop(['FT ORB-DRB', 'A TO BLK', 'ORB-DRB', 'FG', '3PT', 'FT', 'GS MIN', '# Player', '#'], axis=1)
    # Place columns in a specific order
    df = df[
        ['Player', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PTM', '3PTA', '3PT%', 'FTM', 
         'FTA', 'FT%', 'A', 'TO', 'A/TO', 'REB', 'ORB', 'DRB', 'BLK', 'STL', 'PF', 'eFG%', 'TS%', 'Usage Rate', 'Box +/-']]

    return df

# Add parameters for each file
def generate_files_info(input_path, output_path, team_name, opp_team_name, date, team_range, opp_team_range):
    print(f"Output path: {output_path}")
    return {
        'input_path': input_path,
        'output_path': output_path,
        'team_name': team_name,
        'opp_team_name': opp_team_name,
        'date': date,
        'team_range': team_range,
        'opp_team_range': opp_team_range
    }


def process_files(files_info):
    all_playerStats = pd.DataFrame()
    all_oppPlayerStats = pd.DataFrame()

    for file_info in files_info:
        input_path = file_info['input_path']
        output_path = file_info['output_path']
        team_name = file_info['team_name']
        opp_team_name = file_info['opp_team_name']
        date = file_info['date']
        team_range = file_info['team_range']
        opp_team_range = file_info['opp_team_range']

        dfs = read_pdf(input_path, pages="all", multiple_tables=True)

        # Separate the DataFrames into individual variables
        df0 = dfs[0]
        df1 = dfs[1]

        # Process the score and stats
        playerStats = process_stats(df1.iloc[team_range])
        oppPlayerStats = process_stats(df1.iloc[opp_team_range])

        # Assign additional columns
        playerStats = playerStats.assign(Team=team_name, Opponent=opp_team_name, Date=date)
        oppPlayerStats = oppPlayerStats.assign(Team=opp_team_name, Opponent=team_name, Date=date)

        playerStats['Date'] = pd.to_datetime(playerStats['Date'], format='%m-%d-%Y')
        oppPlayerStats['Date'] = pd.to_datetime(oppPlayerStats['Date'], format='%m-%d-%Y')

        # Append to the main DataFrames
        all_playerStats = pd.concat([all_playerStats, playerStats])
        all_oppPlayerStats = pd.concat([all_oppPlayerStats, oppPlayerStats])

    return all_playerStats, all_oppPlayerStats

In [None]:
def read_pdf_as_dataframe(pdf_path):
    dfs = read_pdf(pdf_path, pages="all", multiple_tables=True)
    return dfs

dfs = read_pdf_as_dataframe("../data/men/raw/MICH-11-18-2023.pdf")
dfs[1]  # prints the second table

In [35]:
files_info = [
    generate_files_info("../data/men/raw/CED-11-04-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Cedarville', '11-04-2023', range(20, 29), range(0, 10)),
    generate_files_info("../data/men/raw/BENT-11-05-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Bentley', '11-05-2023', range(0, 10), range(20, 28)),
    generate_files_info("../data/men/raw/WINO-11-16-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/DRURY-2-15-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Drury', '2-15-2024', range(0, 8), range(19, 29))
    
    # Add more files here...
]
'''
    #generate_files_info("../data/men/raw/CENMO-12-18-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Central Missouri', '12-18-2023', range(0, 1), range(19, 20))
    #generate_files_info("../data/men/raw/CENOK-12-19-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    #generate_files_info("../data/men/raw/ILSPR-12-09-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    
    generate_files_info("../data/men/raw/ISPRG-2-8-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/LEWIS-1-20-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/LEWIS-3-2-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/MARY-01-03-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/MARY-1-27-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/MCKEN-01-13-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/MCKEN-2-24-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/MICH-11-18-2023.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/MOST-1-25-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/NWMS-11-22-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/QUIN-2-10-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/QUIN-12-07-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/ROCK-12-02-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/SWBP-2-17-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/TRUM-01-06-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/TRUM-2-3-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/UINDY-1-18-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/UINDY-2-29-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/UIOWA-2-1-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/UIOWA-3-7-24.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/WJ-11-30-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30)),
    generate_files_info("../data/men/raw/WPARK-11-30-23.pdf", "../data/men/clean/23-24/", 'UMSL', 'Winona St.', '11-16-2023', range(0, 9), range(19, 30))
'''   
     

all_playerStats, all_oppPlayerStats = process_files(files_info)

Output path: ../data/men/clean/23-24/
Output path: ../data/men/clean/23-24/
Output path: ../data/men/clean/23-24/
Output path: ../data/men/clean/23-24/


Got stderr: Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Aug 14, 2024 2:52:05 PM or

In [32]:
all_playerStats

Unnamed: 0,Player,MIN,PTS,FGM,FGA,FG%,3PTM,3PTA,3PT%,FTM,...,BLK,STL,PF,eFG%,TS%,Usage Rate,Box +/-,Team,Opponent,Date
20,Matt Enright,36.0,22.0,9.0,15.0,0.6,0.0,5.0,0.0,4.0,...,0.0,2.0,4.0,0.6,0.64,25.56,0.14,UMSL,Cedarville,2023-11-04
21,Janeir Harris,38.0,14.0,5.0,11.0,0.45,0.0,3.0,0.0,4.0,...,2.0,1.0,2.0,0.45,0.53,21.05,0.13,UMSL,Cedarville,2023-11-04
22,Mayson Quartlebaum,24.0,13.0,6.0,10.0,0.6,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.65,0.65,18.33,0.29,UMSL,Cedarville,2023-11-04
23,Emanuel Prospere II,37.0,9.0,4.0,7.0,0.57,1.0,2.0,0.5,0.0,...,0.0,2.0,2.0,0.64,0.57,16.22,0.16,UMSL,Cedarville,2023-11-04
24,Terrell Kabala,23.0,6.0,2.0,6.0,0.33,2.0,4.0,0.5,0.0,...,0.0,1.0,3.0,0.5,0.5,15.65,0.04,UMSL,Cedarville,2023-11-04
25,Troy Glover II,16.0,6.0,2.0,3.0,0.67,0.0,0.0,0.0,2.0,...,1.0,1.0,2.0,0.67,0.63,20.0,0.25,UMSL,Cedarville,2023-11-04
26,Kris O'Neal II,11.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,1.0,0.0,0.0,18.18,0.09,UMSL,Cedarville,2023-11-04
27,Christian Meeks,8.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,15.0,-0.12,UMSL,Cedarville,2023-11-04
28,Sam Bledsoe,7.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,17.14,-0.43,UMSL,Cedarville,2023-11-04
0,Emanuel Prospere II,32.0,18.0,7.0,13.0,0.54,2.0,3.0,0.67,2.0,...,0.0,1.0,3.0,0.62,0.63,27.5,0.09,UMSL,Bentley,2023-11-05


In [33]:
all_oppPlayerStats

Unnamed: 0,Player,MIN,PTS,FGM,FGA,FG%,3PTM,3PTA,3PT%,FTM,...,BLK,STL,PF,eFG%,TS%,Usage Rate,Box +/-,Team,Opponent,Date
0,Jayvon Maughmer,33.0,31.0,11.0,18.0,0.61,3.0,5.0,0.6,6.0,...,1.0,0.0,2.0,0.69,0.75,31.52,0.33,Cedarville,UMSL,2023-11-04
1,Chris Rogers,30.0,14.0,5.0,8.0,0.62,2.0,5.0,0.4,2.0,...,0.0,0.0,2.0,0.75,0.79,18.67,0.13,Cedarville,UMSL,2023-11-04
2,Grant Whisman,31.0,11.0,4.0,9.0,0.44,3.0,7.0,0.43,0.0,...,0.0,0.0,3.0,0.61,0.61,12.9,0.06,Cedarville,UMSL,2023-11-04
3,Jacob Drees,24.0,4.0,2.0,5.0,0.4,0.0,1.0,0.0,0.0,...,0.0,0.0,2.0,0.4,0.4,16.67,0.21,Cedarville,UMSL,2023-11-04
4,Timothy Davis,6.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,26.67,-1.17,Cedarville,UMSL,2023-11-04
5,Kyle Thomas,29.0,9.0,3.0,7.0,0.43,3.0,7.0,0.43,0.0,...,0.0,1.0,0.0,0.64,0.64,13.79,0.28,Cedarville,UMSL,2023-11-04
6,Anthony Ruffolo,16.0,4.0,2.0,4.0,0.5,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.5,0.5,15.0,0.12,Cedarville,UMSL,2023-11-04
7,Tymoteusz Pszczola,14.0,2.0,1.0,4.0,0.25,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.25,0.25,14.29,-0.07,Cedarville,UMSL,2023-11-04
8,Ethan Sellars,16.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,15.0,-0.06,Cedarville,UMSL,2023-11-04
9,David Okpara,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,Cedarville,UMSL,2023-11-04


In [34]:
# Export DataFrames
all_playerStats.to_csv(files_info[0]['output_path'] + 'player_stats.csv', index=False)
all_oppPlayerStats.to_csv(files_info[0]['output_path'] + 'opp_player_stats.csv', index=False)