In [None]:
pip install -r ../requirements.txt

In [4]:
import pandas as pd
from tabula import read_pdf

In [None]:
# Extract the PDF file into a list of DataFrames
pdf_path = "../data/men/raw/Cedarvile-11-04-23.pdf"
dfs = read_pdf(pdf_path, pages="all", multiple_tables=True)

# Separate the DataFrames into individual variables
df0 = dfs[0]
df1 = dfs[1]
df2 = dfs[2]

In [50]:
# Clean the Score By Period table
def process_score(df):
    # Drop the first column and the second row, rename columns
    return df.iloc[:, 1:].iloc[[0, 2], :].rename(
        columns={'Unnamed: 0': 'Team', 
                 'Unnamed: 1': '1st Half',
                 'Unnamed: 2': '2nd Half',
                 'Unnamed: 3': 'Final'})



# Clean the Player Stats tables for both teams
def process_stats(df):
    # Create an explicit copy of the DataFrame
    df = df.copy()
    
    # Remove rows with all NaN values
    df = df.dropna(axis=1, how='all')
    
    # Split 'FT ORB-DRB' column into separate columns
    df[['FT', 'ORB-DRB']] = df['FT ORB-DRB'].str.split(' ', expand=True)

    # Split FG, 3PT, and FT columns into made and attempted
    df[['FGM', 'FGA']] = df['FG'].str.split('-', expand=True).astype(float)
    df[['3PTM', '3PTA']] = df['3PT'].str.split('-', expand=True).astype(float)
    df[['FTM', 'FTA']] = df['FT'].str.split('-', expand=True).astype(float)

    # Split 'A TO BLK' column into separate columns
    df[['A', 'TO', 'BLK']] = df['A TO BLK'].str.split(' ', expand=True).astype(float)

    # Split 'GS MIN' column into separate columns
    df['MIN'] = df['GS MIN'].str.replace('*', '').str.replace(' ', '').astype(float)
    
    # Split 'ORB-DRB' column into separate columns
    df[['ORB', 'DRB']] = df['ORB-DRB'].str.split('-', expand=True).astype(float)

    # Split '# Player' column into separate columns
    df[['#', 'Player']] = df['# Player'].str.split(' ', n=1, expand=True)

    # Clean name values
    df['Player'] = df['Player'].str.split(',').str[::-1].str.join(' ')

    # Convert rest of columns to float type
    df['PTS'] = df['PTS'].astype(float)
    df['REB'] = df['REB'].astype(float)
    df['STL'] = df['STL'].astype(float)
    df['PF'] = df['PF'].astype(float)
    
    # Calculate eFG% and TS%
    # eFG% (Effective Field Goal Percentage): Measures shooting efficiency, taking into account 3-pointers. Formula: (FGM + 0.5*3PM) / FGA
    # TS% (True Shooting Percentage): Measures shooting efficiency, taking into account 3-pointers and free throws. Formula: PTS / (2*(FGA + 0.44*FTA))
    df['eFG%'] = (df['FGM'] + (0.5 * df['3PTM'])) / df['FGA']
    df['TS%'] = df['PTS'] / (2 * (df['FGA'] + (0.44 * df['FTA'])))
    df['eFG%'] = round(df['eFG%'], 2)
    df['TS%'] = round(df['TS%'], 2)

    # Drop original columns that were cleaned
    df = df.drop(['FT ORB-DRB', 'A TO BLK', 'ORB-DRB', 'FG', '3PT', 'FT', 'GS MIN', '# Player', '#'], axis=1)
    # Place columns in a specific order
    df = df[
        ['Player', 'MIN', 'PTS', 'FGM', 'FGA', '3PTM', '3PTA', 'FTM', 
         'FTA', 'A', 'REB', 'ORB', 'DRB', 'TO', 'BLK', 'STL', 'PF', 'eFG%', 'TS%']]

    return df


score = process_score(df0)
playerStats = process_stats(df1[20:29])
oppPlayerStats = process_stats(df1[0:10])


In [51]:
score

Unnamed: 0,Team,1st Half,2nd Half,Final
0,Cedarville,40.0,35.0,75.0
2,Mo.-St. Louis,27.0,43.0,70.0


In [52]:
playerStats

Unnamed: 0,Player,MIN,PTS,FGM,FGA,3PTM,3PTA,FTM,FTA,A,REB,ORB,DRB,TO,BLK,STL,PF,eFG%,TS%
20,Matt Enright,36.0,22.0,9.0,15.0,0.0,5.0,4.0,5.0,2.0,4.0,1.0,3.0,1.0,0.0,2.0,4.0,0.6,0.64
21,Janeir Harris,38.0,14.0,5.0,11.0,0.0,3.0,4.0,5.0,3.0,4.0,0.0,4.0,1.0,2.0,1.0,2.0,0.45,0.53
22,Mayson Quartlebaum,24.0,13.0,6.0,10.0,1.0,1.0,0.0,0.0,1.0,4.0,1.0,3.0,0.0,0.0,0.0,1.0,0.65,0.65
23,Emanuel Prospere II,37.0,9.0,4.0,7.0,1.0,2.0,0.0,2.0,4.0,4.0,1.0,3.0,2.0,0.0,2.0,2.0,0.64,0.57
24,Terrell Kabala,23.0,6.0,2.0,6.0,2.0,4.0,0.0,0.0,2.0,2.0,0.0,2.0,1.0,0.0,1.0,3.0,0.5,0.5
25,Troy Glover II,16.0,6.0,2.0,3.0,0.0,0.0,2.0,4.0,0.0,6.0,0.0,6.0,1.0,1.0,1.0,2.0,0.67,0.63
26,Kris O'Neal II,11.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,2.0,3.0,0.0,3.0,1.0,0.0,0.0
27,Christian Meeks,8.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
28,Sam Bledsoe,7.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [53]:
oppPlayerStats

Unnamed: 0,Player,MIN,PTS,FGM,FGA,3PTM,3PTA,FTM,FTA,A,REB,ORB,DRB,TO,BLK,STL,PF,eFG%,TS%
0,Jayvon Maughmer,33.0,31.0,11.0,18.0,3.0,5.0,6.0,6.0,0.0,7.0,1.0,6.0,2.0,1.0,0.0,2.0,0.69,0.75
1,Chris Rogers,30.0,14.0,5.0,8.0,2.0,5.0,2.0,2.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.75,0.79
2,Grant Whisman,31.0,11.0,4.0,9.0,3.0,7.0,0.0,0.0,1.0,2.0,0.0,2.0,0.0,0.0,0.0,3.0,0.61,0.61
3,Jacob Drees,24.0,4.0,2.0,5.0,0.0,1.0,0.0,0.0,1.0,11.0,1.0,10.0,4.0,0.0,0.0,2.0,0.4,0.4
4,Timothy Davis,6.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0
5,Kyle Thomas,29.0,9.0,3.0,7.0,3.0,7.0,0.0,0.0,2.0,4.0,1.0,3.0,1.0,0.0,1.0,0.0,0.64,0.64
6,Anthony Ruffolo,16.0,4.0,2.0,4.0,0.0,1.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,0.5,0.5
7,Tymoteusz Pszczola,14.0,2.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,0.25,0.25
8,Ethan Sellars,16.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,2.0,2.0,1.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0
9,David Okpara,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
