In [None]:
pip install tabula-py
import pandas as pd
from tabula import read_pdf

In [None]:
# Extract the PDF file into a list of DataFrames
pdf_path = "./data/Cedarvile-11-04-23.pdf"
dfs = read_pdf(pdf_path, pages="all", multiple_tables=True)

# Separate the DataFrames into individual variables
df0 = dfs[0]
df1 = dfs[1]
df2 = dfs[2]

In [48]:
# Clean the Score By Period table
def process_score(df):
    # Drop the first column and the second row, rename columns
    return df.iloc[:, 1:].iloc[[0, 2], :].rename(
        columns={'Unnamed: 0': 'Team', 
                 'Unnamed: 1': '1st Half',
                 'Unnamed: 2': '2nd Half',
                 'Unnamed: 3': 'Final'})



# Clean the Player Stats tables for both teams
def process_stats(df):
    # Create an explicit copy of the DataFrame
    df = df.copy()
    
    # Remove rows with all NaN values
    df = df.dropna(axis=1, how='all')
    
    # Split 'FT ORB-DRB' column into separate columns
    df[['FT', 'ORB-DRB']] = df['FT ORB-DRB'].str.split(' ', expand=True)

    # Split FG, 3PT, and FT columns into made and attempted
    df[['FGM', 'FGA']] = df['FG'].str.split('-', expand=True).astype(int)
    df[['3PTM', '3PTA']] = df['3PT'].str.split('-', expand=True).astype(int)
    df[['FTM', 'FTA']] = df['FT'].str.split('-', expand=True).astype(int)

    # Split 'A TO BLK' column into separate columns
    df[['A', 'TO', 'BLK']] = df['A TO BLK'].str.split(' ', expand=True)

    # Split 'GS MIN' column into separate columns
    df['MIN'] = df['GS MIN'].str.replace('*', '').str.replace(' ', '')
    
    # Split 'ORB-DRB' column into separate columns
    df[['ORB', 'DRB']] = df['ORB-DRB'].str.split('-', expand=True)
    
    # Drop original columnsgs 
    df = df.drop(['FT ORB-DRB', 'A TO BLK', 'ORB-DRB', 'FG', '3PT', 'FT', 'GS MIN'], axis=1)
    
    return df


score = process_score(df0)
playerStats = process_stats(df1[20:29])
oppPlayerStats = process_stats(df1[0:10])


In [49]:
score

Unnamed: 0,Team,1st Half,2nd Half,Final
0,Cedarville,40.0,35.0,75.0
2,Mo.-St. Louis,27.0,43.0,70.0


In [46]:
playerStats

Unnamed: 0,# Player,REB,PF,STL,PTS,FGM,FGA,3PTM,3PTA,FTM,FTA,A,TO,BLK,MIN,ORB,DRB
20,"03 Enright,Matt",4,4,2,22,9,15,0,5,4,5,2,1,0,36,1,3
21,"10 Harris,Janeir",4,2,1,14,5,11,0,3,4,5,3,1,2,38,0,4
22,"15 Quartlebaum,Mayson",4,1,0,13,6,10,1,1,0,0,1,0,0,24,1,3
23,"12 Prospere II,Emanuel",4,2,2,9,4,7,1,2,0,2,4,2,0,37,1,3
24,"21 Kabala,Terrell",2,3,1,6,2,6,2,4,0,0,2,1,0,23,0,2
25,"35 Glover II,Troy",6,2,1,6,2,3,0,0,2,4,0,1,1,16,0,6
26,"05 O'Neal II,Kris",2,1,3,0,0,1,0,0,0,0,1,3,0,11,0,2
27,"02 Meeks,Christian",2,0,0,0,0,2,0,1,0,0,0,1,0,8,2,0
28,"04 Bledsoe,Sam",0,0,0,0,0,2,0,0,0,0,0,1,0,7,0,0


In [47]:
oppPlayerStats

Unnamed: 0,# Player,REB,PF,STL,PTS,FGM,FGA,3PTM,3PTA,FTM,FTA,A,TO,BLK,MIN,ORB,DRB
0,"32 Maughmer,Jayvon",7,2,0,31,11,18,3,5,6,6,0,2,1,33,1,6
1,"00 Rogers,Chris",0,2,0,14,5,8,2,5,2,2,3,1,0,30,0,0
2,"05 Whisman,Grant",2,3,0,11,4,9,3,7,0,0,1,0,0,31,0,2
3,"01 Drees,Jacob",11,2,0,4,2,5,0,1,0,0,1,4,0,24,1,10
4,"11 Davis,Timothy",0,3,0,0,0,3,0,3,0,0,0,1,0,6,0,0
5,"03 Thomas,Kyle",4,0,1,9,3,7,3,7,0,0,2,1,0,29,1,3
6,"02 Ruffolo,Anthony",2,1,1,4,2,4,0,1,0,0,1,1,0,16,1,1
7,"22 Pszczola,Tymoteusz",2,1,1,2,1,4,0,0,0,0,0,1,0,14,1,1
8,"12 Sellars,Ethan",2,1,0,0,0,2,0,1,0,0,2,2,0,16,1,1
9,"10 Okpara,David",0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
