In [1]:
import json
import re
import fitz
from datetime import datetime, timezone
import pandas as pd
import os

In [2]:
from pathlib import Path

def find_all_pdfs(root_ordner):
    return [str(p) for p in Path(root_ordner).rglob('*.pdf')]

main_folder = os.path.join("..", "sample_roster", "bbtc_pl_2025")
bbtc_pl_2025_roster = find_all_pdfs(main_folder)

main_folder = os.path.join("..", "sample_roster", "bbtc_pl_2025_matched_played")
bbtc_pl_2025_matched_played5_roster = find_all_pdfs(main_folder)

main_folder = os.path.join("..", "sample_roster", "bbtc_pl_eurobowl_2025")
eurobowl_2025_roster = find_all_pdfs(main_folder)


test_roster_paths = bbtc_pl_2025_roster + bbtc_pl_2025_matched_played5_roster + eurobowl_2025_roster

In [3]:
test_roster_paths[-1].split("\\")

['..', 'sample_roster', 'bbtc_pl_eurobowl_2025', 'EB - Orcs - Akorus.pdf']

In [4]:
# Python
import fitz  # PyMuPDF

# Open the PDF
doc = fitz.open(test_roster_paths[4])

team_data = {}

extraction_step = 'Race'
for page_number, page in enumerate(doc, start=1):
    # Extract text in blocks/spans with details
    blocks = page.get_text("dict")["blocks"]
    
    for block in blocks:
        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"]
                    color = span["color"]  # RGB, float 0-1 representation
                    # print(f"Page {page_number}: '{text}' Color: {color}")
                    
                    if extraction_step == 'Race':
                        if text == 'COACH NAME':
                            extraction_step = 'Coach'
                        if 'Race' in team_data:
                            team_data


In [47]:
# Python
import fitz  # PyMuPDF

def load_roster(roster_path):
    doc = fitz.open(roster_path)

    roster_path_split = roster_path.split("\\")
    pdf_name = roster_path_split[-1]
    
    return {
        "full_path": roster_path,
        "pdf_name": pdf_name,
        "loaded_pdf": doc
    }

def detect_roster_type(loaded_roster):
    doc = loaded_roster["loaded_pdf"]
    for page_number, page in enumerate(doc, start=1):
        # Extract text in blocks/spans with details
        blocks = page.get_text("dict")["blocks"]
        started_summary = False
    
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"]
                        if text == "SUMMARY":
                            started_summary = True
                        elif started_summary:
                            print(text)
                            if text == "Skill Points":
                                return "bbtc_pl_2025_matched_played"
                            elif text == "Players cost":
                                return "bbtc_pl_2025"
                            elif "Option:" in text:
                                return "bbtc_pl_eurobowl_2025"

        raise RuntimeError(f"Could not identify roster_type for {loaded_roster['full_path']}")


SUMMARY_STEP_MAPPING = {
    "bbtc_pl_2025_matched_played": [
        'Skill Points',
        None,
        'Secondary skills',
        None,
        'Star players',
        None,
    ],
    "bbtc_pl_2025": [
        'Players cost',
        None,
        'Skills cost',
        None,
        'Inducement cost',
        None,
        'Sideline cost',
        None,
        'Primary skills',
        None,
        'Secondary skills',
        None
    ],
    "bbtc_pl_eurobowl_2025": [
        'Players cost',
        None,
        'Skills cost',
        None,
        'Inducement cost',
        None,
        'Sideline cost',
        None,
        'Primary skills',
        None,
        'Secondary skills',
        None
    ],
}


def process_team_pdf(roster_path):
    # Open the PDF
    loaded_roster = load_roster(roster_path)
    doc = loaded_roster["loaded_pdf"]
    pdf_type = detect_roster_type(loaded_roster)
    team_data = {
        'pdf_name': loaded_roster["pdf_name"],
        'pdf_type': pdf_type
    }

    print(team_data)
    
    extraction_step = 'Race'
    for page_number, page in enumerate(doc, start=1):
        # Extract text in blocks/spans with details
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        text = span["text"]
                        color = span["color"]  # RGB, float 0-1 representation
                        # print(f"Page {page_number}: '{text}' Color: {color}")
                        # Adjust extraction step
                        if text == 'SIDELINE':
                            extraction_step = 'Sideline'
                            sideline_ctr = 0
                        elif text == 'INDUCEMENTS':
                            extraction_step = 'Inducements'
                            next_name = None
                        elif text == 'SUMMARY':
                            summary_ctr = 0
                            extraction_step = 'Summary'
                            summary_ctr = -1
    
                        if extraction_step == 'Race':
                            if text == 'COACH NAME':
                                extraction_step = 'Coach'
                                continue
                            if 'Race' in team_data:
                                team_data['Race'] += ' ' + text
                            else:
                                team_data['Race'] = text

                        elif extraction_step == 'Coach':
                            team_data['Coach'] = text
                            extraction_step = 'Team'

                        elif extraction_step == 'Team':
                            if text == 'TEAM NAME':
                                continue
                            elif text == 'SIDELINE':
                                extraction_step = 'Sideline'
                                sideline_ctr = 0
                                
                            elif 'Team' in team_data:
                                team_data['Team'] += ' ' + text
                            else:
                                team_data['Team'] = text

                        elif extraction_step == 'Sideline':
                            sideline_properties = [
                                'Apothecary',
                                'Assistant coaches',
                                'Cheerleaders',
                                'Dedicated fans',
                                'Re-rolls',
                            ]

                            if sideline_ctr >= len(sideline_properties):
                                extraction_step = 'Inducements'
                                continue
                            next_sideline = sideline_properties[sideline_ctr]
                            
                            if text == 'SIDELINE':
                                continue
                            elif text in sideline_properties:
                                if (sideline_ctr == 0) and (text != 'Apothecary'):
                                    sideline_ctr += 1
                                continue
                            else:
                                # print('SAVE', next_sideline, text)
                                team_data[f'Sideline - {next_sideline}'] = text
                                sideline_ctr += 1

                        elif extraction_step == 'Inducements':
                            if text in ['SUMMARY', 'No inducements', 'LEAGUES & SPECIAL']:
                                summary_ctr = 0
                                extraction_step = 'Summary'
                                summary_ctr = -1
                                continue
                            if text == 'INDUCEMENTS':
                                continue
                            if next_name is None:
                                next_name = text
                            else:
                                team_data[f'Inducement - {next_name}'] = text
                                next_name = None

                        elif extraction_step == 'Summary':
                            print(f"SUMMARY {summary_ctr} | {text}")
                            summary_steps = SUMMARY_STEP_MAPPING[pdf_type]
                            if summary_ctr == len(summary_steps):
                                extraction_step = 'Players'
                                player_ctr = -1
                                continue

                            if text == 'SUMMARY':
                                summary_ctr = 0
                                continue
                            elif summary_ctr == -1:
                                continue

                            if pdf_type == "bbtc_pl_eurobowl_2025":
                                if summary_ctr == 0:
                                    team_data[f'Summary - Option'] = text.split(": ")[0]
                                    extraction_step = 'Players'
                                    player_ctr = -1
                                    continue
                                else:
                                    raise NotImplementedError()
                            else:
                                if (summary_ctr % 2) == 1:
                                    team_data[f'Summary - {summary_steps[summary_ctr - 1]}'] = text
                            summary_ctr += 1
    
                        elif extraction_step == 'Players':
                            if text == 'COST':
                                team_data['Players'] = []
                                player_ctr = 1
                                next_player_property = 'Name'
                                current_player = {
                                    'ctr': player_ctr,
                                    'position_name': None,
                                    'primary_1': None,
                                    'primary_2': None,
                                    'secondary_1': None,
                                    'secondary_2': None,
                                    'star': False,
                                }
                                continue
                            elif player_ctr == -1:
                                continue

                            if re.fullmatch(r"\b\d+k\b", text):
                                team_data['Players'].append(current_player)
                                player_ctr += 1
                                current_player = {
                                    'ctr': player_ctr,
                                    'position_name': None,
                                    'primary_1': None,
                                    'primary_2': None,
                                    'secondary_1': None,
                                    'secondary_2': None,
                                    'star': False,
                                }
                                next_player_property = 'Name'

                            elif next_player_property == 'Name':
                                current_player['position_name'] = " ".join(text.split()[1:])
                                next_player_property = 'Skills'

                            elif next_player_property == 'Skills':
                                if color == 681912:
                                    if current_player['primary_1'] is None:
                                        current_player['primary_1'] = text.strip().strip(',')
                                    elif current_player['primary_2'] is None:
                                        current_player['primary_2'] = text.strip().strip(',')
                                    else:
                                        raise RuntimeError(f'Unexpected Skill | Color: {color} - Text: {text}')
                                if color == 4822027:
                                    if current_player['secondary_1'] is None:
                                        current_player['secondary_1'] = text.strip().strip(',')
                                    elif current_player['secondary_2'] is None:
                                        current_player['secondary_2'] = text.strip().strip(',')
                                    else:
                                        raise RuntimeError(f'Unexpected Skill | Color: {color} - Text: {text}')
                            
                            if text == 'Special skill: ':
                                current_player['star'] = True
                                      
    return team_data
                                    
                                
                            
processed_roster = []
for roster_path in test_roster_paths[-1:]:
    team_data = process_team_pdf(roster_path)
    processed_roster.append(team_data)
    # print(json.dumps(team_data, indent=4))

Option: Hiring Legends
{'pdf_name': 'EB - Orcs - Akorus.pdf', 'pdf_type': 'bbtc_pl_eurobowl_2025'}
SUMMARY -1 | RULES
SUMMARY -1 | • Badlands Brawl
SUMMARY -1 | • Brawlin' Brutes
SUMMARY -1 | • Team Captain
SUMMARY -1 | SUMMARY
SUMMARY 0 | Option: Hiring Legends


In [48]:
for roster_path in test_roster_paths:
    loaded_roster = load_roster(roster_path)
    doc = loaded_roster["loaded_pdf"]
    result = detect_roster_type(loaded_roster)
    print(roster_path, "|", result)

Players cost
..\sample_roster\bbtc_pl_2025\High Elfs - Akorus.pdf | bbtc_pl_2025
Players cost
..\sample_roster\bbtc_pl_2025\Khorne - Akorus.pdf | bbtc_pl_2025
Players cost
..\sample_roster\bbtc_pl_2025\Necro - Schlachtenlenker.pdf | bbtc_pl_2025
Players cost
..\sample_roster\bbtc_pl_2025\Nurgle - Akorus.pdf | bbtc_pl_2025
Players cost
..\sample_roster\bbtc_pl_2025\Vamps - Akorus.pdf | bbtc_pl_2025
Skill Points
..\sample_roster\bbtc_pl_2025_matched_played\Zons - Test.pdf | bbtc_pl_2025_matched_played
Option: Hiring Legends
..\sample_roster\bbtc_pl_eurobowl_2025\EB - Orcs - Akorus.pdf | bbtc_pl_eurobowl_2025


In [49]:
processed_roster[0]

{'pdf_name': 'EB - Orcs - Akorus.pdf',
 'pdf_type': 'bbtc_pl_eurobowl_2025',
 'Race': 'Orc',
 'Coach': 'Akorus',
 'Team': 'EB - Orcs Test',
 'Sideline - Apothecary': 'No',
 'Sideline - Assistant coaches': '0',
 'Sideline - Cheerleaders': '0',
 'Sideline - Dedicated fans': '0',
 'Sideline - Re-rolls': '0',
 "Inducement - Blitzer's Best Kegs": '1',
 'Inducement - Weather Mage': '1',
 'Inducement - Dodgy League Rep': '1',
 'Summary - Option': 'Option',
 'Players': [{'ctr': 1,
   'position_name': 'Big Un Blocker',
   'primary_1': 'Block',
   'primary_2': None,
   'secondary_1': None,
   'secondary_2': None,
   'star': False},
  {'ctr': 2,
   'position_name': 'Big Un Blocker',
   'primary_1': None,
   'primary_2': None,
   'secondary_1': None,
   'secondary_2': None,
   'star': False},
  {'ctr': 3,
   'position_name': 'Goblin Lineman',
   'primary_1': None,
   'primary_2': None,
   'secondary_1': None,
   'secondary_2': None,
   'star': False},
  {'ctr': 4,
   'position_name': 'Orc Blitzer'

In [46]:
rows = []
error_rows = []
for team_data in processed_roster:
    if 'Players' in team_data:
        for player in team_data['Players']:
            prefix = f'player_{player["ctr"]}'
            for key, value in player.items():
                if key == 'ctr':
                    continue
                
                team_data[f'{prefix}_{key}'] = value
        
        del team_data['Players']
        rows.append(team_data)
    elif 'player_11_star' in team_data:
        rows.append(team_data)
    else:
        error_rows.append(team_data)

print('Rows', len(rows))
print('Errors', len(error_rows))

Rows 1
Errors 0


In [None]:
df = pd.DataFrame(rows)
df.head(5)

In [8]:
df_error = pd.DataFrame(error_rows)
df_error

Unnamed: 0,PDF Name,Race,Coach,Team,Sideline - Apothecary,Sideline - Assistant coaches,Sideline - Cheerleaders,Sideline - Dedicated fans,Sideline - Re-rolls,Summary - Players cost,Summary - Skills cost,Summary - Inducement cost,Summary - Sideline cost,Summary - Primary skills,Summary - Secondary skills,Inducement - Blitzer's Best Kegs,Inducement - Weather Mage,Inducement - Dodgy League Rep
0,Zons - Test.pdf,Amazon,Akorus,Matched Played Zons,Yes,1,0,0,3,6/6,0/1,1/1,MA ST AG PA AV SKILLS,1 Eagle Warrior,3,,,
1,EB - Orcs - Akorus.pdf,Orc,Akorus,EB - Orcs Test,No,0,0,0,0,Team,1040k/1050k +,Skill Gold,# POSITION,COST,5,1.0,1.0,1.0


In [None]:
df_emerald_bowl_processing = df[['PDF Name', 'Race', 'Coach', 'Team',
       'Summary - Players cost', 'Summary - Skills cost',
       'Summary - Inducement cost', 'Summary - Sideline cost',
       'Summary - Primary skills', 'Summary - Secondary skills']]
df_emerald_bowl_processing['emerald_bowl_skill_cost'] = df_emerald_bowl_processing['Summary - Primary skills'].astype(int) * 20  + df_emerald_bowl_processing['Summary - Secondary skills'].astype(int) * 30
df_emerald_bowl_processing

In [None]:
now_utc = datetime.now(timezone.utc)
timestamp_utc_str = now_utc.strftime("%Y%m%dT%H%M%S")
df.to_csv(f'roster_extracted_{timestamp_utc_str}.csv')

In [None]:
now_utc = datetime.now(timezone.utc)
timestamp_utc_str = now_utc.strftime("%Y%m%dT%H%M%S")
df_update.to_csv(f'eb_roster_update_{timestamp_utc_str}.csv')

In [None]:
df_error = pd.DataFrame(error_rows)
now_utc = datetime.now(timezone.utc)
timestamp_utc_str = now_utc.strftime("%Y%m%dT%H%M%S")
df_error = pd.DataFrame(error_rows).to_csv(f'eb_roster_errors_{timestamp_utc_str}.csv')
len(error_rows)