In [329]:
from PyPDF2 import PdfFileWriter, PdfFileReader
import re, string, csv, unicodedata, os, json


In [330]:
# Helper Functions

# get all the text from a pdf file
def get_text_from_pdf(url):
    with open(url, 'rb') as infile:
        pdfFile = PdfFileReader(infile)
        text = " ".join(page.extractText() for page in pdfFile.pages) # extract text from all pages
        text = unicodedata.normalize('NFKD', text).encode('ascii','ignore')  # normalise unicode
        return text.decode()

# Strip newlines and fix hyphenated names
def fix_text(text):
    text = re.sub(r"\s*\n\s*", " ", text) # strip newlines
    text = re.sub(r"(\w)\s*\-\s*(\w)", r"\1-\2", text)  # fix hyphenated names that have broken
    return text.strip() # trim whitespace

In [359]:
# Generic Regex Components

# \n889\n
# \n1384\n
TOT_SCORE = r"(\d{3,4}|\d{3,4}\.\d)\n?"

# Stars Fell On Alabama\nFly Me To The Moon\n
SONGS = r"([^\n]+)\n" * 2

# If You Love Me Really Love MeCome Follow The Band
BABS_SONGS = r"([A-Z][^\n]+[a-z])\n?" * 2

# 228\n223\n229\n225\n231\n227\n
# 252250247\n244\n247243111\n
CAT_SCORES = r"(\d{3})\n?" * 6 

# Previous (balanced):\n459.0\n458.0\n461.0\n
PREV_SCORES = r"Previous \([Bb]alanced\):\n?" + "(\d\d\d\.\d)\n?" * 3

# 1\n1\n1\nCategory rankings:\n
# 111\nCategory rankings:\n
CAT_RANKS = r"(\d{1,2})\n?" * 3 + "Category [Rr]ankings:\n"

# 1: 
PLACE = r"(\d+):\s*" 

NAME = r"([^\(]+?)\s*" # Assumes name is followed by a (

# (Sarah Hicks, Gill \nIrwin, Julie Robinson,  Monica \nFunnell)
SINGERS = r"\(" + r"(.+?)[\.,]\s*" * 3 + r"(.+?)[\.,]?\)\s*"

# (Jo Braham) 
DIRECTOR = r"\((.+?)\)[\n\s]*"

# (47)\n
PAX = r"\((\d{1,3})\)\n?"

# 82.3
PC_SCORE = r"(\d\d\.\d)" 

In [332]:
# LABBS QF (showing scores from previous round)
# \n1363\n2741.0\nStars Fell On Alabama\nFly Me To The Moon\n
# 228\n223\n229\n225\n231\n227\n
# Previous (balanced):\n459.0\n458.0\n461.0\n1\n1\n1\nCategory rankings:\n
# 1: Pzazz  (Sarah Hicks, Gill \nIrwin, Julie Robinson,  Monica \nFunnell)\n76.1\n
labbs_qf_re = re.compile(TOT_SCORE * 2 + BABS_SONGS + CAT_SCORES + PREV_SCORES + CAT_RANKS + PLACE + NAME + SINGERS + PC_SCORE, re.DOTALL)

def get_quartets_final(text):
    for m in labbs_qf_re.finditer(text):
        thisround_score, tot_score, title1, title2, \
        mus1, mus2, prs1, prs2, sng1, sng2, \
        mus_prev, prs_prev, sng_prev, mus_rank, prs_rank, sng_rank, \
        place, name, tenor, lead, bari, bass, pc_score = (fix_text(t) for t in m.groups())

        yield {
            'name':name,
            'type':'quartet',
            'place':place,
            'tot_score':tot_score,
            'pc_score':pc_score,
            'singers': [
                {'name':tenor, 'part':'tenor'},
                {'name':lead, 'part':'lead'},
                {'name':bari, 'part':'bari'},
                {'name':bass, 'part':'bass'},
            ],
            'songs': [
                {'song':title1, 'mus':mus1, 'prs':prs1, 'sng':sng1},
                {'song':title2, 'mus':mus2, 'prs':prs2, 'sng':sng2},
                {'song':'Previous (balanced)', 'mus':mus_prev, 'prs':prs_prev, 'sng':sng_prev},
            ]
        }    

In [333]:
# LABBS QP / MQF
# \n889\nIf Ever I Would Leave You\nLook At Me Now\n
# 149\n146\n154\n153\n143\n144\n1\n1\n1\nCategory rankings:\n
# 1: The Mix  (Andrea Day, Jo \nBraham, Nancy Kelsall, Sandra \nLea-Riley)\n74.1\n
labbs_q_re = re.compile(TOT_SCORE + SONGS + CAT_SCORES + CAT_RANKS + PLACE + NAME + SINGERS + PC_SCORE, re.DOTALL)

# \n1418\nThere I've Said It AgainWouldn't It Be Lovely234242233\n236\n235238112\n
# Category rankings:\n1: Portobello Road  (Ian James, Brian Schofield, Steve Emery, \n\nStuart Owen)\n78.8

# 936A Fool Such As IOnce Upon A Time154158156\n157\n153158122\n
# Category rankings:\n2: The Locksmiths  (Richard Fisher, Simon Lubkowski, Zac \n\nBooles, Andrew Walker)\n78.0

babs_q_re = re.compile(TOT_SCORE + BABS_SONGS + CAT_SCORES + CAT_RANKS + PLACE + NAME + SINGERS + PC_SCORE, re.DOTALL)

def get_quartets(text):
    if re.search('LADIES', get_district(text)):
        r = labbs_q_re
    else:
        r = babs_q_re
    for m in r.finditer(text):
        tot_score, title1, title2, \
        mus1, mus2, prs1, prs2, sng1, sng2, \
        mus_rank, prs_rank, sng_rank, \
        place, name, tenor, lead, bari, bass, pc_score = (fix_text(t) for t in m.groups())

        yield {
            'name':name,
            'type':'quartet',
            'place':place,
            'tot_score':tot_score,
            'pc_score':pc_score,
            'singers': [
                {'name':tenor, 'part':'tenor'},
                {'name':lead, 'part':'lead'},
                {'name':bari, 'part':'bari'},
                {'name':bass, 'part':'bass'},
            ],
            'songs': [
                {'song':title1, 'mus':mus1, 'prs':prs1, 'sng':sng1},
                {'song':title2, 'mus':mus2, 'prs':prs2, 'sng':sng2},
            ]
        }    

In [334]:
# LABBS CF
# \n1448\nHow'dja Like To Love Me?\nIf You Love Me Really Love Me\n
# 241\n241\n242\n240\n242\n242\n1\n1\n1\nCategory rankings:\n
# 1: Cheshire Chord Company  \n(Jo Braham) (47)\n80.4\n
labbs_c_re = re.compile(TOT_SCORE + SONGS + CAT_SCORES + CAT_RANKS + PLACE + NAME + DIRECTOR + PAX + PC_SCORE, re.DOTALL)

# BABS CF
# \n1483\nIf You Love Me Really Love MeCome Follow The Band252250247\n244\n247243111\n
# Category rankings:\n1: The Cottontown Chorus  \n(Neil Firth) (62)82.4
babs_c_re = re.compile(TOT_SCORE + BABS_SONGS + CAT_SCORES + CAT_RANKS + PLACE + NAME + DIRECTOR + PAX + PC_SCORE, re.DOTALL)

def get_choruses(text):
    if re.search('LADIES', get_district(text)):
        r = labbs_c_re
    else:
        r = babs_c_re
    for m in r.finditer(text):
        tot_score, title1, title2, \
        mus1, mus2, prs1, prs2, sng1, sng2, \
        mus_rank, prs_rank, sng_rank, \
        place, name, director, singers, pc_score = (fix_text(t) for t in m.groups())

        yield {
            'name':name,
            'type':'chorus',
            'place':place,
            'tot_score':tot_score,
            'pc_score':pc_score,
            'director':director,
            'singers':singers,
            'songs': [
                {'song':title1, 'm':mus1, 'p':prs1, 's':sng1},
                {'song':title2, 'm':mus2, 'p':prs2, 's':sng2},
            ]
        }    

In [335]:
# Extract judges and contest admin (CA) names from text
def get_judges(text):
    m = re.compile(r"Music:(.*)\n?(?:Performance|Presentation):(.*)\n?Singing:(.*)\n?(?:Admin|CA):(.*?)(?:\n|Signed)", 
                   re.DOTALL).search(text)
    judges = []
    judges.extend({'cat':'m', 'name':fix_text(n.strip())} for n in m.group(1).split(','))
    judges.extend({'cat':'p', 'name':fix_text(n.strip())} for n in m.group(2).split(','))
    judges.extend({'cat':'s', 'name':fix_text(n.strip())} for n in m.group(3).split(','))
    judges.extend({'cat':'ca', 'name':fix_text(n.strip())} for n in m.group(4).split(','))
    return judges

# Extract contest date from text
def get_date(text):
    m = re.compile(r"Contest date:\s*(.*)\s*").search(text)
    return fix_text(m.group(1))

# Extract contest name from text
def get_contest(text):
    m = re.compile(r".*\n(.*)").match(text)
    return fix_text(m.group(1))

# Extract district name from text
def get_district(text):
    m = re.compile(r"(.*)").match(text)
    return fix_text(m.group(1))

In [336]:
# Scrape all pdf files in the same directory

directories = [#'LABBS/',
               'BABS/']

for directory in directories:
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"): 
            # extract the raw text
            text = get_text_from_pdf(os.path.join(directory, filename))
            # choose the appropriate parsing functions depending on type of contest
            if re.search('CHORUS', get_contest(text)):
                get_contestants = get_choruses
            elif re.search(r"Previous \([Bb]alanced\)", text):
                get_contestants = get_quartets_final
            else:
                get_contestants = get_quartets
            # parse the text, create object
            contest = ({
                'district':get_district(text),
                'contest':get_contest(text),
                'date':get_date(text),
                'contestants':list(get_contestants(text)),
                'judges':get_judges(text)
            })
            # save object as JSON
            with open(os.path.join(directory, filename.replace('pdf', 'json')), 'w') as outfile:
                json.dump(contest, outfile, indent=2)

In [358]:
# Debugging
text = get_text_from_pdf('BABS/2014QP-1.pdf')
# text = "Music: Helen Brown, Rob Campbell, Paul Davies\nPresentation: Pat Deeble, Nickie James, Mike WarnerSinging: Cherry Hartshorn,\n David King, Mike Taylor\nCA: Chris Tideman, Samantha Roberts, Helen Ring, Yvonne Hughes, Phil Mobsby, Alastair Taylor-Payne\n"
# text = "936A Fool Such As iOnce Upon A Time154158156\n157\n153158122\nCategory rankings:\n2: The Locksmiths  (Richard Fisher, Simon Lubkowski, Zac Booles, Andrew Walker)\n78.0"


text

"THE BRITISH ASSOCIATION OF BARBERSHOP SINGERS\nQUARTET PRELIM CONTEST 1  -  BIRMINGHAM: 2014/2015\nOFFICIAL CONTEST RESULT\nScores from semi-finals are normalised to take account of any panel size differences\n. Decimal fractions may therefore exist.\nCategory rankings are calculated on the category to\ntals including previous scores where these exist.\nCategory scores reflect any score reduc\ntions. Reductions are shown in bracke\nts under column headers MR, PR and SR.MusPresSingTotals\n%Ch.Pen\nSongsContestantMRPR\n948Over The RainbowSouth Rampart Street Parade150155165\n163\n158157211\nCategory rankings:\n1: Finest Hour  (James Williams, Eddie Williams, Nick \nWilliams, Phil Cuthbert)79.0936A Fool Such As IOnce Upon A Time154158156\n157\n153158122\nCategory rankings:\n2: The Locksmiths  (Richard Fisher, Simon Lubkowski, Zac \n\nBooles, Andrew Walker)\n78.0900You Make Me Feel So YoungLove Me And The World Is Mine152153150\n149\n147149245\nCategory rankings:\n3: Quantum  (Simon Hunt,