In [541]:
from PyPDF2 import PdfFileWriter, PdfFileReader
import re, string, csv, unicodedata, os, json, subprocess, pprint

pp = pprint.PrettyPrinter(indent=4).pprint



In [559]:
# Convert PDF file to text; returns text as string
def pdftotext(url):
    texturl = url.replace('pdf', 'txt')
    subprocess.run([r'C:\Program Files\Xpdf\bin64\pdftotext.exe', '-raw', url, texturl])
    with open(texturl) as f:
        return f.read()

# Convert PDF file to text; returns text as string
def pdftohtml(url):
    texturl = url.replace('pdf', 'html')
    subprocess.run([r'C:\Program Files\Xpdf\bin64\pdftohtml.exe', url, texturl])
    return None 
    
# Strip newlines and fix hyphenated names
def fix_text(text):
    if text is None:
        return None
    text = re.sub(r"\s*\n\s*", " ", text) # strip newlines
    text = re.sub(r"(\w)\s*\-\s*(\w)", r"\1-\2", text)  # fix hyphenated names that have broken
    return text.strip() # trim whitespace

def dict_from_regex_match(m, keys):
    return {key: fix_text(m.group(key)) for key in keys}

# Iterate through files in a list of directories
def getfiles(directories, ext):
    for directory in directories:
        for filename in os.listdir(directory):
            if filename.endswith(ext):
                yield os.path.join(directory, filename)
    

In [543]:
# DEBUG
#get_contest_details(texts['BABS/2007QSF.pdf'])
#texts['BABS/2007QSF.pdf']

In [544]:
# Extract contest details from text
def get_contest_details(text):
    # (OFFICIAL CONTEST RESULT\n)? and (\d{2} \w{3} 20\d{2}) are unique to the 2009 files
    r = r'^(?P<assoc>.*)\n(OFFICIAL CONTEST RESULT\n)?(?P<contest>.*) - (?P<location>.*): (?P<year>.*)\n[\w\W]+(?P<date>\d{2}/\d{2}/20\d{2}|\d{2} \w{3} 20\d{2})'
    m = re.compile(r).search(text)
    return {key: m.group(key) for key in ('assoc', 'contest', 'location', 'year', 'date')}
    
# Check contest type
def get_contest_type(text):
    if re.search('CHORUS', text):
        return 'chorus'
    elif re.search(r"Previous \([Bb]alanced\)", text):
        return 'quartet final'
    else:
        return 'quartet' 
    
# Extract judges and contest admin (CA) names from text
def get_judges(text):
    keys = ('m', 'p', 's', 'a')
    r =  r'Music:(?: Rolling Panel:-)?(?P<m>.*)\n?'
    r += r'(?:Performance|Presentation):(?: Rolling Panel:-)?(?P<p>.*)\n?'
    r += r'Singing:(?: Rolling Panel:-)?(?P<s>.*)\n?'
    r += r'(?:Admin|CA|CoJ):(?: Rolling Panel:-)?(?P<a>.*?)(?:\n|Signed)'
    m = re.compile(r, re.DOTALL).search(text)
    judges = []
    for key in keys:
        judges.extend([{'cat':key, 'name':fix_text(n.strip())} for n in m.group(key).split(',')])
    return judges

In [545]:
# Generic Regex Components

# \n889\n
# \n1384\n
# \n1384.0\n
TOT_SCORE = r"(?P<tot_score>\d{3,4})(?:\.\d)?\s*"
PREV_TOT_SCORE = r"(?P<prev_tot_score>\d{3,4})(?:\.\d)?\s*"

# Stars Fell On Alabama\nFly Me To The Moon\n
SONGS = "".join(r"(?P<%s>[^\n]+)\s" % key for key in ('name1', 'name2'))

# 228\n223\n229\n225\n231\n227\n
# 252250247\n244\n247243111\n
CAT_SCORES = "".join(r"(?P<%s>\d{3})\s*" % key for key in ('m1', 'p1', 's1', 'm2', 'p2', 's2'))

# Previous (balanced):\n459.0\n458.0\n461.0\n
PREV_SCORES = r"(?P<nameprev>Previous) \([Bb]alanced\):\s?" + "".join(r"(?P<%s>\d{3})\.\d\s*" % key for key in ('mprev', 'pprev', 'sprev'))

# 1\n1\n1\nCategory rankings:\n
# 111\nCategory rankings:\n
CAT_RANKS = "".join(r"(?P<%s>\d{1,2})\s*" % key for key in ('rank_m', 'rank_p', 'rank_s')) + "Category [Rr]ankings:\n"

# 1: 
RANK = r"(?P<rank>\d+):\s*" 

# Chorus/Quartet name
NAME = r"(?P<name>[^\(]+?)\s*" # Assumes name is followed by a (

# (Sarah Hicks, Gill \nIrwin, Julie Robinson,  Monica \nFunnell)
SINGERS = r"\((?P<tenor>[^\.,]+?)[\.,]\s*(?P<lead>[^\.,]+?)[\.,]\s*(?P<bari>[^\.,]+?)[\.,]\s*(?P<bass>[^\.,]+?)[\.,]?\)\s*"
SINGERS = r"(?:%s)?" % SINGERS # Some quartets don't give a list of singers

# (Jo Braham) 
DIRECTOR = r"\((?P<director>[^\.,]+?)\)\s*"

# (47)\n
SIZE = r"\((?P<size>\d{1,3})\)\s?"

# 82.3
PC_SCORE = r"(?P<pc_score>\d\d\.\d)" 

In [546]:
# TODO: Merge these functions into a single function that takes keys, regex, song keys, and member keys as arguments

def get_contestants(text, keys, song_keys, member_keys, regex):
    l = []
    for m in re.compile(regex).finditer(text):
        x = {key: fix_text(m.group(key)) for key in keys}
        x['songs'] = [{key: fix_text(m.group(key + str(n))) for key in ('name', 'm', 's', 'p')} for n in song_keys]
        x['members'] = [ {'part': key, 'name': fix_text(m.group(key))} for key in member_keys]
        l.append(x)
    return l

# Get details of chorus contestants
def get_choruses(text):
    l = []
    keys = ('tot_score', 'rank_m', 'rank_s', 'rank_p', 'rank', 'name', 'size', 'pc_score')
    r = TOT_SCORE + SONGS + CAT_SCORES + CAT_RANKS + RANK + NAME + DIRECTOR + SIZE + PC_SCORE
    for m in re.compile(r).finditer(text):
        x = {key: fix_text(m.group(key)) for key in keys}
        x['songs'] = [{key: fix_text(m.group(key + str(n))) for key in ('name', 'm', 's', 'p')} for n in (1,2)]
        x['members'] = [ {'part': key, 'name': fix_text(m.group(key))} for key in ('director',)]
        l.append(x)
    return l

# Get details of quartet contestants
def get_quartets(text):
    l = []
    keys = ('tot_score', 'rank_m', 'rank_s', 'rank_p', 'rank', 'name', 'pc_score')
    r = TOT_SCORE + SONGS + CAT_SCORES + CAT_RANKS + RANK + NAME + SINGERS + PC_SCORE
    for m in re.compile(r).finditer(text):
        x = {key: fix_text(m.group(key)) for key in keys}
        x['songs'] = [{key: fix_text(m.group(key + str(n))) for key in ('name', 'm', 's', 'p')} for n in (1,2)]
        x['members'] = [ {'part': key, 'name': fix_text(m.group(key))} for key in ('tenor', 'lead', 'bari', 'bass')]
        l.append(x)
    return l

# Get details of quartet finals contestants
def get_finals_quartets(text):
    l = []
    keys = ('prev_tot_score', 'tot_score', 'rank_m', 'rank_s', 'rank_p', 'rank', 'name', 'pc_score')
    r = PREV_TOT_SCORE + TOT_SCORE + SONGS + CAT_SCORES + PREV_SCORES + CAT_RANKS + RANK + NAME + SINGERS + PC_SCORE
    for m in re.compile(r).finditer(text):
        x = {key: fix_text(m.group(key)) for key in keys}
        x['songs'] = [{key: fix_text(m.group(key + str(n))) for key in ('name', 'm', 's', 'p')} for n in (1,2,'prev')]
        x['members'] = [ {'part': key, 'name': fix_text(m.group(key))} for key in ('tenor', 'lead', 'bari', 'bass')]
        l.append(x)
    return l

# Parse texts, return one dict for each scoresheet
def get_contests(texts):
    for file, text in texts.items():
        #print('reading %s' % file)
        contest = get_contest_details(text)
        contest['judges'] = get_judges(text)
        contest['type'] = get_contest_type(text)
        if contest['type'] == 'chorus':
            keys = ('tot_score', 'rank_m', 'rank_s', 'rank_p', 'rank', 'name', 'size', 'pc_score')
            r = TOT_SCORE + SONGS + CAT_SCORES + CAT_RANKS + RANK + NAME + DIRECTOR + SIZE + PC_SCORE
            song_keys = (1,2)
            member_keys = ('director',)
            #contest['contestants'] = get_choruses(text)
        elif contest['type'] == 'quartet final':
            keys = ('prev_tot_score', 'tot_score', 'rank_m', 'rank_s', 'rank_p', 'rank', 'name', 'pc_score')
            r = PREV_TOT_SCORE + TOT_SCORE + SONGS + CAT_SCORES + PREV_SCORES + CAT_RANKS + RANK + NAME + SINGERS + PC_SCORE
            song_keys = (1,2,'prev')
            member_keys = ('tenor', 'lead', 'bari', 'bass')
            #contest['contestants'] = get_finals_quartets(text)
        else:
            keys = ('tot_score', 'rank_m', 'rank_s', 'rank_p', 'rank', 'name', 'pc_score')
            r = TOT_SCORE + SONGS + CAT_SCORES + CAT_RANKS + RANK + NAME + SINGERS + PC_SCORE
            song_keys = (1,2)
            member_keys = ('tenor', 'lead', 'bari', 'bass')
            #contest['contestants'] = get_quartets(text)
        contest['contestants'] = get_contestants(text, keys, song_keys, member_keys, r)
        contest['filename'] = file
        yield contest

In [560]:
# Read all the PDFs into a dict of strings
directories = ['BABS/', 'LABBS/']
texts = {}
for file in getfiles(directories, 'pdf'):
    texts[file] = pdftotext(file)

In [548]:
# Parse strings to list of contests
contests = list(get_contests(texts))

In [549]:
for contest in contests:
    for contestant in contest['contestants']:
        if len(contestant['name']) > 30:
            print(contest['filename'])
            print(contestant['name'])
            print()

BABS/2010CF.pdf
The Great Western Chorus Of Bristol

BABS/2014CF.pdf
The Great Western Chorus Of Bristol

BABS/2015CF.pdf
The Great Western Chorus Of Bristol



In [550]:
# export to excel
from pandas.io.json import json_normalize
import pandas as pd

dfs = []
for contest in contests:
    dfs.append(json_normalize(contest, 'contestants', ['assoc', 'contest', 'year', 'location', 'type', 'date', 'filename']))
pd.concat(dfs).to_excel('by contestants.xlsx')

dfs = []
for contest in contests:
    dfs.append(json_normalize(contest, 
                   ['contestants', 'songs'], 
                   ['assoc', 'contest', 'year', 'location', 'type', 'date', 'filename',
                    ['contestant', 'name'],
                    ['contestant', 'tot_score'],
                    ['contestant', 'pc_score'],
                    ['contestant', 'rank'],
                   ]))
pd.concat(dfs).to_excel('by songs.xlsx')

In [551]:
# export to individual JSON files
for contest in contests:
    with open(contest['filename'].replace('pdf', 'json'), 'w') as outfile:
        json.dump(contest, outfile, indent=2)  

In [552]:
# export to one giant JSON file
with open('all_contests.json', 'w') as outfile:
    json.dump(contests, outfile, indent=2)  

In [553]:
# export to mongodb

#import pymongo
#MONGODB_PW = ''
#client = pymongo.MongoClient("mongodb://admin:%s@scores-shard-00-00-h0xbs.mongodb.net:27017,scores-shard-00-01-h0xbs.mongodb.net:27017,scores-shard-00-02-h0xbs.mongodb.net:27017/<DATABASE>?ssl=true&replicaSet=scores-shard-0&authSource=admin" % MONGODB_PW)
#db = client.barbershop

#for contest in contests:
#    with open(file.replace('pdf', 'json'), 'w') as outfile:
#        db.scores.insert_one(contest)
        
        

In [554]:
for contest in contests:
    if contest['filename'] == 'BABS/2008QSF.pdf':
        pp(contest)

{   'assoc': 'THE BRITISH ASSOCIATION OF BARBERSHOP SINGERS',
    'contest': 'QUARTET SEMI-FINAL (NATIONAL STREAM)',
    'contestants': [   {   'members': [   {'name': 'Alan', 'part': 'tenor'},
                                          {'name': 'Zac', 'part': 'lead'},
                                          {'name': 'Joe', 'part': 'bari'},
                                          {'name': 'Duncan', 'part': 'bass'}],
                           'name': 'Monkey Magic',
                           'pc_score': '76.7',
                           'rank': '1',
                           'rank_m': '1',
                           'rank_p': '1',
                           'rank_s': '1',
                           'songs': [   {   'm': '229',
                                            'name': "What'll I Do",
                                            'p': '236',
                                            's': '228'},
                                        {   'm': '226',
                    