In [392]:
from PyPDF2 import PdfFileWriter, PdfFileReader
import re, string, csv, unicodedata, os, json, subprocess, pprint

pp = pprint.PrettyPrinter(indent=4).pprint



In [393]:
# Convert PDF file to text; returns text as string
def pdftotext(url):
    texturl = url.replace('pdf', 'txt')
    subprocess.run([r'C:\Program Files\Xpdf\bin64\pdftotext.exe', '-raw', url, texturl])
    with open(texturl) as f:
        return f.read()

# Strip newlines and fix hyphenated names
def fix_text(text):
    text = re.sub(r"\s*\n\s*", " ", text) # strip newlines
    text = re.sub(r"(\w)\s*\-\s*(\w)", r"\1-\2", text)  # fix hyphenated names that have broken
    return text.strip() # trim whitespace

# Iterate through files in a list of directories
def getfiles(directories, ext):
    for directory in directories:
        for filename in os.listdir(directory):
            if filename.endswith(ext):
                yield os.path.join(directory, filename)
    

In [394]:
# DEBUG
#get_contest_details(texts['BABS/2007QSF.pdf'])
#texts['BABS/2007QSF.pdf']

In [395]:
# Extract contest details from text
CONTEST_DETAILS = ('assoc', 'contest', 'location', 'year', 'date')
def get_contest_details(text):
    # (OFFICIAL CONTEST RESULT\n)? and (\d{2} \w{3} 20\d{2}) are unique to the 2009 files
    r = r'^(?P<assoc>.*)\n(OFFICIAL CONTEST RESULT\n)?(?P<contest>.*) - (?P<location>.*): (?P<year>.*)\n[\w\W]+(?P<date>\d{2}/\d{2}/20\d{2}|\d{2} \w{3} 20\d{2})'
    m = re.compile(r).search(text)
    return {key: m.group(key) for key in CONTEST_DETAILS}
    
# Check contest type
def get_contest_type(text):
    if re.search('CHORUS', text):
        return 'chorus'
    elif re.search(r"Previous \([Bb]alanced\)", text):
        return 'quartet final'
    else:
        return 'quartet' 
    
# Extract judges and contest admin (CA) names from text
def get_judges(text):
    keys = ('m', 'p', 's', 'ca')
    r =  r'Music:(?: Rolling Panel:-)?(?P<m>.*)\n?'
    r += r'(?:Performance|Presentation):(?: Rolling Panel:-)?(?P<p>.*)\n?'
    r += r'Singing:(?: Rolling Panel:-)?(?P<s>.*)\n?'
    r += r'(?:Admin|CA|CoJ):(?: Rolling Panel:-)?(?P<ca>.*?)(?:\n|Signed)'
    m = re.compile(r, re.DOTALL).search(text)
    judges = []
    for key in keys:
        judges.extend([{'cat':key, 'name':fix_text(n.strip())} for n in m.group(key).split(',')])
    return judges

In [396]:
# Generic Regex Components

# \n889\n
# \n1384\n
TOT_SCORE = r"(?P<tot_score>\d{3,4})\s*"

# \n1384.0\n
FINAL_TOT_SCORE = r"(?P<final_tot_score>\d{3,4})\.\d\s*"

# Stars Fell On Alabama\nFly Me To The Moon\n
SONGS = "".join(r"(?P<%s>[^\n]+)\s" % key for key in ('title1', 'title2'))

# 228\n223\n229\n225\n231\n227\n
# 252250247\n244\n247243111\n
CAT_SCORES = "".join(r"(?P<%s>\d{3})\s*" % key for key in ('m1', 'p1', 's1', 'm2', 'p2', 's2'))

# Previous (balanced):\n459.0\n458.0\n461.0\n
PREV_SCORES = r"(?P<titleprev>Previous) \([Bb]alanced\):\s?" + "".join(r"(?P<%s>\d{3})\.\d\s*" % key for key in ('mprev', 'pprev', 'sprev'))

# 1\n1\n1\nCategory rankings:\n
# 111\nCategory rankings:\n
CAT_RANKS = "".join(r"(?P<%s>\d{1,2})\s*" % key for key in ('rank_m', 'rank_p', 'rank_s')) + "Category [Rr]ankings:\n"

# 1: 
RANK = r"(?P<rank>\d+):\s*" 

# Chorus/Quartet name
NAME = r"(?P<name>[^\(]+?)\s*" # Assumes name is followed by a (

# (Sarah Hicks, Gill \nIrwin, Julie Robinson,  Monica \nFunnell)
SINGERS = r"\((?P<tenor>[^\.,]+?)[\.,]\s*(?P<lead>[^\.,]+?)[\.,]\s*(?P<bari>[^\.,]+?)[\.,]\s*(?P<bass>[^\.,]+?)[\.,]?\)\s*"

# (Jo Braham) 
DIRECTOR = r"\((?P<director>[^\.,]+?)\)\s*"

# (47)\n
SIZE = r"\((?P<size>\d{1,3})\)\s?"

# 82.3
PC_SCORE = r"(?P<pc_score>\d\d\.\d)" 

In [397]:
# Get details of chorus contestants
def get_choruses(text):
    l = []
    keys = ('tot_score', 'rank_m', 'rank_s', 'rank_p', 'rank', 'name', 'director', 'size', 'pc_score')
    r = TOT_SCORE + SONGS + CAT_SCORES + CAT_RANKS + RANK + NAME + DIRECTOR + SIZE + PC_SCORE
    for m in re.compile(r).finditer(text):
        x = {key: fix_text(m.group(key)) for key in keys}
        x['songs'] = [{key: fix_text(m.group(key + str(n))) for key in ('title', 'm', 's', 'p')} for n in (1,2)]
        l.append(x)
    return l

# Get details of quartet contestants
def get_quartets(text):
    l = []
    keys = ('tot_score', 'rank_m', 'rank_s', 'rank_p', 'rank', 'name', 'pc_score')
    r = TOT_SCORE + SONGS + CAT_SCORES + CAT_RANKS + RANK + NAME + SINGERS + PC_SCORE
    for m in re.compile(r).finditer(text):
        x = {key: fix_text(m.group(key)) for key in keys}
        x['songs'] = [{key: fix_text(m.group(key + str(n))) for key in ('title', 'm', 's', 'p')} for n in (1,2)]
        x['singers'] = [ {'part': key, 'name': fix_text(m.group(key))} for key in ('tenor', 'lead', 'bari', 'bass')]
        l.append(x)
    return l

# Get details of quartet finals contestants
def get_finals_quartets(text):
    l = []
    keys = ('tot_score', 'final_tot_score', 'rank_m', 'rank_s', 'rank_p', 'rank', 'name', 'pc_score')
    r = TOT_SCORE + FINAL_TOT_SCORE + SONGS + CAT_SCORES + PREV_SCORES + CAT_RANKS + RANK + NAME + SINGERS + PC_SCORE
    for m in re.compile(r).finditer(text):
        x = {key: fix_text(m.group(key)) for key in keys}
        x['songs'] = [{key: fix_text(m.group(key + str(n))) for key in ('title', 'm', 's', 'p')} for n in (1,2,'prev')]
        x['singers'] = [ {'part': key, 'name': fix_text(m.group(key))} for key in ('tenor', 'lead', 'bari', 'bass')]
        l.append(x)
    return l

# Parse texts, return one dict for each scoresheet
def get_contests(texts):
    for file, text in texts.items():
        #print('reading %s' % file)
        contest = get_contest_details(text)
        contest['judges'] = get_judges(text)
        contest['type'] = get_contest_type(text)
        if contest['type'] == 'chorus':
            contest['contestants'] = get_choruses(text)
        elif contest['type'] == 'quartet final':
            contest['contestants'] = get_finals_quartets(text)
        else:
            contest['contestants'] = get_quartets(text)
        contest['filename'] = file
        yield contest

In [398]:
# Read all the PDFs into a dict of strings
directories = ['BABS/', 'LABBS/']
texts = {}
for file in getfiles(directories, 'pdf'):
    texts[file] = pdftotext(file)

In [399]:
# Parse strings to list of contests
contests = list(get_contests(texts))

In [401]:
# export to excel
from pandas.io.json import json_normalize
import pandas as pd

dfs = []
for contest in contests:
    dfs.append(json_normalize(contest, 'contestants', ['assoc', 'contest', 'year', 'location', 'type', 'date', 'filename']))
pd.concat(dfs).to_excel('by contestants.xlsx')

dfs = []
for contest in contests:
    dfs.append(json_normalize(contest, 
                   ['contestants', 'songs'], 
                   ['assoc', 'contest', 'year', 'location', 'type', 'date', 'filename',
                    ['contestant', 'name'],
                    ['contestant', 'tot_score'],
                    ['contestant', 'pc_score'],
                    ['contestant', 'rank'],
                   ]))
pd.concat(dfs).to_excel('by songs.xlsx')

In [404]:
# export to JSON files
for contest in contests:
    with open(contest['filename'].replace('pdf', 'json'), 'w') as outfile:
        json.dump(contest, outfile, indent=2)  

In [None]:
# export to mongodb

#import pymongo
#MONGODB_PW = ''
#client = pymongo.MongoClient("mongodb://admin:%s@scores-shard-00-00-h0xbs.mongodb.net:27017,scores-shard-00-01-h0xbs.mongodb.net:27017,scores-shard-00-02-h0xbs.mongodb.net:27017/<DATABASE>?ssl=true&replicaSet=scores-shard-0&authSource=admin" % MONGODB_PW)
#db = client.barbershop

#for contest in contests:
#    with open(file.replace('pdf', 'json'), 'w') as outfile:
#        db.scores.insert_one(contest)
        
        