In [199]:
from urllib2 import Request, urlopen
from PyPDF2 import PdfFileWriter, PdfFileReader
from StringIO import StringIO
from tabulate import tabulate
import re, string, csv, unicodedata
import pandas as pd


In [196]:
# Helper Functions

def wikilink(text):
    return text.join('[[',']]')

def get_text_from_pdf(url):
    remoteFile = urlopen(Request(url)).read()
    memoryFile = StringIO(remoteFile)
    pdfFile = PdfFileReader(memoryFile)
    # extract text
    text = " ".join(page.extractText() for page in pdfFile.pages) # extract text from all pages
    text = unicodedata.normalize('NFKD', text).encode('ascii','ignore')  # normalise unicode
    return text


# list of corrections
corrections = {
    r"Pete Nugent": "Nooj",
    r"Peter Nugent": "Nooj",
    r"Li Wen Yip": "Li-Wen Yip",
    r"Simon Lubowski": "Simon Lubkowski",
    r"Alexander Sanctuary": "Alex Sanctuary",
    r"Alexander de Bruin": "Alex de Bruin",
    r"Boo Lewis": "Boo de Bruin",
    r"Alistair Hay\-Plumb": "Alastair Hay-Plumb",
}
corrections = {re.escape(k): v for k, v in corrections.items()}
corrections_re = re.compile("|".join(corrections.keys()))

def fix_text(text):
    text = re.sub(r"\s*\n\s*", " ", text) # strip newlines
    text = re.sub(r"(\w)\-\s(\w)", r"\1-\2", text)  # fix hyphenated names that have broken
    text = corrections_re.sub(lambda m: corrections[re.escape(m.group(0))], text)
    return text

In [153]:
# list of URLS to scrape
urlss = {
    "BinH Mixed": {
        2015:"http://www.labbs.org.uk/convention/2015/MQF2015.pdf",
        2014:"http://www.labbs.org.uk/convention/2014/MQF2014.pdf",
        2013:"http://www.labbs.org.uk/convention/2013/MQF2013.pdf",
        2012:"http://www.labbs.org.uk/convention/2012/MQ2012.pdf"        
    },
    
#    "LABBS": {
#        2015:"http://www.labbs.org.uk/convention/2015/QP-2015.pdf",
#        2014:"http://www.labbs.org.uk/convention/2014/QP-2014.pdf",
#        2013:"http://www.labbs.org.uk/convention/2013/QtPrelims2013.pdf",
#        2012:"http://www.labbs.org.uk/convention/2012/QP2012-Detail.pdf",
#        2011:"http://www.labbs.org.uk/convention/2011/QP-2011-Detail.pdf",
#        2010:"http://www.labbs.org.uk/convention/2010/QSF2010.pdf"        
#    },
}


    
textss = {affiliation: {year: get_text_from_pdf(url) for year, url in urls.items()} for affiliation, urls in urlss.items()}

In [179]:
test="Highest Placed  Quartet(s) 2015\n922\nThe Nearness Of You\nYou Took Advantage Of Me\n152\n154\n153\n153\n154\n156\n1\n1\n1\nCategory rankings:\n1: Hannah and the Hurricanes  \n(Andrea Day, Hannah Braham, \nTim Briggs, Duncan Whinyates)\n76.8"
test2="Second Placed  Quartet(s) 2015\n889\nSteppin' Out With My Baby\nYou Are So Beautiful\n144\n147\n145\n147\n154\n152\n3\n3\n2\nCategory rankings:\n2: MasterMix  (Cherie Morgan, \nSarah Cole, Brian Shaw, \nKenneth Nilsson)\n74.1"
quartet_re = re.compile(
    r"(\d{1,3})\n" +  # total score
    r"([^\n]+)\n" * 2 +  # song titles
    r"(\d{1,3})\n" * 9 +  # category scores + rankings
    r"Category [Rr]ankings:\n" +
    r"(\d+):\s+" +  # "1: "
    r"([^\(]+?)\s+\(" +  # "Quartet Name ("
    r"(.+?)[\.,]\s*" * 3 + r"(.+?)" + # Singers   
    r"\)\s*([\d\.]+)",  # ") score"
    re.DOTALL
)

for m in quartet_re.finditer(test):
    print m.groups()

    
    

('922', 'The Nearness Of You', 'You Took Advantage Of Me', '152', '154', '153', '153', '154', '156', '1', '1', '1', '1', 'Hannah and the Hurricanes', 'Andrea Day', 'Hannah Braham', 'Tim Briggs', 'Duncan Whinyates', '76.8')


In [197]:
quartetss = []



def parse(textss):
    for affiliation, texts in textss.items():
        for year, text in texts.items():
            for m in quartet_re.finditer(text):
                total_score, title1, title2, \
                mus1, mus2, prs1, prs2, sng1, sng2, \
                mus_rank, prs_rank, sng_rank, \
                place, name, tenor, lead, bari, bass, pc_score = (fix_text(t) for t in m.groups())
                
                yield year, place, wikilink(name), wikilink(tenor), \
                wikilink(lead), wikilink(bari), wikilink(bass), pc_score




In [207]:
headers = ['Year','Place','Name','Tenor','Lead','Bari','Bass','Score (%)']
print tabulate(parse(textss), headers, tablefmt="mediawiki") 



{| class="wikitable" style="text-align: left;"
|+ <!-- caption -->
|-
! align="right"|   Year !! align="right"|   Place !! Name                           !! Tenor           !! Lead                 !! Bari            !! Bass             !! align="right"|   Score (%)
|-
| align="right"|   2012 || align="right"|       1 || Double Trouble                 || Jacqui Foetu    || Monica Funnell       || Andy Foster     || Andy Funnell     || align="right"|        75.3
|-
| align="right"|   2012 || align="right"|       2 || K4                             || Delyth Knight   || Dick Knight          || Joe Knight      || Tom Knight       || align="right"|        73.1
|-
| align="right"|   2012 || align="right"|       3 || Rhythmix                       || Cherie Morgan   || Brian Shaw           || Helen Lappert   || Stuart Owen      || align="right"|        72.2
|-
| align="right"|   2012 || align="right"|       4 || Bobcatz                        || Veryan Zimber   || Gemma Netherton-Hind || Sara