In [432]:
import re, string, csv, unicodedata, os, json, subprocess, pprint, pickle, decimal
from extract_rtf import striprtf

decimal.setcontext(decimal.BasicContext)

pp = pprint.PrettyPrinter(indent=4).pprint
PARTS = ['tenor', 'lead', 'bari', 'bass']
CATS = {
    'Music': 'm',
    'Performance': 'p',
    'Singing': 's',
    'CA': 'a'
}

In [433]:
class DecimalEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, decimal.Decimal):
            return float(o)
        return super(DecimalEncoder, self).default(o)

In [434]:
# Iterate through files in a list of directories
def getfiles(directories, ext):
    for directory in directories:
        for filename in os.listdir(directory):
            if filename.endswith(ext):
                yield os.path.join(directory, filename)

# test
#list(getfiles(['BABS RTF'],'rtf'))

In [435]:
# Change a file extension
def change_ext(filename, ext):
    return('%s.%s' % (os.path.splitext(filename)[0], ext))
    
# test
#print(change_ext('BABS RTF\\Results-1996-97-QF-NAT-Detail.anyarbitraryextension', 'rtf'))

In [436]:
# Convert all files in a directory from rtf to plain text
def convert_all_rft2txt(directory):
    for infile in getfiles([directory], 'rtf'):
        outfile = change_ext(infile, 'txt')
        try:
            with open(infile, 'rb') as f:
                plain_text = striprtf(f.read())
            with open(outfile, 'w') as f:
                f.write(plain_text)
        except Exception as e:
            print(infile)
            print(e)
            pass

# test        
#convert_all_rft2txt('BABS RTF')        

In [450]:
def parse_contestant(text, n_judges):
    """
    Parse text for one contestant, in this format:
    1: Cambridge Blues	Dear Old Girl	235	211	218
    	I Got Rhythm	211	206	210	1291	71.7
    	Previous (balanced):	440.0	440.0	411.0	2582.0
    """

    # Assemble the string containing rank, name, and members (usually split across several lines)
    # (Get all the text before the first tab on each line, and concatenate it)
    r = re.compile(r'^[^\t]+', re.MULTILINE)
    contestant = parse_rank_name_members("".join(m2.group(0) for m2 in r.finditer(text)))

    # Get the strings containing the scores
    # (Get all the text after the first tab on each line)
    r = re.compile(r'\t(.+)$', re.MULTILINE)
    contestant['songs'] = [parse_song(m.group(1), n_judges) for m in r.finditer(text)]
    
    # Count number of songs
    contestant['n'] = sum(song['n'] for song in contestant['songs'])

    # Calculate category and categeory pc scores
    for cat in ('m', 'p', 's'):
        contestant[cat] = sum(song[cat] for song in contestant['songs'])
    for cat in ('m', 'p', 's'):
        contestant[cat + '_pc'] = round(decimal.Decimal(contestant[cat]) * 3 / contestant['n'] / n_judges, 1)
    
    # Calculate total score and pc score
    contestant['tot_score'] = sum(song['tot_score'] for song in contestant['songs'])
    contestant['pc_score'] = round(decimal.Decimal(contestant['tot_score']) / contestant['n'] / n_judges, 1)

    return contestant    

In [438]:
def parse_num(text):
    try:
        return int(text.strip())
    except ValueError:
        return Decimal(text.strip())

In [445]:
def parse_song(text, n_judges):
    """
    Extract song title and scores from a string in one of these formats:
    Dear Old Girl	235	211	218
    I Got Rhythm	211	206	210	1291	71.7
    Previous (balanced):	440.0	440.0	411.0	2582.0
    My Wife The Dancer	175	167	(- 13)	164	1015	56.4
    """
    penalty = re.compile(r'\(- (\d+)\)')
    song = {'n': 1}
    
    # Split to a list of strings, then pop items off the list as it is parsed
    split = re.split('\t', text)    
    
    # If the first item in the list is a number, it means there is no title
    song['title'] = None if split[0].isdigit() else split.pop(0)
    
    # Get the music score/penalty
    song['m'] = parse_num(split.pop(0))
    m = penalty.match(split[0])
    if m:
        song['mr'] = -parse_num(m.group(1))
        split.pop(0)
    
    # Get performance score/penalty
    song['p'] = parse_num(split.pop(0))
    m = penalty.match(split[0])
    if m:
        song['pr'] = -parse_num(m.group(1))
        split.pop(0)
    
    # Get singing score
    song['s'] = parse_num(split.pop(0))

    # Calculate total score
    song['tot_score'] = sum(song[cat] for cat in ('m', 'p', 's'))

    if song['title']:    
        # Workaround to make sure that "Previous" scores are counted as two songs when calculating percentages
        if song['title'] == "Previous (balanced):"
            if song['tot_score'] > 0:
                song['n'] = 2
            else:
                song['n'] = 0
    else:
        print("warning - no song title")
        
    # Calculate category pc scores
    for cat in ('m', 'p', 's'):
        song[cat + '_pc'] = round(decimal.Decimal(song[cat]) * 3 / song['n'] / n_judges, 1)
    
    # Calculate pc score
    song['pc_score'] = round(decimal.Decimal(song['tot_score']) / song['n'] / n_judges, 1)
        
    return song    

In [440]:
def parse_rank_name_members(text):
    """
    Extracts rank, contestant name, and members, from a string in one of these formats:
    1: Cambridge Blues
    1: Hallmark Of Harmony (81)
    1: The Great Western Chorus Of Bristol  (Linda Corcoran) (52)
    1: RECKLESS  (Andy Foster, Duncan Whinyates, Dale Kynaston, Andy Funnell)
    Returns a contestant object
    """    
    # Extract rank and name (same for all types of contest)
    # Assumes contestant name doesn't contain any brackets
    m = re.match(r'(\d+): ([^\(]+)', text)
    contestant = {
        'rank': int(m.group(1)),
        'name': m.group(2).strip()
    }

    # Extract any text within brackets
    for m in re.finditer(r'\((.+?)\)', text):
        text_in_brackets = m.group(1).strip()
        try:
            # Check if it's the chorus size (an integer)
            contestant['size'] = int(text_in_brackets)
        except ValueError:
            # Split the string into seprate names.
            names = re.split(r'\s*(?: and |&|,|;|/)\s*', text_in_brackets)
            # 4 names = quartet members, less than 4 names = chorus director(s)
            if len(names) == 4:
                contestant['members'] = [{'part': PARTS[i], 'name': names[i]} for i in range(4)]
            else:
                contestant['members'] = [{'part': 'director', 'name': name} for name in names]

    return contestant

In [451]:
#Contestant	Songs	Mus	MR	Perf	PR	Sing	Ch.Pen	Totals	%
#1: Hallmark Of Harmony (81)	Let The End Of The World Come Tomorrow	233	231	235
#	I'm Looking Over A Four Leaf Clover	235	246	236	1416	78.7
#	Category rankings:	1	1	1

#Contestant	Songs	Mus	MR	Perf	PR	Sing	Ch.Pen	Totals	%
#1: Cambridge Blues	Dear Old Girl	235	211	218
#	I Got Rhythm	211	206	210	1291	71.7
#	Previous (balanced):	440.0	440.0	411.0	2582.0
#	Category rankings:	1	1	1

contests = []

for filename in getfiles(['BABS RTF'],'txt'):
    contest = {
        'assoc': 'BABS',
        'filename': filename
    }
    print('parsing %s' % filename)

    with open(filename, 'r') as f:
        plain_text = f.read()
    
    # Get the third line of text
    text = plain_text.splitlines()[2]
    r = re.compile(r'(?P<contest>.*)  -  (?P<location>.*): (?P<year>[\d/]*)')
    m = r.match(text)
    for key in ('contest', 'location', 'year'):
        contest[key] = m.group(key) if len(m.group(key)) > 0 else None
        if not contest[key]:
            print('warning - no %s' % key)
    
    # Chorus or quartet contest?
    contest['type'] = 'c' if contest['contest'].find('CHORUS') else 'q'
    
    # Parse the contest date
    m = re.search('Contest date: (\d{2}/\d{2}/\d{4})', plain_text)
    contest['date'] = m.group(1) if m else None
    if not contest['date']:
        print('warning - no contest date')
        
    # Parse the judges
    contest['judges'] = []
    r = re.compile('(Music|Performance|Singing|CA): (.+)')
    for m in r.finditer(plain_text):
        # Convert 'Performance' to 'p' etc
        cat = CATS[m.group(1)]
        # Split the comma separated list of judges' names
        names = re.split(', *', m.group(2))
        # Add to list of judges
        for name in names:
            contest['judges'].append({'cat':cat, 'name': name.strip()})
            
    # Count the judges
    n_judges = {cat: sum(1 for j in contest['judges'] if j['cat'] == cat) for cat in ('m', 'p', 's')}
    assert(n_judges['m'] == n_judges['p'] == n_judges['s'])
    n_judges = sum(1 for j in contest['judges'] if j['cat'] in ('m', 'p', 's'))

    # Parse the contestants
    r = re.compile(r'(\d+:.*?)\n\tCategory', re.DOTALL)
    contest['contestants'] = [parse_contestant(m.group(1), n_judges) for m in r.finditer(plain_text)]
    
    # Save as pickle
    with open(change_ext(filename, 'pickle'), 'wb') as outfile:
        pickle.dump(contest, outfile)
    
    # Save as json
    with open(change_ext(filename, 'json'), 'w') as outfile:
        json.dump(contest, outfile, indent=2, cls=DecimalEncoder)      
    
    # Append to the list of contests
    contests.append(contest)
    

parsing BABS RTF\Results-1996-97-CF-NAT-Detail.txt
parsing BABS RTF\Results-1996-97-CP-MID-Detail.txt
parsing BABS RTF\Results-1996-97-CP-NTH-Detail.txt
parsing BABS RTF\Results-1996-97-CP-STH-Detail.txt
parsing BABS RTF\Results-1996-97-QF-NAT-Detail.txt
parsing BABS RTF\Results-1996-97-QP-MID-Detail.txt
parsing BABS RTF\Results-1996-97-QP-NTH-Detail.txt
parsing BABS RTF\Results-1996-97-QP-STH-Detail.txt
parsing BABS RTF\Results-1996-97-QS-NAT-Detail.txt
parsing BABS RTF\Results-1997-98-CF-NAT-Detail.txt
parsing BABS RTF\Results-1997-98-CP-MID-Detail.txt
parsing BABS RTF\Results-1997-98-CP-NTH-Detail.txt
parsing BABS RTF\Results-1997-98-CP-STH-Detail.txt
parsing BABS RTF\Results-1997-98-QF-NAT-Detail.txt
parsing BABS RTF\Results-1997-98-QP-MID-Detail.txt
parsing BABS RTF\Results-1997-98-QP-NTH-Detail.txt
parsing BABS RTF\Results-1997-98-QP-STH-Detail.txt
parsing BABS RTF\Results-1997-98-QPM-SEN-Detail.txt
parsing BABS RTF\Results-1997-98-QPN-SEN-Detail.txt
parsing BABS RTF\Results-1997

In [418]:
# export to excel
from pandas.io.json import json_normalize
import pandas as pd

dfs = []
for contest in contests:
    dfs.append(json_normalize(contest, 'contestants', ['assoc', 'contest', 'year', 'location', 'type', 'date', 'filename']))
pd.concat(dfs).to_excel('babs by contestants.xlsx')

dfs = []
for contest in contests:
    dfs.append(json_normalize(contest, 
                   ['contestants', 'songs'], 
                   ['assoc', 'contest', 'year', 'location', 'type', 'date', 'filename',
                    ['contestant', 'name'],
                    ['contestant', 'tot_score'],
                    ['contestant', 'pc_score'],
                    ['contestant', 'rank'],
                   ]))
pd.concat(dfs).to_excel('babs by songs.xlsx')