In [1]:
import os
import re
import pandas as pd
import docx
import warnings

def italic_markup(r):
    leading_white = (lambda x: x.group() if x is not None else "")(re.search(r'^\s+',r))
    if len(leading_white)<len(r):
        trailing_white = (lambda x: x.group() if x is not None else "")(re.search(r'\s+$',r))
    else:
        trailing_white = ""
    trimmed = r[len(leading_white):(None if len(trailing_white)==0 else -len(trailing_white))]
    if len(trimmed)>0:
        trimmed = "<i>" + trimmed + "</i>"
    return leading_white + trimmed + trailing_white

def getText(filename,markup=False):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        if markup:
            for run in para.runs:
                if run.italic:
                    #run.text = "<i>"+run.text+"</i>"
                    run.text = italic_markup(run.text)
                #print(run.text,run.italic)
        fullText.append(para.text)
    return '\n'.join(fullText)

import datetime

class musiclist:
    term = None
    year = None
    services = []
    
    keys = ['serviceid',
     'servicetype',
     'servicedate',
     'liturgicalday',
     'hymns',
     'introit',
     'responses',
     'psalm',
     'canticles',
     'anthem',
     'requiem',
     'setting',
     'motet',
     'cantata',
     'preacher',
     'sermon topic',
     'voluntary',
     'organist',
     'extranotes']
    
    def __init__(self,inputfile,keys=None,index=None,firstid=None):
        self.inputfile = inputfile
        
        if keys is not None:
            self.keys = keys
            
        if index is not None:
            self.index = index
        else:
            self.index = self.keys[0]
        
        self.rawtext = getText(inputfile,markup=False).split('\n')
        self.rawtext_markup = getText(inputfile,markup=True).split('\n')
        self.parse(self.rawtext,self.rawtext_markup)
        
        self.musiclist_df = pd.DataFrame(self.services,columns=self.keys).set_index(self.index)
        #print(self.musiclist_df)
        if firstid is not None:
            self.musiclist_df.index = list(range(firstid, firstid+len(self.musiclist_df)))
            self.musiclist_df.index.name = self.index
        
    def parse(self,rawtext,rawtext_markup):
        
        # Find current term and year
        for line in rawtext:
            match = re.search(r'^\s?(Michaelmas|Lent|Easter) Term [0-9]{2,4}\s?$',line)
            if match is not None:
                s = match.group().split(' ')
                #print(s)
                self.term = s[0]
                self.year = int(s[2])

        # Find service dates
        matchlines = []
        servicedates = []
        liturgicaldays = []
        for idx,line in enumerate(rawtext):
            try:
                dt = datetime.datetime.strptime(line.title().strip(),"%A %d %B")
                dt = dt.replace(year=self.year)
                liturgicalday = rawtext[idx+1].title()
                matchlines.append(idx)
                servicedates.append(dt)
                liturgicaldays.append(liturgicalday)
                print(dt,idx)
            except ValueError:
                pass
        matchlines.append(None)
        #print(matchlines)
        
        # Now parse each day
        for j in range(len(matchlines)-1):
            dayraw = rawtext[matchlines[j]:matchlines[j+1]]
            dayraw_markup = rawtext_markup[matchlines[j]:matchlines[j+1]]
            date = servicedates[j]
            liturgicalday = liturgicaldays[j]
            
            print('\n'.join(dayraw))
            
            # Find time and service type (note there may be more than one service on a given day)
            timematchlines = []
            servicetimes = []
            servicetypes = []
            for idx,line in enumerate(dayraw):
                match = re.search(r'^\s?[0-9]{,2}\.[0-9]{2} ?(am|pm)\s.+$',line)
                if match is not None:
                    s = [s.strip() for s in match.group().split("\t")]
                    s = list(filter(None,s))
                    
                    print(s)
                    try:
                        t = datetime.datetime.strptime(s[0],'%I.%M %p')
                    except ValueError:
                        t = datetime.datetime.strptime(s[0],'%I.%M%p')
                    t = t.replace(year = date.year, month = date.month, day = date.day)
                    #print(t)
                    #datetime.datetime.strptime('9.00 am','%I.%M %p')
                    timematchlines.append(idx)
                    servicetimes.append(t)
                    servicetypes.append(s[1])
            timematchlines.append(None)
            #print(timematchlines)
            
            # Now parse each service
            for k in range(len(timematchlines)-1):
                serviceraw = dayraw[timematchlines[k]:timematchlines[k+1]]
                serviceraw_markup = dayraw_markup[timematchlines[k]:timematchlines[k+1]]
                t = servicetimes[k]
                servicetype = servicetypes[k]
                musiclistraw = serviceraw[1:]
                musiclistraw_markup = serviceraw_markup[1:]
                
                musicdict = self.parseservicemusic(musiclistraw,musiclistraw_markup)
                
                service_dict = {}
                service_dict['servicedate'] = t.strftime("%Y-%m-%d %H:%M:%S")
                service_dict['servicetype'] = servicetype
                service_dict['liturgicalday'] = liturgicalday
                
                service_dict = {**service_dict, **musicdict}
            
                self.services.append(service_dict)
                
                print("..........................")
                print(t)
                print(service_dict)
                #print(servicetype)
                #print()
                #print('\n'.join(musiclistraw))
                
            
            #print("---------------------------------")
            #print('\n'.join(dayraw))
            #print(service_dict)
                
        
    def parseservicemusic(self,musiclistraw,musiclistraw_markup=None):
        if musiclistraw_markup is None:
            musiclistraw_markup = musiclistraw
        musicdict = {}
        for line,line_markup in zip(musiclistraw,musiclistraw_markup):
            linelist = [s.strip() for s in line.split("\t")]
            keyval = list(filter(None,linelist))
            linelist_markup = line_markup.split("\t")
            keyval_markup = list(filter(None,linelist_markup))
            #print(keyval)
            if len(keyval) == 2:
                if(keyval[0].lower() in self.keys):
                    musicdict[keyval[0].lower()] = keyval_markup[1]
                elif keyval[0].lower() == 'sermon':
                    #print(keyval_markup)
                    s = keyval[1]
                    s_markup = keyval_markup[1]
                    preacher = s_markup.split('<')[0].strip()
                    sermontopic = s[len(preacher):].strip()
                    musicdict['preacher'] = preacher
                    musicdict['sermon topic'] = sermontopic
                else:
                    warnings.warn("The key '{:}' was not recognised.".format(keyval[0].lower()))
                    
                    
        #print(musicdict)
        return musicdict
    
    def export(self,outfile):
        #writer = pd.ExcelWriter(outfile,datetime64_format='yyyy-mm-dd hh:mm:ss')
        self.musiclist_df.to_excel(outfile,sheet_name='magdservices')

In [2]:
# inputfile = "data/Service booklet E19v2 changes accepted.docx"
inputfile = "data/Service booklet L19 v3.docx"

In [3]:
rawtext = getText(inputfile,markup=True).split('\n')
print(getText(inputfile,markup=True))







Magdalene College






Chapel Music List





Lent Term 2019



<i>Please do not remove</i>








The Chaplain:    
The Precentor and Director of Music:
The Assistant Organist:        
The Junior Organ Scholar:           
The Honorary Assistant Organist:                                                                                                                                                             	 	
	
The Rev’d Sarah Atkins
Mr Graham Walker
Mariëtta van der Tol
 Ivo Macdonald
Mr Jonathan Hellyer Jones
                                                       




Times of Services

	Sunday:	9.00 am Holy Communion
		6.00 pm Choral Evensong
	Monday:	8.30 am Morning Prayer
		
	Tuesday:	8.30 am Morning Prayer
		
	Wednesday:	8.30 am Morning Prayer
		
	Thursday:	8.30 am Morning Prayer
		6.15 pm Choral Evensong 
<i>except 7, 21 February, 7 March</i>
	Friday:	8.30 am Morning Prayer
		

holy communion follows a printed order of service in contemporary language. There are three

In [4]:
ml = musiclist(inputfile,firstid=None)
print(ml.term)
print(ml.year)
for s in ml.services:
    print(s)
    print()
outfile = os.path.join('output',os.path.splitext(os.path.split(inputfile)[-1])[0]+'.xlsx')
print(outfile)
ml.export(outfile)

2019-01-20 00:00:00 78
2019-01-24 00:00:00 96
2019-01-27 00:00:00 109
2019-01-30 00:00:00 127
2019-01-31 00:00:00 132
2019-02-03 00:00:00 145
2019-02-06 00:00:00 166
2019-02-10 00:00:00 174
2019-02-14 00:00:00 193
2019-02-17 00:00:00 207
2019-02-22 00:00:00 227
2019-02-24 00:00:00 241
2019-02-27 00:00:00 261
2019-02-28 00:00:00 266
2019-03-03 00:00:00 275
2019-03-06 00:00:00 298
2019-03-10 00:00:00 316
2019-03-14 00:00:00 334
2019-03-18 00:00:00 349
SUNDAY 20 JANUARY
SECOND SUNDAY AFTER THE EPIPHANY


9.00 am	College Eucharist

Hymns		137, Christ is our light t. AM97, 47	

6.00 pm	Choral Evensong with the blessing of the new Icon of St Mary Magdalene

Responses	Oxley
Psalm		102 vv 1-11
Canticles	Plainsong Magnificat; Wood Nunc Dimittis
Anthem		Britten New Year Carol
Sermon		The Master Blessed are the poor in spirit
Hymns		174 t. 490, 52


['9.00 am', 'College Eucharist']
['6.00 pm', 'Choral Evensong with the blessing of the new Icon of St Mary Magdalene']
..........................
201

In [5]:
ml.musiclist_df

Unnamed: 0_level_0,servicetype,servicedate,liturgicalday,hymns,introit,responses,psalm,canticles,anthem,requiem,setting,motet,cantata,preacher,sermon topic,voluntary,organist,extranotes
serviceid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
,College Eucharist,2019-01-20 09:00:00,Second Sunday After The Epiphany,"137, Christ is our light t. AM97, 47",,,,,,,,,,,,,,
,Choral Evensong with the blessing of the new I...,2019-01-20 18:00:00,Second Sunday After The Epiphany,"174 t. 490, 52",,Oxley,102 vv 1-11,Plainsong <i>Magnifica</i><i>t</i>; Wood <i>Nu...,Britten <i>New Year Carol</i>,,,,,The Master,Blessed are the poor in spirit,,,
,Choral Evensong,2019-01-24 18:15:00,,"48, 57",,Oxley,118 vv 19-29,Sumsion in G,Byrd <i>Kyrie Eleison</i> <i>(4-part Mass)</i>,,,,,,,,,
,College Eucharist,2019-01-27 09:00:00,Third Sunday After The Epiphany,"55, The kingdom of God t. AM 569i, 297",,,,,,,,,,,,,,
,Choral Evensong,2019-01-27 18:00:00,Third Sunday After The Epiphany,"472ii, 82, 331",,Oxley,120,Sumsion in G,Byrd <i>Miserere Mei</i>,,,,,The Chaplain,Blessed are those who mourn,,,
,Choral Compline,2019-01-30 21:30:00,,,,,,,,,,,,,,,,
,Choral Evensong,2019-01-31 18:15:00,,"358ii, 377",,Oxley,145 vv 1-7,Howells <i>Collegium Regale</i>,"Wesley <i>Lead Me, Lord</i>",,,,,,,,,
,College Eucharist,2019-02-03 09:00:00,Candlemas,"408i, Like a candle flame t. AM74, Christ be o...",,,,,,,,,,,,,,
,Choral Evensong,2019-02-03 18:00:00,Candlemas,"33 (vv 1,4,5,7), 44, 437",,Rose,18 vv 1-15,Howells <i>Collegium Regale</i>,"Wood <i>Hail, Gladdening Light</i>",,,,,The Reverend Jon Canessa,Blessed are the meek,,,
,Corporate Communion,2019-02-06 18:15:00,,"427, 295, 415 t. 346",,,,,,,Perosi <i>Missa</i> <i>Secunda</i> <i>Pontific...,"Tallis <i>Verily, verily</i>",,,,,,
