Skip to content


Added new USFM file compare
Browse files Browse the repository at this point in the history
  • Loading branch information
RobH123 committed May 9, 2016
1 parent bd315e5 commit f3b64d5
Show file tree
Hide file tree
Showing 3 changed files with 259 additions and 5 deletions.
246 changes: 246 additions & 0 deletions
@@ -0,0 +1,246 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Module comparing USFM Bible book files
# Copyright (C) 2016 Robert Hunt
# Author: Robert Hunt <>
# License: See gpl-3.0.txt
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <>.

Module comparing USFM Bible book files
Given two file paths, reads and compares the USFM Bible books
and returns or prints a dictionary of results.

from gettext import gettext as _

LastModifiedDate = '2016-05-09' # by RJH
ShortProgName = "USFMCompare"
ProgName = "USFM file comparator"
ProgVersion = '0.01'
ProgNameVersion = '{} v{}'.format( ShortProgName, ProgVersion )
ProgNameVersionDate = '{} {} {}'.format( ProgNameVersion, _("last modified"), LastModifiedDate )

debuggingThisModule = True

import os, logging
from collections import OrderedDict
from datetime import datetime

#from singleton import singleton
import BibleOrgSysGlobals
from USFMFile import USFMFile

def USFMCompare( filepath1, filepath2 ):
if BibleOrgSysGlobals.verbosityLevel > 0: print( "\nUSFMCompare() for USFM Bible books" )
if BibleOrgSysGlobals.verbosityLevel > 1:
print( " comparing {}".format( filepath1 ) )
print( " and {}".format( filepath2 ) )

# Set up empty results dictionaries
resultDict = OrderedDict()
resultDict['File1'], resultDict['File2'] = OrderedDict(), OrderedDict()
resultDict['Same'], resultDict['Different'], resultDict['Summary'] = OrderedDict(), OrderedDict(), OrderedDict()

# Note paths and folders
resultDict['File1']['Filepath'], resultDict['File2']['Filepath'] = filepath1, filepath2
resultDict['File1']['Folder'], resultDict['File1']['Filename'] = os.path.split( filepath1 )
resultDict['File2']['Folder'], resultDict['File2']['Filename'] = os.path.split( filepath2 )
if resultDict['File2']['Filename'] == resultDict['File1']['Filename']:
resultDict['Same']['Filename'] = resultDict['File1']['Filename']
if resultDict['File2']['Folder'] == resultDict['File1']['Folder']:
resultDict['Same']['Folder'] = resultDict['File1']['Folder']

# Note file dates and sizes
s1, s2 = os.stat( filepath1 ), os.stat( filepath2 )
resultDict['File1']['Filesize'], resultDict['File2']['Filesize'] = s1.st_size, s2.st_size
if resultDict['File1']['Filesize'] == resultDict['File2']['Filesize']:
resultDict['Same']['Filesize'] = resultDict['File1']['Filesize']
resultDict['Different']['Filesize'] = (resultDict['File1']['Filesize'],resultDict['File2']['Filesize'])
if s1.st_size > s2.st_size:
resultDict['Summary']['Filesize'] = "File1 is bigger"
elif s1.st_size < s2.st_size:
resultDict['Summary']['Filesize'] = "File2 is bigger"
resultDict['File1']['ModifiedTimeStamp'], resultDict['File2']['ModifiedTimeStamp'] = s1.st_mtime, s2.st_mtime
if s1.st_mtime > s2.st_mtime:
resultDict['Summary']['ModifiedTime'] = "File1 is newer"
elif s1.st_mtime < s2.st_mtime:
resultDict['Summary']['ModifiedTime'] = "File2 is newer"
t1, t2 = datetime.fromtimestamp( s1.st_mtime ), datetime.fromtimestamp( s2.st_mtime )
resultDict['File1']['ModifiedDate'], resultDict['File2']['ModifiedDate'] = t1.strftime('%Y-%m-%d'), t2.strftime('%Y-%m-%d')
if resultDict['File1']['ModifiedDate'] == resultDict['File2']['ModifiedDate']:
resultDict['Same']['ModifiedDate'] = resultDict['File1']['ModifiedDate']
resultDict['Different']['ModifiedDate'] = (resultDict['File1']['ModifiedDate'],resultDict['File2']['ModifiedDate'])
resultDict['File1']['ModifiedTime'], resultDict['File2']['ModifiedTime'] = t1.strftime('%H:%M:%S'), t2.strftime('%H:%M:%S')
if resultDict['File1']['ModifiedTime'] == resultDict['File2']['ModifiedTime']:
resultDict['Same']['ModifiedTime'] = resultDict['File1']['ModifiedTime']
resultDict['Different']['ModifiedTime'] = (resultDict['File1']['ModifiedTime'],resultDict['File2']['ModifiedTime'])

# Read the files
uf1, uf2 = USFMFile(), USFMFile() filepath1 ) filepath2 )
#print( 'f1', uf1.lines )
#print( 'f2', uf2.lines )

# Note line counts
resultDict['File1']['LineCount'], resultDict['File2']['LineCount'] = len(uf1.lines), len(uf2.lines)
if resultDict['File1']['LineCount'] == resultDict['File2']['LineCount']:
resultDict['Same']['LineCount'] = resultDict['File1']['LineCount']
resultDict['Different']['LineCount'] = (resultDict['File1']['LineCount'],resultDict['File2']['LineCount'])

# Work through the files counting chapters and verses, etc.
# Trying to resync if there's a different number of lines
resultDict['File1']['IntroLineCount'] = resultDict['File2']['IntroLineCount'] = 0
resultDict['File1']['ChapterMarkerCount'] = resultDict['File2']['ChapterMarkerCount'] = 0
resultDict['File1']['VerseMarkerCount'] = resultDict['File2']['VerseMarkerCount'] = 0
resultDict['File1']['HasContentCount'] = resultDict['File2']['HasContentCount'] = 0
resultDict['Same']['SameMarkerCount'] = resultDict['Different']['DifferentMarkerCount'] = 0
resultDict['Same']['SameLineCount'] = resultDict['Different']['DifferentLineCount'] = 0
lineIndex = lineOffset = 0
hadC1 = hadC2 = False
while True:
if lineIndex >= resultDict['File1']['LineCount']:
if BibleOrgSysGlobals.debugFlag: print( "F1 done" )
if lineIndex >= resultDict['File2']['LineCount']:
if BibleOrgSysGlobals.debugFlag: print( "F2 done" )
(m1,l1), (m2,l2) = uf1.lines[lineIndex], uf2.lines[lineIndex+lineOffset]
#print( lineIndex, lineOffset, m1, m2 )
if m1==m2: resultDict['Same']['SameMarkerCount'] += 1
if BibleOrgSysGlobals.debugFlag: print( "Diff", m1, m2, l1, l2 )
resultDict['Different']['SameMarkerCount'] += 1
if m1==m2 and l1==l2: resultDict['Same']['SameLineCount'] += 1
if BibleOrgSysGlobals.debugFlag: print( "Diff", m1, m2, l1, l2 )
resultDict['Different']['DifferentLineCount'] += 1
if m1=='c':
resultDict['File1']['ChapterMarkerCount'] += 1
hadC1 = True
elif m1=='v':
resultDict['File1']['VerseMarkerCount'] += 1
hadC1 = True
if m2=='c':
resultDict['File2']['ChapterMarkerCount'] += 1
hadC2 = True
elif m2=='v':
resultDict['File2']['VerseMarkerCount'] += 1
hadC2 = True
if not hadC1: resultDict['File1']['IntroLineCount'] += 1
if not hadC2: resultDict['File2']['IntroLineCount'] += 1
if l1.strip(): resultDict['File1']['HasContentCount'] += 1
if l2.strip(): resultDict['File2']['HasContentCount'] += 1
lineIndex += 1

if resultDict['File1']['IntroLineCount'] == resultDict['File2']['IntroLineCount']:
resultDict['Same']['IntroLineCount'] = resultDict['File1']['IntroLineCount']
resultDict['Different']['IntroLineCount'] = (resultDict['File1']['IntroLineCount'],resultDict['File2']['IntroLineCount'])
if s1.resultDict['File1']['IntroLineCount'] > resultDict['File2']['IntroLineCount']:
resultDict['Summary']['Filesize'] = "File1 has more intro markers"
elif s1.resultDict['File1']['IntroLineCount'] < resultDict['File2']['IntroLineCount']:
resultDict['Summary']['Filesize'] = "File2 has more intro markers"
if resultDict['File1']['ChapterMarkerCount'] == resultDict['File2']['ChapterMarkerCount']:
resultDict['Same']['ChapterMarkerCount'] = resultDict['File1']['ChapterMarkerCount']
resultDict['Different']['ChapterMarkerCount'] = (resultDict['File1']['ChapterMarkerCount'],resultDict['File2']['ChapterMarkerCount'])
if s1.resultDict['File1']['ChapterMarkerCount'] > resultDict['File2']['ChapterMarkerCount']:
resultDict['Summary']['Filesize'] = "File1 has more chapter markers"
elif s1.resultDict['File1']['ChapterMarkerCount'] < resultDict['File2']['ChapterMarkerCount']:
resultDict['Summary']['Filesize'] = "File2 has more chapter markers"
if resultDict['File1']['VerseMarkerCount'] == resultDict['File2']['VerseMarkerCount']:
resultDict['Same']['VerseMarkerCount'] = resultDict['File1']['VerseMarkerCount']
resultDict['Different']['VerseMarkerCount'] = (resultDict['File1']['VerseMarkerCount'],resultDict['File2']['VerseMarkerCount'])
if s1.resultDict['File1']['VerseMarkerCount'] > resultDict['File2']['VerseMarkerCount']:
resultDict['Summary']['Filesize'] = "File1 has more verse markers"
elif s1.resultDict['File1']['VerseMarkerCount'] < resultDict['File2']['VerseMarkerCount']:
resultDict['Summary']['Filesize'] = "File2 has more verse markers"
if resultDict['File1']['HasContentCount'] == resultDict['File2']['HasContentCount']:
resultDict['Same']['HasContentCount'] = resultDict['File1']['HasContentCount']
resultDict['Different']['HasContentCount'] = (resultDict['File1']['HasContentCount'],resultDict['File2']['HasContentCount'])
if s1.resultDict['File1']['HasContentCount'] > resultDict['File2']['HasContentCount']:
resultDict['Summary']['Filesize'] = "File1 has more content lines"
elif s1.resultDict['File1']['HasContentCount'] < resultDict['File2']['HasContentCount']:
resultDict['Summary']['Filesize'] = "File2 has more content lines"

return resultDict
# end of USFMCompare

def main():
Main program to handle command line parameters and then run what they want.
if BibleOrgSysGlobals.verbosityLevel > 1: print( ProgNameVersion )
#if BibleOrgSysGlobals.print( BibleOrgSysGlobals.commandLineArguments )

fp1, fp2 = BibleOrgSysGlobals.commandLineArguments.file1, BibleOrgSysGlobals.commandLineArguments.file2
allOkay = True
if not os.path.exists( fp1 ): logging.critical( "Filepath1 {!r} is invalid -- aborting".format( fp1 ) ); allOkay = False
if not os.path.exists( fp2 ): logging.critical( "Filepath2 {!r} is invalid -- aborting".format( fp2 ) ); allOkay = False
if allOkay:
result = USFMCompare( fp1, fp2 )
if BibleOrgSysGlobals.verbosityLevel > 0:
print( "\nResults:" )
for division,dResults in result.items():
if division in ('File1','File2') and BibleOrgSysGlobals.verbosityLevel < 4 \
and not BibleOrgSysGlobals.debugFlag:
if division in ('Same','Different') and BibleOrgSysGlobals.verbosityLevel < 3 \
and not BibleOrgSysGlobals.debugFlag:
print( " {} results are:".format( division ) )
for field,fResult in dResults.items():
print( " {}: {}".format( field,fResult ) )
# end of main

if __name__ == '__main__':
#multiprocessing.freeze_support() # Multiprocessing support for frozen Windows executables

import sys
if 'win' in sys.platform: # Convert stdout so we don't get zillions of UnicodeEncodeErrors
from io import TextIOWrapper
sys.stdout = TextIOWrapper( sys.stdout.detach(), sys.stdout.encoding, 'namereplace' if sys.version_info >= (3,5) else 'backslashreplace' )

# Configure basic Bible Organisational System (BOS) set-up
parser = BibleOrgSysGlobals.setup( ProgName, ProgVersion )
parser.add_argument( '-z', '--sleep', action='store_true', dest='sleep', default=False, help="don't do anything!" )
parser.add_argument('file1', help="USFM Bible book file 1" ) #, metavar='XXX')
parser.add_argument('file2', help="USFM Bible book file 2" ) #, metavar='XXX')
BibleOrgSysGlobals.addStandardOptionsAndProcess( parser )


BibleOrgSysGlobals.closedown( ProgName, ProgVersion )
# end of
5 changes: 3 additions & 2 deletions
Expand Up @@ -34,7 +34,7 @@

from gettext import gettext as _

LastModifiedDate = '2016-03-07' # by RJH
LastModifiedDate = '2016-05-09' # by RJH
ShortProgName = "USFMFile"
ProgName = "USFM File loader"
ProgVersion = '0.85'
Expand Down Expand Up @@ -127,7 +127,8 @@ def read( self, USFMFilepath, ignoreSFMs=None, encoding=None ):
@param key: The SFM record marker (not including the backslash)
@type encoding: string
@rtype: list
@return: list of lists containing the records
Puts the result into self.lines
#print( " {!r}, {!r}, {!r} )".format( USFMFilepath, ignoreSFMs, encoding ) )

Expand Down
13 changes: 10 additions & 3 deletions
Expand Up @@ -36,7 +36,7 @@

from gettext import gettext as _

LastModifiedDate = '2016-03-11' # by RJH
LastModifiedDate = '2016-05-09' # by RJH
ShortProgName = "USFMMarkers"
ProgName = "USFM Markers handler"
ProgVersion = '0.68'
Expand Down Expand Up @@ -704,7 +704,7 @@ def XXXgetMarkerDictFromText( self, text, includeInitialText=False, verifyMarker

def demo():
Main program to handle command line parameters and then run what they want.
Demonstration program to handle command line parameters and then run what they want.
if BibleOrgSysGlobals.verbosityLevel > 1: print( ProgNameVersion )

Expand Down Expand Up @@ -775,7 +775,14 @@ def demo():
# end of demo

if __name__ == '__main__':
# Configure basic set-up
#multiprocessing.freeze_support() # Multiprocessing support for frozen Windows executables

import sys
if 'win' in sys.platform: # Convert stdout so we don't get zillions of UnicodeEncodeErrors
from io import TextIOWrapper
sys.stdout = TextIOWrapper( sys.stdout.detach(), sys.stdout.encoding, 'namereplace' if sys.version_info >= (3,5) else 'backslashreplace' )

# Configure basic Bible Organisational System (BOS) set-up
parser = BibleOrgSysGlobals.setup( ProgName, ProgVersion )
BibleOrgSysGlobals.addStandardOptionsAndProcess( parser )

Expand Down

0 comments on commit f3b64d5

Please sign in to comment.