Permalink
Browse files

- example-extracttext now outputs unicode

  • Loading branch information...
1 parent e3da8ad commit 75aa42c93a86e497a7c4313bd996f70bae7fc7d4 @mikemaccana committed Jan 4, 2010
Showing with 8 additions and 10 deletions.
  1. +8 −10 example-extracttext.py
View
18 example-extracttext.py
@@ -12,21 +12,19 @@
if __name__ == '__main__':
try:
document = opendocx(sys.argv[1])
+ newfile = open(sys.argv[2],'w')
except:
- print('Please supply a filename. For example:')
- print(''' example-extracttext.py 'My Office 2007 document.docx' ''')
+ print('Please supply an input and output file. For example:')
+ print(''' example-extracttext.py 'My Office 2007 document.docx' 'outputfile.txt' ''')
exit()
## Fetch all the text out of the document we just created
paratextlist = getdocumenttext(document)
- # Note that if using shell redirection &>, 1> 2> etc) Python tries to
- # change the unicode into ASCII and fails - even with a UTF-8 $LANG
- # As a workaround, create our own ASCII copy of the list.
- asciiparatextlist = []
+ # Make explicit unicode version
+ newparatextlist = []
for paratext in paratextlist:
- asciiparatextlist.append(paratext.encode("ascii", "backslashreplace"))
+ newparatextlist.append(paratext.encode("utf-8"))
## Print our documnts test with two newlines under each paragraph
- print '\n\n'.join(paratextlist)
-
-
+ newfile.write('\n\n'.join(newparatextlist))
+ #print '\n\n'.join(newparatextlist)

0 comments on commit 75aa42c

Please sign in to comment.