# Custom Markdown Parser

Inputs markdown, outputs XML based on a custom schema using regex.

Next steps:
- Code batch processing (i.e. open all files in one directory, output to another).
- Automate transfer of process files to new directory to simplify workflow.

In [196]:
# Notes on Version 2 (From General Hanley):

# Advocates transcending the "line-by-line approach":
# If you give re.sub the flags=re.MULTILINE option, then ^ and $ will match the
# beginning and end of lines, rather than the whole string. That way you
# can do it for the whole file at once. This should in principle be faster too.

In [1]:
import re, os, os.path, shutil

# custom functions:
import parser_functions as pf

### File Paths

Nice [[explanation of using the os library]](https://automatetheboringstuff.com/chapter8/).

In [4]:
# Input Folder
hdir = os.path.expanduser('~')
md_rel_path = "/Box/Notes/Primary_Sources/transcription_markdown_drafting_stage1"
md_path = hdir + md_rel_path

# Destination Folder
xml_rel_path = r"/Box/Notes/Primary_sources/xml_notes_stage2/parser_depository"
xml_path = hdir + xml_rel_path

# Folder to archive old Markdown files
archive_rel_path = "/Box/Notes/Primary_Sources/transcription_markdown_drafting_stage1/archive_docs_now_at_xml_stage_do_not_use"
archive_path = hdir + archive_rel_path

print ("Files currently in input folder ", os.path.dirname(md_path), ":")
os.listdir(md_path)


Files currently in input folder  /Users/kribblesworth/Box/Notes/Primary_Sources :


['archive_docs_now_at_xml_stage_do_not_use',
 'document_conversion_backlog',
 'parser_output.xml',
 'ser560.txt',
 'ser560.xml',
 'ser72.md',
 'ser808.md',
 'ser809.md',
 'ser811.md',
 'ser812.md',
 'ser813.md',
 'ser814.md',
 'ser815.md',
 'ser816.md',
 'ser817.md',
 'ser818.md',
 'ser842.md',
 'ser843.md',
 'ser857.md',
 'ser876.md',
 'ser877.md']

In [4]:
# Minor: note that os.path.dirname lists the name of the parent folder, not the targeted one
print ("Files currently in destination folder ", os.path.dirname(xml_path), ":")

os.listdir(xml_path)

Files currently in destination folder  /Users/kribblesworth/Box/Notes/Primary_sources/xml_notes_stage2 :


[]

## The Parser

In [11]:
# Making sure in correct directory:
os.chdir(md_path)

# Test if it is all functioning properly:
print (pf.parse_md(os.listdir(md_path)[5]))


<?xml-model href="../../../../../Projects/xml_development_eurasia/schemas/persian_documents_schema_basic.rnc" type="application/relax-ng-compact-syntax"?>
    <document serial = "72">
    	<?xml-model href="/Users/Enkidu/Documents/digital_humanities/xml_development/schemas/persian_documents_schema_basic.rnc" type="application/relax-ng-compact-syntax"?>
	    <document serial = "560">
	    	
		<div> 
		
		
		<!-- Model MarkDown document with new schema. -->
		
		\# 
		
		\#\# 
		<!-- top marginalia -->
		
			<lb/>جناب <flag>عالیحضرتمولایم</flag> الله ظله
			<lb/>جناب شریعت و شرافت امارت و وزارتپناهان
			<lb/>دام عافیتکم
		
		\#\# 
		<!-- vertical section -->
		<!-- beginning of honorific section -->
		
			<lb/>روزی و نصیب آنشریعت و شرافت و وزارتپناهان
			<lb/>رفعت و منزلت جایگاهان دولت و حکومت و دستگاهان
			<lb/>بوده باد بعد از اظهار مراهم دعا بوده میدارت که
			<lb/>الله الحمد و المنه بمهربانی و شرفدولت خداداد ؟ مراتب حالات
			<lb/>قرین شکر و رضا بوده همواره سلامتی و تعالی
			<lb/>و عافی

### Run parser on every file in the input directory, copy to output directory

In [5]:
for filename in os.listdir(md_path):
    if filename.endswith(".txt") or filename.endswith(".md"):
        # Make sure in input directory
        os.chdir(md_path)

        # Export filename
        output_file = "ser" + pf.serial_no(filename) + ".xml"
        
        # MD parsed into XML text for output
        output_text = pf.parse_md(filename)
        
        # Send file
        with open(xml_path + "/" + output_file, 'w+') as fout:
            fout.write(output_text)
            
        # Archived filename
        archive_file = "archived_no" + pf.serial_no(filename) + ".txt"
                       
        # Move active file to the archive folder
        shutil.move(md_path + "/" + filename, archive_path + "/" + archive_file)
       