# Code to clean Ramayanam in Python 3

We use this code as a way to clean the files for publishing as a clean and nice epub document.

### Use beautiful soup for trial

In [1]:
ORIG_FILE = "Vaalmeeki-AranyaKaanda-Sargas-1-75-Devanagari-WM.html"
CLEAN_FILE = "Aranyakaanda.html"

SHLOKA_TXT = r"color:#BD0000;font-size:11pt;line-height:1.5em;"
MEANING_TXT = r"text-align:center;margin-top:8px;color:#000099;font-size:12px;line-height:1.5em; "
COMMENT_TXT = r" text-align:center;margin-top:8px;color:#866;font-size:11px;line-height:1.2em;"

In [2]:
from bs4 import BeautifulSoup

html_text = open(ORIG_FILE, "r", encoding="utf-8").read()
orig_soup = BeautifulSoup(html_text, "lxml")

### Strategy 1

We divide the document to rows. When there is only one row it is a chapter. Else the first column is the shloka number, the second is the shloka and meaning, and the last column is the commentary.

In [3]:
def process_rows(row):
    col = row.find_all('td')
    nc = len(col)
    if nc == 1:
        for c in col:
            for i, d in enumerate(c.find_all('div')):
                if i == 0:
                    d['class'] = "chapter"
    else:
        for i, c in enumerate(col):
            if i == 0:
                c['class'] = "shloka-number"
            elif i == 1:
                c['class'] = "shloka-meaning"
            else:
                c['class'] = "commentary"

my_rows = orig_soup.find_all('tr')

for r in my_rows:
    process_rows(r)

### Strategy 2

Our strategy is then to first convert all the table like formatting _tbody_, _td_ and _tr_ to _div_ and then use the style elements to figure out which of the css classes we have they belong to

In [4]:
table = orig_soup.find("table")

In [5]:
table.name

'table'

In [6]:
table.name = 'div'

In [7]:
rows = orig_soup.find_all("tr")

In [8]:
for r in rows:
    r.name = 'div'

In [9]:
cols = orig_soup.find_all("td")

In [10]:
for c in cols:
    c.name = "div"

In [11]:
tbody = orig_soup.find("tbody")

In [12]:
tbody.name = "div"

In [13]:
divs = orig_soup.find_all('div')

for d in divs:
    # Shloka
    if d.get('style') == SHLOKA_TXT:
        d['class'] = "shloka"
    # English meaning
    elif d.get('style') == MEANING_TXT:
        d['class'] = "english-meaning"
    # Footnote
    elif d.get('style') == COMMENT_TXT:
        d['class'] = "english-footnote"

### Deleting all the unnecassary tags

In [14]:
def remove_attrs(soup, whitelist=tuple()):
    for tag in soup.findAll(True):
        for attr in [attr for attr in tag.attrs if attr not in whitelist]:
            del tag[attr]
    return soup


In [15]:
clean_soup = remove_attrs(orig_soup, whitelist=('class'))

head = clean_soup.head
new_tag = clean_soup.new_tag("link", rel="stylesheet", href="ramayanam.css")
head.insert(0, new_tag)

Trying to clean up the shloka part with the _br_ tag when not needed

In [16]:
f = open(CLEAN_FILE, "w", encoding="utf-8")
f.write(str(clean_soup))

1173305