Fix get_site_map() and improve parse_tr()

mikeqfu · Sep 14, 2023 · c306b10 · c306b10
1 parent c6e5ba3
commit c306b10
Showing 1 changed file with 69 additions and 63 deletions.
diff --git a/pyrcs/parser.py b/pyrcs/parser.py
@@ -62,50 +62,9 @@ def _move_element_to_end(text_, char='\t\t'):
             # break
 
 
-def parse_tr(trs, ths, sep=' / ', as_dataframe=False):
-    """
-    Parse a list of parsed HTML <tr> elements.
-
-    See also [`PT-1 <https://stackoverflow.com/questions/28763891/>`_].
-
-    :param trs: contents under ``<tr>`` tags of a web page.
-    :type trs: bs4.ResultSet | list
-    :param ths: list of column names (usually under a ``<th>`` tag) of a requested table.
-    :type ths: list | bs4.element.Tag
-    :param sep: separator that replaces the one in the raw data.
-    :type sep: str | None
-    :param as_dataframe: whether to return the parsed data in tabular form
-    :type as_dataframe: bool
-    :return: a list of lists that each comprises a row of the requested table
-    :rtype: pandas.DataFrame | typing.List[list]
-
-    **Example**::
-
-        >>> from pyrcs.parser import parse_tr
-        >>> import requests
-        >>> import bs4
-
-        >>> example_url = 'http://www.railwaycodes.org.uk/elrs/elra.shtm'
-        >>> source = requests.get(example_url)
-        >>> parsed_text = bs4.BeautifulSoup(markup=source.content, features='html.parser')
-        >>> ths_dat = [th.text for th in parsed_text.find_all('th')]
-        >>> trs_dat = parsed_text.find_all(name='tr')
-
-        >>> tables_list = parse_tr(trs=trs_dat, ths=ths_dat)  # returns a list of lists
-
-        >>> type(tables_list)
-        list
-        >>> len(tables_list) // 100
-        1
-        >>> tables_list[0]
-        ['AAL',
-         'Ashendon and Aynho Line',
-         '0.00 - 18.29',
-         'Ashendon Junction',
-         'Now NAJ3']
-    """
-
+def _prep_records(trs, ths, sep=' / '):
     ths_len = len(ths)
+
     records = []
     row_spanned = []
 
@@ -136,7 +95,13 @@ def parse_tr(trs, ths, sep=' / ', as_dataframe=False):
 
         records.append(data)
 
+    return records, row_spanned
+
+
+def _check_row_spanned(records, row_spanned):
     if row_spanned:
+        records_ = records.copy()
+
         row_spanned_dict = collections.defaultdict(list)
         for i, *to_repeat in row_spanned:
             row_spanned_dict[i].append(to_repeat)
@@ -147,28 +112,68 @@ def parse_tr(trs, ths, sep=' / ', as_dataframe=False):
                     k = i + j
                     # if (dat in records[i]) and (dat != '\xa0'):  # and (idx < len(records[i]) - 1):
                     #     idx += np.abs(records[i].index(dat) - idx, dtype='int64')
-                    k_len = len(records[k])
-                    if k_len < len(records[i]):
+                    k_len = len(records_[k])
+                    if k_len < len(records_[i]):
                         if k_len == idx:
-                            records[k].insert(idx, dat)
+                            records_[k].insert(idx, dat)
                         elif k_len > idx:
-                            if records[k][idx] != '':
-                                records[k].insert(idx, dat)
+                            if records_[k][idx] != '':
+                                records_[k].insert(idx, dat)
                             else:  # records[k][idx] == '':
-                                records[k][idx] = dat
-
-    # if row_spanned:
-    #     for x in row_spanned:
-    #         for j in range(1, x[2]):
-    #             # Add value in next tr
-    #             idx = x[0] + j
-    #             # assert isinstance(idx, int)
-    #             if x[1] >= len(tbl_lst[idx]):
-    #                 tbl_lst[idx].insert(x[1], x[3])
-    #             elif x[3] in tbl_lst[x[0]]:
-    #                 tbl_lst[idx].insert(tbl_lst[x[0]].index(x[3]), x[3])
-    #             else:
-    #                 tbl_lst[idx].insert(x[1] + 1, x[3])
+                                records_[k][idx] = dat
+
+    else:
+        records_ = records
+
+    return records_
+
+
+def parse_tr(trs, ths, sep=' / ', as_dataframe=False):
+    """
+    Parse a list of parsed HTML <tr> elements.
+
+    See also [`PT-1 <https://stackoverflow.com/questions/28763891/>`_].
+
+    :param trs: contents under ``<tr>`` tags of a web page.
+    :type trs: bs4.ResultSet | list
+    :param ths: list of column names (usually under a ``<th>`` tag) of a requested table.
+    :type ths: list | bs4.element.Tag
+    :param sep: separator that replaces the one in the raw data.
+    :type sep: str | None
+    :param as_dataframe: whether to return the parsed data in tabular form
+    :type as_dataframe: bool
+    :return: a list of lists that each comprises a row of the requested table
+    :rtype: pandas.DataFrame | typing.List[list]
+
+    **Example**::
+
+        >>> from pyrcs.parser import parse_tr
+        >>> import requests
+        >>> import bs4
+
+        >>> example_url = 'http://www.railwaycodes.org.uk/elrs/elra.shtm'
+        >>> source = requests.get(example_url)
+        >>> parsed_text = bs4.BeautifulSoup(markup=source.content, features='html.parser')
+        >>> ths_dat = [th.text for th in parsed_text.find_all('th')]
+        >>> trs_dat = parsed_text.find_all(name='tr')
+
+        >>> tables_list = parse_tr(trs=trs_dat, ths=ths_dat)  # returns a list of lists
+
+        >>> type(tables_list)
+        list
+        >>> len(tables_list) // 100
+        1
+        >>> tables_list[0]
+        ['AAL',
+         'Ashendon and Aynho Line',
+         '0.00 - 18.29',
+         'Ashendon Junction',
+         'Now NAJ3']
+    """
+
+    records, row_spanned = _prep_records(trs=trs, ths=ths, sep=sep)
+
+    records = _check_row_spanned(records, row_spanned)
 
     if isinstance(ths, bs4.Tag):
         column_names = [th.text.strip() for th in ths.find_all('th')]
@@ -401,7 +406,8 @@ def _get_site_map(source):
     for h3 in h3s:
         h3_title = h3.get_text(strip=True)
 
-        h3_dl = h3.find_next_sibling(name='dl')
+        # h3_dl = h3.find_next_sibling(name='dl')
+        h3_dl = h3.find_next(name='dl')
 
         h3_dl_dts = h3_dl.find_all(name='dt')