In [1]:
import re

import torch

from transformers import MarkupLMFeatureExtractor
from transformers import AutoProcessor, AutoModelForSequenceClassification

import pandas as pd 
import numpy as np

In [172]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', None)
pd.set_option('max_colwidth', 100)

In [10]:
def filter_p_tags(node_xpath_pairs, max_length=30):
    filtered_pairs = []
    for idx, pair in enumerate(node_xpath_pairs):
#         last_two_tags = "/".join(pair[1].rsplit("/", 2)[1:])
#         print(last_two_tags)
#         if 'body' in last_two_tags and 'p' in last_two_tags and not 'table' in last_two_tags:
        if 'body' in pair[1] and 'p' in pair[1] and not 'table' in pair[1]:
            node_text = pair[0]
            
            if len(node_text.split()) <= max_length:
                filtered_pairs.append((*pair, f"idx={idx}"))

    return filtered_pairs


def get_text_tag(node_xpath_pairs, text):
    matches = []
    for idx, pair in enumerate(node_xpath_pairs):
        if re.search(fr"{text}", pair[0].lower()):
            matches.append((*pair, f"idx={idx}"))

    return matches

def get_table_tags(node_xpath_pairs):
    filtered_pairs = []
    for idx, pair in enumerate(node_xpath_pairs):
        last_five_tags = "/".join(pair[1].rsplit("/", 5)[1:])
        
        if 'table' in last_five_tags:
            filtered_pairs.append((*pair, f"idx={idx}"))

    return filtered_pairs
    


def get_div_tags(node_xpath_pairs):
    filtered_pairs = []
    for idx, pair in enumerate(node_xpath_pairs):
        last_five_tags = "/".join(pair[1].rsplit("/", 5)[1:])
        
        if 'div' in last_five_tags:
            filtered_pairs.append((*pair, f"idx={idx}"))

    return filtered_pairs


In [11]:
input_html_filelist = [
    "/Users/sukritrao/Documents/NYU/Coursework/Spring2023/Independent-Study/project/playground/alaska_project8.htm",
    "/Users/sukritrao/Documents/NYU/Coursework/Spring2023/Independent-Study/project/playground/Community Bankers Trust Corporation_United Bankshares, Inc.htm",
    "/Users/sukritrao/Documents/NYU/Coursework/Spring2023/Independent-Study/project/playground/County Bancorp, Inc._Nicolet Bankshares, Inc.html",
    "/Users/sukritrao/Documents/NYU/Coursework/Spring2023/Independent-Study/project/playground/Perspecta Inc._Veritas Capital.htm",
    "/Users/sukritrao/Documents/NYU/Coursework/Spring2023/Independent-Study/project/playground/MTS Systems Corporation_Amphenol Corporation.htm",
    "/Users/sukritrao/Documents/NYU/Coursework/Spring2023/Independent-Study/project/playground/Select_Bancorp_Inc_First_Bancorp.htm"
]

In [152]:
file_idx = 1

In [153]:
input_html = input_html_filelist[file_idx]

In [154]:
with open(input_html) as f:
      html_code = f.read()

In [155]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [156]:
feature_extractor = MarkupLMFeatureExtractor()
encoding = feature_extractor(html_code)

In [157]:
nodes = encoding['nodes'][0]
xpaths = encoding['xpaths'][0]

# cropped_xpaths = ["/".join(x.rsplit("/", 5)[1:]) for x in xpaths]

# node_xpath_pairs = list(zip(nodes, cropped_xpaths))
node_xpath_pairs = list(zip(nodes, xpaths))

In [158]:
for x in node_xpath_pairs[10:15]:
    print(x, '\n')

('UNITED BANKSHARES, INC.', '/document/type/sequence/filename/description/text/html/body/center/div/p[6]/b') 

('and', '/document/type/sequence/filename/description/text/html/body/center/div/p[7]') 

('COMMUNITY BANKERS TRUST\nCORPORATION', '/document/type/sequence/filename/description/text/html/body/center/div/p[8]/b') 

('Table of Contents', '/document/type/sequence/filename/description/text/html/body/p/center/div/p[1]/u') 

('Page', '/document/type/sequence/filename/description/text/html/body/p/center/div/table/tr[2]/td[5]') 



In [159]:
filtered_p_tags = filter_p_tags(node_xpath_pairs)
# filtered_p_tags[2305]

# Alaska Project 

## Section Title
"the following terms have the following meanings", 379


## Signature Page

in witness whereof, 2305

## Table of Contents

table of contents 13

# Community Bankers

## Section Title

"following terms are used in this agreement" 247


### Signature

in witness, 1701

# Perspecta

### Section Title
the following terms have the meanings set forth, 349


### Signature 

in witness whereof, 2809

In [160]:
# get_text_tag(filtered_p_tags, 'section')
# get_text_tag(get_div_tags(node_xpath_pairs), '1')

In [161]:
get_text_tag(node_xpath_pairs, "in witness")

[('IN WITNESS WHEREOF',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/center/div/p[1]/b',
  'idx=1701'),
 ('IN WITNESS WHEREOF, Essex Bank and United Bank have each caused this Agreement and Plan of\nMerger to be executed as of the date first above written.',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/center/div/p[1]',
  'idx=2059'),
 ('IN WITNESS WHEREOF, the parties hereto have caused this Agreement to be executed as of the\nday first above written.',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/

In [162]:
node_idx = 379
node_xpath_pairs[node_idx - 2 : node_idx + 10]

[('“',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/center/div/p[15]'),
 ('Effective Date',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/center/div/p[15]/i[1]'),
 ('”',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/center/div/p[15]'),
 ('has the meaning set forth in Section\xa02.02.',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/center/div/p[15]'),
 ('“',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/center/div/p[16]'),
 ('Effective Time',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/center/div/p[16]/i[1]'),
 ('”',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/center/div/p[16]'),
 ('has the meaning set forth in Section\xa02.02.',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/center/div/p[16]'),
 ('“',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/

In [163]:
def get_tag_level(full_xpath, debug=True):
    table_tags = ['table', 'tr', 'td']
    
    if debug:
        print(f"full_xpath={full_xpath}")

    # if all the above tags are in the full_xpath then 
    # it is part of a table, skip
    if all([x in full_xpath for x in table_tags]):
        if debug:
            print(f'given xpath {full_xpath} corresponds to a table, skipping! ')
        return None
    
#     base_tags = ['body', 'html', 'text']
    
#     for base_tag in base_tags:
#         start_idx = full_xpath.find(base_tag)
        
#         if start_idx != -1:
#             start_idx = start_idx + len(base_tag)

    # check the last two tags for a p or div tag
    last_two_tags = full_xpath.rsplit('/', 2)[1:]
    
    if debug:
        print(f"last_two_tags: {last_two_tags}")
    
    # check if the p tag exists in the first char of each tag
    if any(['p' in x[0] for x in last_two_tags]):
        last_tag = last_two_tags[-1]

        if debug:
            print(f"p tag present in last two tags. last_two_tags: {last_two_tags}")
        
        if 'p' in last_tag[0]:
            
            if len(last_tag) == 1:
                if debug:
                    print(f"p tag present in last tag. length=1 {last_tag}")

                return None

            else:
                # if there is an index in the p tag
                # return the number
                if debug:
                    print(f"p tag present in second last tag. length > 1  {last_tag}. Index={int(last_tag[2:-1])}")
                

                return int(last_tag[2:-1])
        else:
            second_last_tag = last_two_tags[0]
            if len(second_last_tag) == 1:
                if debug:
                    print(f"p tag present in second last tag. length=1 {second_last_tag}")

                return None

            else:
                # if there is an index in the p tag
                # return the number
                if debug:
                    print(f"p tag present in second last tag. length > 1  {second_last_tag}. Index={int(second_last_tag[2:-1])}")

                return int(second_last_tag[2:-1])
    
    # check if the div tag exists in the first few chars of each tag
    elif any(['div' in x[:3] for x in last_two_tags]):
        last_tag = last_two_tags[-1]

        if debug:
            print(f"div tag present in last two tags. last_two_tags: {last_two_tags}")

        if 'div' in last_tag:
            if len(last_tag) == 3:
                if debug:
                    print(f"div tag present in last tag. length=3 {last_tag}")
                
                return None
            else:
                # if there is an index in the p tag
                # return the number
                if debug:
                    print(f"div tag present in last tag. length > 1  {last_tag}. Index={int(last_tag[4:-1])}")                

                return int(last_tag[4:-1])            

        else:
            second_last_tag = last_two_tags[0]
            if len(second_last_tag) == 3:
                if debug:
                    print(f"div tag present in second last tag. length=3 {second_last_tag}")                

                    return None

            else:
                # if there is an index in the p tag
                # return the number
                if debug:
                    print(f"div tag present in second last tag. length > 3  {second_last_tag}. Index={int(second_last_tag[4:-1])}")                

                return int(second_last_tag[4:-1])
            
    else:
        return None
       
        
        

In [212]:
df = pd.DataFrame(node_xpath_pairs, columns=['node_text', 'full_xpath'])

In [213]:
df['tag_level'] = df['full_xpath'].apply(lambda full_xpath: get_tag_level(full_xpath, debug=False))

In [214]:
# make sure the tag_levels are unique. In case of repititions then increment to make 
# the tag_levels non-decreasing

# make a copy of the original tag level for debugging 
df['original_tag_level'] = df['tag_level']


# fill the NaN values with 0.0 so we can keep track of the max index 
df['tag_level'] = df['tag_level'].fillna(0.0)

# keep track of the max index value so far. The NaNs are filled with 0.0s so don't affect the max value
df['tag_level_accumulate'] = np.maximum.accumulate(df['tag_level'])

# reset the max accumulated value for the original NaNs as 0.0s
df.loc[df['tag_level'] == 0.0, 'tag_level_accumulate'] = 0.0

# now assign the original tag_level col with the max value plus the original tag level
# in case of repititions, the tag_level will be 1 greater than the max tag_level just preceeding it
# eg: tag_level: 1, 2, 3, NaN, 1 -> after fillna: 1, 2, 3, 0, 1 
# -> max tag_level: 1, 2, 3, 3, 3 -> final tag_level = 1, 2, 3, 3 (3 + 0), 4 (3 + 1)
df['tag_level'] = np.where(df['tag_level_accumulate'] > df['tag_level'], df['tag_level_accumulate'] + df['tag_level'], df['tag_level'])

# reset the 0.0s in the tag level to Nones
df.loc[df['tag_level'] == 0.0, 'tag_level'] = None

In [215]:
def merge_text(group):
    return pd.Series({'concat_node_text': " ".join(group['node_text'])})

In [216]:
merged_text_df = df.groupby('tag_level').apply(merge_text).reset_index(drop=False)

In [217]:
# df['concat_node_text'] = df['']
df = df.assign(concat_node_text=df['tag_level'].map(merged_text_df.set_index('tag_level')['concat_node_text']))
df['concat_node_text'] = df['concat_node_text'].fillna(df['node_text'])

In [231]:
df

Unnamed: 0,node_text,full_xpath,tag_level,original_tag_level,tag_level_accumulate,concat_node_text
0,EX-2.1,/document/type,,,0.0,EX-2.1
1,2,/document/type/sequence,,,0.0,2
2,d158023dex21.htm,/document/type/sequence/filename,,,0.0,d158023dex21.htm
3,EX-2.1,/document/type/sequence/filename/description,,,0.0,EX-2.1
4,EX-2.1,/document/type/sequence/filename/description/text/html/head/title,,,0.0,EX-2.1
5,Exhibit 2.1,/document/type/sequence/filename/description/text/html/body/center/div/p[1]/b,1.0,1.0,1.0,Exhibit 2.1
6,Execution Version,/document/type/sequence/filename/description/text/html/body/center/div/p[2]/b/i,,,0.0,Execution Version
7,AGREEMENT AND PLAN OF REORGANIZATION,/document/type/sequence/filename/description/text/html/body/center/div/p[3]/b,3.0,3.0,3.0,AGREEMENT AND PLAN OF REORGANIZATION
8,"dated as of June 2, 2021",/document/type/sequence/filename/description/text/html/body/center/div/p[4],4.0,4.0,4.0,"dated as of June 2, 2021"
9,by\nand between,/document/type/sequence/filename/description/text/html/body/center/div/p[5],5.0,5.0,5.0,by\nand between


# Errors

1. Community Bankers Trust Corporation_United Bankshares, Inc.htm- Tag Levels repeat and so get merged with the wrong tag level. Try to find identifying pattern in the xpath 
    1.1 /document/type/sequence/filename/description/text/html/body/center/div/p[1]/b and /document/type/sequence/filename/description/text/html/body/p/center/div/p[1]/u
    
    **-> SOLVED - using the tag_level accumulate logic**

In [177]:
np.maximum.accumulate([1, 1, 1, 2, 3, 4, 5, 1, 2])


array([1, 1, 1, 2, 3, 4, 5, 5, 5])

In [178]:
np.maximum.accumulate([1, 1, 1, 2, 3, 4, 5, 1, 2])  + np.array([1, 1, 1, 2, 3, 4, 5, 1, 2]) 

array([ 2,  2,  2,  4,  6,  8, 10,  6,  7])

In [223]:
sample_df = pd.DataFrame({'col1': [1, 1, None, 2, 3, 3, 5, 1, 1, None, 2]})

In [224]:
sample_df

Unnamed: 0,col1
0,1.0
1,1.0
2,
3,2.0
4,3.0
5,3.0
6,5.0
7,1.0
8,1.0
9,


In [225]:
sample_df['col1'] = sample_df['col1'].fillna(0)

In [226]:
sample_df

Unnamed: 0,col1
0,1.0
1,1.0
2,0.0
3,2.0
4,3.0
5,3.0
6,5.0
7,1.0
8,1.0
9,0.0


In [227]:
sample_df['col2'] = np.maximum.accumulate(sample_df['col1'])

In [228]:
sample_df

Unnamed: 0,col1,col2
0,1.0,1.0
1,1.0,1.0
2,0.0,1.0
3,2.0,2.0
4,3.0,3.0
5,3.0,3.0
6,5.0,5.0
7,1.0,5.0
8,1.0,5.0
9,0.0,5.0


In [229]:
sample_df.loc[sample_df['col1'] == 0, 'col2'] = 0

In [230]:
sample_df

Unnamed: 0,col1,col2
0,1.0,1.0
1,1.0,1.0
2,0.0,0.0
3,2.0,2.0
4,3.0,3.0
5,3.0,3.0
6,5.0,5.0
7,1.0,5.0
8,1.0,5.0
9,0.0,0.0


In [190]:
sample_df['col3'] = np.where(sample_df['col2'] > sample_df['col1'], sample_df['col2'] + sample_df['col1'], sample_df['col1'])

In [191]:
sample_df

Unnamed: 0,col1,col2,col3
0,1.0,1.0,1.0
1,1.0,1.0,1.0
2,0.0,0.0,0.0
3,2.0,2.0,2.0
4,3.0,3.0,3.0
5,3.0,3.0,3.0
6,5.0,5.0,5.0
7,1.0,5.0,6.0
8,1.0,5.0,6.0
9,0.0,0.0,0.0
