# Library


## Import


In [16]:
from typing import Literal
import os
import sys
import gspread
import re


sys.path.append("../code")
from util import extract_doi_to_txt, get_string_from_text_file

## LLM


In [8]:
def apply_llm(string_data: str):
    return 1

## Regex


### define regex


In [20]:
# ***Last line should not contain "|"***
regex_method = r"""
    ^\s*\d*[.]?\s?(((O(\s)?ur)|P(\s)?roposed)(\s))?M(\s)?ethod(s)?(ology)?\s*$|# "method" "methods" "methodology"
    ^\s*\d*[.]?\s?(M(\s)?aterial(s)?|B(\s)?ackground)((\s)and(\s)Method(s)?)?\s*$| # "materials and methods" "material" "materials"
    ^\s*\d*[.]?\s?(P(\s)?roposed(\s))?A(\s)?pproach(es)?\s*$| # "approach" "proposed approach"
    ^\s*\d*[.]?\s?(O(\s)?ur(\s))?m(\s)?odel(s)?\s*$| # "our model" "model"
    ^\s*\d*[.]?\s?T(\s)?raining\s*$ # "training"
    
"""

# ***Last line should not contain "|"***
regex_result_only = r"""
    ^\s*\d*[.]?\s?R(\s)?esult(s)?((\s)and(\s)(discussion|analysis))?\s*$| # "result" "results" "results and discussion"
    ^\s*\d*[.]?\s?(P(\s)?erformance(\s))?A(\s)?nalysis\s*$| # "performance analysis"
    ^\s*\d*[.]?\s?((E(\s)?mpirical|E(\s)?xperimental)(\s))?Result(s)?\s*$| # "empirical result" "experimental results"
    ^\s*\d*[.]?\s?(E(\s)?xperiment|D(\s)?ownstream(\s)task)(s)?(\s)and(\s)Result(s)?\s*$ # "experiment and results"
    
"""
# ***Last line should not contain "|"***
regex_result_others = r"""
    ^\s*\d*[.]?\s?E(\s)?xperiment(s)?((\s)and(\s)(discussion|application)(s)?)?\s*$| # "experiment" "experiments" "experiments and discussions"
    ^\s*\d*[.]?\s?E(\s)?valuation(s)?((\s)and(\s)comparison(s)?)?\s*$| # "evaluation" "evaluations" "evaluation and comparisons"
    ^\s*\d*[.]?\s?C(\s)?omparison(s)?((\s)((with(\s)State-of-the-arts)|(to(\s)Human(\s)Performance)))?\s*$| # "comparison" "comparison with state-of-the-arts"
    ^\s*\d*[.]?\s?A(\s)?pplication(s)?\s*$ # "application"
    
"""

# ***Last line should not contain "|"***
regex_extra = r"""
    ^\s*\d*[.]?\s?(F(\s)?urther(\s))?D(\s)?iscussion(s)?\s*$| # "discussion" "discussions"
    ^\s*\d*[.]?\s?R(\s)?eference(s)?\s*$| # "reference" "references"
    ^\s*\d*[.]?\s?(Discussion(s)?(\s)and(\s))?C(\s)?onclusion(s)?\s*$| # "conclusion"
    ^\s*\d*[.]?\s?L(\s)?iterature(\s)Review\s*$| # "literature review"
    ^\s*\d*[.]?\s?I(\s)?ntroduction((\s)and(\s)Motivating(\s)Work)?\s*$| # "introduction"
    ^\s*\d*[.]?\s?A(\s)?bstract\s*$| # "abstract"
    ^\s*\d*[.]?\s?L(\s)?imitation(s)?((\s)(and|&)(\s)(Discussion(s)|(Societal(\s)Impact)|(Future(\s)Work(s)?))?)?\s*$| # "limitation" "limitations"
    ^\s*\d*[.]?\s?A(\s)?ppendi(x)?(ces)?\s*$| # "appendix" "appendices"
    ^\s*\d*[.]?\s?B(\s)?ibliography\s*$| # "bibliography"
    ^\s*\d*[.]?\s?A(\s)?cknowledgement(s)?\s*$| # "acknowledgement" "acknowledgements"
    ^\s*\d*[.]?\s?T(\s)?ables(\s)and(\s)Figures\s*$| # "tables and figures"
    ^\s*\d*[.]?\s?(R(\s)?elated|P(\s)?revious)(\s)W(\s)?ork(s)?\s*$| # "related work"
    ^\s*\d*[.]?\s?A(\s)?nalysis\s*$| # "analysis"
    ^\s*\d*[.]?\s?(A(\s)?dditional(\s))?A(\s)?blation(\s)S(\s)?tud(y|ies)\s*$| # "analysis" "ablation study" "additional ablation study"
    ^\s*\d*[.]?\s?P(\s)?reliminar(y|ies)\s*$| # "preliminary"
    ^\s*\d*[.]?\s?E(\s)?xperiment(\s)Setup\s*$| # "experiment setup"
    ^\s*\d*[.]?\s?B(\s)?ackground\s*$| # "background"
    ^\s*\d*[.]?\s?(B(\s)?roader(\s))?I(\s)?mpact(s)?\s*$| # "broader impact"
    ^\s*\d*[.]?\s?(C(\s)?hallenges(\s)and(\s))?F(\s)?uture(\s)Direction(s)?\s*$| # "challenges and future direction"
    ^\s*\d*[.]?\s?A(\s)?cknowledgment(s)?\s*$| # "Acknowledgments"
    ^\s*\d*[.]?\s?(E(\s)?xperimental(\s))?S(\s)?etup\s*$| # "experimental setup"
    ^\s*\d*[.]?\s?O(\s)?verview\s*$| # "overview"
    ^\s*\d*[.]?\s?(S(\s)?afety(\s))?C(\s)?onsideration(s)?\s*$| # "Safety Considerations"
    ^\s*\d*[.]?\s?F(\s)?uture(\s)work(s)?\s*$| # "future work"
    ^\s*\d*[.]?\s?R(\s)?esource(s)?\s*$ # "resources"
"""

regex_total = (
    regex_method
    + "|"
    + regex_result_only
    + "|"
    + regex_result_others
    + "|"
    + regex_extra
)

### regex_method


In [23]:
def apply_regex(string_data: str):
    regex_total_cp = re.compile(
        regex_total,
        re.IGNORECASE | re.VERBOSE | re.MULTILINE,
    )
    regex_method_cp = re.compile(regex_method, re.IGNORECASE | re.VERBOSE)
    regex_result_others_cp = re.compile(regex_result_others, re.IGNORECASE | re.VERBOSE)
    regex_result_only_cp = re.compile(regex_result_only, re.IGNORECASE | re.VERBOSE)

    method_idx = -1
    result_idx = -1
    result_flag = False  # because "Results" section can be called many names
    section_start_point_list = []
    extracted_section_name_list = []
    for i, match in enumerate(regex_total_cp.finditer(string_data)):
        section = match.group()
        if regex_method_cp.search(section):
            method_idx = i

        # If there is an exact match with "Results"
        if regex_result_only_cp.search(section):  # if there is an exact match
            result_idx = i
            result_flag = True
        # If not, and if exact match has not been found yet, look for other names
        if regex_result_others_cp.search(section) and not result_flag:
            result_idx = i

        section_start_point_list.append(match.start())
        # print(match)
        extracted_section_name_list.append(match.group().strip())
        print([match.group()])

    list_length = len(section_start_point_list)
    if list_length == 0:
        print("No sections found...")
        return
    start_idx = section_start_point_list[method_idx]
    end_idx = (
        len(string_data)
        if method_idx >= list_length - 1
        else section_start_point_list[method_idx + 1]
    )
    end_idx = min(end_idx, start_idx + 40000)
    method_text = string_data[start_idx:end_idx] if method_idx != -1 else ""

    start_idx = section_start_point_list[result_idx]
    end_idx = (
        len(string_data)
        if result_idx >= list_length - 1
        else section_start_point_list[result_idx + 1]
    )
    # 40000 -> char limit per section (50000 char limit on gs api)
    end_idx = min(end_idx, start_idx + 40000)
    result_text = string_data[start_idx:end_idx] if result_idx != -1 else ""

    print("############ METHOD TEXT:\n", method_text)
    print("############ RESULT TEXT:\n", result_text)
    print(
        "############ Extracted Section List:\n",
        "\n".join(extracted_section_name_list),
    )

## Extract


In [19]:
def extract(string_data: str, extraction_method: Literal["regex", "llm"] = "regex"):
    if extraction_method == "regex":
        res = apply_regex(string_data)
    elif extraction_method == "llm":
        res = apply_llm(string_data)
    return res
    # return result dictionary

## google spreadsheet


In [15]:
# connect to the service account
gc = gspread.service_account(filename="../creds.json")

sh = gc.open("Paper Extraction Evaluation").sheet1

In [18]:
# get result dictionary from extract and update gs

# Test


In [21]:
with open(
    "/Users/ms/cs/ML/MLab/paper-extraction/text/Alayrac et al. - 2022 - Flamingo a Visual Language Model for Few-Shot Lea.pdf.txt",
    "r",
) as file:
    content = file.read()

In [24]:
extract(content, "regex")

1