# Main

Using the notebook will allow you to load the model once and use it as many times as you
want. Also makes the code more resilient to errors, such as a bad path.

In [1]:
from pathlib import Path

# INFO_TAG = "persuasive_essays_paragraph"
# INFO_TAG = "persuasive_essays_paragraph_all_linked"
# INFO_TAG = "cdcp"
# INFO_TAG = "drinventor"
INFO_TAG = "abstrct"

PROCESS_TAG = "granma_letters"
BASE_DATA = Path("data")

SOURCE_LANGUAGE = "english"
TARGET_LANGUAGE = "spanish"

# Corpus Projection
CORPUS = BASE_DATA / "corpus" / INFO_TAG
PROCESSED_CORPUS = BASE_DATA / "parsed_to_conll" / INFO_TAG
SENTENCE_ALIGN = BASE_DATA / 'sentence_alignment' / INFO_TAG
BIDIRECTIONAL_ALIGN = BASE_DATA / 'bidirectional_alignment' / INFO_TAG
PROJECTION = BASE_DATA / 'projection' / INFO_TAG

# Link Prediction
TO_PROCESS = BASE_DATA / "to_process" / PROCESS_TAG
SEGMENTER = BASE_DATA / "segmenter_processed" / INFO_TAG / PROCESS_TAG
LINK_PREDICTION = BASE_DATA / 'link_prediction_processed' / INFO_TAG / PROCESS_TAG

# Export to Brat
BRAT = Path("brat", "data", PROCESS_TAG)


# Corpus Projection

In [2]:
# !pip install deep_translator

from pipelines.corpus_pipelines import full_corpus_processing_pipeline, make_alignemnts_pipeline

from aligner.aligner import AwesomeAlignAligner as Aligner
from corpus_parser.unified_parser import UnifiedParser as Parser
from projector.projector import CrossLingualAnnotationProjector as Projector
from sentence_aligner.sentence_aligner import SentenceAligner
from sentence_aligner.translator import GoogleDeepTranslator as Translator
from data_augmentation.translation_augmentation import TranslateDataAugmentator as DataAugmentator

for split in ['dev', 'test', 'train']:
    
    print(split)
    print()
    
    full_corpus_processing_pipeline(
        corpus_dir=CORPUS / split,
        standard_corpus_dest_dir=PROCESSED_CORPUS / split,
        sentence_alignment_dest_dir=SENTENCE_ALIGN / split,
        bidirectional_alignment_dest_dir=BIDIRECTIONAL_ALIGN / split,
        projection_dest_dir=PROJECTION / split,
        corpus_parser=Parser(),
        sentence_aligner=SentenceAligner(Translator()),
        aligner=Aligner(),
        projector=Projector(),
        data_augmentator=DataAugmentator(),
        source_language=SOURCE_LANGUAGE,
        target_language=TARGET_LANGUAGE,
        middle_language=TARGET_LANGUAGE,
        use_spacy=True,
    )
    
#     make_alignemnts_pipeline(
#         standard_corpus_dir=PROCESSED_CORPUS/ split,
#         sentence_alignment_dest_dir=SENTENCE_ALIGN/ split,
#         bidirectional_alignment_dest_dir=BIDIRECTIONAL_ALIGN/ split,
#         projection_dest_dir=PROJECTION/ split,
#         sentence_aligner=SentenceAligner(Translator()),
#         aligner=Aligner(),
#         projector=Projector(),
#         data_augmentator=DataAugmentator(),
#         source_language=SOURCE_LANGUAGE,
#         target_language=TARGET_LANGUAGE,
#         use_spacy=True,
#     )
    
    
    

Collecting deep_translator
  Downloading deep_translator-1.9.0-py3-none-any.whl (29 kB)
Installing collected packages: deep_translator
Successfully installed deep_translator-1.9.0
[0m--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/utils/logging.py", line 177, in emit
    self.console.print(renderable, overflow="ignore", crop=False, style=style)
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1673, in print
    extend(render(renderable, render_options))
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1305, in render
    for render_output in iter_render:
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/utils/logging.py", line 134, in __rich_console__
    for line in lines:
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/segment.py", line 249, in split_lines
    for segment in segments:
  File "/usr/local/lib/python3.8/dist-

Token indices sequence length is longer than the specified maximum sequence length for this model (1496 > 512). Running this sequence through the model will result in indexing errors


test















train

















# Link Prediction

In [None]:
from pipelines.segmenter_pipelines import perform_full_inference_pipeline, perform_segmentation_pipeline, perform_link_prediction_pipeline


In [None]:
from segmenter.tf_segmenter import TensorflowArgumentSegmenter as Segmenter

segmenter = Segmenter(INFO_TAG, TARGET_LANGUAGE)


In [None]:
from link_prediction.tf_link_predictor import TensorflowLinkPredictor as LinkPredictor

link_predictor = LinkPredictor(INFO_TAG, TARGET_LANGUAGE)

In [None]:

# Only segmentation

# perform_segmentation_pipeline(
#     segmenter=segmenter,
#     source_dir=TO_PROCESS,
#     destination_dir=SEGMENTER,
#     language=TARGET_LANGUAGE,
# )

# Only link prediction (Segmentation must be done first)

perform_link_prediction_pipeline(
    link_predictor=link_predictor,
    source_dir=SEGMENTER,
    destination_dir=LINK_PREDICTION,
    source_language=TARGET_LANGUAGE
)

# Both processes Segmentation and Link prediction

# perform_full_inference_pipeline(
#     segmenter=segmenter,
#     link_predictor=link_predictor,
#     source_dir=TO_PROCESS,
#     segmenter_destination_dir=SEGMENTER,
#     destination_dir=LINK_PREDICTION,
#     source_language=TARGET_LANGUAGE
# )

# Export to Brat

In [None]:
from corpus_parser.brat_parser import BratParser
from corpus_parser.conll_parser import ConllParser

dataframes_dict = ConllParser(bioes=True).parse_dir(LINK_PREDICTION)

BratParser().export_from_dataframes(BRAT, dataframes_dict)
