# Main

Using the notebook will allow you to load the model once and use it as many times as you
want. Also makes the code more resilient to errors, such as a bad path.

In [1]:
from pathlib import Path

INFO_TAG = "persuasive_essays_paragraph_all_linked"
# INFO_TAG = "cdcp"
# INFO_TAG = "abstrct"

PROCESS_TAG = "granma_letters"
BASE_DATA = Path("data")

SOURCE_LANGUAGE = "english"
TARGET_LANGUAGE = "spanish"

# Corpus Projection
CORPUS = BASE_DATA / "corpus" / INFO_TAG
PROCESSED_CORPUS = BASE_DATA / "parsed_to_conll" / INFO_TAG
SENTENCE_ALIGN = BASE_DATA / 'sentence_alignment' / INFO_TAG
BIDIRECTIONAL_ALIGN = BASE_DATA / 'bidirectional_alignment' / INFO_TAG
PROJECTION = BASE_DATA / 'projection' / INFO_TAG

# Link Prediction
TO_PROCESS = BASE_DATA / "to_process" / PROCESS_TAG
SEGMENTER = BASE_DATA / "segmenter_processed" / INFO_TAG / PROCESS_TAG
LINK_PREDICTION = BASE_DATA / 'link_prediction_processed' / INFO_TAG / PROCESS_TAG

# Export to Brat
BRAT = Path("brat", "data", PROCESS_TAG)


# Corpus Projection

In [None]:
# !pip install deep_translator

from pipelines.corpus_pipelines import full_corpus_processing_pipeline, make_alignemnts_pipeline

from aligner.aligner import AwesomeAlignAligner as Aligner
from corpus_parser.unified_parser import UnifiedParser as Parser
from projector.projector import CrossLingualAnnotationProjector as Projector
from sentence_aligner.sentence_aligner import SentenceAligner
from sentence_aligner.translator import GoogleDeepTranslator as Translator
from data_augmentation.translation_augmentation import TranslateDataAugmentator as DataAugmentator

for split in ['dev', 'test', 'train']:
    
    print(split)
    print()
    
    full_corpus_processing_pipeline(
        corpus_dir=CORPUS / split,
        standard_corpus_dest_dir=PROCESSED_CORPUS / split,
        sentence_alignment_dest_dir=SENTENCE_ALIGN / split,
        bidirectional_alignment_dest_dir=BIDIRECTIONAL_ALIGN / split,
        projection_dest_dir=PROJECTION / split,
        corpus_parser=Parser(),
        sentence_aligner=SentenceAligner(Translator()),
        aligner=Aligner(),
        projector=Projector(),
        data_augmentator=DataAugmentator(),
        source_language=SOURCE_LANGUAGE,
        target_language=TARGET_LANGUAGE,
        middle_language=TARGET_LANGUAGE,
        use_spacy=True,
    )
    
#     make_alignemnts_pipeline(
#         standard_corpus_dir=PROCESSED_CORPUS/ split,
#         sentence_alignment_dest_dir=SENTENCE_ALIGN/ split,
#         bidirectional_alignment_dest_dir=BIDIRECTIONAL_ALIGN/ split,
#         projection_dest_dir=PROJECTION/ split,
#         sentence_aligner=SentenceAligner(Translator()),
#         aligner=Aligner(),
#         projector=Projector(),
#         data_augmentator=DataAugmentator(),
#         source_language=SOURCE_LANGUAGE,
#         target_language=TARGET_LANGUAGE,
#         use_spacy=True,
#     )
    
    
    

# Link Prediction

In [2]:
from pipelines.segmenter_pipelines import perform_full_inference_pipeline, perform_segmentation_pipeline, perform_link_prediction_pipeline


In [3]:
from segmenter.tf_segmenter import TensorflowArgumentSegmenter as Segmenter

segmenter = Segmenter(INFO_TAG, TARGET_LANGUAGE)


pos_amount 12
tag_amount 11
char_amount 90
word_amount 9280
max_word_size 19
max_seq_size 579

word_to_index
Length: 9282
First 20: [('', 0), ('[UNK]', 1), ('noun', 2), ('det', 3), ('verb', 4), ('adp', 5), ('.', 6), ('adj', 7), ('conj', 8), (',', 9), ('de', 10), ('adv', 11), ('pron', 12), ('la', 13), ('que', 14), ('en', 15), ('los', 16), ('y', 17), ('a', 18), ('el', 19)]

tag_to_index
Length: 13
First 20: [('', 0), ('[UNK]', 1), ('I-Premise', 2), ('O', 3), ('I-Claim', 4), ('I-MajorClaim', 5), ('E-Premise', 6), ('B-Premise', 7), ('E-Claim', 8), ('B-Claim', 9), ('E-MajorClaim', 10), ('B-MajorClaim', 11), ('S-MajorClaim', 12)]

char_to_index
Length: 92
First 20: [('', 0), ('[UNK]', 1), (' ', 2), ('e', 3), ('N', 4), ('a', 5), ('D', 6), ('s', 7), ('o', 8), ('n', 9), ('O', 10), ('E', 11), ('r', 12), ('i', 13), ('A', 14), ('U', 15), ('d', 16), ('l', 17), ('t', 18), ('V', 19)]

pos_to_index
Length: 14
First 20: [('', 0), ('[UNK]', 1), ('NOUN', 2), ('DET', 3), ('VERB', 4), ('ADP', 5), ('.', 6),

In [None]:
from link_prediction.tf_link_predictor import TensorflowLinkPredictor as LinkPredictor

link_predictor = LinkPredictor(INFO_TAG, TARGET_LANGUAGE)

In [5]:

# Only segmentation

perform_segmentation_pipeline(
    segmenter=segmenter,
    source_dir=TO_PROCESS,
    destination_dir=SEGMENTER,
    language=TARGET_LANGUAGE,
)

# Only link prediction (Segmentation must be done first)

# perform_link_prediction_pipeline(
#     link_predictor=link_predictor,
#     source_dir=SEGMENTER,
#     destination_dir=LINK_PREDICTION,
#     source_language=TARGET_LANGUAGE
# )

# Both processes Segmentation and Link prediction

# perform_full_inference_pipeline(
#     segmenter=segmenter,
#     link_predictor=link_predictor,
#     source_dir=TO_PROCESS,
#     segmenter_destination_dir=SEGMENTER,
#     destination_dir=LINK_PREDICTION,
#     source_language=TARGET_LANGUAGE
# )

In [None]:
from link_prediction.tf_link_predictor import TensorflowLinkPredictor as LinkPredictor

link_predictor = LinkPredictor(INFO_TAG, TARGET_LANGUAGE)

perform_link_prediction_pipeline(
    link_predictor=link_predictor,
    source_dir=SEGMENTER,
    destination_dir=LINK_PREDICTION,
    source_language=TARGET_LANGUAGE
)


dev relations 2236
dev source argumentative units 912
dev target argumentative units 418
test relations 5580
test source argumentative units 2226
test target argumentative units 1036
train relations 18874
train source argumentative units 7537
train target argumentative units 3460
Vocab size 10305
Relation tags ['attacks_Inverse', 'supports', 'supports_Inverse', 'attacks']
Proposition tags ['MajorClaim', 'Claim', 'Premise']
max_size_prop 70
max_amount_doc 26
[('supports', 'Premise', 'Claim'), ('supports', 'Premise', 'Claim'), ('supports', 'Premise', 'Claim')]
3


# Export to Brat

In [None]:
from corpus_parser.brat_parser import BratParser
from corpus_parser.conll_parser import ConllParser

dataframes_dict = ConllParser(bioes=True).parse_dir(LINK_PREDICTION)

BratParser().export_from_dataframes(BRAT, dataframes_dict)
